1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <memory> 19 #include <mutex> 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "Debug.h" 25 #include "DeviceEnvironment.h" 26 #include "omptarget.h" 27 #include "omptargetplugin.h" 28 29 #define TARGET_NAME CUDA 30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 31 32 #include "MemoryManager.h" 33 34 #include "llvm/Frontend/OpenMP/OMPConstants.h" 35 36 // Utility for retrieving and printing CUDA error string. 37 #ifdef OMPTARGET_DEBUG 38 #define CUDA_ERR_STRING(err) \ 39 do { \ 40 if (getDebugLevel() > 0) { \ 41 const char *errStr = nullptr; \ 42 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 43 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 44 REPORT("Unrecognized CUDA error code: %d\n", err); \ 45 else if (errStr_status == CUDA_SUCCESS) \ 46 REPORT("CUDA error is: %s\n", errStr); \ 47 else { \ 48 REPORT("Unresolved CUDA error code: %d\n", err); \ 49 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 50 errStr_status); \ 51 } \ 52 } else { \ 53 const char *errStr = nullptr; \ 54 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 55 if (errStr_status == CUDA_SUCCESS) \ 56 REPORT("%s \n", errStr); \ 57 } \ 58 } while (false) 59 #else // OMPTARGET_DEBUG 60 #define CUDA_ERR_STRING(err) \ 61 do { \ 62 const char *errStr = nullptr; \ 63 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 64 if (errStr_status == CUDA_SUCCESS) \ 65 REPORT("%s \n", errStr); \ 66 } while (false) 67 #endif // OMPTARGET_DEBUG 68 69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 70 71 #include "elf_common.h" 72 73 /// Keep entries table per device. 74 struct FuncOrGblEntryTy { 75 __tgt_target_table Table; 76 std::vector<__tgt_offload_entry> Entries; 77 }; 78 79 /// Use a single entity to encode a kernel and a set of flags. 80 struct KernelTy { 81 CUfunction Func; 82 83 // execution mode of kernel 84 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 85 86 /// Maximal number of threads per block for this kernel. 87 int MaxThreadsPerBlock = 0; 88 89 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) 90 : Func(_Func), ExecutionMode(_ExecutionMode) {} 91 }; 92 93 namespace { 94 bool checkResult(CUresult Err, const char *ErrMsg) { 95 if (Err == CUDA_SUCCESS) 96 return true; 97 98 REPORT("%s", ErrMsg); 99 CUDA_ERR_STRING(Err); 100 return false; 101 } 102 103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 104 CUstream Stream) { 105 CUresult Err = 106 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 107 108 if (Err != CUDA_SUCCESS) { 109 DP("Error when copying data from device to device. Pointers: src " 110 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 111 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 112 CUDA_ERR_STRING(Err); 113 return OFFLOAD_FAIL; 114 } 115 116 return OFFLOAD_SUCCESS; 117 } 118 119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 120 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 121 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 122 123 CUresult Err = cuEventRecord(Event, Stream); 124 if (Err != CUDA_SUCCESS) { 125 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 126 DPxPTR(Stream), DPxPTR(Event)); 127 CUDA_ERR_STRING(Err); 128 return OFFLOAD_FAIL; 129 } 130 131 return OFFLOAD_SUCCESS; 132 } 133 134 int syncEvent(void *EventPtr) { 135 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 136 137 CUresult Err = cuEventSynchronize(Event); 138 if (Err != CUDA_SUCCESS) { 139 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 140 CUDA_ERR_STRING(Err); 141 return OFFLOAD_FAIL; 142 } 143 144 return OFFLOAD_SUCCESS; 145 } 146 147 namespace { 148 149 // Structure contains per-device data 150 struct DeviceDataTy { 151 /// List that contains all the kernels. 152 std::list<KernelTy> KernelsList; 153 154 std::list<FuncOrGblEntryTy> FuncGblEntries; 155 156 CUcontext Context = nullptr; 157 // Device properties 158 int ThreadsPerBlock = 0; 159 int BlocksPerGrid = 0; 160 int WarpSize = 0; 161 // OpenMP properties 162 int NumTeams = 0; 163 int NumThreads = 0; 164 }; 165 166 /// Resource allocator where \p T is the resource type. 167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL 168 /// accordingly. The implementation should not raise any exception. 169 template <typename T> struct AllocatorTy { 170 AllocatorTy(CUcontext C) noexcept : Context(C) {} 171 using ElementTy = T; 172 173 virtual ~AllocatorTy() {} 174 175 /// Create a resource and assign to R. 176 virtual int create(T &R) noexcept = 0; 177 /// Destroy the resource. 178 virtual int destroy(T) noexcept = 0; 179 180 protected: 181 CUcontext Context; 182 }; 183 184 /// Allocator for CUstream. 185 struct StreamAllocatorTy final : public AllocatorTy<CUstream> { 186 StreamAllocatorTy(CUcontext C) noexcept : AllocatorTy<CUstream>(C) {} 187 188 /// See AllocatorTy<T>::create. 189 int create(CUstream &Stream) noexcept override { 190 if (!checkResult(cuCtxSetCurrent(Context), 191 "Error returned from cuCtxSetCurrent\n")) 192 return OFFLOAD_FAIL; 193 194 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), 195 "Error returned from cuStreamCreate\n")) 196 return OFFLOAD_FAIL; 197 198 return OFFLOAD_SUCCESS; 199 } 200 201 /// See AllocatorTy<T>::destroy. 202 int destroy(CUstream Stream) noexcept override { 203 if (!checkResult(cuCtxSetCurrent(Context), 204 "Error returned from cuCtxSetCurrent\n")) 205 return OFFLOAD_FAIL; 206 if (!checkResult(cuStreamDestroy(Stream), 207 "Error returned from cuStreamDestroy\n")) 208 return OFFLOAD_FAIL; 209 210 return OFFLOAD_SUCCESS; 211 } 212 }; 213 214 /// Allocator for CUevent. 215 struct EventAllocatorTy final : public AllocatorTy<CUevent> { 216 EventAllocatorTy(CUcontext C) noexcept : AllocatorTy<CUevent>(C) {} 217 218 /// See AllocatorTy<T>::create. 219 int create(CUevent &Event) noexcept override { 220 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), 221 "Error returned from cuEventCreate\n")) 222 return OFFLOAD_FAIL; 223 224 return OFFLOAD_SUCCESS; 225 } 226 227 /// See AllocatorTy<T>::destroy. 228 int destroy(CUevent Event) noexcept override { 229 if (!checkResult(cuEventDestroy(Event), 230 "Error returned from cuEventDestroy\n")) 231 return OFFLOAD_FAIL; 232 233 return OFFLOAD_SUCCESS; 234 } 235 }; 236 237 /// A generic pool of resources where \p T is the resource type. 238 /// \p T should be copyable as the object is stored in \p std::vector . 239 template <typename AllocTy> class ResourcePoolTy { 240 using ElementTy = typename AllocTy::ElementTy; 241 /// Index of the next available resource. 242 size_t Next = 0; 243 /// Mutex to guard the pool. 244 std::mutex Mutex; 245 /// Pool of resources. 246 std::vector<ElementTy> Resources; 247 /// A reference to the corresponding allocator. 248 AllocTy Allocator; 249 250 /// If `Resources` is used up, we will fill in more resources. It assumes that 251 /// the new size `Size` should be always larger than the current size. 252 bool resize(size_t Size) { 253 auto CurSize = Resources.size(); 254 assert(Size > CurSize && "Unexpected smaller size"); 255 Resources.reserve(Size); 256 for (auto I = CurSize; I < Size; ++I) { 257 ElementTy NewItem; 258 int Ret = Allocator.create(NewItem); 259 if (Ret != OFFLOAD_SUCCESS) 260 return false; 261 Resources.push_back(NewItem); 262 } 263 return true; 264 } 265 266 public: 267 ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept 268 : Allocator(std::move(A)) { 269 if (Size) 270 (void)resize(Size); 271 } 272 273 ~ResourcePoolTy() noexcept { clear(); } 274 275 /// Get a resource from pool. `Next` always points to the next available 276 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are 277 /// still available. If there is no resource left, we will ask for more. Each 278 /// time a resource is assigned, the id will increase one. 279 /// xxxxxs+++++++++ 280 /// ^ 281 /// Next 282 /// After assignment, the pool becomes the following and s is assigned. 283 /// xxxxxs+++++++++ 284 /// ^ 285 /// Next 286 int acquire(ElementTy &R) noexcept { 287 std::lock_guard<std::mutex> LG(Mutex); 288 if (Next == Resources.size()) { 289 auto NewSize = Resources.size() ? Resources.size() * 2 : 1; 290 if (!resize(NewSize)) 291 return OFFLOAD_FAIL; 292 } 293 294 assert(Next < Resources.size()); 295 296 R = Resources[Next++]; 297 298 return OFFLOAD_SUCCESS; 299 } 300 301 /// Return the resource back to the pool. When we return a resource, we need 302 /// to first decrease `Next`, and then copy the resource back. It is worth 303 /// noting that, the order of resources return might be different from that 304 /// they're assigned, that saying, at some point, there might be two identical 305 /// resources. 306 /// xxax+a+++++ 307 /// ^ 308 /// Next 309 /// However, it doesn't matter, because they're always on the two sides of 310 /// `Next`. The left one will in the end be overwritten by another resource. 311 /// Therefore, after several execution, the order of pool might be different 312 /// from its initial state. 313 void release(ElementTy R) noexcept { 314 std::lock_guard<std::mutex> LG(Mutex); 315 Resources[--Next] = R; 316 } 317 318 /// Released all stored resources and clear the pool. 319 /// Note: This function is not thread safe. Be sure to guard it if necessary. 320 void clear() noexcept { 321 for (auto &R : Resources) 322 (void)Allocator.destroy(R); 323 Resources.clear(); 324 } 325 }; 326 327 } // namespace 328 329 class DeviceRTLTy { 330 int NumberOfDevices; 331 // OpenMP environment properties 332 int EnvNumTeams; 333 int EnvTeamLimit; 334 int EnvTeamThreadLimit; 335 // OpenMP requires flags 336 int64_t RequiresFlags; 337 // Amount of dynamic shared memory to use at launch. 338 uint64_t DynamicMemorySize; 339 340 /// Number of initial streams for each device. 341 int NumInitialStreams = 32; 342 343 /// Number of initial events for each device. 344 int NumInitialEvents = 8; 345 346 static constexpr const int32_t HardThreadLimit = 1024; 347 static constexpr const int32_t DefaultNumTeams = 128; 348 static constexpr const int32_t DefaultNumThreads = 128; 349 350 using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>; 351 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool; 352 353 using EventPoolTy = ResourcePoolTy<EventAllocatorTy>; 354 std::vector<std::unique_ptr<EventPoolTy>> EventPool; 355 356 std::vector<DeviceDataTy> DeviceData; 357 std::vector<CUmodule> Modules; 358 359 /// Vector of flags indicating the initalization status of all associated 360 /// devices. 361 std::vector<bool> InitializedFlags; 362 363 /// A class responsible for interacting with device native runtime library to 364 /// allocate and free memory. 365 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 366 const int DeviceId; 367 const std::vector<DeviceDataTy> &DeviceData; 368 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 369 370 public: 371 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 372 : DeviceId(DeviceId), DeviceData(DeviceData) {} 373 374 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 375 if (Size == 0) 376 return nullptr; 377 378 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 379 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 380 return nullptr; 381 382 void *MemAlloc = nullptr; 383 switch (Kind) { 384 case TARGET_ALLOC_DEFAULT: 385 case TARGET_ALLOC_DEVICE: 386 CUdeviceptr DevicePtr; 387 Err = cuMemAlloc(&DevicePtr, Size); 388 MemAlloc = (void *)DevicePtr; 389 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 390 return nullptr; 391 break; 392 case TARGET_ALLOC_HOST: 393 void *HostPtr; 394 Err = cuMemAllocHost(&HostPtr, Size); 395 MemAlloc = HostPtr; 396 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 397 return nullptr; 398 HostPinnedAllocs[MemAlloc] = Kind; 399 break; 400 case TARGET_ALLOC_SHARED: 401 CUdeviceptr SharedPtr; 402 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 403 MemAlloc = (void *)SharedPtr; 404 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 405 return nullptr; 406 break; 407 } 408 409 return MemAlloc; 410 } 411 412 int free(void *TgtPtr) override { 413 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 414 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 415 return OFFLOAD_FAIL; 416 417 // Host pinned memory must be freed differently. 418 TargetAllocTy Kind = 419 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 420 ? TARGET_ALLOC_DEFAULT 421 : TARGET_ALLOC_HOST; 422 switch (Kind) { 423 case TARGET_ALLOC_DEFAULT: 424 case TARGET_ALLOC_DEVICE: 425 case TARGET_ALLOC_SHARED: 426 Err = cuMemFree((CUdeviceptr)TgtPtr); 427 if (!checkResult(Err, "Error returned from cuMemFree\n")) 428 return OFFLOAD_FAIL; 429 break; 430 case TARGET_ALLOC_HOST: 431 Err = cuMemFreeHost(TgtPtr); 432 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 433 return OFFLOAD_FAIL; 434 break; 435 } 436 437 return OFFLOAD_SUCCESS; 438 } 439 }; 440 441 /// A vector of device allocators 442 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 443 444 /// A vector of memory managers. Since the memory manager is non-copyable and 445 // non-removable, we wrap them into std::unique_ptr. 446 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 447 448 /// Whether use memory manager 449 bool UseMemoryManager = true; 450 451 // Record entry point associated with device 452 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 453 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 454 E.Entries.push_back(entry); 455 } 456 457 // Return a pointer to the entry associated with the pointer 458 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 459 const void *Addr) const { 460 for (const __tgt_offload_entry &Itr : 461 DeviceData[DeviceId].FuncGblEntries.back().Entries) 462 if (Itr.addr == Addr) 463 return &Itr; 464 465 return nullptr; 466 } 467 468 // Return the pointer to the target entries table 469 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 470 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 471 472 if (E.Entries.empty()) 473 return nullptr; 474 475 // Update table info according to the entries and return the pointer 476 E.Table.EntriesBegin = E.Entries.data(); 477 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 478 479 return &E.Table; 480 } 481 482 // Clear entries table for a device 483 void clearOffloadEntriesTable(const int DeviceId) { 484 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 485 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 486 E.Entries.clear(); 487 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 488 } 489 490 public: 491 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 492 assert(AsyncInfo && "AsyncInfo is nullptr"); 493 494 if (!AsyncInfo->Queue) { 495 CUstream S; 496 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) 497 return nullptr; 498 499 AsyncInfo->Queue = S; 500 } 501 502 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 503 } 504 505 // This class should not be copied 506 DeviceRTLTy(const DeviceRTLTy &) = delete; 507 DeviceRTLTy(DeviceRTLTy &&) = delete; 508 509 DeviceRTLTy() 510 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 511 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 512 DynamicMemorySize(0) { 513 514 DP("Start initializing CUDA\n"); 515 516 CUresult Err = cuInit(0); 517 if (Err == CUDA_ERROR_INVALID_HANDLE) { 518 // Can't call cuGetErrorString if dlsym failed 519 DP("Failed to load CUDA shared library\n"); 520 return; 521 } 522 if (!checkResult(Err, "Error returned from cuInit\n")) { 523 return; 524 } 525 526 Err = cuDeviceGetCount(&NumberOfDevices); 527 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 528 return; 529 530 if (NumberOfDevices == 0) { 531 DP("There are no devices supporting CUDA.\n"); 532 return; 533 } 534 535 DeviceData.resize(NumberOfDevices); 536 StreamPool.resize(NumberOfDevices); 537 EventPool.resize(NumberOfDevices); 538 539 // Get environment variables regarding teams 540 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 541 // OMP_TEAM_LIMIT has been set 542 EnvTeamLimit = std::stoi(EnvStr); 543 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 544 } 545 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 546 // OMP_TEAMS_THREAD_LIMIT has been set 547 EnvTeamThreadLimit = std::stoi(EnvStr); 548 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 549 } 550 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 551 // OMP_NUM_TEAMS has been set 552 EnvNumTeams = std::stoi(EnvStr); 553 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 554 } 555 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 556 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 557 DynamicMemorySize = std::stoi(EnvStr); 558 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 559 DynamicMemorySize); 560 } 561 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { 562 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set 563 NumInitialStreams = std::stoi(EnvStr); 564 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); 565 } 566 567 for (int I = 0; I < NumberOfDevices; ++I) 568 DeviceAllocators.emplace_back(I, DeviceData); 569 570 // Get the size threshold from environment variable 571 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 572 UseMemoryManager = Res.second; 573 size_t MemoryManagerThreshold = Res.first; 574 575 if (UseMemoryManager) 576 for (int I = 0; I < NumberOfDevices; ++I) 577 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 578 DeviceAllocators[I], MemoryManagerThreshold)); 579 580 // We lazily initialize all devices later. 581 InitializedFlags.assign(NumberOfDevices, false); 582 } 583 584 ~DeviceRTLTy() { 585 for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) 586 deinitDevice(DeviceId); 587 } 588 589 // Check whether a given DeviceId is valid 590 bool isValidDeviceId(const int DeviceId) const { 591 return DeviceId >= 0 && DeviceId < NumberOfDevices; 592 } 593 594 int getNumOfDevices() const { return NumberOfDevices; } 595 596 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 597 598 int initDevice(const int DeviceId) { 599 CUdevice Device; 600 601 DP("Getting device %d\n", DeviceId); 602 CUresult Err = cuDeviceGet(&Device, DeviceId); 603 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 604 return OFFLOAD_FAIL; 605 606 assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); 607 InitializedFlags[DeviceId] = true; 608 609 // Query the current flags of the primary context and set its flags if 610 // it is inactive 611 unsigned int FormerPrimaryCtxFlags = 0; 612 int FormerPrimaryCtxIsActive = 0; 613 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 614 &FormerPrimaryCtxIsActive); 615 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 616 return OFFLOAD_FAIL; 617 618 if (FormerPrimaryCtxIsActive) { 619 DP("The primary context is active, no change to its flags\n"); 620 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 621 CU_CTX_SCHED_BLOCKING_SYNC) 622 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 623 } else { 624 DP("The primary context is inactive, set its flags to " 625 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 626 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 627 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 628 return OFFLOAD_FAIL; 629 } 630 631 // Retain the per device primary context and save it to use whenever this 632 // device is selected. 633 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 634 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 635 return OFFLOAD_FAIL; 636 637 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 638 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 639 return OFFLOAD_FAIL; 640 641 // Initialize the stream pool. 642 if (!StreamPool[DeviceId]) 643 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>( 644 StreamAllocatorTy(DeviceData[DeviceId].Context), NumInitialStreams); 645 646 // Initialize the event pool. 647 if (!EventPool[DeviceId]) 648 EventPool[DeviceId] = std::make_unique<EventPoolTy>( 649 EventAllocatorTy(DeviceData[DeviceId].Context), NumInitialEvents); 650 651 // Query attributes to determine number of threads/block and blocks/grid. 652 int MaxGridDimX; 653 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 654 Device); 655 if (Err != CUDA_SUCCESS) { 656 DP("Error getting max grid dimension, use default value %d\n", 657 DeviceRTLTy::DefaultNumTeams); 658 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 659 } else { 660 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 661 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 662 } 663 664 // We are only exploiting threads along the x axis. 665 int MaxBlockDimX; 666 Err = cuDeviceGetAttribute(&MaxBlockDimX, 667 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 668 if (Err != CUDA_SUCCESS) { 669 DP("Error getting max block dimension, use default value %d\n", 670 DeviceRTLTy::DefaultNumThreads); 671 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 672 } else { 673 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 674 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 675 676 if (EnvTeamThreadLimit > 0 && 677 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 678 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 679 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 680 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 681 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 682 } 683 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 684 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 685 "capping at the hard limit\n", 686 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 687 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 688 } 689 } 690 691 // Get and set warp size 692 int WarpSize; 693 Err = 694 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 695 if (Err != CUDA_SUCCESS) { 696 DP("Error getting warp size, assume default value 32\n"); 697 DeviceData[DeviceId].WarpSize = 32; 698 } else { 699 DP("Using warp size %d\n", WarpSize); 700 DeviceData[DeviceId].WarpSize = WarpSize; 701 } 702 703 // Adjust teams to the env variables 704 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 705 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 706 EnvTeamLimit); 707 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 708 } 709 710 size_t StackLimit; 711 size_t HeapLimit; 712 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 713 StackLimit = std::stol(EnvStr); 714 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 715 return OFFLOAD_FAIL; 716 } else { 717 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 718 return OFFLOAD_FAIL; 719 } 720 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 721 HeapLimit = std::stol(EnvStr); 722 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 723 return OFFLOAD_FAIL; 724 } else { 725 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 726 return OFFLOAD_FAIL; 727 } 728 729 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 730 "Device supports up to %d CUDA blocks and %d threads with a " 731 "warp size of %d\n", 732 DeviceData[DeviceId].BlocksPerGrid, 733 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 734 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 735 "Device heap size is %d Bytes, device stack size is %d Bytes per " 736 "thread\n", 737 (int)HeapLimit, (int)StackLimit); 738 739 // Set default number of teams 740 if (EnvNumTeams > 0) { 741 DP("Default number of teams set according to environment %d\n", 742 EnvNumTeams); 743 DeviceData[DeviceId].NumTeams = EnvNumTeams; 744 } else { 745 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 746 DP("Default number of teams set according to library's default %d\n", 747 DeviceRTLTy::DefaultNumTeams); 748 } 749 750 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 751 DP("Default number of teams exceeds device limit, capping at %d\n", 752 DeviceData[DeviceId].BlocksPerGrid); 753 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 754 } 755 756 // Set default number of threads 757 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 758 DP("Default number of threads set according to library's default %d\n", 759 DeviceRTLTy::DefaultNumThreads); 760 if (DeviceData[DeviceId].NumThreads > 761 DeviceData[DeviceId].ThreadsPerBlock) { 762 DP("Default number of threads exceeds device limit, capping at %d\n", 763 DeviceData[DeviceId].ThreadsPerBlock); 764 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 765 } 766 767 return OFFLOAD_SUCCESS; 768 } 769 770 int deinitDevice(const int DeviceId) { 771 auto IsInitialized = InitializedFlags[DeviceId]; 772 if (!IsInitialized) 773 return OFFLOAD_SUCCESS; 774 InitializedFlags[DeviceId] = false; 775 776 if (UseMemoryManager) 777 MemoryManagers[DeviceId].release(); 778 779 // Close module 780 if (CUmodule &M = Modules[DeviceId]) 781 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 782 783 StreamPool[DeviceId].reset(); 784 EventPool[DeviceId].reset(); 785 786 // Destroy context 787 DeviceDataTy &D = DeviceData[DeviceId]; 788 if (D.Context) { 789 if (checkResult(cuCtxSetCurrent(D.Context), 790 "Error returned from cuCtxSetCurrent\n")) { 791 CUdevice Device; 792 if (checkResult(cuCtxGetDevice(&Device), 793 "Error returned from cuCtxGetDevice\n")) 794 checkResult(cuDevicePrimaryCtxRelease(Device), 795 "Error returned from cuDevicePrimaryCtxRelease\n"); 796 } 797 } 798 return OFFLOAD_SUCCESS; 799 } 800 801 __tgt_target_table *loadBinary(const int DeviceId, 802 const __tgt_device_image *Image) { 803 // Set the context we are using 804 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 805 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 806 return nullptr; 807 808 // Clear the offload table as we are going to create a new one. 809 clearOffloadEntriesTable(DeviceId); 810 811 // Create the module and extract the function pointers. 812 CUmodule Module; 813 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 814 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 815 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 816 return nullptr; 817 818 DP("CUDA module successfully loaded!\n"); 819 820 Modules.push_back(Module); 821 822 // Find the symbols in the module by name. 823 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 824 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 825 826 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 827 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 828 if (!E->addr) { 829 // We return nullptr when something like this happens, the host should 830 // have always something in the address to uniquely identify the target 831 // region. 832 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 833 return nullptr; 834 } 835 836 if (E->size) { 837 __tgt_offload_entry Entry = *E; 838 CUdeviceptr CUPtr; 839 size_t CUSize; 840 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 841 // We keep this style here because we need the name 842 if (Err != CUDA_SUCCESS) { 843 REPORT("Loading global '%s' Failed\n", E->name); 844 CUDA_ERR_STRING(Err); 845 return nullptr; 846 } 847 848 if (CUSize != E->size) { 849 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 850 CUSize, E->size); 851 return nullptr; 852 } 853 854 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 855 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 856 857 Entry.addr = (void *)(CUPtr); 858 859 // Note: In the current implementation declare target variables 860 // can either be link or to. This means that once unified 861 // memory is activated via the requires directive, the variable 862 // can be used directly from the host in both cases. 863 // TODO: when variables types other than to or link are added, 864 // the below condition should be changed to explicitly 865 // check for to and link variables types: 866 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 867 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 868 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 869 // If unified memory is present any target link or to variables 870 // can access host addresses directly. There is no longer a 871 // need for device copies. 872 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 873 DP("Copy linked variable host address (" DPxMOD 874 ") to device address (" DPxMOD ")\n", 875 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 876 } 877 878 addOffloadEntry(DeviceId, Entry); 879 880 continue; 881 } 882 883 CUfunction Func; 884 Err = cuModuleGetFunction(&Func, Module, E->name); 885 // We keep this style here because we need the name 886 if (Err != CUDA_SUCCESS) { 887 REPORT("Loading '%s' Failed\n", E->name); 888 CUDA_ERR_STRING(Err); 889 return nullptr; 890 } 891 892 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 893 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 894 895 // default value GENERIC (in case symbol is missing from cubin file) 896 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 897 std::string ExecModeNameStr(E->name); 898 ExecModeNameStr += "_exec_mode"; 899 const char *ExecModeName = ExecModeNameStr.c_str(); 900 901 CUdeviceptr ExecModePtr; 902 size_t CUSize; 903 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 904 if (Err == CUDA_SUCCESS) { 905 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 906 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 907 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 908 return nullptr; 909 } 910 911 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 912 if (Err != CUDA_SUCCESS) { 913 REPORT("Error when copying data from device to host. Pointers: " 914 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 915 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 916 CUDA_ERR_STRING(Err); 917 return nullptr; 918 } 919 } else { 920 DP("Loading global exec_mode '%s' - symbol missing, using default " 921 "value GENERIC (1)\n", 922 ExecModeName); 923 } 924 925 KernelsList.emplace_back(Func, ExecModeVal); 926 927 __tgt_offload_entry Entry = *E; 928 Entry.addr = &KernelsList.back(); 929 addOffloadEntry(DeviceId, Entry); 930 } 931 932 // send device environment data to the device 933 { 934 // TODO: The device ID used here is not the real device ID used by OpenMP. 935 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 936 static_cast<uint32_t>(DeviceId), 937 static_cast<uint32_t>(DynamicMemorySize)}; 938 939 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 940 DeviceEnv.DebugKind = std::stoi(EnvStr); 941 942 const char *DeviceEnvName = "omptarget_device_environment"; 943 CUdeviceptr DeviceEnvPtr; 944 size_t CUSize; 945 946 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 947 if (Err == CUDA_SUCCESS) { 948 if (CUSize != sizeof(DeviceEnv)) { 949 REPORT( 950 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 951 DeviceEnvName, CUSize, sizeof(int32_t)); 952 CUDA_ERR_STRING(Err); 953 return nullptr; 954 } 955 956 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 957 if (Err != CUDA_SUCCESS) { 958 REPORT("Error when copying data from host to device. Pointers: " 959 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 960 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 961 CUDA_ERR_STRING(Err); 962 return nullptr; 963 } 964 965 DP("Sending global device environment data %zu bytes\n", CUSize); 966 } else { 967 DP("Finding global device environment '%s' - symbol missing.\n", 968 DeviceEnvName); 969 DP("Continue, considering this is a device RTL which does not accept " 970 "environment setting.\n"); 971 } 972 } 973 974 return getOffloadEntriesTable(DeviceId); 975 } 976 977 void *dataAlloc(const int DeviceId, const int64_t Size, 978 const TargetAllocTy Kind) { 979 switch (Kind) { 980 case TARGET_ALLOC_DEFAULT: 981 case TARGET_ALLOC_DEVICE: 982 if (UseMemoryManager) 983 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 984 else 985 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 986 case TARGET_ALLOC_HOST: 987 case TARGET_ALLOC_SHARED: 988 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 989 } 990 991 REPORT("Invalid target data allocation kind or requested allocator not " 992 "implemented yet\n"); 993 994 return nullptr; 995 } 996 997 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 998 const int64_t Size, __tgt_async_info *AsyncInfo) const { 999 assert(AsyncInfo && "AsyncInfo is nullptr"); 1000 1001 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1002 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1003 return OFFLOAD_FAIL; 1004 1005 CUstream Stream = getStream(DeviceId, AsyncInfo); 1006 1007 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 1008 if (Err != CUDA_SUCCESS) { 1009 DP("Error when copying data from host to device. Pointers: host " 1010 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1011 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1012 CUDA_ERR_STRING(Err); 1013 return OFFLOAD_FAIL; 1014 } 1015 1016 return OFFLOAD_SUCCESS; 1017 } 1018 1019 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 1020 const int64_t Size, __tgt_async_info *AsyncInfo) const { 1021 assert(AsyncInfo && "AsyncInfo is nullptr"); 1022 1023 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1024 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1025 return OFFLOAD_FAIL; 1026 1027 CUstream Stream = getStream(DeviceId, AsyncInfo); 1028 1029 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 1030 if (Err != CUDA_SUCCESS) { 1031 DP("Error when copying data from device to host. Pointers: host " 1032 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1033 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1034 CUDA_ERR_STRING(Err); 1035 return OFFLOAD_FAIL; 1036 } 1037 1038 return OFFLOAD_SUCCESS; 1039 } 1040 1041 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1042 int64_t Size, __tgt_async_info *AsyncInfo) const { 1043 assert(AsyncInfo && "AsyncInfo is nullptr"); 1044 1045 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 1046 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1047 return OFFLOAD_FAIL; 1048 1049 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1050 1051 // If they are two devices, we try peer to peer copy first 1052 if (SrcDevId != DstDevId) { 1053 int CanAccessPeer = 0; 1054 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1055 if (Err != CUDA_SUCCESS) { 1056 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1057 ", dst = %" PRId32 "\n", 1058 SrcDevId, DstDevId); 1059 CUDA_ERR_STRING(Err); 1060 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1061 } 1062 1063 if (!CanAccessPeer) { 1064 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1065 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1066 } 1067 1068 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1069 if (Err != CUDA_SUCCESS) { 1070 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1071 ", dst = %" PRId32 "\n", 1072 SrcDevId, DstDevId); 1073 CUDA_ERR_STRING(Err); 1074 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1075 } 1076 1077 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1078 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1079 Size, Stream); 1080 if (Err == CUDA_SUCCESS) 1081 return OFFLOAD_SUCCESS; 1082 1083 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1084 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1085 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1086 CUDA_ERR_STRING(Err); 1087 } 1088 1089 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1090 } 1091 1092 int dataDelete(const int DeviceId, void *TgtPtr) { 1093 if (UseMemoryManager) 1094 return MemoryManagers[DeviceId]->free(TgtPtr); 1095 1096 return DeviceAllocators[DeviceId].free(TgtPtr); 1097 } 1098 1099 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1100 ptrdiff_t *TgtOffsets, const int ArgNum, 1101 const int TeamNum, const int ThreadLimit, 1102 const unsigned int LoopTripCount, 1103 __tgt_async_info *AsyncInfo) const { 1104 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1105 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1106 return OFFLOAD_FAIL; 1107 1108 // All args are references. 1109 std::vector<void *> Args(ArgNum); 1110 std::vector<void *> Ptrs(ArgNum); 1111 1112 for (int I = 0; I < ArgNum; ++I) { 1113 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1114 Args[I] = &Ptrs[I]; 1115 } 1116 1117 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1118 1119 const bool IsSPMDGenericMode = 1120 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1121 const bool IsSPMDMode = 1122 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1123 const bool IsGenericMode = 1124 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1125 1126 int CudaThreadsPerBlock; 1127 if (ThreadLimit > 0) { 1128 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1129 CudaThreadsPerBlock = ThreadLimit; 1130 // Add master warp if necessary 1131 if (IsGenericMode) { 1132 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1133 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1134 } 1135 } else { 1136 DP("Setting CUDA threads per block to default %d\n", 1137 DeviceData[DeviceId].NumThreads); 1138 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1139 } 1140 1141 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1142 DP("Threads per block capped at device limit %d\n", 1143 DeviceData[DeviceId].ThreadsPerBlock); 1144 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1145 } 1146 1147 if (!KernelInfo->MaxThreadsPerBlock) { 1148 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1149 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1150 KernelInfo->Func); 1151 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1152 return OFFLOAD_FAIL; 1153 } 1154 1155 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1156 DP("Threads per block capped at kernel limit %d\n", 1157 KernelInfo->MaxThreadsPerBlock); 1158 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1159 } 1160 1161 unsigned int CudaBlocksPerGrid; 1162 if (TeamNum <= 0) { 1163 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1164 if (IsSPMDGenericMode) { 1165 // If we reach this point, then we are executing a kernel that was 1166 // transformed from Generic-mode to SPMD-mode. This kernel has 1167 // SPMD-mode execution, but needs its blocks to be scheduled 1168 // differently because the current loop trip count only applies to the 1169 // `teams distribute` region and will create var too few blocks using 1170 // the regular SPMD-mode method. 1171 CudaBlocksPerGrid = LoopTripCount; 1172 } else if (IsSPMDMode) { 1173 // We have a combined construct, i.e. `target teams distribute 1174 // parallel for [simd]`. We launch so many teams so that each thread 1175 // will execute one iteration of the loop. round up to the nearest 1176 // integer 1177 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1178 } else if (IsGenericMode) { 1179 // If we reach this point, then we have a non-combined construct, i.e. 1180 // `teams distribute` with a nested `parallel for` and each team is 1181 // assigned one iteration of the `distribute` loop. E.g.: 1182 // 1183 // #pragma omp target teams distribute 1184 // for(...loop_tripcount...) { 1185 // #pragma omp parallel for 1186 // for(...) {} 1187 // } 1188 // 1189 // Threads within a team will execute the iterations of the `parallel` 1190 // loop. 1191 CudaBlocksPerGrid = LoopTripCount; 1192 } else { 1193 REPORT("Unknown execution mode: %d\n", 1194 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1195 return OFFLOAD_FAIL; 1196 } 1197 DP("Using %d teams due to loop trip count %" PRIu32 1198 " and number of threads per block %d\n", 1199 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1200 } else { 1201 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1202 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1203 } 1204 } else { 1205 DP("Using requested number of teams %d\n", TeamNum); 1206 CudaBlocksPerGrid = TeamNum; 1207 } 1208 1209 if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { 1210 DP("Capping number of teams to team limit %d\n", 1211 DeviceData[DeviceId].BlocksPerGrid); 1212 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1213 } 1214 1215 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1216 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1217 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1218 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1219 : "(null)", 1220 CudaBlocksPerGrid, CudaThreadsPerBlock, 1221 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1222 1223 CUstream Stream = getStream(DeviceId, AsyncInfo); 1224 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1225 /* gridDimZ */ 1, CudaThreadsPerBlock, 1226 /* blockDimY */ 1, /* blockDimZ */ 1, 1227 DynamicMemorySize, Stream, &Args[0], nullptr); 1228 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1229 return OFFLOAD_FAIL; 1230 1231 DP("Launch of entry point at " DPxMOD " successful!\n", 1232 DPxPTR(TgtEntryPtr)); 1233 1234 return OFFLOAD_SUCCESS; 1235 } 1236 1237 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1238 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1239 CUresult Err = cuStreamSynchronize(Stream); 1240 1241 // Once the stream is synchronized, return it to stream pool and reset 1242 // AsyncInfo. This is to make sure the synchronization only works for its 1243 // own tasks. 1244 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1245 AsyncInfo->Queue = nullptr; 1246 1247 if (Err != CUDA_SUCCESS) { 1248 DP("Error when synchronizing stream. stream = " DPxMOD 1249 ", async info ptr = " DPxMOD "\n", 1250 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1251 CUDA_ERR_STRING(Err); 1252 } 1253 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1254 } 1255 1256 void printDeviceInfo(int32_t device_id) { 1257 char TmpChar[1000]; 1258 std::string TmpStr; 1259 size_t TmpSt; 1260 int TmpInt, TmpInt2, TmpInt3; 1261 1262 CUdevice Device; 1263 checkResult(cuDeviceGet(&Device, device_id), 1264 "Error returned from cuCtxGetDevice\n"); 1265 1266 cuDriverGetVersion(&TmpInt); 1267 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1268 printf(" CUDA Device Number: \t\t%d \n", device_id); 1269 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1270 "Error returned from cuDeviceGetName\n"); 1271 printf(" Device Name: \t\t\t%s \n", TmpChar); 1272 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1273 "Error returned from cuDeviceTotalMem\n"); 1274 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1275 checkResult(cuDeviceGetAttribute( 1276 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1277 "Error returned from cuDeviceGetAttribute\n"); 1278 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1279 checkResult( 1280 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1281 "Error returned from cuDeviceGetAttribute\n"); 1282 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1283 checkResult(cuDeviceGetAttribute( 1284 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1285 "Error returned from cuDeviceGetAttribute\n"); 1286 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1287 checkResult( 1288 cuDeviceGetAttribute( 1289 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1290 "Error returned from cuDeviceGetAttribute\n"); 1291 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1292 checkResult( 1293 cuDeviceGetAttribute( 1294 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1295 "Error returned from cuDeviceGetAttribute\n"); 1296 printf(" Registers per Block: \t\t%d \n", TmpInt); 1297 checkResult( 1298 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1299 "Error returned from cuDeviceGetAttribute\n"); 1300 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1301 checkResult(cuDeviceGetAttribute( 1302 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1303 "Error returned from cuDeviceGetAttribute\n"); 1304 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1305 checkResult(cuDeviceGetAttribute( 1306 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1307 "Error returned from cuDeviceGetAttribute\n"); 1308 checkResult(cuDeviceGetAttribute( 1309 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1310 "Error returned from cuDeviceGetAttribute\n"); 1311 checkResult(cuDeviceGetAttribute( 1312 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1313 "Error returned from cuDeviceGetAttribute\n"); 1314 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1315 TmpInt3); 1316 checkResult(cuDeviceGetAttribute( 1317 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1318 "Error returned from cuDeviceGetAttribute\n"); 1319 checkResult(cuDeviceGetAttribute( 1320 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1321 "Error returned from cuDeviceGetAttribute\n"); 1322 checkResult(cuDeviceGetAttribute( 1323 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1324 "Error returned from cuDeviceGetAttribute\n"); 1325 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1326 TmpInt3); 1327 checkResult( 1328 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1329 "Error returned from cuDeviceGetAttribute\n"); 1330 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1331 checkResult(cuDeviceGetAttribute( 1332 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1333 "Error returned from cuDeviceGetAttribute\n"); 1334 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1335 checkResult( 1336 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1337 "Error returned from cuDeviceGetAttribute\n"); 1338 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1339 checkResult(cuDeviceGetAttribute( 1340 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1341 "Error returned from cuDeviceGetAttribute\n"); 1342 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1343 checkResult( 1344 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1345 "Error returned from cuDeviceGetAttribute\n"); 1346 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1347 checkResult(cuDeviceGetAttribute( 1348 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1349 "Error returned from cuDeviceGetAttribute\n"); 1350 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1351 checkResult( 1352 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1353 "Error returned from cuDeviceGetAttribute\n"); 1354 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1355 TmpStr = "DEFAULT"; 1356 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1357 TmpStr = "PROHIBITED"; 1358 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1359 TmpStr = "EXCLUSIVE PROCESS"; 1360 else 1361 TmpStr = "unknown"; 1362 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1363 checkResult(cuDeviceGetAttribute( 1364 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1365 "Error returned from cuDeviceGetAttribute\n"); 1366 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1367 checkResult( 1368 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1369 "Error returned from cuDeviceGetAttribute\n"); 1370 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1371 checkResult(cuDeviceGetAttribute( 1372 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1373 "Error returned from cuDeviceGetAttribute\n"); 1374 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1375 checkResult( 1376 cuDeviceGetAttribute( 1377 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1378 "Error returned from cuDeviceGetAttribute\n"); 1379 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1380 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1381 Device), 1382 "Error returned from cuDeviceGetAttribute\n"); 1383 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1384 checkResult(cuDeviceGetAttribute( 1385 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1386 Device), 1387 "Error returned from cuDeviceGetAttribute\n"); 1388 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1389 checkResult(cuDeviceGetAttribute( 1390 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1391 "Error returned from cuDeviceGetAttribute\n"); 1392 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1393 checkResult(cuDeviceGetAttribute( 1394 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1395 "Error returned from cuDeviceGetAttribute\n"); 1396 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1397 checkResult(cuDeviceGetAttribute( 1398 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1399 "Error returned from cuDeviceGetAttribute\n"); 1400 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1401 checkResult( 1402 cuDeviceGetAttribute( 1403 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1404 "Error returned from cuDeviceGetAttribute\n"); 1405 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1406 checkResult( 1407 cuDeviceGetAttribute( 1408 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1409 "Error returned from cuDeviceGetAttribute\n"); 1410 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1411 checkResult(cuDeviceGetAttribute( 1412 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1413 "Error returned from cuDeviceGetAttribute\n"); 1414 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1415 checkResult(cuDeviceGetAttribute( 1416 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1417 "Error returned from cuDeviceGetAttribute\n"); 1418 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1419 checkResult( 1420 cuDeviceGetAttribute( 1421 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1422 "Error returned from cuDeviceGetAttribute\n"); 1423 checkResult( 1424 cuDeviceGetAttribute( 1425 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1426 "Error returned from cuDeviceGetAttribute\n"); 1427 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1428 } 1429 1430 int createEvent(int DeviceId, void **P) { 1431 CUevent Event = nullptr; 1432 if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) 1433 return OFFLOAD_FAIL; 1434 *P = Event; 1435 return OFFLOAD_SUCCESS; 1436 } 1437 1438 int destroyEvent(int DeviceId, void *EventPtr) { 1439 EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr)); 1440 return OFFLOAD_SUCCESS; 1441 } 1442 1443 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1444 void *EventPtr) const { 1445 CUstream Stream = getStream(DeviceId, AsyncInfo); 1446 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1447 1448 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1449 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1450 // API document, that argument has to be 0x0. 1451 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1452 if (Err != CUDA_SUCCESS) { 1453 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1454 DPxPTR(Stream), DPxPTR(Event)); 1455 CUDA_ERR_STRING(Err); 1456 return OFFLOAD_FAIL; 1457 } 1458 1459 return OFFLOAD_SUCCESS; 1460 } 1461 1462 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { 1463 if (AsyncInfo->Queue) { 1464 StreamPool[DeviceId]->release( 1465 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1466 AsyncInfo->Queue = nullptr; 1467 } 1468 1469 return OFFLOAD_SUCCESS; 1470 } 1471 1472 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { 1473 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1474 if (!checkResult(Err, "error returned from cuCtxSetCurrent")) 1475 return OFFLOAD_FAIL; 1476 1477 *AsyncInfo = new __tgt_async_info; 1478 getStream(DeviceId, *AsyncInfo); 1479 return OFFLOAD_SUCCESS; 1480 } 1481 1482 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, 1483 const char **ErrStr) const { 1484 assert(DeviceInfo && "DeviceInfo is nullptr"); 1485 1486 if (!DeviceInfo->Context) 1487 DeviceInfo->Context = DeviceData[DeviceId].Context; 1488 if (!DeviceInfo->Device) { 1489 CUdevice Dev; 1490 CUresult Err = cuDeviceGet(&Dev, DeviceId); 1491 if (Err == CUDA_SUCCESS) { 1492 DeviceInfo->Device = reinterpret_cast<void *>(Dev); 1493 } else { 1494 cuGetErrorString(Err, ErrStr); 1495 return OFFLOAD_FAIL; 1496 } 1497 } 1498 return OFFLOAD_SUCCESS; 1499 } 1500 }; 1501 1502 DeviceRTLTy DeviceRTL; 1503 } // namespace 1504 1505 // Exposed library API function 1506 #ifdef __cplusplus 1507 extern "C" { 1508 #endif 1509 1510 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1511 return elf_check_machine(image, /* EM_CUDA */ 190); 1512 } 1513 1514 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1515 1516 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1517 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1518 DeviceRTL.setRequiresFlag(RequiresFlags); 1519 return RequiresFlags; 1520 } 1521 1522 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1523 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1524 DeviceRTL.isValidDeviceId(dst_dev_id)) 1525 return 1; 1526 1527 return 0; 1528 } 1529 1530 int32_t __tgt_rtl_init_device(int32_t device_id) { 1531 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1532 1533 return DeviceRTL.initDevice(device_id); 1534 } 1535 1536 int32_t __tgt_rtl_deinit_device(int32_t device_id) { 1537 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1538 1539 return DeviceRTL.deinitDevice(device_id); 1540 } 1541 1542 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1543 __tgt_device_image *image) { 1544 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1545 1546 return DeviceRTL.loadBinary(device_id, image); 1547 } 1548 1549 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1550 int32_t kind) { 1551 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1552 1553 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1554 } 1555 1556 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1557 int64_t size) { 1558 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1559 1560 __tgt_async_info AsyncInfo; 1561 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1562 size, &AsyncInfo); 1563 if (rc != OFFLOAD_SUCCESS) 1564 return OFFLOAD_FAIL; 1565 1566 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1567 } 1568 1569 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1570 void *hst_ptr, int64_t size, 1571 __tgt_async_info *async_info_ptr) { 1572 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1573 assert(async_info_ptr && "async_info_ptr is nullptr"); 1574 1575 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1576 async_info_ptr); 1577 } 1578 1579 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1580 int64_t size) { 1581 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1582 1583 __tgt_async_info AsyncInfo; 1584 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1585 size, &AsyncInfo); 1586 if (rc != OFFLOAD_SUCCESS) 1587 return OFFLOAD_FAIL; 1588 1589 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1590 } 1591 1592 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1593 void *tgt_ptr, int64_t size, 1594 __tgt_async_info *async_info_ptr) { 1595 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1596 assert(async_info_ptr && "async_info_ptr is nullptr"); 1597 1598 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1599 async_info_ptr); 1600 } 1601 1602 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1603 int dst_dev_id, void *dst_ptr, 1604 int64_t size, 1605 __tgt_async_info *AsyncInfo) { 1606 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1607 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1608 assert(AsyncInfo && "AsyncInfo is nullptr"); 1609 1610 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1611 AsyncInfo); 1612 } 1613 1614 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1615 int32_t dst_dev_id, void *dst_ptr, 1616 int64_t size) { 1617 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1618 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1619 1620 __tgt_async_info AsyncInfo; 1621 const int32_t rc = __tgt_rtl_data_exchange_async( 1622 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1623 if (rc != OFFLOAD_SUCCESS) 1624 return OFFLOAD_FAIL; 1625 1626 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1627 } 1628 1629 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1630 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1631 1632 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1633 } 1634 1635 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1636 void **tgt_args, 1637 ptrdiff_t *tgt_offsets, 1638 int32_t arg_num, int32_t team_num, 1639 int32_t thread_limit, 1640 uint64_t loop_tripcount) { 1641 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1642 1643 __tgt_async_info AsyncInfo; 1644 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1645 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1646 thread_limit, loop_tripcount, &AsyncInfo); 1647 if (rc != OFFLOAD_SUCCESS) 1648 return OFFLOAD_FAIL; 1649 1650 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1651 } 1652 1653 int32_t __tgt_rtl_run_target_team_region_async( 1654 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1655 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1656 int32_t thread_limit, uint64_t loop_tripcount, 1657 __tgt_async_info *async_info_ptr) { 1658 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1659 1660 return DeviceRTL.runTargetTeamRegion( 1661 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1662 thread_limit, loop_tripcount, async_info_ptr); 1663 } 1664 1665 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1666 void **tgt_args, ptrdiff_t *tgt_offsets, 1667 int32_t arg_num) { 1668 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1669 1670 __tgt_async_info AsyncInfo; 1671 const int32_t rc = __tgt_rtl_run_target_region_async( 1672 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1673 if (rc != OFFLOAD_SUCCESS) 1674 return OFFLOAD_FAIL; 1675 1676 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1677 } 1678 1679 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1680 void *tgt_entry_ptr, void **tgt_args, 1681 ptrdiff_t *tgt_offsets, 1682 int32_t arg_num, 1683 __tgt_async_info *async_info_ptr) { 1684 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1685 1686 return __tgt_rtl_run_target_team_region_async( 1687 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1688 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1689 async_info_ptr); 1690 } 1691 1692 int32_t __tgt_rtl_synchronize(int32_t device_id, 1693 __tgt_async_info *async_info_ptr) { 1694 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1695 assert(async_info_ptr && "async_info_ptr is nullptr"); 1696 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1697 1698 return DeviceRTL.synchronize(device_id, async_info_ptr); 1699 } 1700 1701 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1702 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1703 InfoLevel.store(NewInfoLevel); 1704 } 1705 1706 void __tgt_rtl_print_device_info(int32_t device_id) { 1707 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1708 DeviceRTL.printDeviceInfo(device_id); 1709 } 1710 1711 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1712 assert(event && "event is nullptr"); 1713 return DeviceRTL.createEvent(device_id, event); 1714 } 1715 1716 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1717 __tgt_async_info *async_info_ptr) { 1718 assert(async_info_ptr && "async_info_ptr is nullptr"); 1719 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1720 assert(event_ptr && "event_ptr is nullptr"); 1721 1722 return recordEvent(event_ptr, async_info_ptr); 1723 } 1724 1725 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1726 __tgt_async_info *async_info_ptr) { 1727 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1728 assert(async_info_ptr && "async_info_ptr is nullptr"); 1729 assert(event_ptr && "event is nullptr"); 1730 1731 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1732 } 1733 1734 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1735 assert(event_ptr && "event is nullptr"); 1736 1737 return syncEvent(event_ptr); 1738 } 1739 1740 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1741 assert(event_ptr && "event is nullptr"); 1742 1743 return DeviceRTL.destroyEvent(device_id, event_ptr); 1744 } 1745 1746 int32_t __tgt_rtl_release_async_info(int32_t device_id, 1747 __tgt_async_info *async_info) { 1748 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1749 assert(async_info && "async_info is nullptr"); 1750 1751 return DeviceRTL.releaseAsyncInfo(device_id, async_info); 1752 } 1753 1754 int32_t __tgt_rtl_init_async_info(int32_t device_id, 1755 __tgt_async_info **async_info) { 1756 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1757 assert(async_info && "async_info is nullptr"); 1758 1759 return DeviceRTL.initAsyncInfo(device_id, async_info); 1760 } 1761 1762 int32_t __tgt_rtl_init_device_info(int32_t device_id, 1763 __tgt_device_info *device_info_ptr, 1764 const char **err_str) { 1765 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1766 assert(device_info_ptr && "device_info_ptr is nullptr"); 1767 1768 return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str); 1769 } 1770 1771 #ifdef __cplusplus 1772 } 1773 #endif 1774