1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/ADT/StringRef.h" 14 15 #include <algorithm> 16 #include <cassert> 17 #include <cstddef> 18 #include <cuda.h> 19 #include <list> 20 #include <memory> 21 #include <mutex> 22 #include <string> 23 #include <unordered_map> 24 #include <vector> 25 26 #include "Debug.h" 27 #include "DeviceEnvironment.h" 28 #include "omptarget.h" 29 #include "omptargetplugin.h" 30 31 #define TARGET_NAME CUDA 32 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 33 34 #include "MemoryManager.h" 35 36 #include "llvm/Frontend/OpenMP/OMPConstants.h" 37 38 using namespace llvm; 39 40 // Utility for retrieving and printing CUDA error string. 41 #ifdef OMPTARGET_DEBUG 42 #define CUDA_ERR_STRING(err) \ 43 do { \ 44 if (getDebugLevel() > 0) { \ 45 const char *errStr = nullptr; \ 46 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 47 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 48 REPORT("Unrecognized CUDA error code: %d\n", err); \ 49 else if (errStr_status == CUDA_SUCCESS) \ 50 REPORT("CUDA error is: %s\n", errStr); \ 51 else { \ 52 REPORT("Unresolved CUDA error code: %d\n", err); \ 53 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 54 errStr_status); \ 55 } \ 56 } else { \ 57 const char *errStr = nullptr; \ 58 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 59 if (errStr_status == CUDA_SUCCESS) \ 60 REPORT("%s \n", errStr); \ 61 } \ 62 } while (false) 63 #else // OMPTARGET_DEBUG 64 #define CUDA_ERR_STRING(err) \ 65 do { \ 66 const char *errStr = nullptr; \ 67 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 68 if (errStr_status == CUDA_SUCCESS) \ 69 REPORT("%s \n", errStr); \ 70 } while (false) 71 #endif // OMPTARGET_DEBUG 72 73 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 74 75 #include "elf_common.h" 76 77 /// Keep entries table per device. 78 struct FuncOrGblEntryTy { 79 __tgt_target_table Table; 80 std::vector<__tgt_offload_entry> Entries; 81 }; 82 83 /// Use a single entity to encode a kernel and a set of flags. 84 struct KernelTy { 85 CUfunction Func; 86 87 // execution mode of kernel 88 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 89 90 /// Maximal number of threads per block for this kernel. 91 int MaxThreadsPerBlock = 0; 92 93 KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode) 94 : Func(Func), ExecutionMode(ExecutionMode) {} 95 }; 96 97 namespace { 98 bool checkResult(CUresult Err, const char *ErrMsg) { 99 if (Err == CUDA_SUCCESS) 100 return true; 101 102 REPORT("%s", ErrMsg); 103 CUDA_ERR_STRING(Err); 104 return false; 105 } 106 107 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 108 CUstream Stream) { 109 CUresult Err = 110 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 111 112 if (Err != CUDA_SUCCESS) { 113 DP("Error when copying data from device to device. Pointers: src " 114 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 115 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 116 CUDA_ERR_STRING(Err); 117 return OFFLOAD_FAIL; 118 } 119 120 return OFFLOAD_SUCCESS; 121 } 122 123 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 124 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 125 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 126 127 CUresult Err = cuEventRecord(Event, Stream); 128 if (Err != CUDA_SUCCESS) { 129 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 130 DPxPTR(Stream), DPxPTR(Event)); 131 CUDA_ERR_STRING(Err); 132 return OFFLOAD_FAIL; 133 } 134 135 return OFFLOAD_SUCCESS; 136 } 137 138 int syncEvent(void *EventPtr) { 139 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 140 141 CUresult Err = cuEventSynchronize(Event); 142 if (Err != CUDA_SUCCESS) { 143 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 144 CUDA_ERR_STRING(Err); 145 return OFFLOAD_FAIL; 146 } 147 148 return OFFLOAD_SUCCESS; 149 } 150 151 namespace { 152 153 // Structure contains per-device data 154 struct DeviceDataTy { 155 /// List that contains all the kernels. 156 std::list<KernelTy> KernelsList; 157 158 std::list<FuncOrGblEntryTy> FuncGblEntries; 159 160 CUcontext Context = nullptr; 161 // Device properties 162 unsigned int ThreadsPerBlock = 0; 163 unsigned int BlocksPerGrid = 0; 164 unsigned int WarpSize = 0; 165 // OpenMP properties 166 int NumTeams = 0; 167 int NumThreads = 0; 168 }; 169 170 /// Resource allocator where \p T is the resource type. 171 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL 172 /// accordingly. The implementation should not raise any exception. 173 template <typename T> struct AllocatorTy { 174 using ElementTy = T; 175 virtual ~AllocatorTy() {} 176 177 /// Create a resource and assign to R. 178 virtual int create(T &R) noexcept = 0; 179 /// Destroy the resource. 180 virtual int destroy(T) noexcept = 0; 181 }; 182 183 /// Allocator for CUstream. 184 struct StreamAllocatorTy final : public AllocatorTy<CUstream> { 185 /// See AllocatorTy<T>::create. 186 int create(CUstream &Stream) noexcept override { 187 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), 188 "Error returned from cuStreamCreate\n")) 189 return OFFLOAD_FAIL; 190 191 return OFFLOAD_SUCCESS; 192 } 193 194 /// See AllocatorTy<T>::destroy. 195 int destroy(CUstream Stream) noexcept override { 196 if (!checkResult(cuStreamDestroy(Stream), 197 "Error returned from cuStreamDestroy\n")) 198 return OFFLOAD_FAIL; 199 200 return OFFLOAD_SUCCESS; 201 } 202 }; 203 204 /// Allocator for CUevent. 205 struct EventAllocatorTy final : public AllocatorTy<CUevent> { 206 /// See AllocatorTy<T>::create. 207 int create(CUevent &Event) noexcept override { 208 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), 209 "Error returned from cuEventCreate\n")) 210 return OFFLOAD_FAIL; 211 212 return OFFLOAD_SUCCESS; 213 } 214 215 /// See AllocatorTy<T>::destroy. 216 int destroy(CUevent Event) noexcept override { 217 if (!checkResult(cuEventDestroy(Event), 218 "Error returned from cuEventDestroy\n")) 219 return OFFLOAD_FAIL; 220 221 return OFFLOAD_SUCCESS; 222 } 223 }; 224 225 /// A generic pool of resources where \p T is the resource type. 226 /// \p T should be copyable as the object is stored in \p std::vector . 227 template <typename AllocTy> class ResourcePoolTy { 228 using ElementTy = typename AllocTy::ElementTy; 229 /// Index of the next available resource. 230 size_t Next = 0; 231 /// Mutex to guard the pool. 232 std::mutex Mutex; 233 /// Pool of resources. The difference between \p Resources and \p Pool is, 234 /// when a resource is acquired and released, it is all on \p Resources. When 235 /// a batch of new resources are needed, they are both added to \p Resources 236 /// and \p Pool. The reason for this setting is, \p Resources could contain 237 /// redundant elements because resources are not released, which can cause 238 /// double free. This setting makes sure that \p Pool always has every 239 /// resource allocated from the device. 240 std::vector<ElementTy> Resources; 241 std::vector<ElementTy> Pool; 242 /// A reference to the corresponding allocator. 243 AllocTy Allocator; 244 245 /// If `Resources` is used up, we will fill in more resources. It assumes that 246 /// the new size `Size` should be always larger than the current size. 247 bool resize(size_t Size) { 248 assert(Resources.size() == Pool.size() && "size mismatch"); 249 auto CurSize = Resources.size(); 250 assert(Size > CurSize && "Unexpected smaller size"); 251 Pool.reserve(Size); 252 Resources.reserve(Size); 253 for (auto I = CurSize; I < Size; ++I) { 254 ElementTy NewItem; 255 int Ret = Allocator.create(NewItem); 256 if (Ret != OFFLOAD_SUCCESS) 257 return false; 258 Pool.push_back(NewItem); 259 Resources.push_back(NewItem); 260 } 261 return true; 262 } 263 264 public: 265 ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept 266 : Allocator(std::move(A)) { 267 if (Size) 268 (void)resize(Size); 269 } 270 271 ~ResourcePoolTy() noexcept { clear(); } 272 273 /// Get a resource from pool. `Next` always points to the next available 274 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are 275 /// still available. If there is no resource left, we will ask for more. Each 276 /// time a resource is assigned, the id will increase one. 277 /// xxxxxs+++++++++ 278 /// ^ 279 /// Next 280 /// After assignment, the pool becomes the following and s is assigned. 281 /// xxxxxs+++++++++ 282 /// ^ 283 /// Next 284 int acquire(ElementTy &R) noexcept { 285 std::lock_guard<std::mutex> LG(Mutex); 286 if (Next == Resources.size()) { 287 auto NewSize = Resources.size() ? Resources.size() * 2 : 1; 288 if (!resize(NewSize)) 289 return OFFLOAD_FAIL; 290 } 291 292 assert(Next < Resources.size()); 293 294 R = Resources[Next++]; 295 296 return OFFLOAD_SUCCESS; 297 } 298 299 /// Return the resource back to the pool. When we return a resource, we need 300 /// to first decrease `Next`, and then copy the resource back. It is worth 301 /// noting that, the order of resources return might be different from that 302 /// they're assigned, that saying, at some point, there might be two identical 303 /// resources. 304 /// xxax+a+++++ 305 /// ^ 306 /// Next 307 /// However, it doesn't matter, because they're always on the two sides of 308 /// `Next`. The left one will in the end be overwritten by another resource. 309 /// Therefore, after several execution, the order of pool might be different 310 /// from its initial state. 311 void release(ElementTy R) noexcept { 312 std::lock_guard<std::mutex> LG(Mutex); 313 Resources[--Next] = R; 314 } 315 316 /// Released all stored resources and clear the pool. 317 /// Note: This function is not thread safe. Be sure to guard it if necessary. 318 void clear() noexcept { 319 for (auto &R : Pool) 320 (void)Allocator.destroy(R); 321 Pool.clear(); 322 Resources.clear(); 323 } 324 }; 325 326 } // namespace 327 328 class DeviceRTLTy { 329 int NumberOfDevices; 330 // OpenMP environment properties 331 int EnvNumTeams; 332 int EnvTeamLimit; 333 int EnvTeamThreadLimit; 334 // OpenMP requires flags 335 int64_t RequiresFlags; 336 // Amount of dynamic shared memory to use at launch. 337 uint64_t DynamicMemorySize; 338 339 /// Number of initial streams for each device. 340 int NumInitialStreams = 32; 341 342 /// Number of initial events for each device. 343 int NumInitialEvents = 8; 344 345 static constexpr const int32_t HardThreadLimit = 1024; 346 static constexpr const int32_t DefaultNumTeams = 128; 347 static constexpr const int32_t DefaultNumThreads = 128; 348 349 using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>; 350 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool; 351 352 using EventPoolTy = ResourcePoolTy<EventAllocatorTy>; 353 std::vector<std::unique_ptr<EventPoolTy>> EventPool; 354 355 std::vector<DeviceDataTy> DeviceData; 356 std::vector<std::vector<CUmodule>> Modules; 357 358 /// Vector of flags indicating the initalization status of all associated 359 /// devices. 360 std::vector<bool> InitializedFlags; 361 362 enum class PeerAccessState : uint8_t { Unkown, Yes, No }; 363 std::vector<std::vector<PeerAccessState>> PeerAccessMatrix; 364 std::mutex PeerAccessMatrixLock; 365 366 /// A class responsible for interacting with device native runtime library to 367 /// allocate and free memory. 368 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 369 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 370 371 public: 372 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 373 if (Size == 0) 374 return nullptr; 375 376 void *MemAlloc = nullptr; 377 CUresult Err; 378 switch (Kind) { 379 case TARGET_ALLOC_DEFAULT: 380 case TARGET_ALLOC_DEVICE: 381 CUdeviceptr DevicePtr; 382 Err = cuMemAlloc(&DevicePtr, Size); 383 MemAlloc = (void *)DevicePtr; 384 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 385 return nullptr; 386 break; 387 case TARGET_ALLOC_HOST: 388 void *HostPtr; 389 Err = cuMemAllocHost(&HostPtr, Size); 390 MemAlloc = HostPtr; 391 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 392 return nullptr; 393 HostPinnedAllocs[MemAlloc] = Kind; 394 break; 395 case TARGET_ALLOC_SHARED: 396 CUdeviceptr SharedPtr; 397 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 398 MemAlloc = (void *)SharedPtr; 399 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 400 return nullptr; 401 break; 402 } 403 404 return MemAlloc; 405 } 406 407 int free(void *TgtPtr) override { 408 CUresult Err; 409 // Host pinned memory must be freed differently. 410 TargetAllocTy Kind = 411 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 412 ? TARGET_ALLOC_DEFAULT 413 : TARGET_ALLOC_HOST; 414 switch (Kind) { 415 case TARGET_ALLOC_DEFAULT: 416 case TARGET_ALLOC_DEVICE: 417 case TARGET_ALLOC_SHARED: 418 Err = cuMemFree((CUdeviceptr)TgtPtr); 419 if (!checkResult(Err, "Error returned from cuMemFree\n")) 420 return OFFLOAD_FAIL; 421 break; 422 case TARGET_ALLOC_HOST: 423 Err = cuMemFreeHost(TgtPtr); 424 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 425 return OFFLOAD_FAIL; 426 break; 427 } 428 429 return OFFLOAD_SUCCESS; 430 } 431 }; 432 433 /// A vector of device allocators 434 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 435 436 /// A vector of memory managers. Since the memory manager is non-copyable and 437 // non-removable, we wrap them into std::unique_ptr. 438 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 439 440 /// Whether use memory manager 441 bool UseMemoryManager = true; 442 443 // Record entry point associated with device 444 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) { 445 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 446 E.Entries.push_back(Entry); 447 } 448 449 // Return a pointer to the entry associated with the pointer 450 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 451 const void *Addr) const { 452 for (const __tgt_offload_entry &Itr : 453 DeviceData[DeviceId].FuncGblEntries.back().Entries) 454 if (Itr.addr == Addr) 455 return &Itr; 456 457 return nullptr; 458 } 459 460 // Return the pointer to the target entries table 461 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 462 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 463 464 if (E.Entries.empty()) 465 return nullptr; 466 467 // Update table info according to the entries and return the pointer 468 E.Table.EntriesBegin = E.Entries.data(); 469 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 470 471 return &E.Table; 472 } 473 474 // Clear entries table for a device 475 void clearOffloadEntriesTable(const int DeviceId) { 476 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 477 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 478 E.Entries.clear(); 479 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 480 } 481 482 public: 483 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 484 assert(AsyncInfo && "AsyncInfo is nullptr"); 485 486 if (!AsyncInfo->Queue) { 487 CUstream S; 488 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) 489 return nullptr; 490 491 AsyncInfo->Queue = S; 492 } 493 494 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 495 } 496 497 // This class should not be copied 498 DeviceRTLTy(const DeviceRTLTy &) = delete; 499 DeviceRTLTy(DeviceRTLTy &&) = delete; 500 501 DeviceRTLTy() 502 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 503 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 504 DynamicMemorySize(0) { 505 506 DP("Start initializing CUDA\n"); 507 508 CUresult Err = cuInit(0); 509 if (Err == CUDA_ERROR_INVALID_HANDLE) { 510 // Can't call cuGetErrorString if dlsym failed 511 DP("Failed to load CUDA shared library\n"); 512 return; 513 } 514 if (Err == CUDA_ERROR_NO_DEVICE) { 515 DP("There are no devices supporting CUDA.\n"); 516 return; 517 } 518 if (!checkResult(Err, "Error returned from cuInit\n")) { 519 return; 520 } 521 522 Err = cuDeviceGetCount(&NumberOfDevices); 523 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 524 return; 525 526 if (NumberOfDevices == 0) { 527 DP("There are no devices supporting CUDA.\n"); 528 return; 529 } 530 531 DeviceData.resize(NumberOfDevices); 532 Modules.resize(NumberOfDevices); 533 StreamPool.resize(NumberOfDevices); 534 EventPool.resize(NumberOfDevices); 535 PeerAccessMatrix.resize(NumberOfDevices); 536 for (auto &V : PeerAccessMatrix) 537 V.resize(NumberOfDevices, PeerAccessState::Unkown); 538 539 // Get environment variables regarding teams 540 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 541 // OMP_TEAM_LIMIT has been set 542 EnvTeamLimit = std::stoi(EnvStr); 543 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 544 } 545 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 546 // OMP_TEAMS_THREAD_LIMIT has been set 547 EnvTeamThreadLimit = std::stoi(EnvStr); 548 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 549 } 550 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 551 // OMP_NUM_TEAMS has been set 552 EnvNumTeams = std::stoi(EnvStr); 553 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 554 } 555 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 556 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 557 DynamicMemorySize = std::stoi(EnvStr); 558 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 559 DynamicMemorySize); 560 } 561 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { 562 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set 563 NumInitialStreams = std::stoi(EnvStr); 564 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); 565 } 566 567 for (int I = 0; I < NumberOfDevices; ++I) 568 DeviceAllocators.emplace_back(); 569 570 // Get the size threshold from environment variable 571 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 572 UseMemoryManager = Res.second; 573 size_t MemoryManagerThreshold = Res.first; 574 575 if (UseMemoryManager) 576 for (int I = 0; I < NumberOfDevices; ++I) 577 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 578 DeviceAllocators[I], MemoryManagerThreshold)); 579 580 // We lazily initialize all devices later. 581 InitializedFlags.assign(NumberOfDevices, false); 582 } 583 584 ~DeviceRTLTy() { 585 for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) 586 deinitDevice(DeviceId); 587 } 588 589 // Check whether a given DeviceId is valid 590 bool isValidDeviceId(const int DeviceId) const { 591 return DeviceId >= 0 && DeviceId < NumberOfDevices; 592 } 593 594 int getNumOfDevices() const { return NumberOfDevices; } 595 596 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 597 598 int initDevice(const int DeviceId) { 599 CUdevice Device; 600 601 DP("Getting device %d\n", DeviceId); 602 CUresult Err = cuDeviceGet(&Device, DeviceId); 603 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 604 return OFFLOAD_FAIL; 605 606 assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); 607 InitializedFlags[DeviceId] = true; 608 609 // Query the current flags of the primary context and set its flags if 610 // it is inactive 611 unsigned int FormerPrimaryCtxFlags = 0; 612 int FormerPrimaryCtxIsActive = 0; 613 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 614 &FormerPrimaryCtxIsActive); 615 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 616 return OFFLOAD_FAIL; 617 618 if (FormerPrimaryCtxIsActive) { 619 DP("The primary context is active, no change to its flags\n"); 620 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 621 CU_CTX_SCHED_BLOCKING_SYNC) 622 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 623 } else { 624 DP("The primary context is inactive, set its flags to " 625 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 626 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 627 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 628 return OFFLOAD_FAIL; 629 } 630 631 // Retain the per device primary context and save it to use whenever this 632 // device is selected. 633 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 634 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 635 return OFFLOAD_FAIL; 636 637 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 638 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 639 return OFFLOAD_FAIL; 640 641 // Initialize the stream pool. 642 if (!StreamPool[DeviceId]) 643 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(), 644 NumInitialStreams); 645 646 // Initialize the event pool. 647 if (!EventPool[DeviceId]) 648 EventPool[DeviceId] = 649 std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents); 650 651 // Query attributes to determine number of threads/block and blocks/grid. 652 int MaxGridDimX; 653 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 654 Device); 655 if (Err != CUDA_SUCCESS) { 656 DP("Error getting max grid dimension, use default value %d\n", 657 DeviceRTLTy::DefaultNumTeams); 658 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 659 } else { 660 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 661 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 662 } 663 664 // We are only exploiting threads along the x axis. 665 int MaxBlockDimX; 666 Err = cuDeviceGetAttribute(&MaxBlockDimX, 667 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 668 if (Err != CUDA_SUCCESS) { 669 DP("Error getting max block dimension, use default value %d\n", 670 DeviceRTLTy::DefaultNumThreads); 671 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 672 } else { 673 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 674 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 675 676 if (EnvTeamThreadLimit > 0 && 677 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 678 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 679 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 680 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 681 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 682 } 683 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 684 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 685 "capping at the hard limit\n", 686 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 687 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 688 } 689 } 690 691 // Get and set warp size 692 int WarpSize; 693 Err = 694 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 695 if (Err != CUDA_SUCCESS) { 696 DP("Error getting warp size, assume default value 32\n"); 697 DeviceData[DeviceId].WarpSize = 32; 698 } else { 699 DP("Using warp size %d\n", WarpSize); 700 DeviceData[DeviceId].WarpSize = WarpSize; 701 } 702 703 // Adjust teams to the env variables 704 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 705 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 706 EnvTeamLimit); 707 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 708 } 709 710 size_t StackLimit; 711 size_t HeapLimit; 712 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 713 StackLimit = std::stol(EnvStr); 714 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 715 return OFFLOAD_FAIL; 716 } else { 717 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 718 return OFFLOAD_FAIL; 719 } 720 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 721 HeapLimit = std::stol(EnvStr); 722 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 723 return OFFLOAD_FAIL; 724 } else { 725 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 726 return OFFLOAD_FAIL; 727 } 728 729 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 730 "Device supports up to %d CUDA blocks and %d threads with a " 731 "warp size of %d\n", 732 DeviceData[DeviceId].BlocksPerGrid, 733 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 734 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 735 "Device heap size is %d Bytes, device stack size is %d Bytes per " 736 "thread\n", 737 (int)HeapLimit, (int)StackLimit); 738 739 // Set default number of teams 740 if (EnvNumTeams > 0) { 741 DP("Default number of teams set according to environment %d\n", 742 EnvNumTeams); 743 DeviceData[DeviceId].NumTeams = EnvNumTeams; 744 } else { 745 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 746 DP("Default number of teams set according to library's default %d\n", 747 DeviceRTLTy::DefaultNumTeams); 748 } 749 750 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 751 DP("Default number of teams exceeds device limit, capping at %d\n", 752 DeviceData[DeviceId].BlocksPerGrid); 753 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 754 } 755 756 // Set default number of threads 757 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 758 DP("Default number of threads set according to library's default %d\n", 759 DeviceRTLTy::DefaultNumThreads); 760 if (DeviceData[DeviceId].NumThreads > 761 DeviceData[DeviceId].ThreadsPerBlock) { 762 DP("Default number of threads exceeds device limit, capping at %d\n", 763 DeviceData[DeviceId].ThreadsPerBlock); 764 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 765 } 766 767 return OFFLOAD_SUCCESS; 768 } 769 770 int deinitDevice(const int DeviceId) { 771 auto IsInitialized = InitializedFlags[DeviceId]; 772 if (!IsInitialized) 773 return OFFLOAD_SUCCESS; 774 InitializedFlags[DeviceId] = false; 775 776 if (UseMemoryManager) 777 MemoryManagers[DeviceId].release(); 778 779 StreamPool[DeviceId].reset(); 780 EventPool[DeviceId].reset(); 781 782 DeviceDataTy &D = DeviceData[DeviceId]; 783 if (!checkResult(cuCtxSetCurrent(D.Context), 784 "Error returned from cuCtxSetCurrent\n")) 785 return OFFLOAD_FAIL; 786 787 // Unload all modules. 788 for (auto &M : Modules[DeviceId]) 789 if (!checkResult(cuModuleUnload(M), 790 "Error returned from cuModuleUnload\n")) 791 return OFFLOAD_FAIL; 792 793 // Destroy context. 794 CUdevice Device; 795 if (!checkResult(cuCtxGetDevice(&Device), 796 "Error returned from cuCtxGetDevice\n")) 797 return OFFLOAD_FAIL; 798 799 if (!checkResult(cuDevicePrimaryCtxRelease(Device), 800 "Error returned from cuDevicePrimaryCtxRelease\n")) 801 return OFFLOAD_FAIL; 802 803 return OFFLOAD_SUCCESS; 804 } 805 806 __tgt_target_table *loadBinary(const int DeviceId, 807 const __tgt_device_image *Image) { 808 // Clear the offload table as we are going to create a new one. 809 clearOffloadEntriesTable(DeviceId); 810 811 // Create the module and extract the function pointers. 812 CUmodule Module; 813 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 814 CUresult Err = 815 cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 816 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 817 return nullptr; 818 819 DP("CUDA module successfully loaded!\n"); 820 821 Modules[DeviceId].push_back(Module); 822 823 // Find the symbols in the module by name. 824 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 825 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 826 827 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 828 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 829 if (!E->addr) { 830 // We return nullptr when something like this happens, the host should 831 // have always something in the address to uniquely identify the target 832 // region. 833 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 834 return nullptr; 835 } 836 837 if (E->size) { 838 __tgt_offload_entry Entry = *E; 839 CUdeviceptr CUPtr; 840 size_t CUSize; 841 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 842 // We keep this style here because we need the name 843 if (Err != CUDA_SUCCESS) { 844 REPORT("Loading global '%s' Failed\n", E->name); 845 CUDA_ERR_STRING(Err); 846 return nullptr; 847 } 848 849 if (CUSize != E->size) { 850 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 851 CUSize, E->size); 852 return nullptr; 853 } 854 855 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 856 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 857 858 Entry.addr = (void *)(CUPtr); 859 860 // Note: In the current implementation declare target variables 861 // can either be link or to. This means that once unified 862 // memory is activated via the requires directive, the variable 863 // can be used directly from the host in both cases. 864 // TODO: when variables types other than to or link are added, 865 // the below condition should be changed to explicitly 866 // check for to and link variables types: 867 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 868 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 869 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 870 // If unified memory is present any target link or to variables 871 // can access host addresses directly. There is no longer a 872 // need for device copies. 873 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 874 DP("Copy linked variable host address (" DPxMOD 875 ") to device address (" DPxMOD ")\n", 876 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 877 } 878 879 addOffloadEntry(DeviceId, Entry); 880 881 continue; 882 } 883 884 CUfunction Func; 885 Err = cuModuleGetFunction(&Func, Module, E->name); 886 // We keep this style here because we need the name 887 if (Err != CUDA_SUCCESS) { 888 REPORT("Loading '%s' Failed\n", E->name); 889 CUDA_ERR_STRING(Err); 890 return nullptr; 891 } 892 893 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 894 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 895 896 // default value GENERIC (in case symbol is missing from cubin file) 897 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 898 std::string ExecModeNameStr(E->name); 899 ExecModeNameStr += "_exec_mode"; 900 const char *ExecModeName = ExecModeNameStr.c_str(); 901 902 CUdeviceptr ExecModePtr; 903 size_t CUSize; 904 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 905 if (Err == CUDA_SUCCESS) { 906 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 907 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 908 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 909 return nullptr; 910 } 911 912 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 913 if (Err != CUDA_SUCCESS) { 914 REPORT("Error when copying data from device to host. Pointers: " 915 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 916 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 917 CUDA_ERR_STRING(Err); 918 return nullptr; 919 } 920 } else { 921 DP("Loading global exec_mode '%s' - symbol missing, using default " 922 "value GENERIC (1)\n", 923 ExecModeName); 924 } 925 926 KernelsList.emplace_back(Func, ExecModeVal); 927 928 __tgt_offload_entry Entry = *E; 929 Entry.addr = &KernelsList.back(); 930 addOffloadEntry(DeviceId, Entry); 931 } 932 933 // send device environment data to the device 934 { 935 // TODO: The device ID used here is not the real device ID used by OpenMP. 936 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 937 static_cast<uint32_t>(DeviceId), 938 static_cast<uint32_t>(DynamicMemorySize)}; 939 940 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 941 DeviceEnv.DebugKind = std::stoi(EnvStr); 942 943 const char *DeviceEnvName = "omptarget_device_environment"; 944 CUdeviceptr DeviceEnvPtr; 945 size_t CUSize; 946 947 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 948 if (Err == CUDA_SUCCESS) { 949 if (CUSize != sizeof(DeviceEnv)) { 950 REPORT( 951 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 952 DeviceEnvName, CUSize, sizeof(int32_t)); 953 CUDA_ERR_STRING(Err); 954 return nullptr; 955 } 956 957 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 958 if (Err != CUDA_SUCCESS) { 959 REPORT("Error when copying data from host to device. Pointers: " 960 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 961 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 962 CUDA_ERR_STRING(Err); 963 return nullptr; 964 } 965 966 DP("Sending global device environment data %zu bytes\n", CUSize); 967 } else { 968 DP("Finding global device environment '%s' - symbol missing.\n", 969 DeviceEnvName); 970 DP("Continue, considering this is a device RTL which does not accept " 971 "environment setting.\n"); 972 } 973 } 974 975 return getOffloadEntriesTable(DeviceId); 976 } 977 978 void *dataAlloc(const int DeviceId, const int64_t Size, 979 const TargetAllocTy Kind) { 980 switch (Kind) { 981 case TARGET_ALLOC_DEFAULT: 982 case TARGET_ALLOC_DEVICE: 983 if (UseMemoryManager) 984 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 985 else 986 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 987 case TARGET_ALLOC_HOST: 988 case TARGET_ALLOC_SHARED: 989 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 990 } 991 992 REPORT("Invalid target data allocation kind or requested allocator not " 993 "implemented yet\n"); 994 995 return nullptr; 996 } 997 998 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 999 const int64_t Size, __tgt_async_info *AsyncInfo) const { 1000 assert(AsyncInfo && "AsyncInfo is nullptr"); 1001 1002 CUstream Stream = getStream(DeviceId, AsyncInfo); 1003 CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 1004 if (Err != CUDA_SUCCESS) { 1005 DP("Error when copying data from host to device. Pointers: host " 1006 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1007 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1008 CUDA_ERR_STRING(Err); 1009 return OFFLOAD_FAIL; 1010 } 1011 1012 return OFFLOAD_SUCCESS; 1013 } 1014 1015 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 1016 const int64_t Size, __tgt_async_info *AsyncInfo) const { 1017 assert(AsyncInfo && "AsyncInfo is nullptr"); 1018 1019 CUstream Stream = getStream(DeviceId, AsyncInfo); 1020 CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 1021 if (Err != CUDA_SUCCESS) { 1022 DP("Error when copying data from device to host. Pointers: host " 1023 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1024 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1025 CUDA_ERR_STRING(Err); 1026 return OFFLOAD_FAIL; 1027 } 1028 1029 return OFFLOAD_SUCCESS; 1030 } 1031 1032 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1033 int64_t Size, __tgt_async_info *AsyncInfo) { 1034 assert(AsyncInfo && "AsyncInfo is nullptr"); 1035 1036 CUresult Err; 1037 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1038 1039 // If they are two devices, we try peer to peer copy first 1040 if (SrcDevId != DstDevId) { 1041 std::lock_guard<std::mutex> LG(PeerAccessMatrixLock); 1042 1043 switch (PeerAccessMatrix[SrcDevId][DstDevId]) { 1044 case PeerAccessState::No: { 1045 REPORT("Peer access from %" PRId32 " to %" PRId32 1046 " is not supported. Fall back to D2D memcpy.\n", 1047 SrcDevId, DstDevId); 1048 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1049 } 1050 case PeerAccessState::Unkown: { 1051 int CanAccessPeer = 0; 1052 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1053 if (Err != CUDA_SUCCESS) { 1054 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1055 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", 1056 SrcDevId, DstDevId); 1057 CUDA_ERR_STRING(Err); 1058 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; 1059 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1060 } 1061 1062 if (!CanAccessPeer) { 1063 REPORT("P2P access from %d to %d is not supported. Fall back to D2D " 1064 "memcpy.\n", 1065 SrcDevId, DstDevId); 1066 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; 1067 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1068 } 1069 1070 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1071 if (Err != CUDA_SUCCESS) { 1072 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1073 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", 1074 SrcDevId, DstDevId); 1075 CUDA_ERR_STRING(Err); 1076 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; 1077 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1078 } 1079 1080 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes; 1081 1082 LLVM_FALLTHROUGH; 1083 } 1084 case PeerAccessState::Yes: { 1085 Err = cuMemcpyPeerAsync( 1086 (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1087 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream); 1088 if (Err == CUDA_SUCCESS) 1089 return OFFLOAD_SUCCESS; 1090 1091 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1092 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 1093 ". Fall back to D2D memcpy.\n", 1094 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1095 CUDA_ERR_STRING(Err); 1096 1097 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1098 } 1099 } 1100 } 1101 1102 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1103 } 1104 1105 int dataDelete(const int DeviceId, void *TgtPtr) { 1106 if (UseMemoryManager) 1107 return MemoryManagers[DeviceId]->free(TgtPtr); 1108 1109 return DeviceAllocators[DeviceId].free(TgtPtr); 1110 } 1111 1112 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1113 ptrdiff_t *TgtOffsets, const int ArgNum, 1114 const int TeamNum, const int ThreadLimit, 1115 const unsigned int LoopTripCount, 1116 __tgt_async_info *AsyncInfo) const { 1117 // All args are references. 1118 std::vector<void *> Args(ArgNum); 1119 std::vector<void *> Ptrs(ArgNum); 1120 1121 for (int I = 0; I < ArgNum; ++I) { 1122 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1123 Args[I] = &Ptrs[I]; 1124 } 1125 1126 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1127 1128 const bool IsSPMDGenericMode = 1129 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1130 const bool IsSPMDMode = 1131 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1132 const bool IsGenericMode = 1133 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1134 1135 int CudaThreadsPerBlock; 1136 if (ThreadLimit > 0) { 1137 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1138 CudaThreadsPerBlock = ThreadLimit; 1139 // Add master warp if necessary 1140 if (IsGenericMode) { 1141 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1142 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1143 } 1144 } else { 1145 DP("Setting CUDA threads per block to default %d\n", 1146 DeviceData[DeviceId].NumThreads); 1147 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1148 } 1149 1150 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1151 DP("Threads per block capped at device limit %d\n", 1152 DeviceData[DeviceId].ThreadsPerBlock); 1153 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1154 } 1155 1156 CUresult Err; 1157 if (!KernelInfo->MaxThreadsPerBlock) { 1158 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1159 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1160 KernelInfo->Func); 1161 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1162 return OFFLOAD_FAIL; 1163 } 1164 1165 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1166 DP("Threads per block capped at kernel limit %d\n", 1167 KernelInfo->MaxThreadsPerBlock); 1168 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1169 } 1170 1171 unsigned int CudaBlocksPerGrid; 1172 if (TeamNum <= 0) { 1173 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1174 if (IsSPMDGenericMode) { 1175 // If we reach this point, then we are executing a kernel that was 1176 // transformed from Generic-mode to SPMD-mode. This kernel has 1177 // SPMD-mode execution, but needs its blocks to be scheduled 1178 // differently because the current loop trip count only applies to the 1179 // `teams distribute` region and will create var too few blocks using 1180 // the regular SPMD-mode method. 1181 CudaBlocksPerGrid = LoopTripCount; 1182 } else if (IsSPMDMode) { 1183 // We have a combined construct, i.e. `target teams distribute 1184 // parallel for [simd]`. We launch so many teams so that each thread 1185 // will execute one iteration of the loop. round up to the nearest 1186 // integer 1187 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1188 } else if (IsGenericMode) { 1189 // If we reach this point, then we have a non-combined construct, i.e. 1190 // `teams distribute` with a nested `parallel for` and each team is 1191 // assigned one iteration of the `distribute` loop. E.g.: 1192 // 1193 // #pragma omp target teams distribute 1194 // for(...loop_tripcount...) { 1195 // #pragma omp parallel for 1196 // for(...) {} 1197 // } 1198 // 1199 // Threads within a team will execute the iterations of the `parallel` 1200 // loop. 1201 CudaBlocksPerGrid = LoopTripCount; 1202 } else { 1203 REPORT("Unknown execution mode: %d\n", 1204 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1205 return OFFLOAD_FAIL; 1206 } 1207 DP("Using %d teams due to loop trip count %" PRIu32 1208 " and number of threads per block %d\n", 1209 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1210 } else { 1211 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1212 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1213 } 1214 } else { 1215 DP("Using requested number of teams %d\n", TeamNum); 1216 CudaBlocksPerGrid = TeamNum; 1217 } 1218 1219 if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { 1220 DP("Capping number of teams to team limit %d\n", 1221 DeviceData[DeviceId].BlocksPerGrid); 1222 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1223 } 1224 1225 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1226 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1227 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1228 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1229 : "(null)", 1230 CudaBlocksPerGrid, CudaThreadsPerBlock, 1231 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1232 1233 CUstream Stream = getStream(DeviceId, AsyncInfo); 1234 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1235 /* gridDimZ */ 1, CudaThreadsPerBlock, 1236 /* blockDimY */ 1, /* blockDimZ */ 1, 1237 DynamicMemorySize, Stream, &Args[0], nullptr); 1238 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1239 return OFFLOAD_FAIL; 1240 1241 DP("Launch of entry point at " DPxMOD " successful!\n", 1242 DPxPTR(TgtEntryPtr)); 1243 1244 return OFFLOAD_SUCCESS; 1245 } 1246 1247 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1248 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1249 CUresult Err = cuStreamSynchronize(Stream); 1250 1251 // Once the stream is synchronized, return it to stream pool and reset 1252 // AsyncInfo. This is to make sure the synchronization only works for its 1253 // own tasks. 1254 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1255 AsyncInfo->Queue = nullptr; 1256 1257 if (Err != CUDA_SUCCESS) { 1258 DP("Error when synchronizing stream. stream = " DPxMOD 1259 ", async info ptr = " DPxMOD "\n", 1260 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1261 CUDA_ERR_STRING(Err); 1262 } 1263 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1264 } 1265 1266 void printDeviceInfo(int32_t DeviceId) { 1267 char TmpChar[1000]; 1268 std::string TmpStr; 1269 size_t TmpSt; 1270 int TmpInt, TmpInt2, TmpInt3; 1271 1272 CUdevice Device; 1273 checkResult(cuDeviceGet(&Device, DeviceId), 1274 "Error returned from cuCtxGetDevice\n"); 1275 1276 cuDriverGetVersion(&TmpInt); 1277 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1278 printf(" CUDA Device Number: \t\t%d \n", DeviceId); 1279 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1280 "Error returned from cuDeviceGetName\n"); 1281 printf(" Device Name: \t\t\t%s \n", TmpChar); 1282 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1283 "Error returned from cuDeviceTotalMem\n"); 1284 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1285 checkResult(cuDeviceGetAttribute( 1286 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1287 "Error returned from cuDeviceGetAttribute\n"); 1288 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1289 checkResult( 1290 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1291 "Error returned from cuDeviceGetAttribute\n"); 1292 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1293 checkResult(cuDeviceGetAttribute( 1294 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1295 "Error returned from cuDeviceGetAttribute\n"); 1296 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1297 checkResult( 1298 cuDeviceGetAttribute( 1299 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1300 "Error returned from cuDeviceGetAttribute\n"); 1301 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1302 checkResult( 1303 cuDeviceGetAttribute( 1304 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1305 "Error returned from cuDeviceGetAttribute\n"); 1306 printf(" Registers per Block: \t\t%d \n", TmpInt); 1307 checkResult( 1308 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1309 "Error returned from cuDeviceGetAttribute\n"); 1310 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1311 checkResult(cuDeviceGetAttribute( 1312 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1313 "Error returned from cuDeviceGetAttribute\n"); 1314 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1315 checkResult(cuDeviceGetAttribute( 1316 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1317 "Error returned from cuDeviceGetAttribute\n"); 1318 checkResult(cuDeviceGetAttribute( 1319 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1320 "Error returned from cuDeviceGetAttribute\n"); 1321 checkResult(cuDeviceGetAttribute( 1322 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1323 "Error returned from cuDeviceGetAttribute\n"); 1324 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1325 TmpInt3); 1326 checkResult(cuDeviceGetAttribute( 1327 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1328 "Error returned from cuDeviceGetAttribute\n"); 1329 checkResult(cuDeviceGetAttribute( 1330 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1331 "Error returned from cuDeviceGetAttribute\n"); 1332 checkResult(cuDeviceGetAttribute( 1333 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1334 "Error returned from cuDeviceGetAttribute\n"); 1335 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1336 TmpInt3); 1337 checkResult( 1338 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1339 "Error returned from cuDeviceGetAttribute\n"); 1340 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1341 checkResult(cuDeviceGetAttribute( 1342 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1343 "Error returned from cuDeviceGetAttribute\n"); 1344 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1345 checkResult( 1346 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1347 "Error returned from cuDeviceGetAttribute\n"); 1348 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1349 checkResult(cuDeviceGetAttribute( 1350 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1351 "Error returned from cuDeviceGetAttribute\n"); 1352 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1353 checkResult( 1354 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1355 "Error returned from cuDeviceGetAttribute\n"); 1356 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1357 checkResult(cuDeviceGetAttribute( 1358 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1359 "Error returned from cuDeviceGetAttribute\n"); 1360 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1361 checkResult( 1362 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1363 "Error returned from cuDeviceGetAttribute\n"); 1364 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1365 TmpStr = "DEFAULT"; 1366 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1367 TmpStr = "PROHIBITED"; 1368 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1369 TmpStr = "EXCLUSIVE PROCESS"; 1370 else 1371 TmpStr = "unknown"; 1372 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1373 checkResult(cuDeviceGetAttribute( 1374 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1375 "Error returned from cuDeviceGetAttribute\n"); 1376 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1377 checkResult( 1378 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1381 checkResult(cuDeviceGetAttribute( 1382 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1383 "Error returned from cuDeviceGetAttribute\n"); 1384 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1385 checkResult( 1386 cuDeviceGetAttribute( 1387 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1388 "Error returned from cuDeviceGetAttribute\n"); 1389 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1390 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1391 Device), 1392 "Error returned from cuDeviceGetAttribute\n"); 1393 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1394 checkResult(cuDeviceGetAttribute( 1395 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1396 Device), 1397 "Error returned from cuDeviceGetAttribute\n"); 1398 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1399 checkResult(cuDeviceGetAttribute( 1400 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1401 "Error returned from cuDeviceGetAttribute\n"); 1402 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1403 checkResult(cuDeviceGetAttribute( 1404 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1405 "Error returned from cuDeviceGetAttribute\n"); 1406 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1407 checkResult(cuDeviceGetAttribute( 1408 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1409 "Error returned from cuDeviceGetAttribute\n"); 1410 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1411 checkResult( 1412 cuDeviceGetAttribute( 1413 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1414 "Error returned from cuDeviceGetAttribute\n"); 1415 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1416 checkResult( 1417 cuDeviceGetAttribute( 1418 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1419 "Error returned from cuDeviceGetAttribute\n"); 1420 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1421 checkResult(cuDeviceGetAttribute( 1422 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1423 "Error returned from cuDeviceGetAttribute\n"); 1424 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1425 checkResult(cuDeviceGetAttribute( 1426 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1427 "Error returned from cuDeviceGetAttribute\n"); 1428 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1429 checkResult( 1430 cuDeviceGetAttribute( 1431 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1432 "Error returned from cuDeviceGetAttribute\n"); 1433 checkResult( 1434 cuDeviceGetAttribute( 1435 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1436 "Error returned from cuDeviceGetAttribute\n"); 1437 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1438 } 1439 1440 int createEvent(int DeviceId, void **P) { 1441 CUevent Event = nullptr; 1442 if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) 1443 return OFFLOAD_FAIL; 1444 *P = Event; 1445 return OFFLOAD_SUCCESS; 1446 } 1447 1448 int destroyEvent(int DeviceId, void *EventPtr) { 1449 EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr)); 1450 return OFFLOAD_SUCCESS; 1451 } 1452 1453 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1454 void *EventPtr) const { 1455 CUstream Stream = getStream(DeviceId, AsyncInfo); 1456 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1457 1458 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1459 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1460 // API document, that argument has to be 0x0. 1461 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1462 if (Err != CUDA_SUCCESS) { 1463 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1464 DPxPTR(Stream), DPxPTR(Event)); 1465 CUDA_ERR_STRING(Err); 1466 return OFFLOAD_FAIL; 1467 } 1468 1469 return OFFLOAD_SUCCESS; 1470 } 1471 1472 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { 1473 if (AsyncInfo->Queue) { 1474 StreamPool[DeviceId]->release( 1475 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1476 AsyncInfo->Queue = nullptr; 1477 } 1478 1479 return OFFLOAD_SUCCESS; 1480 } 1481 1482 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { 1483 *AsyncInfo = new __tgt_async_info; 1484 getStream(DeviceId, *AsyncInfo); 1485 return OFFLOAD_SUCCESS; 1486 } 1487 1488 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, 1489 const char **ErrStr) const { 1490 assert(DeviceInfo && "DeviceInfo is nullptr"); 1491 1492 if (!DeviceInfo->Context) 1493 DeviceInfo->Context = DeviceData[DeviceId].Context; 1494 if (!DeviceInfo->Device) { 1495 CUdevice Dev; 1496 CUresult Err = cuDeviceGet(&Dev, DeviceId); 1497 if (Err == CUDA_SUCCESS) { 1498 DeviceInfo->Device = reinterpret_cast<void *>(Dev); 1499 } else { 1500 cuGetErrorString(Err, ErrStr); 1501 return OFFLOAD_FAIL; 1502 } 1503 } 1504 return OFFLOAD_SUCCESS; 1505 } 1506 1507 int setContext(int DeviceId) { 1508 assert(InitializedFlags[DeviceId] && "Device is not initialized"); 1509 1510 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1511 if (!checkResult(Err, "error returned from cuCtxSetCurrent")) 1512 return OFFLOAD_FAIL; 1513 1514 return OFFLOAD_SUCCESS; 1515 } 1516 }; 1517 1518 DeviceRTLTy DeviceRTL; 1519 } // namespace 1520 1521 // Exposed library API function 1522 #ifdef __cplusplus 1523 extern "C" { 1524 #endif 1525 1526 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 1527 return elf_check_machine(Image, /* EM_CUDA */ 190); 1528 } 1529 1530 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, 1531 __tgt_image_info *info) { 1532 if (!__tgt_rtl_is_valid_binary(image)) 1533 return false; 1534 1535 // A subarchitecture was not specified. Assume it is compatible. 1536 if (!info || !info->Arch) 1537 return true; 1538 1539 int32_t NumberOfDevices = 0; 1540 if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS) 1541 return false; 1542 1543 StringRef ArchStr = StringRef(info->Arch).drop_front(sizeof("sm_") - 1); 1544 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { 1545 CUdevice Device; 1546 if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS) 1547 return false; 1548 1549 int32_t Major, Minor; 1550 if (cuDeviceGetAttribute(&Major, 1551 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 1552 Device) != CUDA_SUCCESS) 1553 return false; 1554 if (cuDeviceGetAttribute(&Minor, 1555 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 1556 Device) != CUDA_SUCCESS) 1557 return false; 1558 1559 // A cubin generated for a certain compute capability is supported to run on 1560 // any GPU with the same major revision and same or higher minor revision. 1561 int32_t ImageMajor = ArchStr[0] - '0'; 1562 int32_t ImageMinor = ArchStr[1] - '0'; 1563 if (Major != ImageMajor || Minor < ImageMinor) 1564 return false; 1565 } 1566 1567 DP("Image has compatible compute capability: %s\n", info->Arch); 1568 return true; 1569 } 1570 1571 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1572 1573 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1574 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1575 DeviceRTL.setRequiresFlag(RequiresFlags); 1576 return RequiresFlags; 1577 } 1578 1579 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) { 1580 if (DeviceRTL.isValidDeviceId(SrcDevId) && 1581 DeviceRTL.isValidDeviceId(DstDevId)) 1582 return 1; 1583 1584 return 0; 1585 } 1586 1587 int32_t __tgt_rtl_init_device(int32_t DeviceId) { 1588 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1589 // Context is set when init the device. 1590 1591 return DeviceRTL.initDevice(DeviceId); 1592 } 1593 1594 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) { 1595 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1596 // Context is set when deinit the device. 1597 1598 return DeviceRTL.deinitDevice(DeviceId); 1599 } 1600 1601 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 1602 __tgt_device_image *Image) { 1603 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1604 1605 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1606 return nullptr; 1607 1608 return DeviceRTL.loadBinary(DeviceId, Image); 1609 } 1610 1611 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *, 1612 int32_t Kind) { 1613 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1614 1615 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1616 return nullptr; 1617 1618 return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind); 1619 } 1620 1621 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, 1622 int64_t Size) { 1623 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1624 // Context is set in __tgt_rtl_data_submit_async. 1625 1626 __tgt_async_info AsyncInfo; 1627 const int32_t Rc = 1628 __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 1629 if (Rc != OFFLOAD_SUCCESS) 1630 return OFFLOAD_FAIL; 1631 1632 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1633 } 1634 1635 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, 1636 void *HstPtr, int64_t Size, 1637 __tgt_async_info *AsyncInfoPtr) { 1638 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1639 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1640 1641 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1642 return OFFLOAD_FAIL; 1643 1644 return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr); 1645 } 1646 1647 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, 1648 int64_t Size) { 1649 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1650 // Context is set in __tgt_rtl_data_retrieve_async. 1651 1652 __tgt_async_info AsyncInfo; 1653 const int32_t Rc = 1654 __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 1655 if (Rc != OFFLOAD_SUCCESS) 1656 return OFFLOAD_FAIL; 1657 1658 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1659 } 1660 1661 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, 1662 void *TgtPtr, int64_t Size, 1663 __tgt_async_info *AsyncInfoPtr) { 1664 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1665 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1666 1667 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1668 return OFFLOAD_FAIL; 1669 1670 return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr); 1671 } 1672 1673 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr, 1674 int DstDevId, void *DstPtr, int64_t Size, 1675 __tgt_async_info *AsyncInfo) { 1676 assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); 1677 assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); 1678 assert(AsyncInfo && "AsyncInfo is nullptr"); 1679 1680 if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS) 1681 return OFFLOAD_FAIL; 1682 1683 return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size, 1684 AsyncInfo); 1685 } 1686 1687 int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr, 1688 int32_t DstDevId, void *DstPtr, int64_t Size) { 1689 assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); 1690 assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); 1691 // Context is set in __tgt_rtl_data_exchange_async. 1692 1693 __tgt_async_info AsyncInfo; 1694 const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId, 1695 DstPtr, Size, &AsyncInfo); 1696 if (Rc != OFFLOAD_SUCCESS) 1697 return OFFLOAD_FAIL; 1698 1699 return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo); 1700 } 1701 1702 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) { 1703 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1704 1705 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1706 return OFFLOAD_FAIL; 1707 1708 return DeviceRTL.dataDelete(DeviceId, TgtPtr); 1709 } 1710 1711 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 1712 void **TgtArgs, ptrdiff_t *TgtOffsets, 1713 int32_t ArgNum, int32_t TeamNum, 1714 int32_t ThreadLimit, 1715 uint64_t LoopTripcount) { 1716 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1717 // Context is set in __tgt_rtl_run_target_team_region_async. 1718 1719 __tgt_async_info AsyncInfo; 1720 const int32_t Rc = __tgt_rtl_run_target_team_region_async( 1721 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit, 1722 LoopTripcount, &AsyncInfo); 1723 if (Rc != OFFLOAD_SUCCESS) 1724 return OFFLOAD_FAIL; 1725 1726 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1727 } 1728 1729 int32_t __tgt_rtl_run_target_team_region_async( 1730 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 1731 int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit, 1732 uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) { 1733 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1734 1735 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1736 return OFFLOAD_FAIL; 1737 1738 return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs, 1739 TgtOffsets, ArgNum, TeamNum, ThreadLimit, 1740 LoopTripcount, AsyncInfoPtr); 1741 } 1742 1743 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 1744 void **TgtArgs, ptrdiff_t *TgtOffsets, 1745 int32_t ArgNum) { 1746 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1747 // Context is set in __tgt_rtl_run_target_region_async. 1748 1749 __tgt_async_info AsyncInfo; 1750 const int32_t Rc = __tgt_rtl_run_target_region_async( 1751 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo); 1752 if (Rc != OFFLOAD_SUCCESS) 1753 return OFFLOAD_FAIL; 1754 1755 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1756 } 1757 1758 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 1759 void **TgtArgs, ptrdiff_t *TgtOffsets, 1760 int32_t ArgNum, 1761 __tgt_async_info *AsyncInfoPtr) { 1762 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1763 // Context is set in __tgt_rtl_run_target_team_region_async. 1764 return __tgt_rtl_run_target_team_region_async( 1765 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, 1766 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1767 AsyncInfoPtr); 1768 } 1769 1770 int32_t __tgt_rtl_synchronize(int32_t DeviceId, 1771 __tgt_async_info *AsyncInfoPtr) { 1772 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1773 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1774 assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); 1775 // NOTE: We don't need to set context for stream sync. 1776 return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr); 1777 } 1778 1779 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1780 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1781 InfoLevel.store(NewInfoLevel); 1782 } 1783 1784 void __tgt_rtl_print_device_info(int32_t DeviceId) { 1785 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1786 // NOTE: We don't need to set context for print device info. 1787 DeviceRTL.printDeviceInfo(DeviceId); 1788 } 1789 1790 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) { 1791 assert(Event && "event is nullptr"); 1792 1793 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1794 return OFFLOAD_FAIL; 1795 1796 return DeviceRTL.createEvent(DeviceId, Event); 1797 } 1798 1799 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, 1800 __tgt_async_info *AsyncInfoPtr) { 1801 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1802 assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); 1803 assert(EventPtr && "event_ptr is nullptr"); 1804 // NOTE: We might not need to set context for event record. 1805 return recordEvent(EventPtr, AsyncInfoPtr); 1806 } 1807 1808 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, 1809 __tgt_async_info *AsyncInfoPtr) { 1810 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1811 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1812 assert(EventPtr && "event is nullptr"); 1813 // If we don't have a queue we need to set the context. 1814 if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1815 return OFFLOAD_FAIL; 1816 return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr); 1817 } 1818 1819 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { 1820 assert(EventPtr && "event is nullptr"); 1821 // NOTE: We might not need to set context for event sync. 1822 return syncEvent(EventPtr); 1823 } 1824 1825 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { 1826 assert(EventPtr && "event is nullptr"); 1827 1828 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1829 return OFFLOAD_FAIL; 1830 1831 return DeviceRTL.destroyEvent(DeviceId, EventPtr); 1832 } 1833 1834 int32_t __tgt_rtl_release_async_info(int32_t DeviceId, 1835 __tgt_async_info *AsyncInfo) { 1836 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1837 assert(AsyncInfo && "async_info is nullptr"); 1838 1839 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1840 return OFFLOAD_FAIL; 1841 1842 return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo); 1843 } 1844 1845 int32_t __tgt_rtl_init_async_info(int32_t DeviceId, 1846 __tgt_async_info **AsyncInfo) { 1847 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1848 assert(AsyncInfo && "async_info is nullptr"); 1849 1850 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1851 return OFFLOAD_FAIL; 1852 1853 return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo); 1854 } 1855 1856 int32_t __tgt_rtl_init_device_info(int32_t DeviceId, 1857 __tgt_device_info *DeviceInfoPtr, 1858 const char **ErrStr) { 1859 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1860 assert(DeviceInfoPtr && "device_info_ptr is nullptr"); 1861 1862 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1863 return OFFLOAD_FAIL; 1864 1865 return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr); 1866 } 1867 1868 #ifdef __cplusplus 1869 } 1870 #endif 1871