1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <memory> 19 #include <mutex> 20 #include <string> 21 #include <unordered_map> 22 #include <vector> 23 24 #include "Debug.h" 25 #include "DeviceEnvironment.h" 26 #include "omptarget.h" 27 #include "omptargetplugin.h" 28 29 #define TARGET_NAME CUDA 30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 31 32 #include "MemoryManager.h" 33 34 #include "llvm/Frontend/OpenMP/OMPConstants.h" 35 36 // Utility for retrieving and printing CUDA error string. 37 #ifdef OMPTARGET_DEBUG 38 #define CUDA_ERR_STRING(err) \ 39 do { \ 40 if (getDebugLevel() > 0) { \ 41 const char *errStr = nullptr; \ 42 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 43 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 44 REPORT("Unrecognized CUDA error code: %d\n", err); \ 45 else if (errStr_status == CUDA_SUCCESS) \ 46 REPORT("CUDA error is: %s\n", errStr); \ 47 else { \ 48 REPORT("Unresolved CUDA error code: %d\n", err); \ 49 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 50 errStr_status); \ 51 } \ 52 } else { \ 53 const char *errStr = nullptr; \ 54 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 55 if (errStr_status == CUDA_SUCCESS) \ 56 REPORT("%s \n", errStr); \ 57 } \ 58 } while (false) 59 #else // OMPTARGET_DEBUG 60 #define CUDA_ERR_STRING(err) \ 61 do { \ 62 const char *errStr = nullptr; \ 63 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 64 if (errStr_status == CUDA_SUCCESS) \ 65 REPORT("%s \n", errStr); \ 66 } while (false) 67 #endif // OMPTARGET_DEBUG 68 69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 70 71 #include "elf_common.h" 72 73 /// Keep entries table per device. 74 struct FuncOrGblEntryTy { 75 __tgt_target_table Table; 76 std::vector<__tgt_offload_entry> Entries; 77 }; 78 79 /// Use a single entity to encode a kernel and a set of flags. 80 struct KernelTy { 81 CUfunction Func; 82 83 // execution mode of kernel 84 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 85 86 /// Maximal number of threads per block for this kernel. 87 int MaxThreadsPerBlock = 0; 88 89 KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode) 90 : Func(Func), ExecutionMode(ExecutionMode) {} 91 }; 92 93 namespace { 94 bool checkResult(CUresult Err, const char *ErrMsg) { 95 if (Err == CUDA_SUCCESS) 96 return true; 97 98 REPORT("%s", ErrMsg); 99 CUDA_ERR_STRING(Err); 100 return false; 101 } 102 103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 104 CUstream Stream) { 105 CUresult Err = 106 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 107 108 if (Err != CUDA_SUCCESS) { 109 DP("Error when copying data from device to device. Pointers: src " 110 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 111 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 112 CUDA_ERR_STRING(Err); 113 return OFFLOAD_FAIL; 114 } 115 116 return OFFLOAD_SUCCESS; 117 } 118 119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 120 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 121 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 122 123 CUresult Err = cuEventRecord(Event, Stream); 124 if (Err != CUDA_SUCCESS) { 125 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 126 DPxPTR(Stream), DPxPTR(Event)); 127 CUDA_ERR_STRING(Err); 128 return OFFLOAD_FAIL; 129 } 130 131 return OFFLOAD_SUCCESS; 132 } 133 134 int syncEvent(void *EventPtr) { 135 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 136 137 CUresult Err = cuEventSynchronize(Event); 138 if (Err != CUDA_SUCCESS) { 139 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 140 CUDA_ERR_STRING(Err); 141 return OFFLOAD_FAIL; 142 } 143 144 return OFFLOAD_SUCCESS; 145 } 146 147 namespace { 148 149 // Structure contains per-device data 150 struct DeviceDataTy { 151 /// List that contains all the kernels. 152 std::list<KernelTy> KernelsList; 153 154 std::list<FuncOrGblEntryTy> FuncGblEntries; 155 156 CUcontext Context = nullptr; 157 // Device properties 158 unsigned int ThreadsPerBlock = 0; 159 unsigned int BlocksPerGrid = 0; 160 unsigned int WarpSize = 0; 161 // OpenMP properties 162 int NumTeams = 0; 163 int NumThreads = 0; 164 }; 165 166 /// Resource allocator where \p T is the resource type. 167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL 168 /// accordingly. The implementation should not raise any exception. 169 template <typename T> struct AllocatorTy { 170 using ElementTy = T; 171 virtual ~AllocatorTy() {} 172 173 /// Create a resource and assign to R. 174 virtual int create(T &R) noexcept = 0; 175 /// Destroy the resource. 176 virtual int destroy(T) noexcept = 0; 177 }; 178 179 /// Allocator for CUstream. 180 struct StreamAllocatorTy final : public AllocatorTy<CUstream> { 181 /// See AllocatorTy<T>::create. 182 int create(CUstream &Stream) noexcept override { 183 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING), 184 "Error returned from cuStreamCreate\n")) 185 return OFFLOAD_FAIL; 186 187 return OFFLOAD_SUCCESS; 188 } 189 190 /// See AllocatorTy<T>::destroy. 191 int destroy(CUstream Stream) noexcept override { 192 if (!checkResult(cuStreamDestroy(Stream), 193 "Error returned from cuStreamDestroy\n")) 194 return OFFLOAD_FAIL; 195 196 return OFFLOAD_SUCCESS; 197 } 198 }; 199 200 /// Allocator for CUevent. 201 struct EventAllocatorTy final : public AllocatorTy<CUevent> { 202 /// See AllocatorTy<T>::create. 203 int create(CUevent &Event) noexcept override { 204 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT), 205 "Error returned from cuEventCreate\n")) 206 return OFFLOAD_FAIL; 207 208 return OFFLOAD_SUCCESS; 209 } 210 211 /// See AllocatorTy<T>::destroy. 212 int destroy(CUevent Event) noexcept override { 213 if (!checkResult(cuEventDestroy(Event), 214 "Error returned from cuEventDestroy\n")) 215 return OFFLOAD_FAIL; 216 217 return OFFLOAD_SUCCESS; 218 } 219 }; 220 221 /// A generic pool of resources where \p T is the resource type. 222 /// \p T should be copyable as the object is stored in \p std::vector . 223 template <typename AllocTy> class ResourcePoolTy { 224 using ElementTy = typename AllocTy::ElementTy; 225 /// Index of the next available resource. 226 size_t Next = 0; 227 /// Mutex to guard the pool. 228 std::mutex Mutex; 229 /// Pool of resources. The difference between \p Resources and \p Pool is, 230 /// when a resource is acquired and released, it is all on \p Resources. When 231 /// a batch of new resources are needed, they are both added to \p Resources 232 /// and \p Pool. The reason for this setting is, \p Resources could contain 233 /// redundant elements because resources are not released, which can cause 234 /// double free. This setting makes sure that \p Pool always has every 235 /// resource allocated from the device. 236 std::vector<ElementTy> Resources; 237 std::vector<ElementTy> Pool; 238 /// A reference to the corresponding allocator. 239 AllocTy Allocator; 240 241 /// If `Resources` is used up, we will fill in more resources. It assumes that 242 /// the new size `Size` should be always larger than the current size. 243 bool resize(size_t Size) { 244 assert(Resources.size() == Pool.size() && "size mismatch"); 245 auto CurSize = Resources.size(); 246 assert(Size > CurSize && "Unexpected smaller size"); 247 Pool.reserve(Size); 248 Resources.reserve(Size); 249 for (auto I = CurSize; I < Size; ++I) { 250 ElementTy NewItem; 251 int Ret = Allocator.create(NewItem); 252 if (Ret != OFFLOAD_SUCCESS) 253 return false; 254 Pool.push_back(NewItem); 255 Resources.push_back(NewItem); 256 } 257 return true; 258 } 259 260 public: 261 ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept 262 : Allocator(std::move(A)) { 263 if (Size) 264 (void)resize(Size); 265 } 266 267 ~ResourcePoolTy() noexcept { clear(); } 268 269 /// Get a resource from pool. `Next` always points to the next available 270 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are 271 /// still available. If there is no resource left, we will ask for more. Each 272 /// time a resource is assigned, the id will increase one. 273 /// xxxxxs+++++++++ 274 /// ^ 275 /// Next 276 /// After assignment, the pool becomes the following and s is assigned. 277 /// xxxxxs+++++++++ 278 /// ^ 279 /// Next 280 int acquire(ElementTy &R) noexcept { 281 std::lock_guard<std::mutex> LG(Mutex); 282 if (Next == Resources.size()) { 283 auto NewSize = Resources.size() ? Resources.size() * 2 : 1; 284 if (!resize(NewSize)) 285 return OFFLOAD_FAIL; 286 } 287 288 assert(Next < Resources.size()); 289 290 R = Resources[Next++]; 291 292 return OFFLOAD_SUCCESS; 293 } 294 295 /// Return the resource back to the pool. When we return a resource, we need 296 /// to first decrease `Next`, and then copy the resource back. It is worth 297 /// noting that, the order of resources return might be different from that 298 /// they're assigned, that saying, at some point, there might be two identical 299 /// resources. 300 /// xxax+a+++++ 301 /// ^ 302 /// Next 303 /// However, it doesn't matter, because they're always on the two sides of 304 /// `Next`. The left one will in the end be overwritten by another resource. 305 /// Therefore, after several execution, the order of pool might be different 306 /// from its initial state. 307 void release(ElementTy R) noexcept { 308 std::lock_guard<std::mutex> LG(Mutex); 309 Resources[--Next] = R; 310 } 311 312 /// Released all stored resources and clear the pool. 313 /// Note: This function is not thread safe. Be sure to guard it if necessary. 314 void clear() noexcept { 315 for (auto &R : Pool) 316 (void)Allocator.destroy(R); 317 Pool.clear(); 318 Resources.clear(); 319 } 320 }; 321 322 } // namespace 323 324 class DeviceRTLTy { 325 int NumberOfDevices; 326 // OpenMP environment properties 327 int EnvNumTeams; 328 int EnvTeamLimit; 329 int EnvTeamThreadLimit; 330 // OpenMP requires flags 331 int64_t RequiresFlags; 332 // Amount of dynamic shared memory to use at launch. 333 uint64_t DynamicMemorySize; 334 335 /// Number of initial streams for each device. 336 int NumInitialStreams = 32; 337 338 /// Number of initial events for each device. 339 int NumInitialEvents = 8; 340 341 static constexpr const int32_t HardThreadLimit = 1024; 342 static constexpr const int32_t DefaultNumTeams = 128; 343 static constexpr const int32_t DefaultNumThreads = 128; 344 345 using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>; 346 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool; 347 348 using EventPoolTy = ResourcePoolTy<EventAllocatorTy>; 349 std::vector<std::unique_ptr<EventPoolTy>> EventPool; 350 351 std::vector<DeviceDataTy> DeviceData; 352 std::vector<std::vector<CUmodule>> Modules; 353 354 /// Vector of flags indicating the initalization status of all associated 355 /// devices. 356 std::vector<bool> InitializedFlags; 357 358 enum class PeerAccessState : uint8_t { Unkown, Yes, No }; 359 std::vector<std::vector<PeerAccessState>> PeerAccessMatrix; 360 std::mutex PeerAccessMatrixLock; 361 362 /// A class responsible for interacting with device native runtime library to 363 /// allocate and free memory. 364 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 365 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 366 367 public: 368 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 369 if (Size == 0) 370 return nullptr; 371 372 void *MemAlloc = nullptr; 373 CUresult Err; 374 switch (Kind) { 375 case TARGET_ALLOC_DEFAULT: 376 case TARGET_ALLOC_DEVICE: 377 CUdeviceptr DevicePtr; 378 Err = cuMemAlloc(&DevicePtr, Size); 379 MemAlloc = (void *)DevicePtr; 380 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 381 return nullptr; 382 break; 383 case TARGET_ALLOC_HOST: 384 void *HostPtr; 385 Err = cuMemAllocHost(&HostPtr, Size); 386 MemAlloc = HostPtr; 387 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 388 return nullptr; 389 HostPinnedAllocs[MemAlloc] = Kind; 390 break; 391 case TARGET_ALLOC_SHARED: 392 CUdeviceptr SharedPtr; 393 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 394 MemAlloc = (void *)SharedPtr; 395 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 396 return nullptr; 397 break; 398 } 399 400 return MemAlloc; 401 } 402 403 int free(void *TgtPtr) override { 404 CUresult Err; 405 // Host pinned memory must be freed differently. 406 TargetAllocTy Kind = 407 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 408 ? TARGET_ALLOC_DEFAULT 409 : TARGET_ALLOC_HOST; 410 switch (Kind) { 411 case TARGET_ALLOC_DEFAULT: 412 case TARGET_ALLOC_DEVICE: 413 case TARGET_ALLOC_SHARED: 414 Err = cuMemFree((CUdeviceptr)TgtPtr); 415 if (!checkResult(Err, "Error returned from cuMemFree\n")) 416 return OFFLOAD_FAIL; 417 break; 418 case TARGET_ALLOC_HOST: 419 Err = cuMemFreeHost(TgtPtr); 420 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 421 return OFFLOAD_FAIL; 422 break; 423 } 424 425 return OFFLOAD_SUCCESS; 426 } 427 }; 428 429 /// A vector of device allocators 430 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 431 432 /// A vector of memory managers. Since the memory manager is non-copyable and 433 // non-removable, we wrap them into std::unique_ptr. 434 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 435 436 /// Whether use memory manager 437 bool UseMemoryManager = true; 438 439 // Record entry point associated with device 440 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) { 441 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 442 E.Entries.push_back(Entry); 443 } 444 445 // Return a pointer to the entry associated with the pointer 446 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 447 const void *Addr) const { 448 for (const __tgt_offload_entry &Itr : 449 DeviceData[DeviceId].FuncGblEntries.back().Entries) 450 if (Itr.addr == Addr) 451 return &Itr; 452 453 return nullptr; 454 } 455 456 // Return the pointer to the target entries table 457 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 458 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 459 460 if (E.Entries.empty()) 461 return nullptr; 462 463 // Update table info according to the entries and return the pointer 464 E.Table.EntriesBegin = E.Entries.data(); 465 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 466 467 return &E.Table; 468 } 469 470 // Clear entries table for a device 471 void clearOffloadEntriesTable(const int DeviceId) { 472 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 473 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 474 E.Entries.clear(); 475 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 476 } 477 478 public: 479 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 480 assert(AsyncInfo && "AsyncInfo is nullptr"); 481 482 if (!AsyncInfo->Queue) { 483 CUstream S; 484 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS) 485 return nullptr; 486 487 AsyncInfo->Queue = S; 488 } 489 490 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 491 } 492 493 // This class should not be copied 494 DeviceRTLTy(const DeviceRTLTy &) = delete; 495 DeviceRTLTy(DeviceRTLTy &&) = delete; 496 497 DeviceRTLTy() 498 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 499 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 500 DynamicMemorySize(0) { 501 502 DP("Start initializing CUDA\n"); 503 504 CUresult Err = cuInit(0); 505 if (Err == CUDA_ERROR_INVALID_HANDLE) { 506 // Can't call cuGetErrorString if dlsym failed 507 DP("Failed to load CUDA shared library\n"); 508 return; 509 } 510 if (Err == CUDA_ERROR_NO_DEVICE) { 511 DP("There are no devices supporting CUDA.\n"); 512 return; 513 } 514 if (!checkResult(Err, "Error returned from cuInit\n")) { 515 return; 516 } 517 518 Err = cuDeviceGetCount(&NumberOfDevices); 519 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 520 return; 521 522 if (NumberOfDevices == 0) { 523 DP("There are no devices supporting CUDA.\n"); 524 return; 525 } 526 527 DeviceData.resize(NumberOfDevices); 528 Modules.resize(NumberOfDevices); 529 StreamPool.resize(NumberOfDevices); 530 EventPool.resize(NumberOfDevices); 531 PeerAccessMatrix.resize(NumberOfDevices); 532 for (auto &V : PeerAccessMatrix) 533 V.resize(NumberOfDevices, PeerAccessState::Unkown); 534 535 // Get environment variables regarding teams 536 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 537 // OMP_TEAM_LIMIT has been set 538 EnvTeamLimit = std::stoi(EnvStr); 539 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 540 } 541 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 542 // OMP_TEAMS_THREAD_LIMIT has been set 543 EnvTeamThreadLimit = std::stoi(EnvStr); 544 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 545 } 546 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 547 // OMP_NUM_TEAMS has been set 548 EnvNumTeams = std::stoi(EnvStr); 549 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 550 } 551 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 552 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 553 DynamicMemorySize = std::stoi(EnvStr); 554 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 555 DynamicMemorySize); 556 } 557 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) { 558 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set 559 NumInitialStreams = std::stoi(EnvStr); 560 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams); 561 } 562 563 for (int I = 0; I < NumberOfDevices; ++I) 564 DeviceAllocators.emplace_back(); 565 566 // Get the size threshold from environment variable 567 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 568 UseMemoryManager = Res.second; 569 size_t MemoryManagerThreshold = Res.first; 570 571 if (UseMemoryManager) 572 for (int I = 0; I < NumberOfDevices; ++I) 573 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 574 DeviceAllocators[I], MemoryManagerThreshold)); 575 576 // We lazily initialize all devices later. 577 InitializedFlags.assign(NumberOfDevices, false); 578 } 579 580 ~DeviceRTLTy() { 581 for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) 582 deinitDevice(DeviceId); 583 } 584 585 // Check whether a given DeviceId is valid 586 bool isValidDeviceId(const int DeviceId) const { 587 return DeviceId >= 0 && DeviceId < NumberOfDevices; 588 } 589 590 int getNumOfDevices() const { return NumberOfDevices; } 591 592 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 593 594 int initDevice(const int DeviceId) { 595 CUdevice Device; 596 597 DP("Getting device %d\n", DeviceId); 598 CUresult Err = cuDeviceGet(&Device, DeviceId); 599 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 600 return OFFLOAD_FAIL; 601 602 assert(InitializedFlags[DeviceId] == false && "Reinitializing device!"); 603 InitializedFlags[DeviceId] = true; 604 605 // Query the current flags of the primary context and set its flags if 606 // it is inactive 607 unsigned int FormerPrimaryCtxFlags = 0; 608 int FormerPrimaryCtxIsActive = 0; 609 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 610 &FormerPrimaryCtxIsActive); 611 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 612 return OFFLOAD_FAIL; 613 614 if (FormerPrimaryCtxIsActive) { 615 DP("The primary context is active, no change to its flags\n"); 616 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 617 CU_CTX_SCHED_BLOCKING_SYNC) 618 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 619 } else { 620 DP("The primary context is inactive, set its flags to " 621 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 622 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 623 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 624 return OFFLOAD_FAIL; 625 } 626 627 // Retain the per device primary context and save it to use whenever this 628 // device is selected. 629 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 630 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 631 return OFFLOAD_FAIL; 632 633 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 634 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 635 return OFFLOAD_FAIL; 636 637 // Initialize the stream pool. 638 if (!StreamPool[DeviceId]) 639 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(), 640 NumInitialStreams); 641 642 // Initialize the event pool. 643 if (!EventPool[DeviceId]) 644 EventPool[DeviceId] = 645 std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents); 646 647 // Query attributes to determine number of threads/block and blocks/grid. 648 int MaxGridDimX; 649 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 650 Device); 651 if (Err != CUDA_SUCCESS) { 652 DP("Error getting max grid dimension, use default value %d\n", 653 DeviceRTLTy::DefaultNumTeams); 654 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 655 } else { 656 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 657 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 658 } 659 660 // We are only exploiting threads along the x axis. 661 int MaxBlockDimX; 662 Err = cuDeviceGetAttribute(&MaxBlockDimX, 663 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 664 if (Err != CUDA_SUCCESS) { 665 DP("Error getting max block dimension, use default value %d\n", 666 DeviceRTLTy::DefaultNumThreads); 667 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 668 } else { 669 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 670 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 671 672 if (EnvTeamThreadLimit > 0 && 673 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 674 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 675 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 676 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 677 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 678 } 679 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 680 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 681 "capping at the hard limit\n", 682 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 683 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 684 } 685 } 686 687 // Get and set warp size 688 int WarpSize; 689 Err = 690 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 691 if (Err != CUDA_SUCCESS) { 692 DP("Error getting warp size, assume default value 32\n"); 693 DeviceData[DeviceId].WarpSize = 32; 694 } else { 695 DP("Using warp size %d\n", WarpSize); 696 DeviceData[DeviceId].WarpSize = WarpSize; 697 } 698 699 // Adjust teams to the env variables 700 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 701 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 702 EnvTeamLimit); 703 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 704 } 705 706 size_t StackLimit; 707 size_t HeapLimit; 708 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 709 StackLimit = std::stol(EnvStr); 710 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 711 return OFFLOAD_FAIL; 712 } else { 713 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 714 return OFFLOAD_FAIL; 715 } 716 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 717 HeapLimit = std::stol(EnvStr); 718 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 719 return OFFLOAD_FAIL; 720 } else { 721 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 722 return OFFLOAD_FAIL; 723 } 724 725 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 726 "Device supports up to %d CUDA blocks and %d threads with a " 727 "warp size of %d\n", 728 DeviceData[DeviceId].BlocksPerGrid, 729 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 730 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 731 "Device heap size is %d Bytes, device stack size is %d Bytes per " 732 "thread\n", 733 (int)HeapLimit, (int)StackLimit); 734 735 // Set default number of teams 736 if (EnvNumTeams > 0) { 737 DP("Default number of teams set according to environment %d\n", 738 EnvNumTeams); 739 DeviceData[DeviceId].NumTeams = EnvNumTeams; 740 } else { 741 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 742 DP("Default number of teams set according to library's default %d\n", 743 DeviceRTLTy::DefaultNumTeams); 744 } 745 746 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 747 DP("Default number of teams exceeds device limit, capping at %d\n", 748 DeviceData[DeviceId].BlocksPerGrid); 749 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 750 } 751 752 // Set default number of threads 753 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 754 DP("Default number of threads set according to library's default %d\n", 755 DeviceRTLTy::DefaultNumThreads); 756 if (DeviceData[DeviceId].NumThreads > 757 DeviceData[DeviceId].ThreadsPerBlock) { 758 DP("Default number of threads exceeds device limit, capping at %d\n", 759 DeviceData[DeviceId].ThreadsPerBlock); 760 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 761 } 762 763 return OFFLOAD_SUCCESS; 764 } 765 766 int deinitDevice(const int DeviceId) { 767 auto IsInitialized = InitializedFlags[DeviceId]; 768 if (!IsInitialized) 769 return OFFLOAD_SUCCESS; 770 InitializedFlags[DeviceId] = false; 771 772 if (UseMemoryManager) 773 MemoryManagers[DeviceId].release(); 774 775 StreamPool[DeviceId].reset(); 776 EventPool[DeviceId].reset(); 777 778 DeviceDataTy &D = DeviceData[DeviceId]; 779 if (!checkResult(cuCtxSetCurrent(D.Context), 780 "Error returned from cuCtxSetCurrent\n")) 781 return OFFLOAD_FAIL; 782 783 // Unload all modules. 784 for (auto &M : Modules[DeviceId]) 785 if (!checkResult(cuModuleUnload(M), 786 "Error returned from cuModuleUnload\n")) 787 return OFFLOAD_FAIL; 788 789 // Destroy context. 790 CUdevice Device; 791 if (!checkResult(cuCtxGetDevice(&Device), 792 "Error returned from cuCtxGetDevice\n")) 793 return OFFLOAD_FAIL; 794 795 if (!checkResult(cuDevicePrimaryCtxRelease(Device), 796 "Error returned from cuDevicePrimaryCtxRelease\n")) 797 return OFFLOAD_FAIL; 798 799 return OFFLOAD_SUCCESS; 800 } 801 802 __tgt_target_table *loadBinary(const int DeviceId, 803 const __tgt_device_image *Image) { 804 // Clear the offload table as we are going to create a new one. 805 clearOffloadEntriesTable(DeviceId); 806 807 // Create the module and extract the function pointers. 808 CUmodule Module; 809 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 810 CUresult Err = 811 cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 812 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 813 return nullptr; 814 815 DP("CUDA module successfully loaded!\n"); 816 817 Modules[DeviceId].push_back(Module); 818 819 // Find the symbols in the module by name. 820 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 821 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 822 823 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 824 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 825 if (!E->addr) { 826 // We return nullptr when something like this happens, the host should 827 // have always something in the address to uniquely identify the target 828 // region. 829 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 830 return nullptr; 831 } 832 833 if (E->size) { 834 __tgt_offload_entry Entry = *E; 835 CUdeviceptr CUPtr; 836 size_t CUSize; 837 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 838 // We keep this style here because we need the name 839 if (Err != CUDA_SUCCESS) { 840 REPORT("Loading global '%s' Failed\n", E->name); 841 CUDA_ERR_STRING(Err); 842 return nullptr; 843 } 844 845 if (CUSize != E->size) { 846 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 847 CUSize, E->size); 848 return nullptr; 849 } 850 851 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 852 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 853 854 Entry.addr = (void *)(CUPtr); 855 856 // Note: In the current implementation declare target variables 857 // can either be link or to. This means that once unified 858 // memory is activated via the requires directive, the variable 859 // can be used directly from the host in both cases. 860 // TODO: when variables types other than to or link are added, 861 // the below condition should be changed to explicitly 862 // check for to and link variables types: 863 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 864 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 865 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 866 // If unified memory is present any target link or to variables 867 // can access host addresses directly. There is no longer a 868 // need for device copies. 869 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 870 DP("Copy linked variable host address (" DPxMOD 871 ") to device address (" DPxMOD ")\n", 872 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 873 } 874 875 addOffloadEntry(DeviceId, Entry); 876 877 continue; 878 } 879 880 CUfunction Func; 881 Err = cuModuleGetFunction(&Func, Module, E->name); 882 // We keep this style here because we need the name 883 if (Err != CUDA_SUCCESS) { 884 REPORT("Loading '%s' Failed\n", E->name); 885 CUDA_ERR_STRING(Err); 886 return nullptr; 887 } 888 889 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 890 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 891 892 // default value GENERIC (in case symbol is missing from cubin file) 893 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 894 std::string ExecModeNameStr(E->name); 895 ExecModeNameStr += "_exec_mode"; 896 const char *ExecModeName = ExecModeNameStr.c_str(); 897 898 CUdeviceptr ExecModePtr; 899 size_t CUSize; 900 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 901 if (Err == CUDA_SUCCESS) { 902 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 903 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 904 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 905 return nullptr; 906 } 907 908 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 909 if (Err != CUDA_SUCCESS) { 910 REPORT("Error when copying data from device to host. Pointers: " 911 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 912 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 913 CUDA_ERR_STRING(Err); 914 return nullptr; 915 } 916 } else { 917 DP("Loading global exec_mode '%s' - symbol missing, using default " 918 "value GENERIC (1)\n", 919 ExecModeName); 920 } 921 922 KernelsList.emplace_back(Func, ExecModeVal); 923 924 __tgt_offload_entry Entry = *E; 925 Entry.addr = &KernelsList.back(); 926 addOffloadEntry(DeviceId, Entry); 927 } 928 929 // send device environment data to the device 930 { 931 // TODO: The device ID used here is not the real device ID used by OpenMP. 932 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 933 static_cast<uint32_t>(DeviceId), 934 static_cast<uint32_t>(DynamicMemorySize)}; 935 936 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 937 DeviceEnv.DebugKind = std::stoi(EnvStr); 938 939 const char *DeviceEnvName = "omptarget_device_environment"; 940 CUdeviceptr DeviceEnvPtr; 941 size_t CUSize; 942 943 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 944 if (Err == CUDA_SUCCESS) { 945 if (CUSize != sizeof(DeviceEnv)) { 946 REPORT( 947 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 948 DeviceEnvName, CUSize, sizeof(int32_t)); 949 CUDA_ERR_STRING(Err); 950 return nullptr; 951 } 952 953 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 954 if (Err != CUDA_SUCCESS) { 955 REPORT("Error when copying data from host to device. Pointers: " 956 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 957 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 958 CUDA_ERR_STRING(Err); 959 return nullptr; 960 } 961 962 DP("Sending global device environment data %zu bytes\n", CUSize); 963 } else { 964 DP("Finding global device environment '%s' - symbol missing.\n", 965 DeviceEnvName); 966 DP("Continue, considering this is a device RTL which does not accept " 967 "environment setting.\n"); 968 } 969 } 970 971 return getOffloadEntriesTable(DeviceId); 972 } 973 974 void *dataAlloc(const int DeviceId, const int64_t Size, 975 const TargetAllocTy Kind) { 976 switch (Kind) { 977 case TARGET_ALLOC_DEFAULT: 978 case TARGET_ALLOC_DEVICE: 979 if (UseMemoryManager) 980 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 981 else 982 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 983 case TARGET_ALLOC_HOST: 984 case TARGET_ALLOC_SHARED: 985 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 986 } 987 988 REPORT("Invalid target data allocation kind or requested allocator not " 989 "implemented yet\n"); 990 991 return nullptr; 992 } 993 994 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 995 const int64_t Size, __tgt_async_info *AsyncInfo) const { 996 assert(AsyncInfo && "AsyncInfo is nullptr"); 997 998 CUstream Stream = getStream(DeviceId, AsyncInfo); 999 CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 1000 if (Err != CUDA_SUCCESS) { 1001 DP("Error when copying data from host to device. Pointers: host " 1002 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1003 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1004 CUDA_ERR_STRING(Err); 1005 return OFFLOAD_FAIL; 1006 } 1007 1008 return OFFLOAD_SUCCESS; 1009 } 1010 1011 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 1012 const int64_t Size, __tgt_async_info *AsyncInfo) const { 1013 assert(AsyncInfo && "AsyncInfo is nullptr"); 1014 1015 CUstream Stream = getStream(DeviceId, AsyncInfo); 1016 CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 1017 if (Err != CUDA_SUCCESS) { 1018 DP("Error when copying data from device to host. Pointers: host " 1019 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1020 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1021 CUDA_ERR_STRING(Err); 1022 return OFFLOAD_FAIL; 1023 } 1024 1025 return OFFLOAD_SUCCESS; 1026 } 1027 1028 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1029 int64_t Size, __tgt_async_info *AsyncInfo) { 1030 assert(AsyncInfo && "AsyncInfo is nullptr"); 1031 1032 CUresult Err; 1033 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1034 1035 // If they are two devices, we try peer to peer copy first 1036 if (SrcDevId != DstDevId) { 1037 std::lock_guard<std::mutex> LG(PeerAccessMatrixLock); 1038 1039 switch (PeerAccessMatrix[SrcDevId][DstDevId]) { 1040 case PeerAccessState::No: { 1041 REPORT("Peer access from %" PRId32 " to %" PRId32 1042 " is not supported. Fall back to D2D memcpy.\n", 1043 SrcDevId, DstDevId); 1044 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1045 } 1046 case PeerAccessState::Unkown: { 1047 int CanAccessPeer = 0; 1048 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1049 if (Err != CUDA_SUCCESS) { 1050 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1051 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", 1052 SrcDevId, DstDevId); 1053 CUDA_ERR_STRING(Err); 1054 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; 1055 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1056 } 1057 1058 if (!CanAccessPeer) { 1059 REPORT("P2P access from %d to %d is not supported. Fall back to D2D " 1060 "memcpy.\n", 1061 SrcDevId, DstDevId); 1062 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; 1063 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1064 } 1065 1066 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1067 if (Err != CUDA_SUCCESS) { 1068 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1069 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n", 1070 SrcDevId, DstDevId); 1071 CUDA_ERR_STRING(Err); 1072 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No; 1073 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1074 } 1075 1076 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes; 1077 1078 LLVM_FALLTHROUGH; 1079 } 1080 case PeerAccessState::Yes: { 1081 Err = cuMemcpyPeerAsync( 1082 (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1083 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream); 1084 if (Err == CUDA_SUCCESS) 1085 return OFFLOAD_SUCCESS; 1086 1087 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1088 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 1089 ". Fall back to D2D memcpy.\n", 1090 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1091 CUDA_ERR_STRING(Err); 1092 1093 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1094 } 1095 } 1096 } 1097 1098 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1099 } 1100 1101 int dataDelete(const int DeviceId, void *TgtPtr) { 1102 if (UseMemoryManager) 1103 return MemoryManagers[DeviceId]->free(TgtPtr); 1104 1105 return DeviceAllocators[DeviceId].free(TgtPtr); 1106 } 1107 1108 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1109 ptrdiff_t *TgtOffsets, const int ArgNum, 1110 const int TeamNum, const int ThreadLimit, 1111 const unsigned int LoopTripCount, 1112 __tgt_async_info *AsyncInfo) const { 1113 // All args are references. 1114 std::vector<void *> Args(ArgNum); 1115 std::vector<void *> Ptrs(ArgNum); 1116 1117 for (int I = 0; I < ArgNum; ++I) { 1118 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1119 Args[I] = &Ptrs[I]; 1120 } 1121 1122 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1123 1124 const bool IsSPMDGenericMode = 1125 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1126 const bool IsSPMDMode = 1127 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1128 const bool IsGenericMode = 1129 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1130 1131 int CudaThreadsPerBlock; 1132 if (ThreadLimit > 0) { 1133 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1134 CudaThreadsPerBlock = ThreadLimit; 1135 // Add master warp if necessary 1136 if (IsGenericMode) { 1137 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1138 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1139 } 1140 } else { 1141 DP("Setting CUDA threads per block to default %d\n", 1142 DeviceData[DeviceId].NumThreads); 1143 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1144 } 1145 1146 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1147 DP("Threads per block capped at device limit %d\n", 1148 DeviceData[DeviceId].ThreadsPerBlock); 1149 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1150 } 1151 1152 CUresult Err; 1153 if (!KernelInfo->MaxThreadsPerBlock) { 1154 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1155 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1156 KernelInfo->Func); 1157 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1158 return OFFLOAD_FAIL; 1159 } 1160 1161 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1162 DP("Threads per block capped at kernel limit %d\n", 1163 KernelInfo->MaxThreadsPerBlock); 1164 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1165 } 1166 1167 unsigned int CudaBlocksPerGrid; 1168 if (TeamNum <= 0) { 1169 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1170 if (IsSPMDGenericMode) { 1171 // If we reach this point, then we are executing a kernel that was 1172 // transformed from Generic-mode to SPMD-mode. This kernel has 1173 // SPMD-mode execution, but needs its blocks to be scheduled 1174 // differently because the current loop trip count only applies to the 1175 // `teams distribute` region and will create var too few blocks using 1176 // the regular SPMD-mode method. 1177 CudaBlocksPerGrid = LoopTripCount; 1178 } else if (IsSPMDMode) { 1179 // We have a combined construct, i.e. `target teams distribute 1180 // parallel for [simd]`. We launch so many teams so that each thread 1181 // will execute one iteration of the loop. round up to the nearest 1182 // integer 1183 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1184 } else if (IsGenericMode) { 1185 // If we reach this point, then we have a non-combined construct, i.e. 1186 // `teams distribute` with a nested `parallel for` and each team is 1187 // assigned one iteration of the `distribute` loop. E.g.: 1188 // 1189 // #pragma omp target teams distribute 1190 // for(...loop_tripcount...) { 1191 // #pragma omp parallel for 1192 // for(...) {} 1193 // } 1194 // 1195 // Threads within a team will execute the iterations of the `parallel` 1196 // loop. 1197 CudaBlocksPerGrid = LoopTripCount; 1198 } else { 1199 REPORT("Unknown execution mode: %d\n", 1200 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1201 return OFFLOAD_FAIL; 1202 } 1203 DP("Using %d teams due to loop trip count %" PRIu32 1204 " and number of threads per block %d\n", 1205 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1206 } else { 1207 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1208 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1209 } 1210 } else { 1211 DP("Using requested number of teams %d\n", TeamNum); 1212 CudaBlocksPerGrid = TeamNum; 1213 } 1214 1215 if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) { 1216 DP("Capping number of teams to team limit %d\n", 1217 DeviceData[DeviceId].BlocksPerGrid); 1218 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1219 } 1220 1221 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1222 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1223 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1224 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1225 : "(null)", 1226 CudaBlocksPerGrid, CudaThreadsPerBlock, 1227 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1228 1229 CUstream Stream = getStream(DeviceId, AsyncInfo); 1230 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1231 /* gridDimZ */ 1, CudaThreadsPerBlock, 1232 /* blockDimY */ 1, /* blockDimZ */ 1, 1233 DynamicMemorySize, Stream, &Args[0], nullptr); 1234 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1235 return OFFLOAD_FAIL; 1236 1237 DP("Launch of entry point at " DPxMOD " successful!\n", 1238 DPxPTR(TgtEntryPtr)); 1239 1240 return OFFLOAD_SUCCESS; 1241 } 1242 1243 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1244 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1245 CUresult Err = cuStreamSynchronize(Stream); 1246 1247 // Once the stream is synchronized, return it to stream pool and reset 1248 // AsyncInfo. This is to make sure the synchronization only works for its 1249 // own tasks. 1250 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1251 AsyncInfo->Queue = nullptr; 1252 1253 if (Err != CUDA_SUCCESS) { 1254 DP("Error when synchronizing stream. stream = " DPxMOD 1255 ", async info ptr = " DPxMOD "\n", 1256 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1257 CUDA_ERR_STRING(Err); 1258 } 1259 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1260 } 1261 1262 void printDeviceInfo(int32_t DeviceId) { 1263 char TmpChar[1000]; 1264 std::string TmpStr; 1265 size_t TmpSt; 1266 int TmpInt, TmpInt2, TmpInt3; 1267 1268 CUdevice Device; 1269 checkResult(cuDeviceGet(&Device, DeviceId), 1270 "Error returned from cuCtxGetDevice\n"); 1271 1272 cuDriverGetVersion(&TmpInt); 1273 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1274 printf(" CUDA Device Number: \t\t%d \n", DeviceId); 1275 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1276 "Error returned from cuDeviceGetName\n"); 1277 printf(" Device Name: \t\t\t%s \n", TmpChar); 1278 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1279 "Error returned from cuDeviceTotalMem\n"); 1280 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1281 checkResult(cuDeviceGetAttribute( 1282 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1283 "Error returned from cuDeviceGetAttribute\n"); 1284 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1285 checkResult( 1286 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1287 "Error returned from cuDeviceGetAttribute\n"); 1288 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1289 checkResult(cuDeviceGetAttribute( 1290 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1291 "Error returned from cuDeviceGetAttribute\n"); 1292 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1293 checkResult( 1294 cuDeviceGetAttribute( 1295 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1296 "Error returned from cuDeviceGetAttribute\n"); 1297 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1298 checkResult( 1299 cuDeviceGetAttribute( 1300 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1301 "Error returned from cuDeviceGetAttribute\n"); 1302 printf(" Registers per Block: \t\t%d \n", TmpInt); 1303 checkResult( 1304 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1305 "Error returned from cuDeviceGetAttribute\n"); 1306 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1307 checkResult(cuDeviceGetAttribute( 1308 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1309 "Error returned from cuDeviceGetAttribute\n"); 1310 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1311 checkResult(cuDeviceGetAttribute( 1312 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1313 "Error returned from cuDeviceGetAttribute\n"); 1314 checkResult(cuDeviceGetAttribute( 1315 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1316 "Error returned from cuDeviceGetAttribute\n"); 1317 checkResult(cuDeviceGetAttribute( 1318 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1319 "Error returned from cuDeviceGetAttribute\n"); 1320 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1321 TmpInt3); 1322 checkResult(cuDeviceGetAttribute( 1323 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1324 "Error returned from cuDeviceGetAttribute\n"); 1325 checkResult(cuDeviceGetAttribute( 1326 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1327 "Error returned from cuDeviceGetAttribute\n"); 1328 checkResult(cuDeviceGetAttribute( 1329 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1330 "Error returned from cuDeviceGetAttribute\n"); 1331 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1332 TmpInt3); 1333 checkResult( 1334 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1335 "Error returned from cuDeviceGetAttribute\n"); 1336 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1337 checkResult(cuDeviceGetAttribute( 1338 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1339 "Error returned from cuDeviceGetAttribute\n"); 1340 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1341 checkResult( 1342 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1343 "Error returned from cuDeviceGetAttribute\n"); 1344 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1345 checkResult(cuDeviceGetAttribute( 1346 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1347 "Error returned from cuDeviceGetAttribute\n"); 1348 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1349 checkResult( 1350 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1351 "Error returned from cuDeviceGetAttribute\n"); 1352 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1353 checkResult(cuDeviceGetAttribute( 1354 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1355 "Error returned from cuDeviceGetAttribute\n"); 1356 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1357 checkResult( 1358 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1359 "Error returned from cuDeviceGetAttribute\n"); 1360 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1361 TmpStr = "DEFAULT"; 1362 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1363 TmpStr = "PROHIBITED"; 1364 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1365 TmpStr = "EXCLUSIVE PROCESS"; 1366 else 1367 TmpStr = "unknown"; 1368 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1369 checkResult(cuDeviceGetAttribute( 1370 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1371 "Error returned from cuDeviceGetAttribute\n"); 1372 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1373 checkResult( 1374 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1375 "Error returned from cuDeviceGetAttribute\n"); 1376 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1377 checkResult(cuDeviceGetAttribute( 1378 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1381 checkResult( 1382 cuDeviceGetAttribute( 1383 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1384 "Error returned from cuDeviceGetAttribute\n"); 1385 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1386 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1387 Device), 1388 "Error returned from cuDeviceGetAttribute\n"); 1389 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1390 checkResult(cuDeviceGetAttribute( 1391 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1392 Device), 1393 "Error returned from cuDeviceGetAttribute\n"); 1394 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1395 checkResult(cuDeviceGetAttribute( 1396 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1397 "Error returned from cuDeviceGetAttribute\n"); 1398 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1399 checkResult(cuDeviceGetAttribute( 1400 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1401 "Error returned from cuDeviceGetAttribute\n"); 1402 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1403 checkResult(cuDeviceGetAttribute( 1404 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1405 "Error returned from cuDeviceGetAttribute\n"); 1406 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1407 checkResult( 1408 cuDeviceGetAttribute( 1409 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1410 "Error returned from cuDeviceGetAttribute\n"); 1411 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1412 checkResult( 1413 cuDeviceGetAttribute( 1414 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1415 "Error returned from cuDeviceGetAttribute\n"); 1416 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1417 checkResult(cuDeviceGetAttribute( 1418 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1419 "Error returned from cuDeviceGetAttribute\n"); 1420 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1421 checkResult(cuDeviceGetAttribute( 1422 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1423 "Error returned from cuDeviceGetAttribute\n"); 1424 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1425 checkResult( 1426 cuDeviceGetAttribute( 1427 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1428 "Error returned from cuDeviceGetAttribute\n"); 1429 checkResult( 1430 cuDeviceGetAttribute( 1431 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1432 "Error returned from cuDeviceGetAttribute\n"); 1433 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1434 } 1435 1436 int createEvent(int DeviceId, void **P) { 1437 CUevent Event = nullptr; 1438 if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS) 1439 return OFFLOAD_FAIL; 1440 *P = Event; 1441 return OFFLOAD_SUCCESS; 1442 } 1443 1444 int destroyEvent(int DeviceId, void *EventPtr) { 1445 EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr)); 1446 return OFFLOAD_SUCCESS; 1447 } 1448 1449 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1450 void *EventPtr) const { 1451 CUstream Stream = getStream(DeviceId, AsyncInfo); 1452 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1453 1454 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1455 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1456 // API document, that argument has to be 0x0. 1457 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1458 if (Err != CUDA_SUCCESS) { 1459 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1460 DPxPTR(Stream), DPxPTR(Event)); 1461 CUDA_ERR_STRING(Err); 1462 return OFFLOAD_FAIL; 1463 } 1464 1465 return OFFLOAD_SUCCESS; 1466 } 1467 1468 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const { 1469 if (AsyncInfo->Queue) { 1470 StreamPool[DeviceId]->release( 1471 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1472 AsyncInfo->Queue = nullptr; 1473 } 1474 1475 return OFFLOAD_SUCCESS; 1476 } 1477 1478 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const { 1479 *AsyncInfo = new __tgt_async_info; 1480 getStream(DeviceId, *AsyncInfo); 1481 return OFFLOAD_SUCCESS; 1482 } 1483 1484 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo, 1485 const char **ErrStr) const { 1486 assert(DeviceInfo && "DeviceInfo is nullptr"); 1487 1488 if (!DeviceInfo->Context) 1489 DeviceInfo->Context = DeviceData[DeviceId].Context; 1490 if (!DeviceInfo->Device) { 1491 CUdevice Dev; 1492 CUresult Err = cuDeviceGet(&Dev, DeviceId); 1493 if (Err == CUDA_SUCCESS) { 1494 DeviceInfo->Device = reinterpret_cast<void *>(Dev); 1495 } else { 1496 cuGetErrorString(Err, ErrStr); 1497 return OFFLOAD_FAIL; 1498 } 1499 } 1500 return OFFLOAD_SUCCESS; 1501 } 1502 1503 int setContext(int DeviceId) { 1504 assert(InitializedFlags[DeviceId] && "Device is not initialized"); 1505 1506 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1507 if (!checkResult(Err, "error returned from cuCtxSetCurrent")) 1508 return OFFLOAD_FAIL; 1509 1510 return OFFLOAD_SUCCESS; 1511 } 1512 }; 1513 1514 DeviceRTLTy DeviceRTL; 1515 } // namespace 1516 1517 // Exposed library API function 1518 #ifdef __cplusplus 1519 extern "C" { 1520 #endif 1521 1522 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 1523 return elf_check_machine(Image, /* EM_CUDA */ 190); 1524 } 1525 1526 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, 1527 __tgt_image_info *info) { 1528 if (!__tgt_rtl_is_valid_binary(image)) 1529 return false; 1530 1531 // A subarchitecture was not specified. Assume it is compatible. 1532 if (!info->Arch) 1533 return true; 1534 1535 int32_t NumberOfDevices = 0; 1536 if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS) 1537 return false; 1538 1539 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { 1540 CUdevice Device; 1541 if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS) 1542 return false; 1543 1544 int32_t Major, Minor; 1545 if (cuDeviceGetAttribute(&Major, 1546 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 1547 Device) != CUDA_SUCCESS) 1548 return false; 1549 if (cuDeviceGetAttribute(&Minor, 1550 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 1551 Device) != CUDA_SUCCESS) 1552 return false; 1553 1554 std::string ArchStr = "sm_" + std::to_string(Major) + std::to_string(Minor); 1555 if (ArchStr != info->Arch) 1556 return false; 1557 } 1558 1559 DP("Image has compatible compute capability: %s\n", info->Arch); 1560 return true; 1561 } 1562 1563 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1564 1565 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1566 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1567 DeviceRTL.setRequiresFlag(RequiresFlags); 1568 return RequiresFlags; 1569 } 1570 1571 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) { 1572 if (DeviceRTL.isValidDeviceId(SrcDevId) && 1573 DeviceRTL.isValidDeviceId(DstDevId)) 1574 return 1; 1575 1576 return 0; 1577 } 1578 1579 int32_t __tgt_rtl_init_device(int32_t DeviceId) { 1580 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1581 // Context is set when init the device. 1582 1583 return DeviceRTL.initDevice(DeviceId); 1584 } 1585 1586 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) { 1587 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1588 // Context is set when deinit the device. 1589 1590 return DeviceRTL.deinitDevice(DeviceId); 1591 } 1592 1593 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, 1594 __tgt_device_image *Image) { 1595 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1596 1597 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1598 return nullptr; 1599 1600 return DeviceRTL.loadBinary(DeviceId, Image); 1601 } 1602 1603 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *, 1604 int32_t Kind) { 1605 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1606 1607 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1608 return nullptr; 1609 1610 return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind); 1611 } 1612 1613 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, 1614 int64_t Size) { 1615 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1616 // Context is set in __tgt_rtl_data_submit_async. 1617 1618 __tgt_async_info AsyncInfo; 1619 const int32_t Rc = 1620 __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); 1621 if (Rc != OFFLOAD_SUCCESS) 1622 return OFFLOAD_FAIL; 1623 1624 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1625 } 1626 1627 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, 1628 void *HstPtr, int64_t Size, 1629 __tgt_async_info *AsyncInfoPtr) { 1630 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1631 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1632 1633 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1634 return OFFLOAD_FAIL; 1635 1636 return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr); 1637 } 1638 1639 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, 1640 int64_t Size) { 1641 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1642 // Context is set in __tgt_rtl_data_retrieve_async. 1643 1644 __tgt_async_info AsyncInfo; 1645 const int32_t Rc = 1646 __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); 1647 if (Rc != OFFLOAD_SUCCESS) 1648 return OFFLOAD_FAIL; 1649 1650 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1651 } 1652 1653 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, 1654 void *TgtPtr, int64_t Size, 1655 __tgt_async_info *AsyncInfoPtr) { 1656 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1657 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1658 1659 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1660 return OFFLOAD_FAIL; 1661 1662 return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr); 1663 } 1664 1665 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr, 1666 int DstDevId, void *DstPtr, int64_t Size, 1667 __tgt_async_info *AsyncInfo) { 1668 assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); 1669 assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); 1670 assert(AsyncInfo && "AsyncInfo is nullptr"); 1671 1672 if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS) 1673 return OFFLOAD_FAIL; 1674 1675 return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size, 1676 AsyncInfo); 1677 } 1678 1679 int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr, 1680 int32_t DstDevId, void *DstPtr, int64_t Size) { 1681 assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid"); 1682 assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid"); 1683 // Context is set in __tgt_rtl_data_exchange_async. 1684 1685 __tgt_async_info AsyncInfo; 1686 const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId, 1687 DstPtr, Size, &AsyncInfo); 1688 if (Rc != OFFLOAD_SUCCESS) 1689 return OFFLOAD_FAIL; 1690 1691 return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo); 1692 } 1693 1694 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) { 1695 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1696 1697 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1698 return OFFLOAD_FAIL; 1699 1700 return DeviceRTL.dataDelete(DeviceId, TgtPtr); 1701 } 1702 1703 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, 1704 void **TgtArgs, ptrdiff_t *TgtOffsets, 1705 int32_t ArgNum, int32_t TeamNum, 1706 int32_t ThreadLimit, 1707 uint64_t LoopTripcount) { 1708 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1709 // Context is set in __tgt_rtl_run_target_team_region_async. 1710 1711 __tgt_async_info AsyncInfo; 1712 const int32_t Rc = __tgt_rtl_run_target_team_region_async( 1713 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit, 1714 LoopTripcount, &AsyncInfo); 1715 if (Rc != OFFLOAD_SUCCESS) 1716 return OFFLOAD_FAIL; 1717 1718 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1719 } 1720 1721 int32_t __tgt_rtl_run_target_team_region_async( 1722 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, 1723 int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit, 1724 uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) { 1725 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1726 1727 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1728 return OFFLOAD_FAIL; 1729 1730 return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs, 1731 TgtOffsets, ArgNum, TeamNum, ThreadLimit, 1732 LoopTripcount, AsyncInfoPtr); 1733 } 1734 1735 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, 1736 void **TgtArgs, ptrdiff_t *TgtOffsets, 1737 int32_t ArgNum) { 1738 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1739 // Context is set in __tgt_rtl_run_target_region_async. 1740 1741 __tgt_async_info AsyncInfo; 1742 const int32_t Rc = __tgt_rtl_run_target_region_async( 1743 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo); 1744 if (Rc != OFFLOAD_SUCCESS) 1745 return OFFLOAD_FAIL; 1746 1747 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); 1748 } 1749 1750 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, 1751 void **TgtArgs, ptrdiff_t *TgtOffsets, 1752 int32_t ArgNum, 1753 __tgt_async_info *AsyncInfoPtr) { 1754 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1755 // Context is set in __tgt_rtl_run_target_team_region_async. 1756 return __tgt_rtl_run_target_team_region_async( 1757 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, 1758 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1759 AsyncInfoPtr); 1760 } 1761 1762 int32_t __tgt_rtl_synchronize(int32_t DeviceId, 1763 __tgt_async_info *AsyncInfoPtr) { 1764 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1765 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1766 assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); 1767 // NOTE: We don't need to set context for stream sync. 1768 return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr); 1769 } 1770 1771 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1772 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1773 InfoLevel.store(NewInfoLevel); 1774 } 1775 1776 void __tgt_rtl_print_device_info(int32_t DeviceId) { 1777 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1778 // NOTE: We don't need to set context for print device info. 1779 DeviceRTL.printDeviceInfo(DeviceId); 1780 } 1781 1782 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) { 1783 assert(Event && "event is nullptr"); 1784 1785 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1786 return OFFLOAD_FAIL; 1787 1788 return DeviceRTL.createEvent(DeviceId, Event); 1789 } 1790 1791 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, 1792 __tgt_async_info *AsyncInfoPtr) { 1793 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1794 assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr"); 1795 assert(EventPtr && "event_ptr is nullptr"); 1796 // NOTE: We might not need to set context for event record. 1797 return recordEvent(EventPtr, AsyncInfoPtr); 1798 } 1799 1800 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, 1801 __tgt_async_info *AsyncInfoPtr) { 1802 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1803 assert(AsyncInfoPtr && "async_info_ptr is nullptr"); 1804 assert(EventPtr && "event is nullptr"); 1805 // If we don't have a queue we need to set the context. 1806 if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1807 return OFFLOAD_FAIL; 1808 return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr); 1809 } 1810 1811 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { 1812 assert(EventPtr && "event is nullptr"); 1813 // NOTE: We might not need to set context for event sync. 1814 return syncEvent(EventPtr); 1815 } 1816 1817 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { 1818 assert(EventPtr && "event is nullptr"); 1819 1820 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1821 return OFFLOAD_FAIL; 1822 1823 return DeviceRTL.destroyEvent(DeviceId, EventPtr); 1824 } 1825 1826 int32_t __tgt_rtl_release_async_info(int32_t DeviceId, 1827 __tgt_async_info *AsyncInfo) { 1828 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1829 assert(AsyncInfo && "async_info is nullptr"); 1830 1831 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1832 return OFFLOAD_FAIL; 1833 1834 return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo); 1835 } 1836 1837 int32_t __tgt_rtl_init_async_info(int32_t DeviceId, 1838 __tgt_async_info **AsyncInfo) { 1839 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1840 assert(AsyncInfo && "async_info is nullptr"); 1841 1842 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1843 return OFFLOAD_FAIL; 1844 1845 return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo); 1846 } 1847 1848 int32_t __tgt_rtl_init_device_info(int32_t DeviceId, 1849 __tgt_device_info *DeviceInfoPtr, 1850 const char **ErrStr) { 1851 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid"); 1852 assert(DeviceInfoPtr && "device_info_ptr is nullptr"); 1853 1854 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS) 1855 return OFFLOAD_FAIL; 1856 1857 return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr); 1858 } 1859 1860 #ifdef __cplusplus 1861 } 1862 #endif 1863