1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "omptargetplugin.h" 25 26 #define TARGET_NAME CUDA 27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 28 29 #include "MemoryManager.h" 30 31 // Utility for retrieving and printing CUDA error string. 32 #ifdef OMPTARGET_DEBUG 33 #define CUDA_ERR_STRING(err) \ 34 do { \ 35 if (getDebugLevel() > 0) { \ 36 const char *errStr = nullptr; \ 37 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 38 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 39 REPORT("Unrecognized CUDA error code: %d\n", err); \ 40 else if (errStr_status == CUDA_SUCCESS) \ 41 REPORT("CUDA error is: %s\n", errStr); \ 42 else { \ 43 REPORT("Unresolved CUDA error code: %d\n", err); \ 44 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 45 errStr_status); \ 46 } \ 47 } else { \ 48 const char *errStr = nullptr; \ 49 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 50 if (errStr_status == CUDA_SUCCESS) \ 51 REPORT("%s \n", errStr); \ 52 } \ 53 } while (false) 54 #else // OMPTARGET_DEBUG 55 #define CUDA_ERR_STRING(err) \ 56 do { \ 57 const char *errStr = nullptr; \ 58 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 59 if (errStr_status == CUDA_SUCCESS) \ 60 REPORT("%s \n", errStr); \ 61 } while (false) 62 #endif // OMPTARGET_DEBUG 63 64 #include "elf_common.h" 65 66 /// Keep entries table per device. 67 struct FuncOrGblEntryTy { 68 __tgt_target_table Table; 69 std::vector<__tgt_offload_entry> Entries; 70 }; 71 72 enum ExecutionModeType { 73 SPMD, // constructors, destructors, 74 // combined constructs (`teams distribute parallel for [simd]`) 75 GENERIC, // everything else 76 NONE 77 }; 78 79 /// Use a single entity to encode a kernel and a set of flags. 80 struct KernelTy { 81 CUfunction Func; 82 83 // execution mode of kernel 84 // 0 - SPMD mode (without master warp) 85 // 1 - Generic mode (with master warp) 86 int8_t ExecutionMode; 87 88 /// Maximal number of threads per block for this kernel. 89 int MaxThreadsPerBlock = 0; 90 91 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 92 : Func(_Func), ExecutionMode(_ExecutionMode) {} 93 }; 94 95 /// Device environment data 96 /// Manually sync with the deviceRTL side for now, move to a dedicated header 97 /// file later. 98 struct omptarget_device_environmentTy { 99 int32_t debug_level; 100 }; 101 102 namespace { 103 bool checkResult(CUresult Err, const char *ErrMsg) { 104 if (Err == CUDA_SUCCESS) 105 return true; 106 107 REPORT("%s", ErrMsg); 108 CUDA_ERR_STRING(Err); 109 return false; 110 } 111 112 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 113 CUstream Stream) { 114 CUresult Err = 115 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 116 117 if (Err != CUDA_SUCCESS) { 118 DP("Error when copying data from device to device. Pointers: src " 119 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 120 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 121 CUDA_ERR_STRING(Err); 122 return OFFLOAD_FAIL; 123 } 124 125 return OFFLOAD_SUCCESS; 126 } 127 128 // Structure contains per-device data 129 struct DeviceDataTy { 130 /// List that contains all the kernels. 131 std::list<KernelTy> KernelsList; 132 133 std::list<FuncOrGblEntryTy> FuncGblEntries; 134 135 CUcontext Context = nullptr; 136 // Device properties 137 int ThreadsPerBlock = 0; 138 int BlocksPerGrid = 0; 139 int WarpSize = 0; 140 // OpenMP properties 141 int NumTeams = 0; 142 int NumThreads = 0; 143 }; 144 145 class StreamManagerTy { 146 int NumberOfDevices; 147 // The initial size of stream pool 148 int EnvNumInitialStreams; 149 // Per-device stream mutex 150 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 151 // Per-device stream Id indicates the next available stream in the pool 152 std::vector<int> NextStreamId; 153 // Per-device stream pool 154 std::vector<std::vector<CUstream>> StreamPool; 155 // Reference to per-device data 156 std::vector<DeviceDataTy> &DeviceData; 157 158 // If there is no CUstream left in the pool, we will resize the pool to 159 // allocate more CUstream. This function should be called with device mutex, 160 // and we do not resize to smaller one. 161 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 162 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 163 const size_t CurrentSize = Pool.size(); 164 assert(NewSize > CurrentSize && "new size is not larger than current size"); 165 166 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 167 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 168 // We will return if cannot switch to the right context in case of 169 // creating bunch of streams that are not corresponding to the right 170 // device. The offloading will fail later because selected CUstream is 171 // nullptr. 172 return; 173 } 174 175 Pool.resize(NewSize, nullptr); 176 177 for (size_t I = CurrentSize; I < NewSize; ++I) { 178 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 179 "Error returned from cuStreamCreate\n"); 180 } 181 } 182 183 public: 184 StreamManagerTy(const int NumberOfDevices, 185 std::vector<DeviceDataTy> &DeviceData) 186 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 187 DeviceData(DeviceData) { 188 StreamPool.resize(NumberOfDevices); 189 NextStreamId.resize(NumberOfDevices); 190 StreamMtx.resize(NumberOfDevices); 191 192 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 193 EnvNumInitialStreams = std::stoi(EnvStr); 194 195 // Initialize the next stream id 196 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 197 198 // Initialize stream mutex 199 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 200 Ptr = std::make_unique<std::mutex>(); 201 } 202 203 ~StreamManagerTy() { 204 // Destroy streams 205 for (int I = 0; I < NumberOfDevices; ++I) { 206 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 207 "Error returned from cuCtxSetCurrent\n"); 208 209 for (CUstream &S : StreamPool[I]) { 210 if (S) 211 checkResult(cuStreamDestroy(S), 212 "Error returned from cuStreamDestroy\n"); 213 } 214 } 215 } 216 217 // Get a CUstream from pool. Per-device next stream id always points to the 218 // next available CUstream. That means, CUstreams [0, id-1] have been 219 // assigned, and [id,] are still available. If there is no CUstream left, we 220 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 221 // the id will increase one. 222 // xxxxxs+++++++++ 223 // ^ 224 // id 225 // After assignment, the pool becomes the following and s is assigned. 226 // xxxxxs+++++++++ 227 // ^ 228 // id 229 CUstream getStream(const int DeviceId) { 230 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 231 int &Id = NextStreamId[DeviceId]; 232 // No CUstream left in the pool, we need to request from CUDA RT 233 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 234 // By default we double the stream pool every time 235 resizeStreamPool(DeviceId, Id * 2); 236 } 237 return StreamPool[DeviceId][Id++]; 238 } 239 240 // Return a CUstream back to pool. As mentioned above, per-device next 241 // stream is always points to the next available CUstream, so when we return 242 // a CUstream, we need to first decrease the id, and then copy the CUstream 243 // back. 244 // It is worth noting that, the order of streams return might be different 245 // from that they're assigned, that saying, at some point, there might be 246 // two identical CUstreams. 247 // xxax+a+++++ 248 // ^ 249 // id 250 // However, it doesn't matter, because they're always on the two sides of 251 // id. The left one will in the end be overwritten by another CUstream. 252 // Therefore, after several execution, the order of pool might be different 253 // from its initial state. 254 void returnStream(const int DeviceId, CUstream Stream) { 255 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 256 int &Id = NextStreamId[DeviceId]; 257 assert(Id > 0 && "Wrong stream ID"); 258 StreamPool[DeviceId][--Id] = Stream; 259 } 260 261 bool initializeDeviceStreamPool(const int DeviceId) { 262 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 263 264 resizeStreamPool(DeviceId, EnvNumInitialStreams); 265 266 // Check the size of stream pool 267 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 268 return false; 269 270 // Check whether each stream is valid 271 for (CUstream &S : StreamPool[DeviceId]) 272 if (!S) 273 return false; 274 275 return true; 276 } 277 }; 278 279 class DeviceRTLTy { 280 int NumberOfDevices; 281 // OpenMP environment properties 282 int EnvNumTeams; 283 int EnvTeamLimit; 284 int EnvTeamThreadLimit; 285 // OpenMP requires flags 286 int64_t RequiresFlags; 287 288 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 289 static constexpr const int HardThreadLimit = 1024; 290 static constexpr const int DefaultNumTeams = 128; 291 static constexpr const int DefaultNumThreads = 128; 292 293 std::unique_ptr<StreamManagerTy> StreamManager; 294 std::vector<DeviceDataTy> DeviceData; 295 std::vector<CUmodule> Modules; 296 297 /// A class responsible for interacting with device native runtime library to 298 /// allocate and free memory. 299 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 300 const int DeviceId; 301 const std::vector<DeviceDataTy> &DeviceData; 302 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 303 304 public: 305 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 306 : DeviceId(DeviceId), DeviceData(DeviceData) {} 307 308 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 309 if (Size == 0) 310 return nullptr; 311 312 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 313 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 314 return nullptr; 315 316 void *MemAlloc = nullptr; 317 switch (Kind) { 318 case TARGET_ALLOC_DEFAULT: 319 case TARGET_ALLOC_DEVICE: 320 CUdeviceptr DevicePtr; 321 Err = cuMemAlloc(&DevicePtr, Size); 322 MemAlloc = (void *)DevicePtr; 323 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 324 return nullptr; 325 break; 326 case TARGET_ALLOC_HOST: 327 void *HostPtr; 328 Err = cuMemAllocHost(&HostPtr, Size); 329 MemAlloc = HostPtr; 330 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 331 return nullptr; 332 HostPinnedAllocs[MemAlloc] = Kind; 333 break; 334 case TARGET_ALLOC_SHARED: 335 CUdeviceptr SharedPtr; 336 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 337 MemAlloc = (void *)SharedPtr; 338 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 339 return nullptr; 340 break; 341 } 342 343 return MemAlloc; 344 } 345 346 int free(void *TgtPtr) override { 347 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 348 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 349 return OFFLOAD_FAIL; 350 351 // Host pinned memory must be freed differently. 352 TargetAllocTy Kind = 353 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 354 ? TARGET_ALLOC_DEFAULT 355 : TARGET_ALLOC_HOST; 356 switch (Kind) { 357 case TARGET_ALLOC_DEFAULT: 358 case TARGET_ALLOC_DEVICE: 359 case TARGET_ALLOC_SHARED: 360 Err = cuMemFree((CUdeviceptr)TgtPtr); 361 if (!checkResult(Err, "Error returned from cuMemFree\n")) 362 return OFFLOAD_FAIL; 363 break; 364 case TARGET_ALLOC_HOST: 365 Err = cuMemFreeHost(TgtPtr); 366 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 367 return OFFLOAD_FAIL; 368 break; 369 } 370 371 return OFFLOAD_SUCCESS; 372 } 373 }; 374 375 /// A vector of device allocators 376 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 377 378 /// A vector of memory managers. Since the memory manager is non-copyable and 379 // non-removable, we wrap them into std::unique_ptr. 380 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 381 382 /// Whether use memory manager 383 bool UseMemoryManager = true; 384 385 // Record entry point associated with device 386 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 387 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 388 E.Entries.push_back(entry); 389 } 390 391 // Return a pointer to the entry associated with the pointer 392 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 393 const void *Addr) const { 394 for (const __tgt_offload_entry &Itr : 395 DeviceData[DeviceId].FuncGblEntries.back().Entries) 396 if (Itr.addr == Addr) 397 return &Itr; 398 399 return nullptr; 400 } 401 402 // Return the pointer to the target entries table 403 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 404 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 405 406 if (E.Entries.empty()) 407 return nullptr; 408 409 // Update table info according to the entries and return the pointer 410 E.Table.EntriesBegin = E.Entries.data(); 411 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 412 413 return &E.Table; 414 } 415 416 // Clear entries table for a device 417 void clearOffloadEntriesTable(const int DeviceId) { 418 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 419 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 420 E.Entries.clear(); 421 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 422 } 423 424 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 425 assert(AsyncInfo && "AsyncInfo is nullptr"); 426 427 if (!AsyncInfo->Queue) 428 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 429 430 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 431 } 432 433 public: 434 // This class should not be copied 435 DeviceRTLTy(const DeviceRTLTy &) = delete; 436 DeviceRTLTy(DeviceRTLTy &&) = delete; 437 438 DeviceRTLTy() 439 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 440 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { 441 442 DP("Start initializing CUDA\n"); 443 444 CUresult Err = cuInit(0); 445 if (Err == CUDA_ERROR_INVALID_HANDLE) { 446 // Can't call cuGetErrorString if dlsym failed 447 DP("Failed to load CUDA shared library\n"); 448 return; 449 } 450 if (!checkResult(Err, "Error returned from cuInit\n")) { 451 return; 452 } 453 454 Err = cuDeviceGetCount(&NumberOfDevices); 455 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 456 return; 457 458 if (NumberOfDevices == 0) { 459 DP("There are no devices supporting CUDA.\n"); 460 return; 461 } 462 463 DeviceData.resize(NumberOfDevices); 464 465 // Get environment variables regarding teams 466 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 467 // OMP_TEAM_LIMIT has been set 468 EnvTeamLimit = std::stoi(EnvStr); 469 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 470 } 471 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 472 // OMP_TEAMS_THREAD_LIMIT has been set 473 EnvTeamThreadLimit = std::stoi(EnvStr); 474 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 475 } 476 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 477 // OMP_NUM_TEAMS has been set 478 EnvNumTeams = std::stoi(EnvStr); 479 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 480 } 481 482 StreamManager = 483 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 484 485 for (int I = 0; I < NumberOfDevices; ++I) 486 DeviceAllocators.emplace_back(I, DeviceData); 487 488 // Get the size threshold from environment variable 489 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 490 UseMemoryManager = Res.second; 491 size_t MemoryManagerThreshold = Res.first; 492 493 if (UseMemoryManager) 494 for (int I = 0; I < NumberOfDevices; ++I) 495 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 496 DeviceAllocators[I], MemoryManagerThreshold)); 497 } 498 499 ~DeviceRTLTy() { 500 // We first destruct memory managers in case that its dependent data are 501 // destroyed before it. 502 for (auto &M : MemoryManagers) 503 M.release(); 504 505 StreamManager = nullptr; 506 507 for (CUmodule &M : Modules) 508 // Close module 509 if (M) 510 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 511 512 for (DeviceDataTy &D : DeviceData) { 513 // Destroy context 514 if (D.Context) { 515 checkResult(cuCtxSetCurrent(D.Context), 516 "Error returned from cuCtxSetCurrent\n"); 517 CUdevice Device; 518 checkResult(cuCtxGetDevice(&Device), 519 "Error returned from cuCtxGetDevice\n"); 520 checkResult(cuDevicePrimaryCtxRelease(Device), 521 "Error returned from cuDevicePrimaryCtxRelease\n"); 522 } 523 } 524 } 525 526 // Check whether a given DeviceId is valid 527 bool isValidDeviceId(const int DeviceId) const { 528 return DeviceId >= 0 && DeviceId < NumberOfDevices; 529 } 530 531 int getNumOfDevices() const { return NumberOfDevices; } 532 533 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 534 535 int initDevice(const int DeviceId) { 536 CUdevice Device; 537 538 DP("Getting device %d\n", DeviceId); 539 CUresult Err = cuDeviceGet(&Device, DeviceId); 540 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 541 return OFFLOAD_FAIL; 542 543 // Query the current flags of the primary context and set its flags if 544 // it is inactive 545 unsigned int FormerPrimaryCtxFlags = 0; 546 int FormerPrimaryCtxIsActive = 0; 547 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 548 &FormerPrimaryCtxIsActive); 549 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 550 return OFFLOAD_FAIL; 551 552 if (FormerPrimaryCtxIsActive) { 553 DP("The primary context is active, no change to its flags\n"); 554 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 555 CU_CTX_SCHED_BLOCKING_SYNC) 556 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 557 } else { 558 DP("The primary context is inactive, set its flags to " 559 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 560 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 561 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 562 return OFFLOAD_FAIL; 563 } 564 565 // Retain the per device primary context and save it to use whenever this 566 // device is selected. 567 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 568 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 569 return OFFLOAD_FAIL; 570 571 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 572 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 573 return OFFLOAD_FAIL; 574 575 // Initialize stream pool 576 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 577 return OFFLOAD_FAIL; 578 579 // Query attributes to determine number of threads/block and blocks/grid. 580 int MaxGridDimX; 581 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 582 Device); 583 if (Err != CUDA_SUCCESS) { 584 DP("Error getting max grid dimension, use default value %d\n", 585 DeviceRTLTy::DefaultNumTeams); 586 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 587 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 588 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 589 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 590 } else { 591 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 592 "at the hard limit\n", 593 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 594 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 595 } 596 597 // We are only exploiting threads along the x axis. 598 int MaxBlockDimX; 599 Err = cuDeviceGetAttribute(&MaxBlockDimX, 600 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 601 if (Err != CUDA_SUCCESS) { 602 DP("Error getting max block dimension, use default value %d\n", 603 DeviceRTLTy::DefaultNumThreads); 604 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 605 } else { 606 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 607 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 608 609 if (EnvTeamThreadLimit > 0 && 610 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 611 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 612 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 613 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 614 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 615 } 616 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 617 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 618 "capping at the hard limit\n", 619 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 620 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 621 } 622 } 623 624 // Get and set warp size 625 int WarpSize; 626 Err = 627 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 628 if (Err != CUDA_SUCCESS) { 629 DP("Error getting warp size, assume default value 32\n"); 630 DeviceData[DeviceId].WarpSize = 32; 631 } else { 632 DP("Using warp size %d\n", WarpSize); 633 DeviceData[DeviceId].WarpSize = WarpSize; 634 } 635 636 // Adjust teams to the env variables 637 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 638 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 639 EnvTeamLimit); 640 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 641 } 642 643 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 644 "Device supports up to %d CUDA blocks and %d threads with a " 645 "warp size of %d\n", 646 DeviceData[DeviceId].BlocksPerGrid, 647 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 648 649 // Set default number of teams 650 if (EnvNumTeams > 0) { 651 DP("Default number of teams set according to environment %d\n", 652 EnvNumTeams); 653 DeviceData[DeviceId].NumTeams = EnvNumTeams; 654 } else { 655 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 656 DP("Default number of teams set according to library's default %d\n", 657 DeviceRTLTy::DefaultNumTeams); 658 } 659 660 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 661 DP("Default number of teams exceeds device limit, capping at %d\n", 662 DeviceData[DeviceId].BlocksPerGrid); 663 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 664 } 665 666 // Set default number of threads 667 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 668 DP("Default number of threads set according to library's default %d\n", 669 DeviceRTLTy::DefaultNumThreads); 670 if (DeviceData[DeviceId].NumThreads > 671 DeviceData[DeviceId].ThreadsPerBlock) { 672 DP("Default number of threads exceeds device limit, capping at %d\n", 673 DeviceData[DeviceId].ThreadsPerBlock); 674 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 675 } 676 677 return OFFLOAD_SUCCESS; 678 } 679 680 __tgt_target_table *loadBinary(const int DeviceId, 681 const __tgt_device_image *Image) { 682 // Set the context we are using 683 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 684 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 685 return nullptr; 686 687 // Clear the offload table as we are going to create a new one. 688 clearOffloadEntriesTable(DeviceId); 689 690 // Create the module and extract the function pointers. 691 CUmodule Module; 692 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 693 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 694 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 695 return nullptr; 696 697 DP("CUDA module successfully loaded!\n"); 698 699 Modules.push_back(Module); 700 701 // Find the symbols in the module by name. 702 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 703 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 704 705 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 706 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 707 if (!E->addr) { 708 // We return nullptr when something like this happens, the host should 709 // have always something in the address to uniquely identify the target 710 // region. 711 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 712 return nullptr; 713 } 714 715 if (E->size) { 716 __tgt_offload_entry Entry = *E; 717 CUdeviceptr CUPtr; 718 size_t CUSize; 719 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 720 // We keep this style here because we need the name 721 if (Err != CUDA_SUCCESS) { 722 REPORT("Loading global '%s' Failed\n", E->name); 723 CUDA_ERR_STRING(Err); 724 return nullptr; 725 } 726 727 if (CUSize != E->size) { 728 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 729 CUSize, E->size); 730 return nullptr; 731 } 732 733 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 734 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 735 736 Entry.addr = (void *)(CUPtr); 737 738 // Note: In the current implementation declare target variables 739 // can either be link or to. This means that once unified 740 // memory is activated via the requires directive, the variable 741 // can be used directly from the host in both cases. 742 // TODO: when variables types other than to or link are added, 743 // the below condition should be changed to explicitly 744 // check for to and link variables types: 745 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 746 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 747 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 748 // If unified memory is present any target link or to variables 749 // can access host addresses directly. There is no longer a 750 // need for device copies. 751 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 752 DP("Copy linked variable host address (" DPxMOD 753 ") to device address (" DPxMOD ")\n", 754 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 755 } 756 757 addOffloadEntry(DeviceId, Entry); 758 759 continue; 760 } 761 762 CUfunction Func; 763 Err = cuModuleGetFunction(&Func, Module, E->name); 764 // We keep this style here because we need the name 765 if (Err != CUDA_SUCCESS) { 766 REPORT("Loading '%s' Failed\n", E->name); 767 CUDA_ERR_STRING(Err); 768 return nullptr; 769 } 770 771 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 772 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 773 774 // default value GENERIC (in case symbol is missing from cubin file) 775 int8_t ExecModeVal = ExecutionModeType::GENERIC; 776 std::string ExecModeNameStr(E->name); 777 ExecModeNameStr += "_exec_mode"; 778 const char *ExecModeName = ExecModeNameStr.c_str(); 779 780 CUdeviceptr ExecModePtr; 781 size_t CUSize; 782 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 783 if (Err == CUDA_SUCCESS) { 784 if (CUSize != sizeof(int8_t)) { 785 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 786 ExecModeName, CUSize, sizeof(int8_t)); 787 return nullptr; 788 } 789 790 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 791 if (Err != CUDA_SUCCESS) { 792 REPORT("Error when copying data from device to host. Pointers: " 793 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 794 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 795 CUDA_ERR_STRING(Err); 796 return nullptr; 797 } 798 799 if (ExecModeVal < 0 || ExecModeVal > 1) { 800 DP("Error wrong exec_mode value specified in cubin file: %d\n", 801 ExecModeVal); 802 return nullptr; 803 } 804 } else { 805 REPORT("Loading global exec_mode '%s' - symbol missing, using default " 806 "value GENERIC (1)\n", 807 ExecModeName); 808 CUDA_ERR_STRING(Err); 809 } 810 811 KernelsList.emplace_back(Func, ExecModeVal); 812 813 __tgt_offload_entry Entry = *E; 814 Entry.addr = &KernelsList.back(); 815 addOffloadEntry(DeviceId, Entry); 816 } 817 818 // send device environment data to the device 819 { 820 omptarget_device_environmentTy DeviceEnv{0}; 821 822 #ifdef OMPTARGET_DEBUG 823 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 824 DeviceEnv.debug_level = std::stoi(EnvStr); 825 #endif 826 827 const char *DeviceEnvName = "omptarget_device_environment"; 828 CUdeviceptr DeviceEnvPtr; 829 size_t CUSize; 830 831 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 832 if (Err == CUDA_SUCCESS) { 833 if (CUSize != sizeof(DeviceEnv)) { 834 REPORT( 835 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 836 DeviceEnvName, CUSize, sizeof(int32_t)); 837 CUDA_ERR_STRING(Err); 838 return nullptr; 839 } 840 841 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 842 if (Err != CUDA_SUCCESS) { 843 REPORT("Error when copying data from host to device. Pointers: " 844 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 845 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 846 CUDA_ERR_STRING(Err); 847 return nullptr; 848 } 849 850 DP("Sending global device environment data %zu bytes\n", CUSize); 851 } else { 852 DP("Finding global device environment '%s' - symbol missing.\n", 853 DeviceEnvName); 854 DP("Continue, considering this is a device RTL which does not accept " 855 "environment setting.\n"); 856 } 857 } 858 859 return getOffloadEntriesTable(DeviceId); 860 } 861 862 void *dataAlloc(const int DeviceId, const int64_t Size, 863 const TargetAllocTy Kind) { 864 switch (Kind) { 865 case TARGET_ALLOC_DEFAULT: 866 case TARGET_ALLOC_DEVICE: 867 if (UseMemoryManager) 868 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 869 else 870 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 871 case TARGET_ALLOC_HOST: 872 case TARGET_ALLOC_SHARED: 873 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 874 } 875 876 REPORT("Invalid target data allocation kind or requested allocator not " 877 "implemented yet\n"); 878 879 return nullptr; 880 } 881 882 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 883 const int64_t Size, __tgt_async_info *AsyncInfo) const { 884 assert(AsyncInfo && "AsyncInfo is nullptr"); 885 886 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 887 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 888 return OFFLOAD_FAIL; 889 890 CUstream Stream = getStream(DeviceId, AsyncInfo); 891 892 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 893 if (Err != CUDA_SUCCESS) { 894 DP("Error when copying data from host to device. Pointers: host " 895 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 896 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 897 CUDA_ERR_STRING(Err); 898 return OFFLOAD_FAIL; 899 } 900 901 return OFFLOAD_SUCCESS; 902 } 903 904 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 905 const int64_t Size, __tgt_async_info *AsyncInfo) const { 906 assert(AsyncInfo && "AsyncInfo is nullptr"); 907 908 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 909 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 910 return OFFLOAD_FAIL; 911 912 CUstream Stream = getStream(DeviceId, AsyncInfo); 913 914 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 915 if (Err != CUDA_SUCCESS) { 916 DP("Error when copying data from device to host. Pointers: host " 917 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 918 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 919 CUDA_ERR_STRING(Err); 920 return OFFLOAD_FAIL; 921 } 922 923 return OFFLOAD_SUCCESS; 924 } 925 926 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 927 int64_t Size, __tgt_async_info *AsyncInfo) const { 928 assert(AsyncInfo && "AsyncInfo is nullptr"); 929 930 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 931 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 932 return OFFLOAD_FAIL; 933 934 CUstream Stream = getStream(SrcDevId, AsyncInfo); 935 936 // If they are two devices, we try peer to peer copy first 937 if (SrcDevId != DstDevId) { 938 int CanAccessPeer = 0; 939 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 940 if (Err != CUDA_SUCCESS) { 941 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 942 ", dst = %" PRId32 "\n", 943 SrcDevId, DstDevId); 944 CUDA_ERR_STRING(Err); 945 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 946 } 947 948 if (!CanAccessPeer) { 949 DP("P2P memcpy not supported so fall back to D2D memcpy"); 950 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 951 } 952 953 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 954 if (Err != CUDA_SUCCESS) { 955 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 956 ", dst = %" PRId32 "\n", 957 SrcDevId, DstDevId); 958 CUDA_ERR_STRING(Err); 959 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 960 } 961 962 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 963 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 964 Size, Stream); 965 if (Err == CUDA_SUCCESS) 966 return OFFLOAD_SUCCESS; 967 968 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 969 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 970 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 971 CUDA_ERR_STRING(Err); 972 } 973 974 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 975 } 976 977 int dataDelete(const int DeviceId, void *TgtPtr) { 978 if (UseMemoryManager) 979 return MemoryManagers[DeviceId]->free(TgtPtr); 980 981 return DeviceAllocators[DeviceId].free(TgtPtr); 982 } 983 984 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 985 ptrdiff_t *TgtOffsets, const int ArgNum, 986 const int TeamNum, const int ThreadLimit, 987 const unsigned int LoopTripCount, 988 __tgt_async_info *AsyncInfo) const { 989 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 990 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 991 return OFFLOAD_FAIL; 992 993 // All args are references. 994 std::vector<void *> Args(ArgNum); 995 std::vector<void *> Ptrs(ArgNum); 996 997 for (int I = 0; I < ArgNum; ++I) { 998 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 999 Args[I] = &Ptrs[I]; 1000 } 1001 1002 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1003 1004 int CudaThreadsPerBlock; 1005 if (ThreadLimit > 0) { 1006 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1007 CudaThreadsPerBlock = ThreadLimit; 1008 // Add master warp if necessary 1009 if (KernelInfo->ExecutionMode == GENERIC) { 1010 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1011 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1012 } 1013 } else { 1014 DP("Setting CUDA threads per block to default %d\n", 1015 DeviceData[DeviceId].NumThreads); 1016 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1017 } 1018 1019 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1020 DP("Threads per block capped at device limit %d\n", 1021 DeviceData[DeviceId].ThreadsPerBlock); 1022 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1023 } 1024 1025 if (!KernelInfo->MaxThreadsPerBlock) { 1026 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1027 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1028 KernelInfo->Func); 1029 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1030 return OFFLOAD_FAIL; 1031 } 1032 1033 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1034 DP("Threads per block capped at kernel limit %d\n", 1035 KernelInfo->MaxThreadsPerBlock); 1036 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1037 } 1038 1039 unsigned int CudaBlocksPerGrid; 1040 if (TeamNum <= 0) { 1041 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1042 if (KernelInfo->ExecutionMode == SPMD) { 1043 // We have a combined construct, i.e. `target teams distribute 1044 // parallel for [simd]`. We launch so many teams so that each thread 1045 // will execute one iteration of the loop. round up to the nearest 1046 // integer 1047 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1048 } else { 1049 // If we reach this point, then we have a non-combined construct, i.e. 1050 // `teams distribute` with a nested `parallel for` and each team is 1051 // assigned one iteration of the `distribute` loop. E.g.: 1052 // 1053 // #pragma omp target teams distribute 1054 // for(...loop_tripcount...) { 1055 // #pragma omp parallel for 1056 // for(...) {} 1057 // } 1058 // 1059 // Threads within a team will execute the iterations of the `parallel` 1060 // loop. 1061 CudaBlocksPerGrid = LoopTripCount; 1062 } 1063 DP("Using %d teams due to loop trip count %" PRIu32 1064 " and number of threads per block %d\n", 1065 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1066 } else { 1067 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1068 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1069 } 1070 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1071 DP("Capping number of teams to team limit %d\n", 1072 DeviceData[DeviceId].BlocksPerGrid); 1073 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1074 } else { 1075 DP("Using requested number of teams %d\n", TeamNum); 1076 CudaBlocksPerGrid = TeamNum; 1077 } 1078 1079 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1080 "Launching kernel %s with %d blocks and %d threads in %s " 1081 "mode\n", 1082 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1083 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1084 : "(null)", 1085 CudaBlocksPerGrid, CudaThreadsPerBlock, 1086 (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic"); 1087 1088 CUstream Stream = getStream(DeviceId, AsyncInfo); 1089 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1090 /* gridDimZ */ 1, CudaThreadsPerBlock, 1091 /* blockDimY */ 1, /* blockDimZ */ 1, 1092 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 1093 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1094 return OFFLOAD_FAIL; 1095 1096 DP("Launch of entry point at " DPxMOD " successful!\n", 1097 DPxPTR(TgtEntryPtr)); 1098 1099 return OFFLOAD_SUCCESS; 1100 } 1101 1102 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1103 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1104 CUresult Err = cuStreamSynchronize(Stream); 1105 1106 // Once the stream is synchronized, return it to stream pool and reset 1107 // AsyncInfo. This is to make sure the synchronization only works for its 1108 // own tasks. 1109 StreamManager->returnStream(DeviceId, 1110 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1111 AsyncInfo->Queue = nullptr; 1112 1113 if (Err != CUDA_SUCCESS) { 1114 DP("Error when synchronizing stream. stream = " DPxMOD 1115 ", async info ptr = " DPxMOD "\n", 1116 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1117 CUDA_ERR_STRING(Err); 1118 } 1119 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1120 } 1121 }; 1122 1123 DeviceRTLTy DeviceRTL; 1124 } // namespace 1125 1126 // Exposed library API function 1127 #ifdef __cplusplus 1128 extern "C" { 1129 #endif 1130 1131 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1132 return elf_check_machine(image, /* EM_CUDA */ 190); 1133 } 1134 1135 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1136 1137 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1138 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1139 DeviceRTL.setRequiresFlag(RequiresFlags); 1140 return RequiresFlags; 1141 } 1142 1143 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1144 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1145 DeviceRTL.isValidDeviceId(dst_dev_id)) 1146 return 1; 1147 1148 return 0; 1149 } 1150 1151 int32_t __tgt_rtl_init_device(int32_t device_id) { 1152 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1153 1154 return DeviceRTL.initDevice(device_id); 1155 } 1156 1157 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1158 __tgt_device_image *image) { 1159 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1160 1161 return DeviceRTL.loadBinary(device_id, image); 1162 } 1163 1164 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1165 int32_t kind) { 1166 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1167 1168 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1169 } 1170 1171 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1172 int64_t size) { 1173 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1174 1175 __tgt_async_info AsyncInfo; 1176 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1177 size, &AsyncInfo); 1178 if (rc != OFFLOAD_SUCCESS) 1179 return OFFLOAD_FAIL; 1180 1181 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1182 } 1183 1184 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1185 void *hst_ptr, int64_t size, 1186 __tgt_async_info *async_info_ptr) { 1187 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1188 assert(async_info_ptr && "async_info_ptr is nullptr"); 1189 1190 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1191 async_info_ptr); 1192 } 1193 1194 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1195 int64_t size) { 1196 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1197 1198 __tgt_async_info AsyncInfo; 1199 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1200 size, &AsyncInfo); 1201 if (rc != OFFLOAD_SUCCESS) 1202 return OFFLOAD_FAIL; 1203 1204 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1205 } 1206 1207 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1208 void *tgt_ptr, int64_t size, 1209 __tgt_async_info *async_info_ptr) { 1210 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1211 assert(async_info_ptr && "async_info_ptr is nullptr"); 1212 1213 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1214 async_info_ptr); 1215 } 1216 1217 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1218 int dst_dev_id, void *dst_ptr, 1219 int64_t size, 1220 __tgt_async_info *AsyncInfo) { 1221 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1222 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1223 assert(AsyncInfo && "AsyncInfo is nullptr"); 1224 1225 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1226 AsyncInfo); 1227 } 1228 1229 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1230 int32_t dst_dev_id, void *dst_ptr, 1231 int64_t size) { 1232 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1233 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1234 1235 __tgt_async_info AsyncInfo; 1236 const int32_t rc = __tgt_rtl_data_exchange_async( 1237 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1238 if (rc != OFFLOAD_SUCCESS) 1239 return OFFLOAD_FAIL; 1240 1241 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1242 } 1243 1244 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1245 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1246 1247 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1248 } 1249 1250 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1251 void **tgt_args, 1252 ptrdiff_t *tgt_offsets, 1253 int32_t arg_num, int32_t team_num, 1254 int32_t thread_limit, 1255 uint64_t loop_tripcount) { 1256 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1257 1258 __tgt_async_info AsyncInfo; 1259 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1260 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1261 thread_limit, loop_tripcount, &AsyncInfo); 1262 if (rc != OFFLOAD_SUCCESS) 1263 return OFFLOAD_FAIL; 1264 1265 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1266 } 1267 1268 int32_t __tgt_rtl_run_target_team_region_async( 1269 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1270 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1271 int32_t thread_limit, uint64_t loop_tripcount, 1272 __tgt_async_info *async_info_ptr) { 1273 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1274 1275 return DeviceRTL.runTargetTeamRegion( 1276 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1277 thread_limit, loop_tripcount, async_info_ptr); 1278 } 1279 1280 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1281 void **tgt_args, ptrdiff_t *tgt_offsets, 1282 int32_t arg_num) { 1283 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1284 1285 __tgt_async_info AsyncInfo; 1286 const int32_t rc = __tgt_rtl_run_target_region_async( 1287 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1288 if (rc != OFFLOAD_SUCCESS) 1289 return OFFLOAD_FAIL; 1290 1291 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1292 } 1293 1294 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1295 void *tgt_entry_ptr, void **tgt_args, 1296 ptrdiff_t *tgt_offsets, 1297 int32_t arg_num, 1298 __tgt_async_info *async_info_ptr) { 1299 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1300 1301 return __tgt_rtl_run_target_team_region_async( 1302 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1303 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1304 async_info_ptr); 1305 } 1306 1307 int32_t __tgt_rtl_synchronize(int32_t device_id, 1308 __tgt_async_info *async_info_ptr) { 1309 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1310 assert(async_info_ptr && "async_info_ptr is nullptr"); 1311 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1312 1313 return DeviceRTL.synchronize(device_id, async_info_ptr); 1314 } 1315 1316 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1317 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1318 InfoLevel.store(NewInfoLevel); 1319 } 1320 1321 #ifdef __cplusplus 1322 } 1323 #endif 1324