1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "omptargetplugin.h" 25 26 #define TARGET_NAME CUDA 27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 28 29 #include "MemoryManager.h" 30 31 // Utility for retrieving and printing CUDA error string. 32 #ifdef OMPTARGET_DEBUG 33 #define CUDA_ERR_STRING(err) \ 34 do { \ 35 if (getDebugLevel() > 0) { \ 36 const char *errStr = nullptr; \ 37 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 38 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 39 REPORT("Unrecognized CUDA error code: %d\n", err); \ 40 else if (errStr_status == CUDA_SUCCESS) \ 41 REPORT("CUDA error is: %s\n", errStr); \ 42 else { \ 43 REPORT("Unresolved CUDA error code: %d\n", err); \ 44 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 45 errStr_status); \ 46 } \ 47 } else { \ 48 const char *errStr = nullptr; \ 49 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 50 if (errStr_status == CUDA_SUCCESS) \ 51 REPORT("%s \n", errStr); \ 52 } \ 53 } while (false) 54 #else // OMPTARGET_DEBUG 55 #define CUDA_ERR_STRING(err) \ 56 do { \ 57 const char *errStr = nullptr; \ 58 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 59 if (errStr_status == CUDA_SUCCESS) \ 60 REPORT("%s \n", errStr); \ 61 } while (false) 62 #endif // OMPTARGET_DEBUG 63 64 #include "elf_common.h" 65 66 /// Keep entries table per device. 67 struct FuncOrGblEntryTy { 68 __tgt_target_table Table; 69 std::vector<__tgt_offload_entry> Entries; 70 }; 71 72 enum ExecutionModeType { 73 SPMD, // constructors, destructors, 74 // combined constructs (`teams distribute parallel for [simd]`) 75 GENERIC, // everything else 76 SPMD_GENERIC, // Generic kernel with SPMD execution 77 NONE 78 }; 79 80 /// Use a single entity to encode a kernel and a set of flags. 81 struct KernelTy { 82 CUfunction Func; 83 84 // execution mode of kernel 85 // 0 - SPMD mode (without master warp) 86 // 1 - Generic mode (with master warp) 87 // 2 - SPMD mode execution with Generic mode semantics. 88 int8_t ExecutionMode; 89 90 /// Maximal number of threads per block for this kernel. 91 int MaxThreadsPerBlock = 0; 92 93 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 94 : Func(_Func), ExecutionMode(_ExecutionMode) {} 95 }; 96 97 /// Device environment data 98 /// Manually sync with the deviceRTL side for now, move to a dedicated header 99 /// file later. 100 struct omptarget_device_environmentTy { 101 int32_t debug_level; 102 }; 103 104 namespace { 105 bool checkResult(CUresult Err, const char *ErrMsg) { 106 if (Err == CUDA_SUCCESS) 107 return true; 108 109 REPORT("%s", ErrMsg); 110 CUDA_ERR_STRING(Err); 111 return false; 112 } 113 114 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 115 CUstream Stream) { 116 CUresult Err = 117 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 118 119 if (Err != CUDA_SUCCESS) { 120 DP("Error when copying data from device to device. Pointers: src " 121 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 122 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 123 CUDA_ERR_STRING(Err); 124 return OFFLOAD_FAIL; 125 } 126 127 return OFFLOAD_SUCCESS; 128 } 129 130 // Structure contains per-device data 131 struct DeviceDataTy { 132 /// List that contains all the kernels. 133 std::list<KernelTy> KernelsList; 134 135 std::list<FuncOrGblEntryTy> FuncGblEntries; 136 137 CUcontext Context = nullptr; 138 // Device properties 139 int ThreadsPerBlock = 0; 140 int BlocksPerGrid = 0; 141 int WarpSize = 0; 142 // OpenMP properties 143 int NumTeams = 0; 144 int NumThreads = 0; 145 }; 146 147 class StreamManagerTy { 148 int NumberOfDevices; 149 // The initial size of stream pool 150 int EnvNumInitialStreams; 151 // Per-device stream mutex 152 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 153 // Per-device stream Id indicates the next available stream in the pool 154 std::vector<int> NextStreamId; 155 // Per-device stream pool 156 std::vector<std::vector<CUstream>> StreamPool; 157 // Reference to per-device data 158 std::vector<DeviceDataTy> &DeviceData; 159 160 // If there is no CUstream left in the pool, we will resize the pool to 161 // allocate more CUstream. This function should be called with device mutex, 162 // and we do not resize to smaller one. 163 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 164 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 165 const size_t CurrentSize = Pool.size(); 166 assert(NewSize > CurrentSize && "new size is not larger than current size"); 167 168 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 169 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 170 // We will return if cannot switch to the right context in case of 171 // creating bunch of streams that are not corresponding to the right 172 // device. The offloading will fail later because selected CUstream is 173 // nullptr. 174 return; 175 } 176 177 Pool.resize(NewSize, nullptr); 178 179 for (size_t I = CurrentSize; I < NewSize; ++I) { 180 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 181 "Error returned from cuStreamCreate\n"); 182 } 183 } 184 185 public: 186 StreamManagerTy(const int NumberOfDevices, 187 std::vector<DeviceDataTy> &DeviceData) 188 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 189 DeviceData(DeviceData) { 190 StreamPool.resize(NumberOfDevices); 191 NextStreamId.resize(NumberOfDevices); 192 StreamMtx.resize(NumberOfDevices); 193 194 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 195 EnvNumInitialStreams = std::stoi(EnvStr); 196 197 // Initialize the next stream id 198 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 199 200 // Initialize stream mutex 201 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 202 Ptr = std::make_unique<std::mutex>(); 203 } 204 205 ~StreamManagerTy() { 206 // Destroy streams 207 for (int I = 0; I < NumberOfDevices; ++I) { 208 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 209 "Error returned from cuCtxSetCurrent\n"); 210 211 for (CUstream &S : StreamPool[I]) { 212 if (S) 213 checkResult(cuStreamDestroy(S), 214 "Error returned from cuStreamDestroy\n"); 215 } 216 } 217 } 218 219 // Get a CUstream from pool. Per-device next stream id always points to the 220 // next available CUstream. That means, CUstreams [0, id-1] have been 221 // assigned, and [id,] are still available. If there is no CUstream left, we 222 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 223 // the id will increase one. 224 // xxxxxs+++++++++ 225 // ^ 226 // id 227 // After assignment, the pool becomes the following and s is assigned. 228 // xxxxxs+++++++++ 229 // ^ 230 // id 231 CUstream getStream(const int DeviceId) { 232 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 233 int &Id = NextStreamId[DeviceId]; 234 // No CUstream left in the pool, we need to request from CUDA RT 235 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 236 // By default we double the stream pool every time 237 resizeStreamPool(DeviceId, Id * 2); 238 } 239 return StreamPool[DeviceId][Id++]; 240 } 241 242 // Return a CUstream back to pool. As mentioned above, per-device next 243 // stream is always points to the next available CUstream, so when we return 244 // a CUstream, we need to first decrease the id, and then copy the CUstream 245 // back. 246 // It is worth noting that, the order of streams return might be different 247 // from that they're assigned, that saying, at some point, there might be 248 // two identical CUstreams. 249 // xxax+a+++++ 250 // ^ 251 // id 252 // However, it doesn't matter, because they're always on the two sides of 253 // id. The left one will in the end be overwritten by another CUstream. 254 // Therefore, after several execution, the order of pool might be different 255 // from its initial state. 256 void returnStream(const int DeviceId, CUstream Stream) { 257 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 258 int &Id = NextStreamId[DeviceId]; 259 assert(Id > 0 && "Wrong stream ID"); 260 StreamPool[DeviceId][--Id] = Stream; 261 } 262 263 bool initializeDeviceStreamPool(const int DeviceId) { 264 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 265 266 resizeStreamPool(DeviceId, EnvNumInitialStreams); 267 268 // Check the size of stream pool 269 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 270 return false; 271 272 // Check whether each stream is valid 273 for (CUstream &S : StreamPool[DeviceId]) 274 if (!S) 275 return false; 276 277 return true; 278 } 279 }; 280 281 class DeviceRTLTy { 282 int NumberOfDevices; 283 // OpenMP environment properties 284 int EnvNumTeams; 285 int EnvTeamLimit; 286 int EnvTeamThreadLimit; 287 // OpenMP requires flags 288 int64_t RequiresFlags; 289 290 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 291 static constexpr const int HardThreadLimit = 1024; 292 static constexpr const int DefaultNumTeams = 128; 293 static constexpr const int DefaultNumThreads = 128; 294 295 std::unique_ptr<StreamManagerTy> StreamManager; 296 std::vector<DeviceDataTy> DeviceData; 297 std::vector<CUmodule> Modules; 298 299 /// A class responsible for interacting with device native runtime library to 300 /// allocate and free memory. 301 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 302 const int DeviceId; 303 const std::vector<DeviceDataTy> &DeviceData; 304 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 305 306 public: 307 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 308 : DeviceId(DeviceId), DeviceData(DeviceData) {} 309 310 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 311 if (Size == 0) 312 return nullptr; 313 314 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 315 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 316 return nullptr; 317 318 void *MemAlloc = nullptr; 319 switch (Kind) { 320 case TARGET_ALLOC_DEFAULT: 321 case TARGET_ALLOC_DEVICE: 322 CUdeviceptr DevicePtr; 323 Err = cuMemAlloc(&DevicePtr, Size); 324 MemAlloc = (void *)DevicePtr; 325 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 326 return nullptr; 327 break; 328 case TARGET_ALLOC_HOST: 329 void *HostPtr; 330 Err = cuMemAllocHost(&HostPtr, Size); 331 MemAlloc = HostPtr; 332 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 333 return nullptr; 334 HostPinnedAllocs[MemAlloc] = Kind; 335 break; 336 case TARGET_ALLOC_SHARED: 337 CUdeviceptr SharedPtr; 338 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 339 MemAlloc = (void *)SharedPtr; 340 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 341 return nullptr; 342 break; 343 } 344 345 return MemAlloc; 346 } 347 348 int free(void *TgtPtr) override { 349 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 350 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 351 return OFFLOAD_FAIL; 352 353 // Host pinned memory must be freed differently. 354 TargetAllocTy Kind = 355 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 356 ? TARGET_ALLOC_DEFAULT 357 : TARGET_ALLOC_HOST; 358 switch (Kind) { 359 case TARGET_ALLOC_DEFAULT: 360 case TARGET_ALLOC_DEVICE: 361 case TARGET_ALLOC_SHARED: 362 Err = cuMemFree((CUdeviceptr)TgtPtr); 363 if (!checkResult(Err, "Error returned from cuMemFree\n")) 364 return OFFLOAD_FAIL; 365 break; 366 case TARGET_ALLOC_HOST: 367 Err = cuMemFreeHost(TgtPtr); 368 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 369 return OFFLOAD_FAIL; 370 break; 371 } 372 373 return OFFLOAD_SUCCESS; 374 } 375 }; 376 377 /// A vector of device allocators 378 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 379 380 /// A vector of memory managers. Since the memory manager is non-copyable and 381 // non-removable, we wrap them into std::unique_ptr. 382 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 383 384 /// Whether use memory manager 385 bool UseMemoryManager = true; 386 387 // Record entry point associated with device 388 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 389 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 390 E.Entries.push_back(entry); 391 } 392 393 // Return a pointer to the entry associated with the pointer 394 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 395 const void *Addr) const { 396 for (const __tgt_offload_entry &Itr : 397 DeviceData[DeviceId].FuncGblEntries.back().Entries) 398 if (Itr.addr == Addr) 399 return &Itr; 400 401 return nullptr; 402 } 403 404 // Return the pointer to the target entries table 405 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 406 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 407 408 if (E.Entries.empty()) 409 return nullptr; 410 411 // Update table info according to the entries and return the pointer 412 E.Table.EntriesBegin = E.Entries.data(); 413 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 414 415 return &E.Table; 416 } 417 418 // Clear entries table for a device 419 void clearOffloadEntriesTable(const int DeviceId) { 420 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 421 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 422 E.Entries.clear(); 423 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 424 } 425 426 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 427 assert(AsyncInfo && "AsyncInfo is nullptr"); 428 429 if (!AsyncInfo->Queue) 430 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 431 432 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 433 } 434 435 public: 436 // This class should not be copied 437 DeviceRTLTy(const DeviceRTLTy &) = delete; 438 DeviceRTLTy(DeviceRTLTy &&) = delete; 439 440 DeviceRTLTy() 441 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 442 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { 443 444 DP("Start initializing CUDA\n"); 445 446 CUresult Err = cuInit(0); 447 if (Err == CUDA_ERROR_INVALID_HANDLE) { 448 // Can't call cuGetErrorString if dlsym failed 449 DP("Failed to load CUDA shared library\n"); 450 return; 451 } 452 if (!checkResult(Err, "Error returned from cuInit\n")) { 453 return; 454 } 455 456 Err = cuDeviceGetCount(&NumberOfDevices); 457 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 458 return; 459 460 if (NumberOfDevices == 0) { 461 DP("There are no devices supporting CUDA.\n"); 462 return; 463 } 464 465 DeviceData.resize(NumberOfDevices); 466 467 // Get environment variables regarding teams 468 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 469 // OMP_TEAM_LIMIT has been set 470 EnvTeamLimit = std::stoi(EnvStr); 471 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 472 } 473 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 474 // OMP_TEAMS_THREAD_LIMIT has been set 475 EnvTeamThreadLimit = std::stoi(EnvStr); 476 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 477 } 478 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 479 // OMP_NUM_TEAMS has been set 480 EnvNumTeams = std::stoi(EnvStr); 481 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 482 } 483 484 StreamManager = 485 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 486 487 for (int I = 0; I < NumberOfDevices; ++I) 488 DeviceAllocators.emplace_back(I, DeviceData); 489 490 // Get the size threshold from environment variable 491 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 492 UseMemoryManager = Res.second; 493 size_t MemoryManagerThreshold = Res.first; 494 495 if (UseMemoryManager) 496 for (int I = 0; I < NumberOfDevices; ++I) 497 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 498 DeviceAllocators[I], MemoryManagerThreshold)); 499 } 500 501 ~DeviceRTLTy() { 502 // We first destruct memory managers in case that its dependent data are 503 // destroyed before it. 504 for (auto &M : MemoryManagers) 505 M.release(); 506 507 StreamManager = nullptr; 508 509 for (CUmodule &M : Modules) 510 // Close module 511 if (M) 512 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 513 514 for (DeviceDataTy &D : DeviceData) { 515 // Destroy context 516 if (D.Context) { 517 checkResult(cuCtxSetCurrent(D.Context), 518 "Error returned from cuCtxSetCurrent\n"); 519 CUdevice Device; 520 checkResult(cuCtxGetDevice(&Device), 521 "Error returned from cuCtxGetDevice\n"); 522 checkResult(cuDevicePrimaryCtxRelease(Device), 523 "Error returned from cuDevicePrimaryCtxRelease\n"); 524 } 525 } 526 } 527 528 // Check whether a given DeviceId is valid 529 bool isValidDeviceId(const int DeviceId) const { 530 return DeviceId >= 0 && DeviceId < NumberOfDevices; 531 } 532 533 int getNumOfDevices() const { return NumberOfDevices; } 534 535 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 536 537 int initDevice(const int DeviceId) { 538 CUdevice Device; 539 540 DP("Getting device %d\n", DeviceId); 541 CUresult Err = cuDeviceGet(&Device, DeviceId); 542 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 543 return OFFLOAD_FAIL; 544 545 // Query the current flags of the primary context and set its flags if 546 // it is inactive 547 unsigned int FormerPrimaryCtxFlags = 0; 548 int FormerPrimaryCtxIsActive = 0; 549 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 550 &FormerPrimaryCtxIsActive); 551 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 552 return OFFLOAD_FAIL; 553 554 if (FormerPrimaryCtxIsActive) { 555 DP("The primary context is active, no change to its flags\n"); 556 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 557 CU_CTX_SCHED_BLOCKING_SYNC) 558 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 559 } else { 560 DP("The primary context is inactive, set its flags to " 561 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 562 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 563 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 564 return OFFLOAD_FAIL; 565 } 566 567 // Retain the per device primary context and save it to use whenever this 568 // device is selected. 569 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 570 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 571 return OFFLOAD_FAIL; 572 573 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 574 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 575 return OFFLOAD_FAIL; 576 577 // Initialize stream pool 578 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 579 return OFFLOAD_FAIL; 580 581 // Query attributes to determine number of threads/block and blocks/grid. 582 int MaxGridDimX; 583 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 584 Device); 585 if (Err != CUDA_SUCCESS) { 586 DP("Error getting max grid dimension, use default value %d\n", 587 DeviceRTLTy::DefaultNumTeams); 588 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 589 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 590 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 591 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 592 } else { 593 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 594 "at the hard limit\n", 595 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 596 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 597 } 598 599 // We are only exploiting threads along the x axis. 600 int MaxBlockDimX; 601 Err = cuDeviceGetAttribute(&MaxBlockDimX, 602 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 603 if (Err != CUDA_SUCCESS) { 604 DP("Error getting max block dimension, use default value %d\n", 605 DeviceRTLTy::DefaultNumThreads); 606 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 607 } else { 608 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 609 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 610 611 if (EnvTeamThreadLimit > 0 && 612 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 613 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 614 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 615 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 616 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 617 } 618 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 619 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 620 "capping at the hard limit\n", 621 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 622 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 623 } 624 } 625 626 // Get and set warp size 627 int WarpSize; 628 Err = 629 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 630 if (Err != CUDA_SUCCESS) { 631 DP("Error getting warp size, assume default value 32\n"); 632 DeviceData[DeviceId].WarpSize = 32; 633 } else { 634 DP("Using warp size %d\n", WarpSize); 635 DeviceData[DeviceId].WarpSize = WarpSize; 636 } 637 638 // Adjust teams to the env variables 639 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 640 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 641 EnvTeamLimit); 642 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 643 } 644 645 size_t StackLimit; 646 size_t HeapLimit; 647 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 648 StackLimit = std::stol(EnvStr); 649 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 650 return OFFLOAD_FAIL; 651 } else { 652 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 653 return OFFLOAD_FAIL; 654 } 655 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 656 HeapLimit = std::stol(EnvStr); 657 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 658 return OFFLOAD_FAIL; 659 } else { 660 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 661 return OFFLOAD_FAIL; 662 } 663 664 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 665 "Device supports up to %d CUDA blocks and %d threads with a " 666 "warp size of %d\n", 667 DeviceData[DeviceId].BlocksPerGrid, 668 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 669 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 670 "Device heap size is %d Bytes, device stack size is %d Bytes per " 671 "thread\n", 672 (int)HeapLimit, (int)StackLimit); 673 674 // Set default number of teams 675 if (EnvNumTeams > 0) { 676 DP("Default number of teams set according to environment %d\n", 677 EnvNumTeams); 678 DeviceData[DeviceId].NumTeams = EnvNumTeams; 679 } else { 680 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 681 DP("Default number of teams set according to library's default %d\n", 682 DeviceRTLTy::DefaultNumTeams); 683 } 684 685 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 686 DP("Default number of teams exceeds device limit, capping at %d\n", 687 DeviceData[DeviceId].BlocksPerGrid); 688 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 689 } 690 691 // Set default number of threads 692 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 693 DP("Default number of threads set according to library's default %d\n", 694 DeviceRTLTy::DefaultNumThreads); 695 if (DeviceData[DeviceId].NumThreads > 696 DeviceData[DeviceId].ThreadsPerBlock) { 697 DP("Default number of threads exceeds device limit, capping at %d\n", 698 DeviceData[DeviceId].ThreadsPerBlock); 699 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 700 } 701 702 return OFFLOAD_SUCCESS; 703 } 704 705 __tgt_target_table *loadBinary(const int DeviceId, 706 const __tgt_device_image *Image) { 707 // Set the context we are using 708 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 709 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 710 return nullptr; 711 712 // Clear the offload table as we are going to create a new one. 713 clearOffloadEntriesTable(DeviceId); 714 715 // Create the module and extract the function pointers. 716 CUmodule Module; 717 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 718 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 719 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 720 return nullptr; 721 722 DP("CUDA module successfully loaded!\n"); 723 724 Modules.push_back(Module); 725 726 // Find the symbols in the module by name. 727 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 728 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 729 730 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 731 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 732 if (!E->addr) { 733 // We return nullptr when something like this happens, the host should 734 // have always something in the address to uniquely identify the target 735 // region. 736 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 737 return nullptr; 738 } 739 740 if (E->size) { 741 __tgt_offload_entry Entry = *E; 742 CUdeviceptr CUPtr; 743 size_t CUSize; 744 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 745 // We keep this style here because we need the name 746 if (Err != CUDA_SUCCESS) { 747 REPORT("Loading global '%s' Failed\n", E->name); 748 CUDA_ERR_STRING(Err); 749 return nullptr; 750 } 751 752 if (CUSize != E->size) { 753 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 754 CUSize, E->size); 755 return nullptr; 756 } 757 758 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 759 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 760 761 Entry.addr = (void *)(CUPtr); 762 763 // Note: In the current implementation declare target variables 764 // can either be link or to. This means that once unified 765 // memory is activated via the requires directive, the variable 766 // can be used directly from the host in both cases. 767 // TODO: when variables types other than to or link are added, 768 // the below condition should be changed to explicitly 769 // check for to and link variables types: 770 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 771 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 772 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 773 // If unified memory is present any target link or to variables 774 // can access host addresses directly. There is no longer a 775 // need for device copies. 776 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 777 DP("Copy linked variable host address (" DPxMOD 778 ") to device address (" DPxMOD ")\n", 779 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 780 } 781 782 addOffloadEntry(DeviceId, Entry); 783 784 continue; 785 } 786 787 CUfunction Func; 788 Err = cuModuleGetFunction(&Func, Module, E->name); 789 // We keep this style here because we need the name 790 if (Err != CUDA_SUCCESS) { 791 REPORT("Loading '%s' Failed\n", E->name); 792 CUDA_ERR_STRING(Err); 793 return nullptr; 794 } 795 796 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 797 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 798 799 // default value GENERIC (in case symbol is missing from cubin file) 800 int8_t ExecModeVal = ExecutionModeType::GENERIC; 801 std::string ExecModeNameStr(E->name); 802 ExecModeNameStr += "_exec_mode"; 803 const char *ExecModeName = ExecModeNameStr.c_str(); 804 805 CUdeviceptr ExecModePtr; 806 size_t CUSize; 807 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 808 if (Err == CUDA_SUCCESS) { 809 if (CUSize != sizeof(int8_t)) { 810 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 811 ExecModeName, CUSize, sizeof(int8_t)); 812 return nullptr; 813 } 814 815 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 816 if (Err != CUDA_SUCCESS) { 817 REPORT("Error when copying data from device to host. Pointers: " 818 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 819 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 820 CUDA_ERR_STRING(Err); 821 return nullptr; 822 } 823 824 if (ExecModeVal < 0 || ExecModeVal > 2) { 825 DP("Error wrong exec_mode value specified in cubin file: %d\n", 826 ExecModeVal); 827 return nullptr; 828 } 829 } else { 830 REPORT("Loading global exec_mode '%s' - symbol missing, using default " 831 "value GENERIC (1)\n", 832 ExecModeName); 833 CUDA_ERR_STRING(Err); 834 } 835 836 KernelsList.emplace_back(Func, ExecModeVal); 837 838 __tgt_offload_entry Entry = *E; 839 Entry.addr = &KernelsList.back(); 840 addOffloadEntry(DeviceId, Entry); 841 } 842 843 // send device environment data to the device 844 { 845 omptarget_device_environmentTy DeviceEnv{0}; 846 847 #ifdef OMPTARGET_DEBUG 848 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 849 DeviceEnv.debug_level = std::stoi(EnvStr); 850 #endif 851 852 const char *DeviceEnvName = "omptarget_device_environment"; 853 CUdeviceptr DeviceEnvPtr; 854 size_t CUSize; 855 856 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 857 if (Err == CUDA_SUCCESS) { 858 if (CUSize != sizeof(DeviceEnv)) { 859 REPORT( 860 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 861 DeviceEnvName, CUSize, sizeof(int32_t)); 862 CUDA_ERR_STRING(Err); 863 return nullptr; 864 } 865 866 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 867 if (Err != CUDA_SUCCESS) { 868 REPORT("Error when copying data from host to device. Pointers: " 869 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 870 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 871 CUDA_ERR_STRING(Err); 872 return nullptr; 873 } 874 875 DP("Sending global device environment data %zu bytes\n", CUSize); 876 } else { 877 DP("Finding global device environment '%s' - symbol missing.\n", 878 DeviceEnvName); 879 DP("Continue, considering this is a device RTL which does not accept " 880 "environment setting.\n"); 881 } 882 } 883 884 return getOffloadEntriesTable(DeviceId); 885 } 886 887 void *dataAlloc(const int DeviceId, const int64_t Size, 888 const TargetAllocTy Kind) { 889 switch (Kind) { 890 case TARGET_ALLOC_DEFAULT: 891 case TARGET_ALLOC_DEVICE: 892 if (UseMemoryManager) 893 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 894 else 895 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 896 case TARGET_ALLOC_HOST: 897 case TARGET_ALLOC_SHARED: 898 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 899 } 900 901 REPORT("Invalid target data allocation kind or requested allocator not " 902 "implemented yet\n"); 903 904 return nullptr; 905 } 906 907 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 908 const int64_t Size, __tgt_async_info *AsyncInfo) const { 909 assert(AsyncInfo && "AsyncInfo is nullptr"); 910 911 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 912 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 913 return OFFLOAD_FAIL; 914 915 CUstream Stream = getStream(DeviceId, AsyncInfo); 916 917 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 918 if (Err != CUDA_SUCCESS) { 919 DP("Error when copying data from host to device. Pointers: host " 920 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 921 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 922 CUDA_ERR_STRING(Err); 923 return OFFLOAD_FAIL; 924 } 925 926 return OFFLOAD_SUCCESS; 927 } 928 929 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 930 const int64_t Size, __tgt_async_info *AsyncInfo) const { 931 assert(AsyncInfo && "AsyncInfo is nullptr"); 932 933 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 934 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 935 return OFFLOAD_FAIL; 936 937 CUstream Stream = getStream(DeviceId, AsyncInfo); 938 939 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 940 if (Err != CUDA_SUCCESS) { 941 DP("Error when copying data from device to host. Pointers: host " 942 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 943 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 944 CUDA_ERR_STRING(Err); 945 return OFFLOAD_FAIL; 946 } 947 948 return OFFLOAD_SUCCESS; 949 } 950 951 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 952 int64_t Size, __tgt_async_info *AsyncInfo) const { 953 assert(AsyncInfo && "AsyncInfo is nullptr"); 954 955 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 956 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 957 return OFFLOAD_FAIL; 958 959 CUstream Stream = getStream(SrcDevId, AsyncInfo); 960 961 // If they are two devices, we try peer to peer copy first 962 if (SrcDevId != DstDevId) { 963 int CanAccessPeer = 0; 964 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 965 if (Err != CUDA_SUCCESS) { 966 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 967 ", dst = %" PRId32 "\n", 968 SrcDevId, DstDevId); 969 CUDA_ERR_STRING(Err); 970 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 971 } 972 973 if (!CanAccessPeer) { 974 DP("P2P memcpy not supported so fall back to D2D memcpy"); 975 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 976 } 977 978 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 979 if (Err != CUDA_SUCCESS) { 980 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 981 ", dst = %" PRId32 "\n", 982 SrcDevId, DstDevId); 983 CUDA_ERR_STRING(Err); 984 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 985 } 986 987 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 988 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 989 Size, Stream); 990 if (Err == CUDA_SUCCESS) 991 return OFFLOAD_SUCCESS; 992 993 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 994 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 995 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 996 CUDA_ERR_STRING(Err); 997 } 998 999 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1000 } 1001 1002 int dataDelete(const int DeviceId, void *TgtPtr) { 1003 if (UseMemoryManager) 1004 return MemoryManagers[DeviceId]->free(TgtPtr); 1005 1006 return DeviceAllocators[DeviceId].free(TgtPtr); 1007 } 1008 1009 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1010 ptrdiff_t *TgtOffsets, const int ArgNum, 1011 const int TeamNum, const int ThreadLimit, 1012 const unsigned int LoopTripCount, 1013 __tgt_async_info *AsyncInfo) const { 1014 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1015 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1016 return OFFLOAD_FAIL; 1017 1018 // All args are references. 1019 std::vector<void *> Args(ArgNum); 1020 std::vector<void *> Ptrs(ArgNum); 1021 1022 for (int I = 0; I < ArgNum; ++I) { 1023 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1024 Args[I] = &Ptrs[I]; 1025 } 1026 1027 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1028 1029 int CudaThreadsPerBlock; 1030 if (ThreadLimit > 0) { 1031 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1032 CudaThreadsPerBlock = ThreadLimit; 1033 // Add master warp if necessary 1034 if (KernelInfo->ExecutionMode == GENERIC) { 1035 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1036 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1037 } 1038 } else { 1039 DP("Setting CUDA threads per block to default %d\n", 1040 DeviceData[DeviceId].NumThreads); 1041 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1042 } 1043 1044 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1045 DP("Threads per block capped at device limit %d\n", 1046 DeviceData[DeviceId].ThreadsPerBlock); 1047 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1048 } 1049 1050 if (!KernelInfo->MaxThreadsPerBlock) { 1051 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1052 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1053 KernelInfo->Func); 1054 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1055 return OFFLOAD_FAIL; 1056 } 1057 1058 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1059 DP("Threads per block capped at kernel limit %d\n", 1060 KernelInfo->MaxThreadsPerBlock); 1061 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1062 } 1063 1064 unsigned int CudaBlocksPerGrid; 1065 if (TeamNum <= 0) { 1066 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1067 if (KernelInfo->ExecutionMode == SPMD) { 1068 // We have a combined construct, i.e. `target teams distribute 1069 // parallel for [simd]`. We launch so many teams so that each thread 1070 // will execute one iteration of the loop. round up to the nearest 1071 // integer 1072 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1073 } else if (KernelInfo->ExecutionMode == GENERIC) { 1074 // If we reach this point, then we have a non-combined construct, i.e. 1075 // `teams distribute` with a nested `parallel for` and each team is 1076 // assigned one iteration of the `distribute` loop. E.g.: 1077 // 1078 // #pragma omp target teams distribute 1079 // for(...loop_tripcount...) { 1080 // #pragma omp parallel for 1081 // for(...) {} 1082 // } 1083 // 1084 // Threads within a team will execute the iterations of the `parallel` 1085 // loop. 1086 CudaBlocksPerGrid = LoopTripCount; 1087 } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) { 1088 // If we reach this point, then we are executing a kernel that was 1089 // transformed from Generic-mode to SPMD-mode. This kernel has 1090 // SPMD-mode execution, but needs its blocks to be scheduled 1091 // differently because the current loop trip count only applies to the 1092 // `teams distribute` region and will create var too few blocks using 1093 // the regular SPMD-mode method. 1094 CudaBlocksPerGrid = LoopTripCount; 1095 } else { 1096 REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode); 1097 return OFFLOAD_FAIL; 1098 } 1099 DP("Using %d teams due to loop trip count %" PRIu32 1100 " and number of threads per block %d\n", 1101 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1102 } else { 1103 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1104 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1105 } 1106 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1107 DP("Capping number of teams to team limit %d\n", 1108 DeviceData[DeviceId].BlocksPerGrid); 1109 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1110 } else { 1111 DP("Using requested number of teams %d\n", TeamNum); 1112 CudaBlocksPerGrid = TeamNum; 1113 } 1114 1115 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1116 "Launching kernel %s with %d blocks and %d threads in %s " 1117 "mode\n", 1118 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1119 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1120 : "(null)", 1121 CudaBlocksPerGrid, CudaThreadsPerBlock, 1122 (KernelInfo->ExecutionMode != SPMD 1123 ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" 1124 : "SPMD-Generic") 1125 : "SPMD")); 1126 1127 CUstream Stream = getStream(DeviceId, AsyncInfo); 1128 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1129 /* gridDimZ */ 1, CudaThreadsPerBlock, 1130 /* blockDimY */ 1, /* blockDimZ */ 1, 1131 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 1132 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1133 return OFFLOAD_FAIL; 1134 1135 DP("Launch of entry point at " DPxMOD " successful!\n", 1136 DPxPTR(TgtEntryPtr)); 1137 1138 return OFFLOAD_SUCCESS; 1139 } 1140 1141 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1142 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1143 CUresult Err = cuStreamSynchronize(Stream); 1144 1145 // Once the stream is synchronized, return it to stream pool and reset 1146 // AsyncInfo. This is to make sure the synchronization only works for its 1147 // own tasks. 1148 StreamManager->returnStream(DeviceId, 1149 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1150 AsyncInfo->Queue = nullptr; 1151 1152 if (Err != CUDA_SUCCESS) { 1153 DP("Error when synchronizing stream. stream = " DPxMOD 1154 ", async info ptr = " DPxMOD "\n", 1155 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1156 CUDA_ERR_STRING(Err); 1157 } 1158 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1159 } 1160 }; 1161 1162 DeviceRTLTy DeviceRTL; 1163 } // namespace 1164 1165 // Exposed library API function 1166 #ifdef __cplusplus 1167 extern "C" { 1168 #endif 1169 1170 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1171 return elf_check_machine(image, /* EM_CUDA */ 190); 1172 } 1173 1174 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1175 1176 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1177 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1178 DeviceRTL.setRequiresFlag(RequiresFlags); 1179 return RequiresFlags; 1180 } 1181 1182 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1183 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1184 DeviceRTL.isValidDeviceId(dst_dev_id)) 1185 return 1; 1186 1187 return 0; 1188 } 1189 1190 int32_t __tgt_rtl_init_device(int32_t device_id) { 1191 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1192 1193 return DeviceRTL.initDevice(device_id); 1194 } 1195 1196 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1197 __tgt_device_image *image) { 1198 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1199 1200 return DeviceRTL.loadBinary(device_id, image); 1201 } 1202 1203 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1204 int32_t kind) { 1205 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1206 1207 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1208 } 1209 1210 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1211 int64_t size) { 1212 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1213 1214 __tgt_async_info AsyncInfo; 1215 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1216 size, &AsyncInfo); 1217 if (rc != OFFLOAD_SUCCESS) 1218 return OFFLOAD_FAIL; 1219 1220 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1221 } 1222 1223 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1224 void *hst_ptr, int64_t size, 1225 __tgt_async_info *async_info_ptr) { 1226 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1227 assert(async_info_ptr && "async_info_ptr is nullptr"); 1228 1229 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1230 async_info_ptr); 1231 } 1232 1233 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1234 int64_t size) { 1235 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1236 1237 __tgt_async_info AsyncInfo; 1238 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1239 size, &AsyncInfo); 1240 if (rc != OFFLOAD_SUCCESS) 1241 return OFFLOAD_FAIL; 1242 1243 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1244 } 1245 1246 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1247 void *tgt_ptr, int64_t size, 1248 __tgt_async_info *async_info_ptr) { 1249 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1250 assert(async_info_ptr && "async_info_ptr is nullptr"); 1251 1252 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1253 async_info_ptr); 1254 } 1255 1256 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1257 int dst_dev_id, void *dst_ptr, 1258 int64_t size, 1259 __tgt_async_info *AsyncInfo) { 1260 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1261 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1262 assert(AsyncInfo && "AsyncInfo is nullptr"); 1263 1264 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1265 AsyncInfo); 1266 } 1267 1268 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1269 int32_t dst_dev_id, void *dst_ptr, 1270 int64_t size) { 1271 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1272 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1273 1274 __tgt_async_info AsyncInfo; 1275 const int32_t rc = __tgt_rtl_data_exchange_async( 1276 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1277 if (rc != OFFLOAD_SUCCESS) 1278 return OFFLOAD_FAIL; 1279 1280 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1281 } 1282 1283 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1284 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1285 1286 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1287 } 1288 1289 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1290 void **tgt_args, 1291 ptrdiff_t *tgt_offsets, 1292 int32_t arg_num, int32_t team_num, 1293 int32_t thread_limit, 1294 uint64_t loop_tripcount) { 1295 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1296 1297 __tgt_async_info AsyncInfo; 1298 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1299 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1300 thread_limit, loop_tripcount, &AsyncInfo); 1301 if (rc != OFFLOAD_SUCCESS) 1302 return OFFLOAD_FAIL; 1303 1304 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1305 } 1306 1307 int32_t __tgt_rtl_run_target_team_region_async( 1308 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1309 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1310 int32_t thread_limit, uint64_t loop_tripcount, 1311 __tgt_async_info *async_info_ptr) { 1312 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1313 1314 return DeviceRTL.runTargetTeamRegion( 1315 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1316 thread_limit, loop_tripcount, async_info_ptr); 1317 } 1318 1319 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1320 void **tgt_args, ptrdiff_t *tgt_offsets, 1321 int32_t arg_num) { 1322 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1323 1324 __tgt_async_info AsyncInfo; 1325 const int32_t rc = __tgt_rtl_run_target_region_async( 1326 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1327 if (rc != OFFLOAD_SUCCESS) 1328 return OFFLOAD_FAIL; 1329 1330 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1331 } 1332 1333 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1334 void *tgt_entry_ptr, void **tgt_args, 1335 ptrdiff_t *tgt_offsets, 1336 int32_t arg_num, 1337 __tgt_async_info *async_info_ptr) { 1338 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1339 1340 return __tgt_rtl_run_target_team_region_async( 1341 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1342 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1343 async_info_ptr); 1344 } 1345 1346 int32_t __tgt_rtl_synchronize(int32_t device_id, 1347 __tgt_async_info *async_info_ptr) { 1348 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1349 assert(async_info_ptr && "async_info_ptr is nullptr"); 1350 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1351 1352 return DeviceRTL.synchronize(device_id, async_info_ptr); 1353 } 1354 1355 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1356 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1357 InfoLevel.store(NewInfoLevel); 1358 } 1359 1360 #ifdef __cplusplus 1361 } 1362 #endif 1363