1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "omptargetplugin.h" 25 26 #define TARGET_NAME CUDA 27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 28 29 #include "MemoryManager.h" 30 31 // Utility for retrieving and printing CUDA error string. 32 #ifdef OMPTARGET_DEBUG 33 #define CUDA_ERR_STRING(err) \ 34 do { \ 35 if (getDebugLevel() > 0) { \ 36 const char *errStr = nullptr; \ 37 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 38 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 39 REPORT("Unrecognized CUDA error code: %d\n", err); \ 40 else if (errStr_status == CUDA_SUCCESS) \ 41 REPORT("CUDA error is: %s\n", errStr); \ 42 else { \ 43 REPORT("Unresolved CUDA error code: %d\n", err); \ 44 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 45 errStr_status); \ 46 } \ 47 } else { \ 48 const char *errStr = nullptr; \ 49 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 50 if (errStr_status == CUDA_SUCCESS) \ 51 REPORT("%s \n", errStr); \ 52 } \ 53 } while (false) 54 #else // OMPTARGET_DEBUG 55 #define CUDA_ERR_STRING(err) \ 56 do { \ 57 const char *errStr = nullptr; \ 58 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 59 if (errStr_status == CUDA_SUCCESS) \ 60 REPORT("%s \n", errStr); \ 61 } while (false) 62 #endif // OMPTARGET_DEBUG 63 64 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 65 66 #include "elf_common.h" 67 68 /// Keep entries table per device. 69 struct FuncOrGblEntryTy { 70 __tgt_target_table Table; 71 std::vector<__tgt_offload_entry> Entries; 72 }; 73 74 enum ExecutionModeType { 75 SPMD, // constructors, destructors, 76 // combined constructs (`teams distribute parallel for [simd]`) 77 GENERIC, // everything else 78 SPMD_GENERIC, // Generic kernel with SPMD execution 79 NONE 80 }; 81 82 /// Use a single entity to encode a kernel and a set of flags. 83 struct KernelTy { 84 CUfunction Func; 85 86 // execution mode of kernel 87 // 0 - SPMD mode (without master warp) 88 // 1 - Generic mode (with master warp) 89 // 2 - SPMD mode execution with Generic mode semantics. 90 int8_t ExecutionMode; 91 92 /// Maximal number of threads per block for this kernel. 93 int MaxThreadsPerBlock = 0; 94 95 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 96 : Func(_Func), ExecutionMode(_ExecutionMode) {} 97 }; 98 99 /// Device environment data 100 /// Manually sync with the deviceRTL side for now, move to a dedicated header 101 /// file later. 102 struct omptarget_device_environmentTy { 103 int32_t debug_level; 104 }; 105 106 namespace { 107 bool checkResult(CUresult Err, const char *ErrMsg) { 108 if (Err == CUDA_SUCCESS) 109 return true; 110 111 REPORT("%s", ErrMsg); 112 CUDA_ERR_STRING(Err); 113 return false; 114 } 115 116 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 117 CUstream Stream) { 118 CUresult Err = 119 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 120 121 if (Err != CUDA_SUCCESS) { 122 DP("Error when copying data from device to device. Pointers: src " 123 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 124 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 125 CUDA_ERR_STRING(Err); 126 return OFFLOAD_FAIL; 127 } 128 129 return OFFLOAD_SUCCESS; 130 } 131 132 // Structure contains per-device data 133 struct DeviceDataTy { 134 /// List that contains all the kernels. 135 std::list<KernelTy> KernelsList; 136 137 std::list<FuncOrGblEntryTy> FuncGblEntries; 138 139 CUcontext Context = nullptr; 140 // Device properties 141 int ThreadsPerBlock = 0; 142 int BlocksPerGrid = 0; 143 int WarpSize = 0; 144 // OpenMP properties 145 int NumTeams = 0; 146 int NumThreads = 0; 147 }; 148 149 class StreamManagerTy { 150 int NumberOfDevices; 151 // The initial size of stream pool 152 int EnvNumInitialStreams; 153 // Per-device stream mutex 154 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 155 // Per-device stream Id indicates the next available stream in the pool 156 std::vector<int> NextStreamId; 157 // Per-device stream pool 158 std::vector<std::vector<CUstream>> StreamPool; 159 // Reference to per-device data 160 std::vector<DeviceDataTy> &DeviceData; 161 162 // If there is no CUstream left in the pool, we will resize the pool to 163 // allocate more CUstream. This function should be called with device mutex, 164 // and we do not resize to smaller one. 165 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 166 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 167 const size_t CurrentSize = Pool.size(); 168 assert(NewSize > CurrentSize && "new size is not larger than current size"); 169 170 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 171 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 172 // We will return if cannot switch to the right context in case of 173 // creating bunch of streams that are not corresponding to the right 174 // device. The offloading will fail later because selected CUstream is 175 // nullptr. 176 return; 177 } 178 179 Pool.resize(NewSize, nullptr); 180 181 for (size_t I = CurrentSize; I < NewSize; ++I) { 182 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 183 "Error returned from cuStreamCreate\n"); 184 } 185 } 186 187 public: 188 StreamManagerTy(const int NumberOfDevices, 189 std::vector<DeviceDataTy> &DeviceData) 190 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 191 DeviceData(DeviceData) { 192 StreamPool.resize(NumberOfDevices); 193 NextStreamId.resize(NumberOfDevices); 194 StreamMtx.resize(NumberOfDevices); 195 196 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 197 EnvNumInitialStreams = std::stoi(EnvStr); 198 199 // Initialize the next stream id 200 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 201 202 // Initialize stream mutex 203 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 204 Ptr = std::make_unique<std::mutex>(); 205 } 206 207 ~StreamManagerTy() { 208 // Destroy streams 209 for (int I = 0; I < NumberOfDevices; ++I) { 210 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 211 "Error returned from cuCtxSetCurrent\n"); 212 213 for (CUstream &S : StreamPool[I]) { 214 if (S) 215 checkResult(cuStreamDestroy(S), 216 "Error returned from cuStreamDestroy\n"); 217 } 218 } 219 } 220 221 // Get a CUstream from pool. Per-device next stream id always points to the 222 // next available CUstream. That means, CUstreams [0, id-1] have been 223 // assigned, and [id,] are still available. If there is no CUstream left, we 224 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 225 // the id will increase one. 226 // xxxxxs+++++++++ 227 // ^ 228 // id 229 // After assignment, the pool becomes the following and s is assigned. 230 // xxxxxs+++++++++ 231 // ^ 232 // id 233 CUstream getStream(const int DeviceId) { 234 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 235 int &Id = NextStreamId[DeviceId]; 236 // No CUstream left in the pool, we need to request from CUDA RT 237 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 238 // By default we double the stream pool every time 239 resizeStreamPool(DeviceId, Id * 2); 240 } 241 return StreamPool[DeviceId][Id++]; 242 } 243 244 // Return a CUstream back to pool. As mentioned above, per-device next 245 // stream is always points to the next available CUstream, so when we return 246 // a CUstream, we need to first decrease the id, and then copy the CUstream 247 // back. 248 // It is worth noting that, the order of streams return might be different 249 // from that they're assigned, that saying, at some point, there might be 250 // two identical CUstreams. 251 // xxax+a+++++ 252 // ^ 253 // id 254 // However, it doesn't matter, because they're always on the two sides of 255 // id. The left one will in the end be overwritten by another CUstream. 256 // Therefore, after several execution, the order of pool might be different 257 // from its initial state. 258 void returnStream(const int DeviceId, CUstream Stream) { 259 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 260 int &Id = NextStreamId[DeviceId]; 261 assert(Id > 0 && "Wrong stream ID"); 262 StreamPool[DeviceId][--Id] = Stream; 263 } 264 265 bool initializeDeviceStreamPool(const int DeviceId) { 266 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 267 268 resizeStreamPool(DeviceId, EnvNumInitialStreams); 269 270 // Check the size of stream pool 271 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 272 return false; 273 274 // Check whether each stream is valid 275 for (CUstream &S : StreamPool[DeviceId]) 276 if (!S) 277 return false; 278 279 return true; 280 } 281 }; 282 283 class DeviceRTLTy { 284 int NumberOfDevices; 285 // OpenMP environment properties 286 int EnvNumTeams; 287 int EnvTeamLimit; 288 int EnvTeamThreadLimit; 289 // OpenMP requires flags 290 int64_t RequiresFlags; 291 292 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 293 static constexpr const int HardThreadLimit = 1024; 294 static constexpr const int DefaultNumTeams = 128; 295 static constexpr const int DefaultNumThreads = 128; 296 297 std::unique_ptr<StreamManagerTy> StreamManager; 298 std::vector<DeviceDataTy> DeviceData; 299 std::vector<CUmodule> Modules; 300 301 /// A class responsible for interacting with device native runtime library to 302 /// allocate and free memory. 303 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 304 const int DeviceId; 305 const std::vector<DeviceDataTy> &DeviceData; 306 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 307 308 public: 309 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 310 : DeviceId(DeviceId), DeviceData(DeviceData) {} 311 312 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 313 if (Size == 0) 314 return nullptr; 315 316 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 317 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 318 return nullptr; 319 320 void *MemAlloc = nullptr; 321 switch (Kind) { 322 case TARGET_ALLOC_DEFAULT: 323 case TARGET_ALLOC_DEVICE: 324 CUdeviceptr DevicePtr; 325 Err = cuMemAlloc(&DevicePtr, Size); 326 MemAlloc = (void *)DevicePtr; 327 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 328 return nullptr; 329 break; 330 case TARGET_ALLOC_HOST: 331 void *HostPtr; 332 Err = cuMemAllocHost(&HostPtr, Size); 333 MemAlloc = HostPtr; 334 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 335 return nullptr; 336 HostPinnedAllocs[MemAlloc] = Kind; 337 break; 338 case TARGET_ALLOC_SHARED: 339 CUdeviceptr SharedPtr; 340 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 341 MemAlloc = (void *)SharedPtr; 342 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 343 return nullptr; 344 break; 345 } 346 347 return MemAlloc; 348 } 349 350 int free(void *TgtPtr) override { 351 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 352 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 353 return OFFLOAD_FAIL; 354 355 // Host pinned memory must be freed differently. 356 TargetAllocTy Kind = 357 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 358 ? TARGET_ALLOC_DEFAULT 359 : TARGET_ALLOC_HOST; 360 switch (Kind) { 361 case TARGET_ALLOC_DEFAULT: 362 case TARGET_ALLOC_DEVICE: 363 case TARGET_ALLOC_SHARED: 364 Err = cuMemFree((CUdeviceptr)TgtPtr); 365 if (!checkResult(Err, "Error returned from cuMemFree\n")) 366 return OFFLOAD_FAIL; 367 break; 368 case TARGET_ALLOC_HOST: 369 Err = cuMemFreeHost(TgtPtr); 370 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 371 return OFFLOAD_FAIL; 372 break; 373 } 374 375 return OFFLOAD_SUCCESS; 376 } 377 }; 378 379 /// A vector of device allocators 380 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 381 382 /// A vector of memory managers. Since the memory manager is non-copyable and 383 // non-removable, we wrap them into std::unique_ptr. 384 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 385 386 /// Whether use memory manager 387 bool UseMemoryManager = true; 388 389 // Record entry point associated with device 390 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 391 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 392 E.Entries.push_back(entry); 393 } 394 395 // Return a pointer to the entry associated with the pointer 396 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 397 const void *Addr) const { 398 for (const __tgt_offload_entry &Itr : 399 DeviceData[DeviceId].FuncGblEntries.back().Entries) 400 if (Itr.addr == Addr) 401 return &Itr; 402 403 return nullptr; 404 } 405 406 // Return the pointer to the target entries table 407 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 408 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 409 410 if (E.Entries.empty()) 411 return nullptr; 412 413 // Update table info according to the entries and return the pointer 414 E.Table.EntriesBegin = E.Entries.data(); 415 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 416 417 return &E.Table; 418 } 419 420 // Clear entries table for a device 421 void clearOffloadEntriesTable(const int DeviceId) { 422 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 423 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 424 E.Entries.clear(); 425 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 426 } 427 428 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 429 assert(AsyncInfo && "AsyncInfo is nullptr"); 430 431 if (!AsyncInfo->Queue) 432 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 433 434 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 435 } 436 437 public: 438 // This class should not be copied 439 DeviceRTLTy(const DeviceRTLTy &) = delete; 440 DeviceRTLTy(DeviceRTLTy &&) = delete; 441 442 DeviceRTLTy() 443 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 444 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { 445 446 DP("Start initializing CUDA\n"); 447 448 CUresult Err = cuInit(0); 449 if (Err == CUDA_ERROR_INVALID_HANDLE) { 450 // Can't call cuGetErrorString if dlsym failed 451 DP("Failed to load CUDA shared library\n"); 452 return; 453 } 454 if (!checkResult(Err, "Error returned from cuInit\n")) { 455 return; 456 } 457 458 Err = cuDeviceGetCount(&NumberOfDevices); 459 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 460 return; 461 462 if (NumberOfDevices == 0) { 463 DP("There are no devices supporting CUDA.\n"); 464 return; 465 } 466 467 DeviceData.resize(NumberOfDevices); 468 469 // Get environment variables regarding teams 470 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 471 // OMP_TEAM_LIMIT has been set 472 EnvTeamLimit = std::stoi(EnvStr); 473 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 474 } 475 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 476 // OMP_TEAMS_THREAD_LIMIT has been set 477 EnvTeamThreadLimit = std::stoi(EnvStr); 478 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 479 } 480 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 481 // OMP_NUM_TEAMS has been set 482 EnvNumTeams = std::stoi(EnvStr); 483 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 484 } 485 486 StreamManager = 487 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 488 489 for (int I = 0; I < NumberOfDevices; ++I) 490 DeviceAllocators.emplace_back(I, DeviceData); 491 492 // Get the size threshold from environment variable 493 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 494 UseMemoryManager = Res.second; 495 size_t MemoryManagerThreshold = Res.first; 496 497 if (UseMemoryManager) 498 for (int I = 0; I < NumberOfDevices; ++I) 499 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 500 DeviceAllocators[I], MemoryManagerThreshold)); 501 } 502 503 ~DeviceRTLTy() { 504 // We first destruct memory managers in case that its dependent data are 505 // destroyed before it. 506 for (auto &M : MemoryManagers) 507 M.release(); 508 509 StreamManager = nullptr; 510 511 for (CUmodule &M : Modules) 512 // Close module 513 if (M) 514 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 515 516 for (DeviceDataTy &D : DeviceData) { 517 // Destroy context 518 if (D.Context) { 519 checkResult(cuCtxSetCurrent(D.Context), 520 "Error returned from cuCtxSetCurrent\n"); 521 CUdevice Device; 522 checkResult(cuCtxGetDevice(&Device), 523 "Error returned from cuCtxGetDevice\n"); 524 checkResult(cuDevicePrimaryCtxRelease(Device), 525 "Error returned from cuDevicePrimaryCtxRelease\n"); 526 } 527 } 528 } 529 530 // Check whether a given DeviceId is valid 531 bool isValidDeviceId(const int DeviceId) const { 532 return DeviceId >= 0 && DeviceId < NumberOfDevices; 533 } 534 535 int getNumOfDevices() const { return NumberOfDevices; } 536 537 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 538 539 int initDevice(const int DeviceId) { 540 CUdevice Device; 541 542 DP("Getting device %d\n", DeviceId); 543 CUresult Err = cuDeviceGet(&Device, DeviceId); 544 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 545 return OFFLOAD_FAIL; 546 547 // Query the current flags of the primary context and set its flags if 548 // it is inactive 549 unsigned int FormerPrimaryCtxFlags = 0; 550 int FormerPrimaryCtxIsActive = 0; 551 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 552 &FormerPrimaryCtxIsActive); 553 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 554 return OFFLOAD_FAIL; 555 556 if (FormerPrimaryCtxIsActive) { 557 DP("The primary context is active, no change to its flags\n"); 558 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 559 CU_CTX_SCHED_BLOCKING_SYNC) 560 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 561 } else { 562 DP("The primary context is inactive, set its flags to " 563 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 564 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 565 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 566 return OFFLOAD_FAIL; 567 } 568 569 // Retain the per device primary context and save it to use whenever this 570 // device is selected. 571 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 572 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 573 return OFFLOAD_FAIL; 574 575 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 576 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 577 return OFFLOAD_FAIL; 578 579 // Initialize stream pool 580 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 581 return OFFLOAD_FAIL; 582 583 // Query attributes to determine number of threads/block and blocks/grid. 584 int MaxGridDimX; 585 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 586 Device); 587 if (Err != CUDA_SUCCESS) { 588 DP("Error getting max grid dimension, use default value %d\n", 589 DeviceRTLTy::DefaultNumTeams); 590 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 591 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 592 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 593 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 594 } else { 595 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 596 "at the hard limit\n", 597 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 598 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 599 } 600 601 // We are only exploiting threads along the x axis. 602 int MaxBlockDimX; 603 Err = cuDeviceGetAttribute(&MaxBlockDimX, 604 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 605 if (Err != CUDA_SUCCESS) { 606 DP("Error getting max block dimension, use default value %d\n", 607 DeviceRTLTy::DefaultNumThreads); 608 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 609 } else { 610 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 611 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 612 613 if (EnvTeamThreadLimit > 0 && 614 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 615 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 616 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 617 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 618 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 619 } 620 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 621 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 622 "capping at the hard limit\n", 623 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 624 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 625 } 626 } 627 628 // Get and set warp size 629 int WarpSize; 630 Err = 631 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 632 if (Err != CUDA_SUCCESS) { 633 DP("Error getting warp size, assume default value 32\n"); 634 DeviceData[DeviceId].WarpSize = 32; 635 } else { 636 DP("Using warp size %d\n", WarpSize); 637 DeviceData[DeviceId].WarpSize = WarpSize; 638 } 639 640 // Adjust teams to the env variables 641 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 642 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 643 EnvTeamLimit); 644 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 645 } 646 647 size_t StackLimit; 648 size_t HeapLimit; 649 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 650 StackLimit = std::stol(EnvStr); 651 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 652 return OFFLOAD_FAIL; 653 } else { 654 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 655 return OFFLOAD_FAIL; 656 } 657 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 658 HeapLimit = std::stol(EnvStr); 659 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 660 return OFFLOAD_FAIL; 661 } else { 662 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 663 return OFFLOAD_FAIL; 664 } 665 666 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 667 "Device supports up to %d CUDA blocks and %d threads with a " 668 "warp size of %d\n", 669 DeviceData[DeviceId].BlocksPerGrid, 670 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 671 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 672 "Device heap size is %d Bytes, device stack size is %d Bytes per " 673 "thread\n", 674 (int)HeapLimit, (int)StackLimit); 675 676 // Set default number of teams 677 if (EnvNumTeams > 0) { 678 DP("Default number of teams set according to environment %d\n", 679 EnvNumTeams); 680 DeviceData[DeviceId].NumTeams = EnvNumTeams; 681 } else { 682 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 683 DP("Default number of teams set according to library's default %d\n", 684 DeviceRTLTy::DefaultNumTeams); 685 } 686 687 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 688 DP("Default number of teams exceeds device limit, capping at %d\n", 689 DeviceData[DeviceId].BlocksPerGrid); 690 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 691 } 692 693 // Set default number of threads 694 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 695 DP("Default number of threads set according to library's default %d\n", 696 DeviceRTLTy::DefaultNumThreads); 697 if (DeviceData[DeviceId].NumThreads > 698 DeviceData[DeviceId].ThreadsPerBlock) { 699 DP("Default number of threads exceeds device limit, capping at %d\n", 700 DeviceData[DeviceId].ThreadsPerBlock); 701 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 702 } 703 704 return OFFLOAD_SUCCESS; 705 } 706 707 __tgt_target_table *loadBinary(const int DeviceId, 708 const __tgt_device_image *Image) { 709 // Set the context we are using 710 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 711 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 712 return nullptr; 713 714 // Clear the offload table as we are going to create a new one. 715 clearOffloadEntriesTable(DeviceId); 716 717 // Create the module and extract the function pointers. 718 CUmodule Module; 719 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 720 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 721 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 722 return nullptr; 723 724 DP("CUDA module successfully loaded!\n"); 725 726 Modules.push_back(Module); 727 728 // Find the symbols in the module by name. 729 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 730 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 731 732 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 733 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 734 if (!E->addr) { 735 // We return nullptr when something like this happens, the host should 736 // have always something in the address to uniquely identify the target 737 // region. 738 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 739 return nullptr; 740 } 741 742 if (E->size) { 743 __tgt_offload_entry Entry = *E; 744 CUdeviceptr CUPtr; 745 size_t CUSize; 746 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 747 // We keep this style here because we need the name 748 if (Err != CUDA_SUCCESS) { 749 REPORT("Loading global '%s' Failed\n", E->name); 750 CUDA_ERR_STRING(Err); 751 return nullptr; 752 } 753 754 if (CUSize != E->size) { 755 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 756 CUSize, E->size); 757 return nullptr; 758 } 759 760 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 761 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 762 763 Entry.addr = (void *)(CUPtr); 764 765 // Note: In the current implementation declare target variables 766 // can either be link or to. This means that once unified 767 // memory is activated via the requires directive, the variable 768 // can be used directly from the host in both cases. 769 // TODO: when variables types other than to or link are added, 770 // the below condition should be changed to explicitly 771 // check for to and link variables types: 772 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 773 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 774 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 775 // If unified memory is present any target link or to variables 776 // can access host addresses directly. There is no longer a 777 // need for device copies. 778 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 779 DP("Copy linked variable host address (" DPxMOD 780 ") to device address (" DPxMOD ")\n", 781 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 782 } 783 784 addOffloadEntry(DeviceId, Entry); 785 786 continue; 787 } 788 789 CUfunction Func; 790 Err = cuModuleGetFunction(&Func, Module, E->name); 791 // We keep this style here because we need the name 792 if (Err != CUDA_SUCCESS) { 793 REPORT("Loading '%s' Failed\n", E->name); 794 CUDA_ERR_STRING(Err); 795 return nullptr; 796 } 797 798 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 799 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 800 801 // default value GENERIC (in case symbol is missing from cubin file) 802 int8_t ExecModeVal = ExecutionModeType::GENERIC; 803 std::string ExecModeNameStr(E->name); 804 ExecModeNameStr += "_exec_mode"; 805 const char *ExecModeName = ExecModeNameStr.c_str(); 806 807 CUdeviceptr ExecModePtr; 808 size_t CUSize; 809 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 810 if (Err == CUDA_SUCCESS) { 811 if (CUSize != sizeof(int8_t)) { 812 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 813 ExecModeName, CUSize, sizeof(int8_t)); 814 return nullptr; 815 } 816 817 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 818 if (Err != CUDA_SUCCESS) { 819 REPORT("Error when copying data from device to host. Pointers: " 820 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 821 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 822 CUDA_ERR_STRING(Err); 823 return nullptr; 824 } 825 826 if (ExecModeVal < 0 || ExecModeVal > 2) { 827 DP("Error wrong exec_mode value specified in cubin file: %d\n", 828 ExecModeVal); 829 return nullptr; 830 } 831 } else { 832 DP("Loading global exec_mode '%s' - symbol missing, using default " 833 "value GENERIC (1)\n", 834 ExecModeName); 835 } 836 837 KernelsList.emplace_back(Func, ExecModeVal); 838 839 __tgt_offload_entry Entry = *E; 840 Entry.addr = &KernelsList.back(); 841 addOffloadEntry(DeviceId, Entry); 842 } 843 844 // send device environment data to the device 845 { 846 omptarget_device_environmentTy DeviceEnv{0}; 847 848 #ifdef OMPTARGET_DEBUG 849 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 850 DeviceEnv.debug_level = std::stoi(EnvStr); 851 #endif 852 853 const char *DeviceEnvName = "omptarget_device_environment"; 854 CUdeviceptr DeviceEnvPtr; 855 size_t CUSize; 856 857 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 858 if (Err == CUDA_SUCCESS) { 859 if (CUSize != sizeof(DeviceEnv)) { 860 REPORT( 861 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 862 DeviceEnvName, CUSize, sizeof(int32_t)); 863 CUDA_ERR_STRING(Err); 864 return nullptr; 865 } 866 867 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 868 if (Err != CUDA_SUCCESS) { 869 REPORT("Error when copying data from host to device. Pointers: " 870 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 871 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 872 CUDA_ERR_STRING(Err); 873 return nullptr; 874 } 875 876 DP("Sending global device environment data %zu bytes\n", CUSize); 877 } else { 878 DP("Finding global device environment '%s' - symbol missing.\n", 879 DeviceEnvName); 880 DP("Continue, considering this is a device RTL which does not accept " 881 "environment setting.\n"); 882 } 883 } 884 885 return getOffloadEntriesTable(DeviceId); 886 } 887 888 void *dataAlloc(const int DeviceId, const int64_t Size, 889 const TargetAllocTy Kind) { 890 switch (Kind) { 891 case TARGET_ALLOC_DEFAULT: 892 case TARGET_ALLOC_DEVICE: 893 if (UseMemoryManager) 894 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 895 else 896 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 897 case TARGET_ALLOC_HOST: 898 case TARGET_ALLOC_SHARED: 899 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 900 } 901 902 REPORT("Invalid target data allocation kind or requested allocator not " 903 "implemented yet\n"); 904 905 return nullptr; 906 } 907 908 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 909 const int64_t Size, __tgt_async_info *AsyncInfo) const { 910 assert(AsyncInfo && "AsyncInfo is nullptr"); 911 912 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 913 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 914 return OFFLOAD_FAIL; 915 916 CUstream Stream = getStream(DeviceId, AsyncInfo); 917 918 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 919 if (Err != CUDA_SUCCESS) { 920 DP("Error when copying data from host to device. Pointers: host " 921 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 922 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 923 CUDA_ERR_STRING(Err); 924 return OFFLOAD_FAIL; 925 } 926 927 return OFFLOAD_SUCCESS; 928 } 929 930 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 931 const int64_t Size, __tgt_async_info *AsyncInfo) const { 932 assert(AsyncInfo && "AsyncInfo is nullptr"); 933 934 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 935 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 936 return OFFLOAD_FAIL; 937 938 CUstream Stream = getStream(DeviceId, AsyncInfo); 939 940 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 941 if (Err != CUDA_SUCCESS) { 942 DP("Error when copying data from device to host. Pointers: host " 943 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 944 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 945 CUDA_ERR_STRING(Err); 946 return OFFLOAD_FAIL; 947 } 948 949 return OFFLOAD_SUCCESS; 950 } 951 952 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 953 int64_t Size, __tgt_async_info *AsyncInfo) const { 954 assert(AsyncInfo && "AsyncInfo is nullptr"); 955 956 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 957 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 958 return OFFLOAD_FAIL; 959 960 CUstream Stream = getStream(SrcDevId, AsyncInfo); 961 962 // If they are two devices, we try peer to peer copy first 963 if (SrcDevId != DstDevId) { 964 int CanAccessPeer = 0; 965 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 966 if (Err != CUDA_SUCCESS) { 967 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 968 ", dst = %" PRId32 "\n", 969 SrcDevId, DstDevId); 970 CUDA_ERR_STRING(Err); 971 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 972 } 973 974 if (!CanAccessPeer) { 975 DP("P2P memcpy not supported so fall back to D2D memcpy"); 976 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 977 } 978 979 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 980 if (Err != CUDA_SUCCESS) { 981 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 982 ", dst = %" PRId32 "\n", 983 SrcDevId, DstDevId); 984 CUDA_ERR_STRING(Err); 985 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 986 } 987 988 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 989 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 990 Size, Stream); 991 if (Err == CUDA_SUCCESS) 992 return OFFLOAD_SUCCESS; 993 994 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 995 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 996 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 997 CUDA_ERR_STRING(Err); 998 } 999 1000 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1001 } 1002 1003 int dataDelete(const int DeviceId, void *TgtPtr) { 1004 if (UseMemoryManager) 1005 return MemoryManagers[DeviceId]->free(TgtPtr); 1006 1007 return DeviceAllocators[DeviceId].free(TgtPtr); 1008 } 1009 1010 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1011 ptrdiff_t *TgtOffsets, const int ArgNum, 1012 const int TeamNum, const int ThreadLimit, 1013 const unsigned int LoopTripCount, 1014 __tgt_async_info *AsyncInfo) const { 1015 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1016 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1017 return OFFLOAD_FAIL; 1018 1019 // All args are references. 1020 std::vector<void *> Args(ArgNum); 1021 std::vector<void *> Ptrs(ArgNum); 1022 1023 for (int I = 0; I < ArgNum; ++I) { 1024 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1025 Args[I] = &Ptrs[I]; 1026 } 1027 1028 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1029 1030 int CudaThreadsPerBlock; 1031 if (ThreadLimit > 0) { 1032 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1033 CudaThreadsPerBlock = ThreadLimit; 1034 // Add master warp if necessary 1035 if (KernelInfo->ExecutionMode == GENERIC) { 1036 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1037 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1038 } 1039 } else { 1040 DP("Setting CUDA threads per block to default %d\n", 1041 DeviceData[DeviceId].NumThreads); 1042 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1043 } 1044 1045 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1046 DP("Threads per block capped at device limit %d\n", 1047 DeviceData[DeviceId].ThreadsPerBlock); 1048 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1049 } 1050 1051 if (!KernelInfo->MaxThreadsPerBlock) { 1052 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1053 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1054 KernelInfo->Func); 1055 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1056 return OFFLOAD_FAIL; 1057 } 1058 1059 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1060 DP("Threads per block capped at kernel limit %d\n", 1061 KernelInfo->MaxThreadsPerBlock); 1062 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1063 } 1064 1065 unsigned int CudaBlocksPerGrid; 1066 if (TeamNum <= 0) { 1067 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1068 if (KernelInfo->ExecutionMode == SPMD) { 1069 // We have a combined construct, i.e. `target teams distribute 1070 // parallel for [simd]`. We launch so many teams so that each thread 1071 // will execute one iteration of the loop. round up to the nearest 1072 // integer 1073 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1074 } else if (KernelInfo->ExecutionMode == GENERIC) { 1075 // If we reach this point, then we have a non-combined construct, i.e. 1076 // `teams distribute` with a nested `parallel for` and each team is 1077 // assigned one iteration of the `distribute` loop. E.g.: 1078 // 1079 // #pragma omp target teams distribute 1080 // for(...loop_tripcount...) { 1081 // #pragma omp parallel for 1082 // for(...) {} 1083 // } 1084 // 1085 // Threads within a team will execute the iterations of the `parallel` 1086 // loop. 1087 CudaBlocksPerGrid = LoopTripCount; 1088 } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) { 1089 // If we reach this point, then we are executing a kernel that was 1090 // transformed from Generic-mode to SPMD-mode. This kernel has 1091 // SPMD-mode execution, but needs its blocks to be scheduled 1092 // differently because the current loop trip count only applies to the 1093 // `teams distribute` region and will create var too few blocks using 1094 // the regular SPMD-mode method. 1095 CudaBlocksPerGrid = LoopTripCount; 1096 } else { 1097 REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode); 1098 return OFFLOAD_FAIL; 1099 } 1100 DP("Using %d teams due to loop trip count %" PRIu32 1101 " and number of threads per block %d\n", 1102 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1103 } else { 1104 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1105 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1106 } 1107 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1108 DP("Capping number of teams to team limit %d\n", 1109 DeviceData[DeviceId].BlocksPerGrid); 1110 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1111 } else { 1112 DP("Using requested number of teams %d\n", TeamNum); 1113 CudaBlocksPerGrid = TeamNum; 1114 } 1115 1116 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1117 "Launching kernel %s with %d blocks and %d threads in %s " 1118 "mode\n", 1119 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1120 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1121 : "(null)", 1122 CudaBlocksPerGrid, CudaThreadsPerBlock, 1123 (KernelInfo->ExecutionMode != SPMD 1124 ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" 1125 : "SPMD-Generic") 1126 : "SPMD")); 1127 1128 CUstream Stream = getStream(DeviceId, AsyncInfo); 1129 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1130 /* gridDimZ */ 1, CudaThreadsPerBlock, 1131 /* blockDimY */ 1, /* blockDimZ */ 1, 1132 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 1133 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1134 return OFFLOAD_FAIL; 1135 1136 DP("Launch of entry point at " DPxMOD " successful!\n", 1137 DPxPTR(TgtEntryPtr)); 1138 1139 return OFFLOAD_SUCCESS; 1140 } 1141 1142 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1143 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1144 CUresult Err = cuStreamSynchronize(Stream); 1145 1146 // Once the stream is synchronized, return it to stream pool and reset 1147 // AsyncInfo. This is to make sure the synchronization only works for its 1148 // own tasks. 1149 StreamManager->returnStream(DeviceId, 1150 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1151 AsyncInfo->Queue = nullptr; 1152 1153 if (Err != CUDA_SUCCESS) { 1154 DP("Error when synchronizing stream. stream = " DPxMOD 1155 ", async info ptr = " DPxMOD "\n", 1156 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1157 CUDA_ERR_STRING(Err); 1158 } 1159 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1160 } 1161 1162 void printDeviceInfo(int32_t device_id) { 1163 char TmpChar[1000]; 1164 std::string TmpStr; 1165 size_t TmpSt; 1166 int TmpInt, TmpInt2, TmpInt3; 1167 1168 CUdevice Device; 1169 checkResult(cuDeviceGet(&Device, device_id), 1170 "Error returned from cuCtxGetDevice\n"); 1171 1172 cuDriverGetVersion(&TmpInt); 1173 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1174 printf(" CUDA Device Number: \t\t%d \n", device_id); 1175 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1176 "Error returned from cuDeviceGetName\n"); 1177 printf(" Device Name: \t\t\t%s \n", TmpChar); 1178 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1179 "Error returned from cuDeviceTotalMem\n"); 1180 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1181 checkResult(cuDeviceGetAttribute( 1182 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1183 "Error returned from cuDeviceGetAttribute\n"); 1184 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1185 checkResult( 1186 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1187 "Error returned from cuDeviceGetAttribute\n"); 1188 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1189 checkResult(cuDeviceGetAttribute( 1190 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1191 "Error returned from cuDeviceGetAttribute\n"); 1192 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1193 checkResult( 1194 cuDeviceGetAttribute( 1195 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1196 "Error returned from cuDeviceGetAttribute\n"); 1197 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1198 checkResult( 1199 cuDeviceGetAttribute( 1200 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1201 "Error returned from cuDeviceGetAttribute\n"); 1202 printf(" Registers per Block: \t\t%d \n", TmpInt); 1203 checkResult( 1204 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1205 "Error returned from cuDeviceGetAttribute\n"); 1206 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1207 checkResult(cuDeviceGetAttribute( 1208 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1209 "Error returned from cuDeviceGetAttribute\n"); 1210 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1211 checkResult(cuDeviceGetAttribute( 1212 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1213 "Error returned from cuDeviceGetAttribute\n"); 1214 checkResult(cuDeviceGetAttribute( 1215 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1216 "Error returned from cuDeviceGetAttribute\n"); 1217 checkResult(cuDeviceGetAttribute( 1218 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1219 "Error returned from cuDeviceGetAttribute\n"); 1220 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1221 TmpInt3); 1222 checkResult(cuDeviceGetAttribute( 1223 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1224 "Error returned from cuDeviceGetAttribute\n"); 1225 checkResult(cuDeviceGetAttribute( 1226 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1227 "Error returned from cuDeviceGetAttribute\n"); 1228 checkResult(cuDeviceGetAttribute( 1229 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1230 "Error returned from cuDeviceGetAttribute\n"); 1231 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1232 TmpInt3); 1233 checkResult( 1234 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1235 "Error returned from cuDeviceGetAttribute\n"); 1236 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1237 checkResult(cuDeviceGetAttribute( 1238 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1239 "Error returned from cuDeviceGetAttribute\n"); 1240 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1241 checkResult( 1242 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1243 "Error returned from cuDeviceGetAttribute\n"); 1244 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1245 checkResult(cuDeviceGetAttribute( 1246 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1247 "Error returned from cuDeviceGetAttribute\n"); 1248 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1249 checkResult( 1250 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1251 "Error returned from cuDeviceGetAttribute\n"); 1252 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1253 checkResult(cuDeviceGetAttribute( 1254 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1255 "Error returned from cuDeviceGetAttribute\n"); 1256 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1257 checkResult( 1258 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1259 "Error returned from cuDeviceGetAttribute\n"); 1260 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1261 TmpStr = "DEFAULT"; 1262 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1263 TmpStr = "PROHIBITED"; 1264 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1265 TmpStr = "EXCLUSIVE PROCESS"; 1266 else 1267 TmpStr = "unknown"; 1268 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1269 checkResult(cuDeviceGetAttribute( 1270 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1271 "Error returned from cuDeviceGetAttribute\n"); 1272 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1273 checkResult( 1274 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1275 "Error returned from cuDeviceGetAttribute\n"); 1276 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1277 checkResult(cuDeviceGetAttribute( 1278 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1279 "Error returned from cuDeviceGetAttribute\n"); 1280 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1281 checkResult( 1282 cuDeviceGetAttribute( 1283 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1284 "Error returned from cuDeviceGetAttribute\n"); 1285 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1286 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1287 Device), 1288 "Error returned from cuDeviceGetAttribute\n"); 1289 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1290 checkResult(cuDeviceGetAttribute( 1291 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1292 Device), 1293 "Error returned from cuDeviceGetAttribute\n"); 1294 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1295 checkResult(cuDeviceGetAttribute( 1296 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1297 "Error returned from cuDeviceGetAttribute\n"); 1298 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1299 checkResult(cuDeviceGetAttribute( 1300 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1301 "Error returned from cuDeviceGetAttribute\n"); 1302 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1303 checkResult(cuDeviceGetAttribute( 1304 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1305 "Error returned from cuDeviceGetAttribute\n"); 1306 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1307 checkResult( 1308 cuDeviceGetAttribute( 1309 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1310 "Error returned from cuDeviceGetAttribute\n"); 1311 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1312 checkResult( 1313 cuDeviceGetAttribute( 1314 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1315 "Error returned from cuDeviceGetAttribute\n"); 1316 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1317 checkResult(cuDeviceGetAttribute( 1318 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1319 "Error returned from cuDeviceGetAttribute\n"); 1320 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1321 checkResult(cuDeviceGetAttribute( 1322 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1323 "Error returned from cuDeviceGetAttribute\n"); 1324 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1325 checkResult( 1326 cuDeviceGetAttribute( 1327 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1328 "Error returned from cuDeviceGetAttribute\n"); 1329 checkResult( 1330 cuDeviceGetAttribute( 1331 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1332 "Error returned from cuDeviceGetAttribute\n"); 1333 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1334 } 1335 }; 1336 1337 DeviceRTLTy DeviceRTL; 1338 } // namespace 1339 1340 // Exposed library API function 1341 #ifdef __cplusplus 1342 extern "C" { 1343 #endif 1344 1345 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1346 return elf_check_machine(image, /* EM_CUDA */ 190); 1347 } 1348 1349 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1350 1351 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1352 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1353 DeviceRTL.setRequiresFlag(RequiresFlags); 1354 return RequiresFlags; 1355 } 1356 1357 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1358 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1359 DeviceRTL.isValidDeviceId(dst_dev_id)) 1360 return 1; 1361 1362 return 0; 1363 } 1364 1365 int32_t __tgt_rtl_init_device(int32_t device_id) { 1366 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1367 1368 return DeviceRTL.initDevice(device_id); 1369 } 1370 1371 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1372 __tgt_device_image *image) { 1373 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1374 1375 return DeviceRTL.loadBinary(device_id, image); 1376 } 1377 1378 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1379 int32_t kind) { 1380 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1381 1382 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1383 } 1384 1385 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1386 int64_t size) { 1387 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1388 1389 __tgt_async_info AsyncInfo; 1390 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1391 size, &AsyncInfo); 1392 if (rc != OFFLOAD_SUCCESS) 1393 return OFFLOAD_FAIL; 1394 1395 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1396 } 1397 1398 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1399 void *hst_ptr, int64_t size, 1400 __tgt_async_info *async_info_ptr) { 1401 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1402 assert(async_info_ptr && "async_info_ptr is nullptr"); 1403 1404 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1405 async_info_ptr); 1406 } 1407 1408 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1409 int64_t size) { 1410 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1411 1412 __tgt_async_info AsyncInfo; 1413 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1414 size, &AsyncInfo); 1415 if (rc != OFFLOAD_SUCCESS) 1416 return OFFLOAD_FAIL; 1417 1418 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1419 } 1420 1421 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1422 void *tgt_ptr, int64_t size, 1423 __tgt_async_info *async_info_ptr) { 1424 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1425 assert(async_info_ptr && "async_info_ptr is nullptr"); 1426 1427 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1428 async_info_ptr); 1429 } 1430 1431 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1432 int dst_dev_id, void *dst_ptr, 1433 int64_t size, 1434 __tgt_async_info *AsyncInfo) { 1435 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1436 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1437 assert(AsyncInfo && "AsyncInfo is nullptr"); 1438 1439 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1440 AsyncInfo); 1441 } 1442 1443 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1444 int32_t dst_dev_id, void *dst_ptr, 1445 int64_t size) { 1446 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1447 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1448 1449 __tgt_async_info AsyncInfo; 1450 const int32_t rc = __tgt_rtl_data_exchange_async( 1451 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1452 if (rc != OFFLOAD_SUCCESS) 1453 return OFFLOAD_FAIL; 1454 1455 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1456 } 1457 1458 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1459 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1460 1461 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1462 } 1463 1464 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1465 void **tgt_args, 1466 ptrdiff_t *tgt_offsets, 1467 int32_t arg_num, int32_t team_num, 1468 int32_t thread_limit, 1469 uint64_t loop_tripcount) { 1470 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1471 1472 __tgt_async_info AsyncInfo; 1473 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1474 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1475 thread_limit, loop_tripcount, &AsyncInfo); 1476 if (rc != OFFLOAD_SUCCESS) 1477 return OFFLOAD_FAIL; 1478 1479 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1480 } 1481 1482 int32_t __tgt_rtl_run_target_team_region_async( 1483 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1484 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1485 int32_t thread_limit, uint64_t loop_tripcount, 1486 __tgt_async_info *async_info_ptr) { 1487 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1488 1489 return DeviceRTL.runTargetTeamRegion( 1490 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1491 thread_limit, loop_tripcount, async_info_ptr); 1492 } 1493 1494 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1495 void **tgt_args, ptrdiff_t *tgt_offsets, 1496 int32_t arg_num) { 1497 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1498 1499 __tgt_async_info AsyncInfo; 1500 const int32_t rc = __tgt_rtl_run_target_region_async( 1501 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1502 if (rc != OFFLOAD_SUCCESS) 1503 return OFFLOAD_FAIL; 1504 1505 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1506 } 1507 1508 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1509 void *tgt_entry_ptr, void **tgt_args, 1510 ptrdiff_t *tgt_offsets, 1511 int32_t arg_num, 1512 __tgt_async_info *async_info_ptr) { 1513 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1514 1515 return __tgt_rtl_run_target_team_region_async( 1516 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1517 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1518 async_info_ptr); 1519 } 1520 1521 int32_t __tgt_rtl_synchronize(int32_t device_id, 1522 __tgt_async_info *async_info_ptr) { 1523 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1524 assert(async_info_ptr && "async_info_ptr is nullptr"); 1525 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1526 1527 return DeviceRTL.synchronize(device_id, async_info_ptr); 1528 } 1529 1530 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1531 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1532 InfoLevel.store(NewInfoLevel); 1533 } 1534 1535 void __tgt_rtl_print_device_info(int32_t device_id) { 1536 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1537 DeviceRTL.printDeviceInfo(device_id); 1538 } 1539 1540 #ifdef __cplusplus 1541 } 1542 #endif 1543