1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "omptargetplugin.h" 25 26 #define TARGET_NAME CUDA 27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 28 29 #include "MemoryManager.h" 30 31 // Utility for retrieving and printing CUDA error string. 32 #ifdef OMPTARGET_DEBUG 33 #define CUDA_ERR_STRING(err) \ 34 do { \ 35 if (getDebugLevel() > 0) { \ 36 const char *errStr = nullptr; \ 37 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 38 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 39 REPORT("Unrecognized CUDA error code: %d\n", err); \ 40 else if (errStr_status == CUDA_SUCCESS) \ 41 REPORT("CUDA error is: %s\n", errStr); \ 42 else { \ 43 REPORT("Unresolved CUDA error code: %d\n", err); \ 44 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 45 errStr_status); \ 46 } \ 47 } else { \ 48 const char *errStr = nullptr; \ 49 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 50 if (errStr_status == CUDA_SUCCESS) \ 51 REPORT("%s \n", errStr); \ 52 } \ 53 } while (false) 54 #else // OMPTARGET_DEBUG 55 #define CUDA_ERR_STRING(err) \ 56 do { \ 57 const char *errStr = nullptr; \ 58 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 59 if (errStr_status == CUDA_SUCCESS) \ 60 REPORT("%s \n", errStr); \ 61 } while (false) 62 #endif // OMPTARGET_DEBUG 63 64 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 65 66 #include "elf_common.h" 67 68 /// Keep entries table per device. 69 struct FuncOrGblEntryTy { 70 __tgt_target_table Table; 71 std::vector<__tgt_offload_entry> Entries; 72 }; 73 74 enum ExecutionModeType { 75 SPMD, // constructors, destructors, 76 // combined constructs (`teams distribute parallel for [simd]`) 77 GENERIC, // everything else 78 SPMD_GENERIC, // Generic kernel with SPMD execution 79 NONE 80 }; 81 82 /// Use a single entity to encode a kernel and a set of flags. 83 struct KernelTy { 84 CUfunction Func; 85 86 // execution mode of kernel 87 // 0 - SPMD mode (without master warp) 88 // 1 - Generic mode (with master warp) 89 // 2 - SPMD mode execution with Generic mode semantics. 90 int8_t ExecutionMode; 91 92 /// Maximal number of threads per block for this kernel. 93 int MaxThreadsPerBlock = 0; 94 95 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 96 : Func(_Func), ExecutionMode(_ExecutionMode) {} 97 }; 98 99 /// Device environment data 100 /// Manually sync with the deviceRTL side for now, move to a dedicated header 101 /// file later. 102 struct omptarget_device_environmentTy { 103 int32_t debug_level; 104 }; 105 106 namespace { 107 bool checkResult(CUresult Err, const char *ErrMsg) { 108 if (Err == CUDA_SUCCESS) 109 return true; 110 111 REPORT("%s", ErrMsg); 112 CUDA_ERR_STRING(Err); 113 return false; 114 } 115 116 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 117 CUstream Stream) { 118 CUresult Err = 119 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 120 121 if (Err != CUDA_SUCCESS) { 122 DP("Error when copying data from device to device. Pointers: src " 123 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 124 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 125 CUDA_ERR_STRING(Err); 126 return OFFLOAD_FAIL; 127 } 128 129 return OFFLOAD_SUCCESS; 130 } 131 132 // Structure contains per-device data 133 struct DeviceDataTy { 134 /// List that contains all the kernels. 135 std::list<KernelTy> KernelsList; 136 137 std::list<FuncOrGblEntryTy> FuncGblEntries; 138 139 CUcontext Context = nullptr; 140 // Device properties 141 int ThreadsPerBlock = 0; 142 int BlocksPerGrid = 0; 143 int WarpSize = 0; 144 // OpenMP properties 145 int NumTeams = 0; 146 int NumThreads = 0; 147 }; 148 149 class StreamManagerTy { 150 int NumberOfDevices; 151 // The initial size of stream pool 152 int EnvNumInitialStreams; 153 // Per-device stream mutex 154 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 155 // Per-device stream Id indicates the next available stream in the pool 156 std::vector<int> NextStreamId; 157 // Per-device stream pool 158 std::vector<std::vector<CUstream>> StreamPool; 159 // Reference to per-device data 160 std::vector<DeviceDataTy> &DeviceData; 161 162 // If there is no CUstream left in the pool, we will resize the pool to 163 // allocate more CUstream. This function should be called with device mutex, 164 // and we do not resize to smaller one. 165 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 166 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 167 const size_t CurrentSize = Pool.size(); 168 assert(NewSize > CurrentSize && "new size is not larger than current size"); 169 170 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 171 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 172 // We will return if cannot switch to the right context in case of 173 // creating bunch of streams that are not corresponding to the right 174 // device. The offloading will fail later because selected CUstream is 175 // nullptr. 176 return; 177 } 178 179 Pool.resize(NewSize, nullptr); 180 181 for (size_t I = CurrentSize; I < NewSize; ++I) { 182 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 183 "Error returned from cuStreamCreate\n"); 184 } 185 } 186 187 public: 188 StreamManagerTy(const int NumberOfDevices, 189 std::vector<DeviceDataTy> &DeviceData) 190 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 191 DeviceData(DeviceData) { 192 StreamPool.resize(NumberOfDevices); 193 NextStreamId.resize(NumberOfDevices); 194 StreamMtx.resize(NumberOfDevices); 195 196 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 197 EnvNumInitialStreams = std::stoi(EnvStr); 198 199 // Initialize the next stream id 200 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 201 202 // Initialize stream mutex 203 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 204 Ptr = std::make_unique<std::mutex>(); 205 } 206 207 ~StreamManagerTy() { 208 // Destroy streams 209 for (int I = 0; I < NumberOfDevices; ++I) { 210 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 211 "Error returned from cuCtxSetCurrent\n"); 212 213 for (CUstream &S : StreamPool[I]) { 214 if (S) 215 checkResult(cuStreamDestroy(S), 216 "Error returned from cuStreamDestroy\n"); 217 } 218 } 219 } 220 221 // Get a CUstream from pool. Per-device next stream id always points to the 222 // next available CUstream. That means, CUstreams [0, id-1] have been 223 // assigned, and [id,] are still available. If there is no CUstream left, we 224 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 225 // the id will increase one. 226 // xxxxxs+++++++++ 227 // ^ 228 // id 229 // After assignment, the pool becomes the following and s is assigned. 230 // xxxxxs+++++++++ 231 // ^ 232 // id 233 CUstream getStream(const int DeviceId) { 234 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 235 int &Id = NextStreamId[DeviceId]; 236 // No CUstream left in the pool, we need to request from CUDA RT 237 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 238 // By default we double the stream pool every time 239 resizeStreamPool(DeviceId, Id * 2); 240 } 241 return StreamPool[DeviceId][Id++]; 242 } 243 244 // Return a CUstream back to pool. As mentioned above, per-device next 245 // stream is always points to the next available CUstream, so when we return 246 // a CUstream, we need to first decrease the id, and then copy the CUstream 247 // back. 248 // It is worth noting that, the order of streams return might be different 249 // from that they're assigned, that saying, at some point, there might be 250 // two identical CUstreams. 251 // xxax+a+++++ 252 // ^ 253 // id 254 // However, it doesn't matter, because they're always on the two sides of 255 // id. The left one will in the end be overwritten by another CUstream. 256 // Therefore, after several execution, the order of pool might be different 257 // from its initial state. 258 void returnStream(const int DeviceId, CUstream Stream) { 259 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 260 int &Id = NextStreamId[DeviceId]; 261 assert(Id > 0 && "Wrong stream ID"); 262 StreamPool[DeviceId][--Id] = Stream; 263 } 264 265 bool initializeDeviceStreamPool(const int DeviceId) { 266 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 267 268 resizeStreamPool(DeviceId, EnvNumInitialStreams); 269 270 // Check the size of stream pool 271 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 272 return false; 273 274 // Check whether each stream is valid 275 for (CUstream &S : StreamPool[DeviceId]) 276 if (!S) 277 return false; 278 279 return true; 280 } 281 }; 282 283 class DeviceRTLTy { 284 int NumberOfDevices; 285 // OpenMP environment properties 286 int EnvNumTeams; 287 int EnvTeamLimit; 288 int EnvTeamThreadLimit; 289 // OpenMP requires flags 290 int64_t RequiresFlags; 291 292 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 293 static constexpr const int HardThreadLimit = 1024; 294 static constexpr const int DefaultNumTeams = 128; 295 static constexpr const int DefaultNumThreads = 128; 296 297 std::unique_ptr<StreamManagerTy> StreamManager; 298 std::vector<DeviceDataTy> DeviceData; 299 std::vector<CUmodule> Modules; 300 301 /// A class responsible for interacting with device native runtime library to 302 /// allocate and free memory. 303 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 304 const int DeviceId; 305 const std::vector<DeviceDataTy> &DeviceData; 306 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 307 308 public: 309 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 310 : DeviceId(DeviceId), DeviceData(DeviceData) {} 311 312 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 313 if (Size == 0) 314 return nullptr; 315 316 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 317 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 318 return nullptr; 319 320 void *MemAlloc = nullptr; 321 switch (Kind) { 322 case TARGET_ALLOC_DEFAULT: 323 case TARGET_ALLOC_DEVICE: 324 CUdeviceptr DevicePtr; 325 Err = cuMemAlloc(&DevicePtr, Size); 326 MemAlloc = (void *)DevicePtr; 327 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 328 return nullptr; 329 break; 330 case TARGET_ALLOC_HOST: 331 void *HostPtr; 332 Err = cuMemAllocHost(&HostPtr, Size); 333 MemAlloc = HostPtr; 334 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 335 return nullptr; 336 HostPinnedAllocs[MemAlloc] = Kind; 337 break; 338 case TARGET_ALLOC_SHARED: 339 CUdeviceptr SharedPtr; 340 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 341 MemAlloc = (void *)SharedPtr; 342 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 343 return nullptr; 344 break; 345 } 346 347 return MemAlloc; 348 } 349 350 int free(void *TgtPtr) override { 351 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 352 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 353 return OFFLOAD_FAIL; 354 355 // Host pinned memory must be freed differently. 356 TargetAllocTy Kind = 357 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 358 ? TARGET_ALLOC_DEFAULT 359 : TARGET_ALLOC_HOST; 360 switch (Kind) { 361 case TARGET_ALLOC_DEFAULT: 362 case TARGET_ALLOC_DEVICE: 363 case TARGET_ALLOC_SHARED: 364 Err = cuMemFree((CUdeviceptr)TgtPtr); 365 if (!checkResult(Err, "Error returned from cuMemFree\n")) 366 return OFFLOAD_FAIL; 367 break; 368 case TARGET_ALLOC_HOST: 369 Err = cuMemFreeHost(TgtPtr); 370 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 371 return OFFLOAD_FAIL; 372 break; 373 } 374 375 return OFFLOAD_SUCCESS; 376 } 377 }; 378 379 /// A vector of device allocators 380 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 381 382 /// A vector of memory managers. Since the memory manager is non-copyable and 383 // non-removable, we wrap them into std::unique_ptr. 384 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 385 386 /// Whether use memory manager 387 bool UseMemoryManager = true; 388 389 // Record entry point associated with device 390 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 391 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 392 E.Entries.push_back(entry); 393 } 394 395 // Return a pointer to the entry associated with the pointer 396 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 397 const void *Addr) const { 398 for (const __tgt_offload_entry &Itr : 399 DeviceData[DeviceId].FuncGblEntries.back().Entries) 400 if (Itr.addr == Addr) 401 return &Itr; 402 403 return nullptr; 404 } 405 406 // Return the pointer to the target entries table 407 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 408 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 409 410 if (E.Entries.empty()) 411 return nullptr; 412 413 // Update table info according to the entries and return the pointer 414 E.Table.EntriesBegin = E.Entries.data(); 415 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 416 417 return &E.Table; 418 } 419 420 // Clear entries table for a device 421 void clearOffloadEntriesTable(const int DeviceId) { 422 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 423 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 424 E.Entries.clear(); 425 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 426 } 427 428 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 429 assert(AsyncInfo && "AsyncInfo is nullptr"); 430 431 if (!AsyncInfo->Queue) 432 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 433 434 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 435 } 436 437 public: 438 // This class should not be copied 439 DeviceRTLTy(const DeviceRTLTy &) = delete; 440 DeviceRTLTy(DeviceRTLTy &&) = delete; 441 442 DeviceRTLTy() 443 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 444 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { 445 446 DP("Start initializing CUDA\n"); 447 448 CUresult Err = cuInit(0); 449 if (Err == CUDA_ERROR_INVALID_HANDLE) { 450 // Can't call cuGetErrorString if dlsym failed 451 DP("Failed to load CUDA shared library\n"); 452 return; 453 } 454 if (!checkResult(Err, "Error returned from cuInit\n")) { 455 return; 456 } 457 458 Err = cuDeviceGetCount(&NumberOfDevices); 459 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 460 return; 461 462 if (NumberOfDevices == 0) { 463 DP("There are no devices supporting CUDA.\n"); 464 return; 465 } 466 467 DeviceData.resize(NumberOfDevices); 468 469 // Get environment variables regarding teams 470 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 471 // OMP_TEAM_LIMIT has been set 472 EnvTeamLimit = std::stoi(EnvStr); 473 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 474 } 475 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 476 // OMP_TEAMS_THREAD_LIMIT has been set 477 EnvTeamThreadLimit = std::stoi(EnvStr); 478 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 479 } 480 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 481 // OMP_NUM_TEAMS has been set 482 EnvNumTeams = std::stoi(EnvStr); 483 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 484 } 485 486 StreamManager = 487 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 488 489 for (int I = 0; I < NumberOfDevices; ++I) 490 DeviceAllocators.emplace_back(I, DeviceData); 491 492 // Get the size threshold from environment variable 493 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 494 UseMemoryManager = Res.second; 495 size_t MemoryManagerThreshold = Res.first; 496 497 if (UseMemoryManager) 498 for (int I = 0; I < NumberOfDevices; ++I) 499 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 500 DeviceAllocators[I], MemoryManagerThreshold)); 501 } 502 503 ~DeviceRTLTy() { 504 // We first destruct memory managers in case that its dependent data are 505 // destroyed before it. 506 for (auto &M : MemoryManagers) 507 M.release(); 508 509 StreamManager = nullptr; 510 511 for (CUmodule &M : Modules) 512 // Close module 513 if (M) 514 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 515 516 for (DeviceDataTy &D : DeviceData) { 517 // Destroy context 518 if (D.Context) { 519 checkResult(cuCtxSetCurrent(D.Context), 520 "Error returned from cuCtxSetCurrent\n"); 521 CUdevice Device; 522 checkResult(cuCtxGetDevice(&Device), 523 "Error returned from cuCtxGetDevice\n"); 524 checkResult(cuDevicePrimaryCtxRelease(Device), 525 "Error returned from cuDevicePrimaryCtxRelease\n"); 526 } 527 } 528 } 529 530 // Check whether a given DeviceId is valid 531 bool isValidDeviceId(const int DeviceId) const { 532 return DeviceId >= 0 && DeviceId < NumberOfDevices; 533 } 534 535 int getNumOfDevices() const { return NumberOfDevices; } 536 537 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 538 539 int initDevice(const int DeviceId) { 540 CUdevice Device; 541 542 DP("Getting device %d\n", DeviceId); 543 CUresult Err = cuDeviceGet(&Device, DeviceId); 544 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 545 return OFFLOAD_FAIL; 546 547 // Query the current flags of the primary context and set its flags if 548 // it is inactive 549 unsigned int FormerPrimaryCtxFlags = 0; 550 int FormerPrimaryCtxIsActive = 0; 551 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 552 &FormerPrimaryCtxIsActive); 553 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 554 return OFFLOAD_FAIL; 555 556 if (FormerPrimaryCtxIsActive) { 557 DP("The primary context is active, no change to its flags\n"); 558 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 559 CU_CTX_SCHED_BLOCKING_SYNC) 560 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 561 } else { 562 DP("The primary context is inactive, set its flags to " 563 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 564 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 565 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 566 return OFFLOAD_FAIL; 567 } 568 569 // Retain the per device primary context and save it to use whenever this 570 // device is selected. 571 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 572 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 573 return OFFLOAD_FAIL; 574 575 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 576 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 577 return OFFLOAD_FAIL; 578 579 // Initialize stream pool 580 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 581 return OFFLOAD_FAIL; 582 583 // Query attributes to determine number of threads/block and blocks/grid. 584 int MaxGridDimX; 585 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 586 Device); 587 if (Err != CUDA_SUCCESS) { 588 DP("Error getting max grid dimension, use default value %d\n", 589 DeviceRTLTy::DefaultNumTeams); 590 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 591 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 592 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 593 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 594 } else { 595 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 596 "at the hard limit\n", 597 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 598 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 599 } 600 601 // We are only exploiting threads along the x axis. 602 int MaxBlockDimX; 603 Err = cuDeviceGetAttribute(&MaxBlockDimX, 604 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 605 if (Err != CUDA_SUCCESS) { 606 DP("Error getting max block dimension, use default value %d\n", 607 DeviceRTLTy::DefaultNumThreads); 608 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 609 } else { 610 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 611 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 612 613 if (EnvTeamThreadLimit > 0 && 614 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 615 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 616 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 617 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 618 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 619 } 620 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 621 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 622 "capping at the hard limit\n", 623 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 624 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 625 } 626 } 627 628 // Get and set warp size 629 int WarpSize; 630 Err = 631 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 632 if (Err != CUDA_SUCCESS) { 633 DP("Error getting warp size, assume default value 32\n"); 634 DeviceData[DeviceId].WarpSize = 32; 635 } else { 636 DP("Using warp size %d\n", WarpSize); 637 DeviceData[DeviceId].WarpSize = WarpSize; 638 } 639 640 // Adjust teams to the env variables 641 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 642 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 643 EnvTeamLimit); 644 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 645 } 646 647 size_t StackLimit; 648 size_t HeapLimit; 649 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 650 StackLimit = std::stol(EnvStr); 651 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 652 return OFFLOAD_FAIL; 653 } else { 654 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 655 return OFFLOAD_FAIL; 656 } 657 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 658 HeapLimit = std::stol(EnvStr); 659 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 660 return OFFLOAD_FAIL; 661 } else { 662 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 663 return OFFLOAD_FAIL; 664 } 665 666 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 667 "Device supports up to %d CUDA blocks and %d threads with a " 668 "warp size of %d\n", 669 DeviceData[DeviceId].BlocksPerGrid, 670 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 671 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 672 "Device heap size is %d Bytes, device stack size is %d Bytes per " 673 "thread\n", 674 (int)HeapLimit, (int)StackLimit); 675 676 // Set default number of teams 677 if (EnvNumTeams > 0) { 678 DP("Default number of teams set according to environment %d\n", 679 EnvNumTeams); 680 DeviceData[DeviceId].NumTeams = EnvNumTeams; 681 } else { 682 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 683 DP("Default number of teams set according to library's default %d\n", 684 DeviceRTLTy::DefaultNumTeams); 685 } 686 687 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 688 DP("Default number of teams exceeds device limit, capping at %d\n", 689 DeviceData[DeviceId].BlocksPerGrid); 690 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 691 } 692 693 // Set default number of threads 694 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 695 DP("Default number of threads set according to library's default %d\n", 696 DeviceRTLTy::DefaultNumThreads); 697 if (DeviceData[DeviceId].NumThreads > 698 DeviceData[DeviceId].ThreadsPerBlock) { 699 DP("Default number of threads exceeds device limit, capping at %d\n", 700 DeviceData[DeviceId].ThreadsPerBlock); 701 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 702 } 703 704 return OFFLOAD_SUCCESS; 705 } 706 707 __tgt_target_table *loadBinary(const int DeviceId, 708 const __tgt_device_image *Image) { 709 // Set the context we are using 710 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 711 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 712 return nullptr; 713 714 // Clear the offload table as we are going to create a new one. 715 clearOffloadEntriesTable(DeviceId); 716 717 // Create the module and extract the function pointers. 718 CUmodule Module; 719 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 720 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 721 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 722 return nullptr; 723 724 DP("CUDA module successfully loaded!\n"); 725 726 Modules.push_back(Module); 727 728 // Find the symbols in the module by name. 729 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 730 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 731 732 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 733 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 734 if (!E->addr) { 735 // We return nullptr when something like this happens, the host should 736 // have always something in the address to uniquely identify the target 737 // region. 738 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 739 return nullptr; 740 } 741 742 if (E->size) { 743 __tgt_offload_entry Entry = *E; 744 CUdeviceptr CUPtr; 745 size_t CUSize; 746 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 747 // We keep this style here because we need the name 748 if (Err != CUDA_SUCCESS) { 749 REPORT("Loading global '%s' Failed\n", E->name); 750 CUDA_ERR_STRING(Err); 751 return nullptr; 752 } 753 754 if (CUSize != E->size) { 755 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 756 CUSize, E->size); 757 return nullptr; 758 } 759 760 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 761 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 762 763 Entry.addr = (void *)(CUPtr); 764 765 // Note: In the current implementation declare target variables 766 // can either be link or to. This means that once unified 767 // memory is activated via the requires directive, the variable 768 // can be used directly from the host in both cases. 769 // TODO: when variables types other than to or link are added, 770 // the below condition should be changed to explicitly 771 // check for to and link variables types: 772 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 773 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 774 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 775 // If unified memory is present any target link or to variables 776 // can access host addresses directly. There is no longer a 777 // need for device copies. 778 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 779 DP("Copy linked variable host address (" DPxMOD 780 ") to device address (" DPxMOD ")\n", 781 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 782 } 783 784 addOffloadEntry(DeviceId, Entry); 785 786 continue; 787 } 788 789 CUfunction Func; 790 Err = cuModuleGetFunction(&Func, Module, E->name); 791 // We keep this style here because we need the name 792 if (Err != CUDA_SUCCESS) { 793 REPORT("Loading '%s' Failed\n", E->name); 794 CUDA_ERR_STRING(Err); 795 return nullptr; 796 } 797 798 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 799 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 800 801 // default value GENERIC (in case symbol is missing from cubin file) 802 int8_t ExecModeVal = ExecutionModeType::GENERIC; 803 std::string ExecModeNameStr(E->name); 804 ExecModeNameStr += "_exec_mode"; 805 const char *ExecModeName = ExecModeNameStr.c_str(); 806 807 CUdeviceptr ExecModePtr; 808 size_t CUSize; 809 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 810 if (Err == CUDA_SUCCESS) { 811 if (CUSize != sizeof(int8_t)) { 812 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 813 ExecModeName, CUSize, sizeof(int8_t)); 814 return nullptr; 815 } 816 817 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 818 if (Err != CUDA_SUCCESS) { 819 REPORT("Error when copying data from device to host. Pointers: " 820 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 821 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 822 CUDA_ERR_STRING(Err); 823 return nullptr; 824 } 825 826 if (ExecModeVal < 0 || ExecModeVal > 2) { 827 DP("Error wrong exec_mode value specified in cubin file: %d\n", 828 ExecModeVal); 829 return nullptr; 830 } 831 } else { 832 REPORT("Loading global exec_mode '%s' - symbol missing, using default " 833 "value GENERIC (1)\n", 834 ExecModeName); 835 CUDA_ERR_STRING(Err); 836 } 837 838 KernelsList.emplace_back(Func, ExecModeVal); 839 840 __tgt_offload_entry Entry = *E; 841 Entry.addr = &KernelsList.back(); 842 addOffloadEntry(DeviceId, Entry); 843 } 844 845 // send device environment data to the device 846 { 847 omptarget_device_environmentTy DeviceEnv{0}; 848 849 #ifdef OMPTARGET_DEBUG 850 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 851 DeviceEnv.debug_level = std::stoi(EnvStr); 852 #endif 853 854 const char *DeviceEnvName = "omptarget_device_environment"; 855 CUdeviceptr DeviceEnvPtr; 856 size_t CUSize; 857 858 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 859 if (Err == CUDA_SUCCESS) { 860 if (CUSize != sizeof(DeviceEnv)) { 861 REPORT( 862 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 863 DeviceEnvName, CUSize, sizeof(int32_t)); 864 CUDA_ERR_STRING(Err); 865 return nullptr; 866 } 867 868 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 869 if (Err != CUDA_SUCCESS) { 870 REPORT("Error when copying data from host to device. Pointers: " 871 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 872 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 873 CUDA_ERR_STRING(Err); 874 return nullptr; 875 } 876 877 DP("Sending global device environment data %zu bytes\n", CUSize); 878 } else { 879 DP("Finding global device environment '%s' - symbol missing.\n", 880 DeviceEnvName); 881 DP("Continue, considering this is a device RTL which does not accept " 882 "environment setting.\n"); 883 } 884 } 885 886 return getOffloadEntriesTable(DeviceId); 887 } 888 889 void *dataAlloc(const int DeviceId, const int64_t Size, 890 const TargetAllocTy Kind) { 891 switch (Kind) { 892 case TARGET_ALLOC_DEFAULT: 893 case TARGET_ALLOC_DEVICE: 894 if (UseMemoryManager) 895 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 896 else 897 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 898 case TARGET_ALLOC_HOST: 899 case TARGET_ALLOC_SHARED: 900 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 901 } 902 903 REPORT("Invalid target data allocation kind or requested allocator not " 904 "implemented yet\n"); 905 906 return nullptr; 907 } 908 909 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 910 const int64_t Size, __tgt_async_info *AsyncInfo) const { 911 assert(AsyncInfo && "AsyncInfo is nullptr"); 912 913 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 914 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 915 return OFFLOAD_FAIL; 916 917 CUstream Stream = getStream(DeviceId, AsyncInfo); 918 919 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 920 if (Err != CUDA_SUCCESS) { 921 DP("Error when copying data from host to device. Pointers: host " 922 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 923 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 924 CUDA_ERR_STRING(Err); 925 return OFFLOAD_FAIL; 926 } 927 928 return OFFLOAD_SUCCESS; 929 } 930 931 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 932 const int64_t Size, __tgt_async_info *AsyncInfo) const { 933 assert(AsyncInfo && "AsyncInfo is nullptr"); 934 935 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 936 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 937 return OFFLOAD_FAIL; 938 939 CUstream Stream = getStream(DeviceId, AsyncInfo); 940 941 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 942 if (Err != CUDA_SUCCESS) { 943 DP("Error when copying data from device to host. Pointers: host " 944 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 945 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 946 CUDA_ERR_STRING(Err); 947 return OFFLOAD_FAIL; 948 } 949 950 return OFFLOAD_SUCCESS; 951 } 952 953 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 954 int64_t Size, __tgt_async_info *AsyncInfo) const { 955 assert(AsyncInfo && "AsyncInfo is nullptr"); 956 957 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 958 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 959 return OFFLOAD_FAIL; 960 961 CUstream Stream = getStream(SrcDevId, AsyncInfo); 962 963 // If they are two devices, we try peer to peer copy first 964 if (SrcDevId != DstDevId) { 965 int CanAccessPeer = 0; 966 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 967 if (Err != CUDA_SUCCESS) { 968 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 969 ", dst = %" PRId32 "\n", 970 SrcDevId, DstDevId); 971 CUDA_ERR_STRING(Err); 972 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 973 } 974 975 if (!CanAccessPeer) { 976 DP("P2P memcpy not supported so fall back to D2D memcpy"); 977 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 978 } 979 980 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 981 if (Err != CUDA_SUCCESS) { 982 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 983 ", dst = %" PRId32 "\n", 984 SrcDevId, DstDevId); 985 CUDA_ERR_STRING(Err); 986 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 987 } 988 989 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 990 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 991 Size, Stream); 992 if (Err == CUDA_SUCCESS) 993 return OFFLOAD_SUCCESS; 994 995 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 996 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 997 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 998 CUDA_ERR_STRING(Err); 999 } 1000 1001 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1002 } 1003 1004 int dataDelete(const int DeviceId, void *TgtPtr) { 1005 if (UseMemoryManager) 1006 return MemoryManagers[DeviceId]->free(TgtPtr); 1007 1008 return DeviceAllocators[DeviceId].free(TgtPtr); 1009 } 1010 1011 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1012 ptrdiff_t *TgtOffsets, const int ArgNum, 1013 const int TeamNum, const int ThreadLimit, 1014 const unsigned int LoopTripCount, 1015 __tgt_async_info *AsyncInfo) const { 1016 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1017 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1018 return OFFLOAD_FAIL; 1019 1020 // All args are references. 1021 std::vector<void *> Args(ArgNum); 1022 std::vector<void *> Ptrs(ArgNum); 1023 1024 for (int I = 0; I < ArgNum; ++I) { 1025 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1026 Args[I] = &Ptrs[I]; 1027 } 1028 1029 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1030 1031 int CudaThreadsPerBlock; 1032 if (ThreadLimit > 0) { 1033 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1034 CudaThreadsPerBlock = ThreadLimit; 1035 // Add master warp if necessary 1036 if (KernelInfo->ExecutionMode == GENERIC) { 1037 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1038 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1039 } 1040 } else { 1041 DP("Setting CUDA threads per block to default %d\n", 1042 DeviceData[DeviceId].NumThreads); 1043 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1044 } 1045 1046 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1047 DP("Threads per block capped at device limit %d\n", 1048 DeviceData[DeviceId].ThreadsPerBlock); 1049 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1050 } 1051 1052 if (!KernelInfo->MaxThreadsPerBlock) { 1053 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1054 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1055 KernelInfo->Func); 1056 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1057 return OFFLOAD_FAIL; 1058 } 1059 1060 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1061 DP("Threads per block capped at kernel limit %d\n", 1062 KernelInfo->MaxThreadsPerBlock); 1063 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1064 } 1065 1066 unsigned int CudaBlocksPerGrid; 1067 if (TeamNum <= 0) { 1068 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1069 if (KernelInfo->ExecutionMode == SPMD) { 1070 // We have a combined construct, i.e. `target teams distribute 1071 // parallel for [simd]`. We launch so many teams so that each thread 1072 // will execute one iteration of the loop. round up to the nearest 1073 // integer 1074 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1075 } else if (KernelInfo->ExecutionMode == GENERIC) { 1076 // If we reach this point, then we have a non-combined construct, i.e. 1077 // `teams distribute` with a nested `parallel for` and each team is 1078 // assigned one iteration of the `distribute` loop. E.g.: 1079 // 1080 // #pragma omp target teams distribute 1081 // for(...loop_tripcount...) { 1082 // #pragma omp parallel for 1083 // for(...) {} 1084 // } 1085 // 1086 // Threads within a team will execute the iterations of the `parallel` 1087 // loop. 1088 CudaBlocksPerGrid = LoopTripCount; 1089 } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) { 1090 // If we reach this point, then we are executing a kernel that was 1091 // transformed from Generic-mode to SPMD-mode. This kernel has 1092 // SPMD-mode execution, but needs its blocks to be scheduled 1093 // differently because the current loop trip count only applies to the 1094 // `teams distribute` region and will create var too few blocks using 1095 // the regular SPMD-mode method. 1096 CudaBlocksPerGrid = LoopTripCount; 1097 } else { 1098 REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode); 1099 return OFFLOAD_FAIL; 1100 } 1101 DP("Using %d teams due to loop trip count %" PRIu32 1102 " and number of threads per block %d\n", 1103 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1104 } else { 1105 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1106 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1107 } 1108 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1109 DP("Capping number of teams to team limit %d\n", 1110 DeviceData[DeviceId].BlocksPerGrid); 1111 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1112 } else { 1113 DP("Using requested number of teams %d\n", TeamNum); 1114 CudaBlocksPerGrid = TeamNum; 1115 } 1116 1117 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1118 "Launching kernel %s with %d blocks and %d threads in %s " 1119 "mode\n", 1120 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1121 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1122 : "(null)", 1123 CudaBlocksPerGrid, CudaThreadsPerBlock, 1124 (KernelInfo->ExecutionMode != SPMD 1125 ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" 1126 : "SPMD-Generic") 1127 : "SPMD")); 1128 1129 CUstream Stream = getStream(DeviceId, AsyncInfo); 1130 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1131 /* gridDimZ */ 1, CudaThreadsPerBlock, 1132 /* blockDimY */ 1, /* blockDimZ */ 1, 1133 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 1134 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1135 return OFFLOAD_FAIL; 1136 1137 DP("Launch of entry point at " DPxMOD " successful!\n", 1138 DPxPTR(TgtEntryPtr)); 1139 1140 return OFFLOAD_SUCCESS; 1141 } 1142 1143 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1144 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1145 CUresult Err = cuStreamSynchronize(Stream); 1146 1147 // Once the stream is synchronized, return it to stream pool and reset 1148 // AsyncInfo. This is to make sure the synchronization only works for its 1149 // own tasks. 1150 StreamManager->returnStream(DeviceId, 1151 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1152 AsyncInfo->Queue = nullptr; 1153 1154 if (Err != CUDA_SUCCESS) { 1155 DP("Error when synchronizing stream. stream = " DPxMOD 1156 ", async info ptr = " DPxMOD "\n", 1157 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1158 CUDA_ERR_STRING(Err); 1159 } 1160 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1161 } 1162 1163 void printDeviceInfo(int32_t device_id) { 1164 char TmpChar[1000]; 1165 std::string TmpStr; 1166 size_t TmpSt; 1167 int TmpInt, TmpInt2, TmpInt3; 1168 1169 CUdevice Device; 1170 checkResult(cuDeviceGet(&Device, device_id), 1171 "Error returned from cuCtxGetDevice\n"); 1172 1173 cuDriverGetVersion(&TmpInt); 1174 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1175 printf(" CUDA Device Number: \t\t%d \n", device_id); 1176 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1177 "Error returned from cuDeviceGetName\n"); 1178 printf(" Device Name: \t\t\t%s \n", TmpChar); 1179 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1180 "Error returned from cuDeviceTotalMem\n"); 1181 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1182 checkResult(cuDeviceGetAttribute( 1183 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1184 "Error returned from cuDeviceGetAttribute\n"); 1185 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1186 checkResult( 1187 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1188 "Error returned from cuDeviceGetAttribute\n"); 1189 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1190 checkResult(cuDeviceGetAttribute( 1191 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1192 "Error returned from cuDeviceGetAttribute\n"); 1193 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1194 checkResult( 1195 cuDeviceGetAttribute( 1196 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1197 "Error returned from cuDeviceGetAttribute\n"); 1198 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1199 checkResult( 1200 cuDeviceGetAttribute( 1201 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1202 "Error returned from cuDeviceGetAttribute\n"); 1203 printf(" Registers per Block: \t\t%d \n", TmpInt); 1204 checkResult( 1205 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1206 "Error returned from cuDeviceGetAttribute\n"); 1207 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1208 checkResult(cuDeviceGetAttribute( 1209 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1210 "Error returned from cuDeviceGetAttribute\n"); 1211 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1212 checkResult(cuDeviceGetAttribute( 1213 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1214 "Error returned from cuDeviceGetAttribute\n"); 1215 checkResult(cuDeviceGetAttribute( 1216 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1217 "Error returned from cuDeviceGetAttribute\n"); 1218 checkResult(cuDeviceGetAttribute( 1219 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1220 "Error returned from cuDeviceGetAttribute\n"); 1221 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1222 TmpInt3); 1223 checkResult(cuDeviceGetAttribute( 1224 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1225 "Error returned from cuDeviceGetAttribute\n"); 1226 checkResult(cuDeviceGetAttribute( 1227 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1228 "Error returned from cuDeviceGetAttribute\n"); 1229 checkResult(cuDeviceGetAttribute( 1230 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1231 "Error returned from cuDeviceGetAttribute\n"); 1232 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1233 TmpInt3); 1234 checkResult( 1235 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1236 "Error returned from cuDeviceGetAttribute\n"); 1237 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1238 checkResult(cuDeviceGetAttribute( 1239 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1240 "Error returned from cuDeviceGetAttribute\n"); 1241 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1242 checkResult( 1243 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1244 "Error returned from cuDeviceGetAttribute\n"); 1245 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1246 checkResult(cuDeviceGetAttribute( 1247 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1248 "Error returned from cuDeviceGetAttribute\n"); 1249 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1250 checkResult( 1251 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1252 "Error returned from cuDeviceGetAttribute\n"); 1253 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1254 checkResult(cuDeviceGetAttribute( 1255 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1256 "Error returned from cuDeviceGetAttribute\n"); 1257 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1258 checkResult( 1259 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1260 "Error returned from cuDeviceGetAttribute\n"); 1261 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1262 TmpStr = "DEFAULT"; 1263 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1264 TmpStr = "PROHIBITED"; 1265 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1266 TmpStr = "EXCLUSIVE PROCESS"; 1267 else 1268 TmpStr = "unknown"; 1269 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1270 checkResult(cuDeviceGetAttribute( 1271 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1272 "Error returned from cuDeviceGetAttribute\n"); 1273 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1274 checkResult( 1275 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1276 "Error returned from cuDeviceGetAttribute\n"); 1277 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1278 checkResult(cuDeviceGetAttribute( 1279 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1280 "Error returned from cuDeviceGetAttribute\n"); 1281 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1282 checkResult( 1283 cuDeviceGetAttribute( 1284 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1285 "Error returned from cuDeviceGetAttribute\n"); 1286 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1287 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1288 Device), 1289 "Error returned from cuDeviceGetAttribute\n"); 1290 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1291 checkResult(cuDeviceGetAttribute( 1292 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1293 Device), 1294 "Error returned from cuDeviceGetAttribute\n"); 1295 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1296 checkResult(cuDeviceGetAttribute( 1297 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1298 "Error returned from cuDeviceGetAttribute\n"); 1299 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1300 checkResult(cuDeviceGetAttribute( 1301 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1302 "Error returned from cuDeviceGetAttribute\n"); 1303 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1304 checkResult(cuDeviceGetAttribute( 1305 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1306 "Error returned from cuDeviceGetAttribute\n"); 1307 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1308 checkResult( 1309 cuDeviceGetAttribute( 1310 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1311 "Error returned from cuDeviceGetAttribute\n"); 1312 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1313 checkResult( 1314 cuDeviceGetAttribute( 1315 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1316 "Error returned from cuDeviceGetAttribute\n"); 1317 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1318 checkResult(cuDeviceGetAttribute( 1319 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1320 "Error returned from cuDeviceGetAttribute\n"); 1321 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1322 checkResult(cuDeviceGetAttribute( 1323 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1324 "Error returned from cuDeviceGetAttribute\n"); 1325 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1326 checkResult( 1327 cuDeviceGetAttribute( 1328 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1329 "Error returned from cuDeviceGetAttribute\n"); 1330 checkResult( 1331 cuDeviceGetAttribute( 1332 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1333 "Error returned from cuDeviceGetAttribute\n"); 1334 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1335 } 1336 }; 1337 1338 DeviceRTLTy DeviceRTL; 1339 } // namespace 1340 1341 // Exposed library API function 1342 #ifdef __cplusplus 1343 extern "C" { 1344 #endif 1345 1346 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1347 return elf_check_machine(image, /* EM_CUDA */ 190); 1348 } 1349 1350 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1351 1352 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1353 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1354 DeviceRTL.setRequiresFlag(RequiresFlags); 1355 return RequiresFlags; 1356 } 1357 1358 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1359 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1360 DeviceRTL.isValidDeviceId(dst_dev_id)) 1361 return 1; 1362 1363 return 0; 1364 } 1365 1366 int32_t __tgt_rtl_init_device(int32_t device_id) { 1367 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1368 1369 return DeviceRTL.initDevice(device_id); 1370 } 1371 1372 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1373 __tgt_device_image *image) { 1374 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1375 1376 return DeviceRTL.loadBinary(device_id, image); 1377 } 1378 1379 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1380 int32_t kind) { 1381 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1382 1383 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1384 } 1385 1386 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1387 int64_t size) { 1388 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1389 1390 __tgt_async_info AsyncInfo; 1391 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1392 size, &AsyncInfo); 1393 if (rc != OFFLOAD_SUCCESS) 1394 return OFFLOAD_FAIL; 1395 1396 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1397 } 1398 1399 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1400 void *hst_ptr, int64_t size, 1401 __tgt_async_info *async_info_ptr) { 1402 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1403 assert(async_info_ptr && "async_info_ptr is nullptr"); 1404 1405 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1406 async_info_ptr); 1407 } 1408 1409 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1410 int64_t size) { 1411 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1412 1413 __tgt_async_info AsyncInfo; 1414 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1415 size, &AsyncInfo); 1416 if (rc != OFFLOAD_SUCCESS) 1417 return OFFLOAD_FAIL; 1418 1419 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1420 } 1421 1422 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1423 void *tgt_ptr, int64_t size, 1424 __tgt_async_info *async_info_ptr) { 1425 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1426 assert(async_info_ptr && "async_info_ptr is nullptr"); 1427 1428 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1429 async_info_ptr); 1430 } 1431 1432 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1433 int dst_dev_id, void *dst_ptr, 1434 int64_t size, 1435 __tgt_async_info *AsyncInfo) { 1436 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1437 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1438 assert(AsyncInfo && "AsyncInfo is nullptr"); 1439 1440 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1441 AsyncInfo); 1442 } 1443 1444 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1445 int32_t dst_dev_id, void *dst_ptr, 1446 int64_t size) { 1447 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1448 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1449 1450 __tgt_async_info AsyncInfo; 1451 const int32_t rc = __tgt_rtl_data_exchange_async( 1452 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1453 if (rc != OFFLOAD_SUCCESS) 1454 return OFFLOAD_FAIL; 1455 1456 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1457 } 1458 1459 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1460 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1461 1462 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1463 } 1464 1465 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1466 void **tgt_args, 1467 ptrdiff_t *tgt_offsets, 1468 int32_t arg_num, int32_t team_num, 1469 int32_t thread_limit, 1470 uint64_t loop_tripcount) { 1471 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1472 1473 __tgt_async_info AsyncInfo; 1474 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1475 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1476 thread_limit, loop_tripcount, &AsyncInfo); 1477 if (rc != OFFLOAD_SUCCESS) 1478 return OFFLOAD_FAIL; 1479 1480 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1481 } 1482 1483 int32_t __tgt_rtl_run_target_team_region_async( 1484 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1485 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1486 int32_t thread_limit, uint64_t loop_tripcount, 1487 __tgt_async_info *async_info_ptr) { 1488 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1489 1490 return DeviceRTL.runTargetTeamRegion( 1491 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1492 thread_limit, loop_tripcount, async_info_ptr); 1493 } 1494 1495 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1496 void **tgt_args, ptrdiff_t *tgt_offsets, 1497 int32_t arg_num) { 1498 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1499 1500 __tgt_async_info AsyncInfo; 1501 const int32_t rc = __tgt_rtl_run_target_region_async( 1502 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1503 if (rc != OFFLOAD_SUCCESS) 1504 return OFFLOAD_FAIL; 1505 1506 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1507 } 1508 1509 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1510 void *tgt_entry_ptr, void **tgt_args, 1511 ptrdiff_t *tgt_offsets, 1512 int32_t arg_num, 1513 __tgt_async_info *async_info_ptr) { 1514 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1515 1516 return __tgt_rtl_run_target_team_region_async( 1517 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1518 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1519 async_info_ptr); 1520 } 1521 1522 int32_t __tgt_rtl_synchronize(int32_t device_id, 1523 __tgt_async_info *async_info_ptr) { 1524 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1525 assert(async_info_ptr && "async_info_ptr is nullptr"); 1526 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1527 1528 return DeviceRTL.synchronize(device_id, async_info_ptr); 1529 } 1530 1531 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1532 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1533 InfoLevel.store(NewInfoLevel); 1534 } 1535 1536 void __tgt_rtl_print_device_info(int32_t device_id) { 1537 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1538 DeviceRTL.printDeviceInfo(device_id); 1539 } 1540 1541 #ifdef __cplusplus 1542 } 1543 #endif 1544