1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "DeviceEnvironment.h" 25 #include "omptargetplugin.h" 26 27 #define TARGET_NAME CUDA 28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 29 30 #include "MemoryManager.h" 31 32 #include "llvm/Frontend/OpenMP/OMPConstants.h" 33 34 // Utility for retrieving and printing CUDA error string. 35 #ifdef OMPTARGET_DEBUG 36 #define CUDA_ERR_STRING(err) \ 37 do { \ 38 if (getDebugLevel() > 0) { \ 39 const char *errStr = nullptr; \ 40 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 41 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 42 REPORT("Unrecognized CUDA error code: %d\n", err); \ 43 else if (errStr_status == CUDA_SUCCESS) \ 44 REPORT("CUDA error is: %s\n", errStr); \ 45 else { \ 46 REPORT("Unresolved CUDA error code: %d\n", err); \ 47 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 48 errStr_status); \ 49 } \ 50 } else { \ 51 const char *errStr = nullptr; \ 52 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 53 if (errStr_status == CUDA_SUCCESS) \ 54 REPORT("%s \n", errStr); \ 55 } \ 56 } while (false) 57 #else // OMPTARGET_DEBUG 58 #define CUDA_ERR_STRING(err) \ 59 do { \ 60 const char *errStr = nullptr; \ 61 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 62 if (errStr_status == CUDA_SUCCESS) \ 63 REPORT("%s \n", errStr); \ 64 } while (false) 65 #endif // OMPTARGET_DEBUG 66 67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 68 69 #include "elf_common.h" 70 71 /// Keep entries table per device. 72 struct FuncOrGblEntryTy { 73 __tgt_target_table Table; 74 std::vector<__tgt_offload_entry> Entries; 75 }; 76 77 /// Use a single entity to encode a kernel and a set of flags. 78 struct KernelTy { 79 CUfunction Func; 80 81 // execution mode of kernel 82 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 83 84 /// Maximal number of threads per block for this kernel. 85 int MaxThreadsPerBlock = 0; 86 87 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) 88 : Func(_Func), ExecutionMode(_ExecutionMode) {} 89 }; 90 91 namespace { 92 bool checkResult(CUresult Err, const char *ErrMsg) { 93 if (Err == CUDA_SUCCESS) 94 return true; 95 96 REPORT("%s", ErrMsg); 97 CUDA_ERR_STRING(Err); 98 return false; 99 } 100 101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 102 CUstream Stream) { 103 CUresult Err = 104 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 105 106 if (Err != CUDA_SUCCESS) { 107 DP("Error when copying data from device to device. Pointers: src " 108 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 109 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 110 CUDA_ERR_STRING(Err); 111 return OFFLOAD_FAIL; 112 } 113 114 return OFFLOAD_SUCCESS; 115 } 116 117 int createEvent(void **P) { 118 CUevent Event = nullptr; 119 120 CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); 121 if (Err != CUDA_SUCCESS) { 122 DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); 123 CUDA_ERR_STRING(Err); 124 return OFFLOAD_FAIL; 125 } 126 127 *P = Event; 128 129 return OFFLOAD_SUCCESS; 130 } 131 132 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 133 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 134 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 135 136 CUresult Err = cuEventRecord(Event, Stream); 137 if (Err != CUDA_SUCCESS) { 138 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 139 DPxPTR(Stream), DPxPTR(Event)); 140 CUDA_ERR_STRING(Err); 141 return OFFLOAD_FAIL; 142 } 143 144 return OFFLOAD_SUCCESS; 145 } 146 147 int syncEvent(void *EventPtr) { 148 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 149 150 CUresult Err = cuEventSynchronize(Event); 151 if (Err != CUDA_SUCCESS) { 152 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 153 CUDA_ERR_STRING(Err); 154 return OFFLOAD_FAIL; 155 } 156 157 return OFFLOAD_SUCCESS; 158 } 159 160 int destroyEvent(void *EventPtr) { 161 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 162 163 CUresult Err = cuEventDestroy(Event); 164 if (Err != CUDA_SUCCESS) { 165 DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); 166 CUDA_ERR_STRING(Err); 167 return OFFLOAD_FAIL; 168 } 169 170 return OFFLOAD_SUCCESS; 171 } 172 173 // Structure contains per-device data 174 struct DeviceDataTy { 175 /// List that contains all the kernels. 176 std::list<KernelTy> KernelsList; 177 178 std::list<FuncOrGblEntryTy> FuncGblEntries; 179 180 CUcontext Context = nullptr; 181 // Device properties 182 int ThreadsPerBlock = 0; 183 int BlocksPerGrid = 0; 184 int WarpSize = 0; 185 // OpenMP properties 186 int NumTeams = 0; 187 int NumThreads = 0; 188 }; 189 190 class StreamManagerTy { 191 int NumberOfDevices; 192 // The initial size of stream pool 193 int EnvNumInitialStreams; 194 // Per-device stream mutex 195 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 196 // Per-device stream Id indicates the next available stream in the pool 197 std::vector<int> NextStreamId; 198 // Per-device stream pool 199 std::vector<std::vector<CUstream>> StreamPool; 200 // Reference to per-device data 201 std::vector<DeviceDataTy> &DeviceData; 202 203 // If there is no CUstream left in the pool, we will resize the pool to 204 // allocate more CUstream. This function should be called with device mutex, 205 // and we do not resize to smaller one. 206 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 207 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 208 const size_t CurrentSize = Pool.size(); 209 assert(NewSize > CurrentSize && "new size is not larger than current size"); 210 211 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 212 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 213 // We will return if cannot switch to the right context in case of 214 // creating bunch of streams that are not corresponding to the right 215 // device. The offloading will fail later because selected CUstream is 216 // nullptr. 217 return; 218 } 219 220 Pool.resize(NewSize, nullptr); 221 222 for (size_t I = CurrentSize; I < NewSize; ++I) { 223 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 224 "Error returned from cuStreamCreate\n"); 225 } 226 } 227 228 public: 229 StreamManagerTy(const int NumberOfDevices, 230 std::vector<DeviceDataTy> &DeviceData) 231 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 232 DeviceData(DeviceData) { 233 StreamPool.resize(NumberOfDevices); 234 NextStreamId.resize(NumberOfDevices); 235 StreamMtx.resize(NumberOfDevices); 236 237 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 238 EnvNumInitialStreams = std::stoi(EnvStr); 239 240 // Initialize the next stream id 241 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 242 243 // Initialize stream mutex 244 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 245 Ptr = std::make_unique<std::mutex>(); 246 } 247 248 ~StreamManagerTy() { 249 // Destroy streams 250 for (int I = 0; I < NumberOfDevices; ++I) { 251 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 252 "Error returned from cuCtxSetCurrent\n"); 253 254 for (CUstream &S : StreamPool[I]) { 255 if (S) 256 checkResult(cuStreamDestroy(S), 257 "Error returned from cuStreamDestroy\n"); 258 } 259 } 260 } 261 262 // Get a CUstream from pool. Per-device next stream id always points to the 263 // next available CUstream. That means, CUstreams [0, id-1] have been 264 // assigned, and [id,] are still available. If there is no CUstream left, we 265 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 266 // the id will increase one. 267 // xxxxxs+++++++++ 268 // ^ 269 // id 270 // After assignment, the pool becomes the following and s is assigned. 271 // xxxxxs+++++++++ 272 // ^ 273 // id 274 CUstream getStream(const int DeviceId) { 275 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 276 int &Id = NextStreamId[DeviceId]; 277 // No CUstream left in the pool, we need to request from CUDA RT 278 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 279 // By default we double the stream pool every time 280 resizeStreamPool(DeviceId, Id * 2); 281 } 282 return StreamPool[DeviceId][Id++]; 283 } 284 285 // Return a CUstream back to pool. As mentioned above, per-device next 286 // stream is always points to the next available CUstream, so when we return 287 // a CUstream, we need to first decrease the id, and then copy the CUstream 288 // back. 289 // It is worth noting that, the order of streams return might be different 290 // from that they're assigned, that saying, at some point, there might be 291 // two identical CUstreams. 292 // xxax+a+++++ 293 // ^ 294 // id 295 // However, it doesn't matter, because they're always on the two sides of 296 // id. The left one will in the end be overwritten by another CUstream. 297 // Therefore, after several execution, the order of pool might be different 298 // from its initial state. 299 void returnStream(const int DeviceId, CUstream Stream) { 300 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 301 int &Id = NextStreamId[DeviceId]; 302 assert(Id > 0 && "Wrong stream ID"); 303 StreamPool[DeviceId][--Id] = Stream; 304 } 305 306 bool initializeDeviceStreamPool(const int DeviceId) { 307 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 308 309 resizeStreamPool(DeviceId, EnvNumInitialStreams); 310 311 // Check the size of stream pool 312 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 313 return false; 314 315 // Check whether each stream is valid 316 for (CUstream &S : StreamPool[DeviceId]) 317 if (!S) 318 return false; 319 320 return true; 321 } 322 }; 323 324 class DeviceRTLTy { 325 int NumberOfDevices; 326 // OpenMP environment properties 327 int EnvNumTeams; 328 int EnvTeamLimit; 329 int EnvTeamThreadLimit; 330 // OpenMP requires flags 331 int64_t RequiresFlags; 332 // Amount of dynamic shared memory to use at launch. 333 uint64_t DynamicMemorySize; 334 335 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 336 static constexpr const int HardThreadLimit = 1024; 337 static constexpr const int DefaultNumTeams = 128; 338 static constexpr const int DefaultNumThreads = 128; 339 340 std::unique_ptr<StreamManagerTy> StreamManager; 341 std::vector<DeviceDataTy> DeviceData; 342 std::vector<CUmodule> Modules; 343 344 /// A class responsible for interacting with device native runtime library to 345 /// allocate and free memory. 346 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 347 const int DeviceId; 348 const std::vector<DeviceDataTy> &DeviceData; 349 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 350 351 public: 352 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 353 : DeviceId(DeviceId), DeviceData(DeviceData) {} 354 355 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 356 if (Size == 0) 357 return nullptr; 358 359 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 360 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 361 return nullptr; 362 363 void *MemAlloc = nullptr; 364 switch (Kind) { 365 case TARGET_ALLOC_DEFAULT: 366 case TARGET_ALLOC_DEVICE: 367 CUdeviceptr DevicePtr; 368 Err = cuMemAlloc(&DevicePtr, Size); 369 MemAlloc = (void *)DevicePtr; 370 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 371 return nullptr; 372 break; 373 case TARGET_ALLOC_HOST: 374 void *HostPtr; 375 Err = cuMemAllocHost(&HostPtr, Size); 376 MemAlloc = HostPtr; 377 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 378 return nullptr; 379 HostPinnedAllocs[MemAlloc] = Kind; 380 break; 381 case TARGET_ALLOC_SHARED: 382 CUdeviceptr SharedPtr; 383 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 384 MemAlloc = (void *)SharedPtr; 385 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 386 return nullptr; 387 break; 388 } 389 390 return MemAlloc; 391 } 392 393 int free(void *TgtPtr) override { 394 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 395 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 396 return OFFLOAD_FAIL; 397 398 // Host pinned memory must be freed differently. 399 TargetAllocTy Kind = 400 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 401 ? TARGET_ALLOC_DEFAULT 402 : TARGET_ALLOC_HOST; 403 switch (Kind) { 404 case TARGET_ALLOC_DEFAULT: 405 case TARGET_ALLOC_DEVICE: 406 case TARGET_ALLOC_SHARED: 407 Err = cuMemFree((CUdeviceptr)TgtPtr); 408 if (!checkResult(Err, "Error returned from cuMemFree\n")) 409 return OFFLOAD_FAIL; 410 break; 411 case TARGET_ALLOC_HOST: 412 Err = cuMemFreeHost(TgtPtr); 413 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 414 return OFFLOAD_FAIL; 415 break; 416 } 417 418 return OFFLOAD_SUCCESS; 419 } 420 }; 421 422 /// A vector of device allocators 423 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 424 425 /// A vector of memory managers. Since the memory manager is non-copyable and 426 // non-removable, we wrap them into std::unique_ptr. 427 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 428 429 /// Whether use memory manager 430 bool UseMemoryManager = true; 431 432 // Record entry point associated with device 433 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 434 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 435 E.Entries.push_back(entry); 436 } 437 438 // Return a pointer to the entry associated with the pointer 439 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 440 const void *Addr) const { 441 for (const __tgt_offload_entry &Itr : 442 DeviceData[DeviceId].FuncGblEntries.back().Entries) 443 if (Itr.addr == Addr) 444 return &Itr; 445 446 return nullptr; 447 } 448 449 // Return the pointer to the target entries table 450 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 451 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 452 453 if (E.Entries.empty()) 454 return nullptr; 455 456 // Update table info according to the entries and return the pointer 457 E.Table.EntriesBegin = E.Entries.data(); 458 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 459 460 return &E.Table; 461 } 462 463 // Clear entries table for a device 464 void clearOffloadEntriesTable(const int DeviceId) { 465 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 466 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 467 E.Entries.clear(); 468 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 469 } 470 471 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 472 assert(AsyncInfo && "AsyncInfo is nullptr"); 473 474 if (!AsyncInfo->Queue) 475 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 476 477 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 478 } 479 480 public: 481 // This class should not be copied 482 DeviceRTLTy(const DeviceRTLTy &) = delete; 483 DeviceRTLTy(DeviceRTLTy &&) = delete; 484 485 DeviceRTLTy() 486 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 487 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 488 DynamicMemorySize(0) { 489 490 DP("Start initializing CUDA\n"); 491 492 CUresult Err = cuInit(0); 493 if (Err == CUDA_ERROR_INVALID_HANDLE) { 494 // Can't call cuGetErrorString if dlsym failed 495 DP("Failed to load CUDA shared library\n"); 496 return; 497 } 498 if (!checkResult(Err, "Error returned from cuInit\n")) { 499 return; 500 } 501 502 Err = cuDeviceGetCount(&NumberOfDevices); 503 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 504 return; 505 506 if (NumberOfDevices == 0) { 507 DP("There are no devices supporting CUDA.\n"); 508 return; 509 } 510 511 DeviceData.resize(NumberOfDevices); 512 513 // Get environment variables regarding teams 514 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 515 // OMP_TEAM_LIMIT has been set 516 EnvTeamLimit = std::stoi(EnvStr); 517 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 518 } 519 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 520 // OMP_TEAMS_THREAD_LIMIT has been set 521 EnvTeamThreadLimit = std::stoi(EnvStr); 522 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 523 } 524 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 525 // OMP_NUM_TEAMS has been set 526 EnvNumTeams = std::stoi(EnvStr); 527 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 528 } 529 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 530 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 531 DynamicMemorySize = std::stoi(EnvStr); 532 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 533 DynamicMemorySize); 534 } 535 536 StreamManager = 537 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 538 539 for (int I = 0; I < NumberOfDevices; ++I) 540 DeviceAllocators.emplace_back(I, DeviceData); 541 542 // Get the size threshold from environment variable 543 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 544 UseMemoryManager = Res.second; 545 size_t MemoryManagerThreshold = Res.first; 546 547 if (UseMemoryManager) 548 for (int I = 0; I < NumberOfDevices; ++I) 549 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 550 DeviceAllocators[I], MemoryManagerThreshold)); 551 } 552 553 ~DeviceRTLTy() { 554 // We first destruct memory managers in case that its dependent data are 555 // destroyed before it. 556 for (auto &M : MemoryManagers) 557 M.release(); 558 559 StreamManager = nullptr; 560 561 for (CUmodule &M : Modules) 562 // Close module 563 if (M) 564 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 565 566 for (DeviceDataTy &D : DeviceData) { 567 // Destroy context 568 if (D.Context) { 569 checkResult(cuCtxSetCurrent(D.Context), 570 "Error returned from cuCtxSetCurrent\n"); 571 CUdevice Device; 572 checkResult(cuCtxGetDevice(&Device), 573 "Error returned from cuCtxGetDevice\n"); 574 checkResult(cuDevicePrimaryCtxRelease(Device), 575 "Error returned from cuDevicePrimaryCtxRelease\n"); 576 } 577 } 578 } 579 580 // Check whether a given DeviceId is valid 581 bool isValidDeviceId(const int DeviceId) const { 582 return DeviceId >= 0 && DeviceId < NumberOfDevices; 583 } 584 585 int getNumOfDevices() const { return NumberOfDevices; } 586 587 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 588 589 int initDevice(const int DeviceId) { 590 CUdevice Device; 591 592 DP("Getting device %d\n", DeviceId); 593 CUresult Err = cuDeviceGet(&Device, DeviceId); 594 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 595 return OFFLOAD_FAIL; 596 597 // Query the current flags of the primary context and set its flags if 598 // it is inactive 599 unsigned int FormerPrimaryCtxFlags = 0; 600 int FormerPrimaryCtxIsActive = 0; 601 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 602 &FormerPrimaryCtxIsActive); 603 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 604 return OFFLOAD_FAIL; 605 606 if (FormerPrimaryCtxIsActive) { 607 DP("The primary context is active, no change to its flags\n"); 608 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 609 CU_CTX_SCHED_BLOCKING_SYNC) 610 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 611 } else { 612 DP("The primary context is inactive, set its flags to " 613 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 614 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 615 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 616 return OFFLOAD_FAIL; 617 } 618 619 // Retain the per device primary context and save it to use whenever this 620 // device is selected. 621 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 622 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 623 return OFFLOAD_FAIL; 624 625 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 626 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 627 return OFFLOAD_FAIL; 628 629 // Initialize stream pool 630 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 631 return OFFLOAD_FAIL; 632 633 // Query attributes to determine number of threads/block and blocks/grid. 634 int MaxGridDimX; 635 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 636 Device); 637 if (Err != CUDA_SUCCESS) { 638 DP("Error getting max grid dimension, use default value %d\n", 639 DeviceRTLTy::DefaultNumTeams); 640 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 641 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 642 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 643 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 644 } else { 645 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 646 "at the hard limit\n", 647 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 648 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 649 } 650 651 // We are only exploiting threads along the x axis. 652 int MaxBlockDimX; 653 Err = cuDeviceGetAttribute(&MaxBlockDimX, 654 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 655 if (Err != CUDA_SUCCESS) { 656 DP("Error getting max block dimension, use default value %d\n", 657 DeviceRTLTy::DefaultNumThreads); 658 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 659 } else { 660 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 661 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 662 663 if (EnvTeamThreadLimit > 0 && 664 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 665 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 666 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 667 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 668 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 669 } 670 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 671 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 672 "capping at the hard limit\n", 673 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 674 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 675 } 676 } 677 678 // Get and set warp size 679 int WarpSize; 680 Err = 681 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 682 if (Err != CUDA_SUCCESS) { 683 DP("Error getting warp size, assume default value 32\n"); 684 DeviceData[DeviceId].WarpSize = 32; 685 } else { 686 DP("Using warp size %d\n", WarpSize); 687 DeviceData[DeviceId].WarpSize = WarpSize; 688 } 689 690 // Adjust teams to the env variables 691 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 692 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 693 EnvTeamLimit); 694 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 695 } 696 697 size_t StackLimit; 698 size_t HeapLimit; 699 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 700 StackLimit = std::stol(EnvStr); 701 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 702 return OFFLOAD_FAIL; 703 } else { 704 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 705 return OFFLOAD_FAIL; 706 } 707 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 708 HeapLimit = std::stol(EnvStr); 709 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 710 return OFFLOAD_FAIL; 711 } else { 712 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 713 return OFFLOAD_FAIL; 714 } 715 716 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 717 "Device supports up to %d CUDA blocks and %d threads with a " 718 "warp size of %d\n", 719 DeviceData[DeviceId].BlocksPerGrid, 720 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 721 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 722 "Device heap size is %d Bytes, device stack size is %d Bytes per " 723 "thread\n", 724 (int)HeapLimit, (int)StackLimit); 725 726 // Set default number of teams 727 if (EnvNumTeams > 0) { 728 DP("Default number of teams set according to environment %d\n", 729 EnvNumTeams); 730 DeviceData[DeviceId].NumTeams = EnvNumTeams; 731 } else { 732 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 733 DP("Default number of teams set according to library's default %d\n", 734 DeviceRTLTy::DefaultNumTeams); 735 } 736 737 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 738 DP("Default number of teams exceeds device limit, capping at %d\n", 739 DeviceData[DeviceId].BlocksPerGrid); 740 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 741 } 742 743 // Set default number of threads 744 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 745 DP("Default number of threads set according to library's default %d\n", 746 DeviceRTLTy::DefaultNumThreads); 747 if (DeviceData[DeviceId].NumThreads > 748 DeviceData[DeviceId].ThreadsPerBlock) { 749 DP("Default number of threads exceeds device limit, capping at %d\n", 750 DeviceData[DeviceId].ThreadsPerBlock); 751 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; 752 } 753 754 return OFFLOAD_SUCCESS; 755 } 756 757 __tgt_target_table *loadBinary(const int DeviceId, 758 const __tgt_device_image *Image) { 759 // Set the context we are using 760 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 761 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 762 return nullptr; 763 764 // Clear the offload table as we are going to create a new one. 765 clearOffloadEntriesTable(DeviceId); 766 767 // Create the module and extract the function pointers. 768 CUmodule Module; 769 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 770 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 771 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 772 return nullptr; 773 774 DP("CUDA module successfully loaded!\n"); 775 776 Modules.push_back(Module); 777 778 // Find the symbols in the module by name. 779 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 780 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 781 782 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 783 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 784 if (!E->addr) { 785 // We return nullptr when something like this happens, the host should 786 // have always something in the address to uniquely identify the target 787 // region. 788 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 789 return nullptr; 790 } 791 792 if (E->size) { 793 __tgt_offload_entry Entry = *E; 794 CUdeviceptr CUPtr; 795 size_t CUSize; 796 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 797 // We keep this style here because we need the name 798 if (Err != CUDA_SUCCESS) { 799 REPORT("Loading global '%s' Failed\n", E->name); 800 CUDA_ERR_STRING(Err); 801 return nullptr; 802 } 803 804 if (CUSize != E->size) { 805 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 806 CUSize, E->size); 807 return nullptr; 808 } 809 810 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 811 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 812 813 Entry.addr = (void *)(CUPtr); 814 815 // Note: In the current implementation declare target variables 816 // can either be link or to. This means that once unified 817 // memory is activated via the requires directive, the variable 818 // can be used directly from the host in both cases. 819 // TODO: when variables types other than to or link are added, 820 // the below condition should be changed to explicitly 821 // check for to and link variables types: 822 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 823 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 824 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 825 // If unified memory is present any target link or to variables 826 // can access host addresses directly. There is no longer a 827 // need for device copies. 828 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 829 DP("Copy linked variable host address (" DPxMOD 830 ") to device address (" DPxMOD ")\n", 831 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 832 } 833 834 addOffloadEntry(DeviceId, Entry); 835 836 continue; 837 } 838 839 CUfunction Func; 840 Err = cuModuleGetFunction(&Func, Module, E->name); 841 // We keep this style here because we need the name 842 if (Err != CUDA_SUCCESS) { 843 REPORT("Loading '%s' Failed\n", E->name); 844 CUDA_ERR_STRING(Err); 845 return nullptr; 846 } 847 848 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 849 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 850 851 // default value GENERIC (in case symbol is missing from cubin file) 852 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 853 std::string ExecModeNameStr(E->name); 854 ExecModeNameStr += "_exec_mode"; 855 const char *ExecModeName = ExecModeNameStr.c_str(); 856 857 CUdeviceptr ExecModePtr; 858 size_t CUSize; 859 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 860 if (Err == CUDA_SUCCESS) { 861 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 862 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 863 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 864 return nullptr; 865 } 866 867 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 868 if (Err != CUDA_SUCCESS) { 869 REPORT("Error when copying data from device to host. Pointers: " 870 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 871 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 872 CUDA_ERR_STRING(Err); 873 return nullptr; 874 } 875 } else { 876 DP("Loading global exec_mode '%s' - symbol missing, using default " 877 "value GENERIC (1)\n", 878 ExecModeName); 879 } 880 881 KernelsList.emplace_back(Func, ExecModeVal); 882 883 __tgt_offload_entry Entry = *E; 884 Entry.addr = &KernelsList.back(); 885 addOffloadEntry(DeviceId, Entry); 886 } 887 888 // send device environment data to the device 889 { 890 // TODO: The device ID used here is not the real device ID used by OpenMP. 891 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices), 892 static_cast<uint32_t>(DeviceId), 893 static_cast<uint32_t>(DynamicMemorySize)}; 894 895 #ifdef OMPTARGET_DEBUG 896 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 897 DeviceEnv.DebugKind = std::stoi(EnvStr); 898 #endif 899 900 const char *DeviceEnvName = "omptarget_device_environment"; 901 CUdeviceptr DeviceEnvPtr; 902 size_t CUSize; 903 904 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 905 if (Err == CUDA_SUCCESS) { 906 if (CUSize != sizeof(DeviceEnv)) { 907 REPORT( 908 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 909 DeviceEnvName, CUSize, sizeof(int32_t)); 910 CUDA_ERR_STRING(Err); 911 return nullptr; 912 } 913 914 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 915 if (Err != CUDA_SUCCESS) { 916 REPORT("Error when copying data from host to device. Pointers: " 917 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 918 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 919 CUDA_ERR_STRING(Err); 920 return nullptr; 921 } 922 923 DP("Sending global device environment data %zu bytes\n", CUSize); 924 } else { 925 DP("Finding global device environment '%s' - symbol missing.\n", 926 DeviceEnvName); 927 DP("Continue, considering this is a device RTL which does not accept " 928 "environment setting.\n"); 929 } 930 } 931 932 return getOffloadEntriesTable(DeviceId); 933 } 934 935 void *dataAlloc(const int DeviceId, const int64_t Size, 936 const TargetAllocTy Kind) { 937 switch (Kind) { 938 case TARGET_ALLOC_DEFAULT: 939 case TARGET_ALLOC_DEVICE: 940 if (UseMemoryManager) 941 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 942 else 943 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 944 case TARGET_ALLOC_HOST: 945 case TARGET_ALLOC_SHARED: 946 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 947 } 948 949 REPORT("Invalid target data allocation kind or requested allocator not " 950 "implemented yet\n"); 951 952 return nullptr; 953 } 954 955 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 956 const int64_t Size, __tgt_async_info *AsyncInfo) const { 957 assert(AsyncInfo && "AsyncInfo is nullptr"); 958 959 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 960 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 961 return OFFLOAD_FAIL; 962 963 CUstream Stream = getStream(DeviceId, AsyncInfo); 964 965 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 966 if (Err != CUDA_SUCCESS) { 967 DP("Error when copying data from host to device. Pointers: host " 968 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 969 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 970 CUDA_ERR_STRING(Err); 971 return OFFLOAD_FAIL; 972 } 973 974 return OFFLOAD_SUCCESS; 975 } 976 977 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 978 const int64_t Size, __tgt_async_info *AsyncInfo) const { 979 assert(AsyncInfo && "AsyncInfo is nullptr"); 980 981 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 982 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 983 return OFFLOAD_FAIL; 984 985 CUstream Stream = getStream(DeviceId, AsyncInfo); 986 987 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 988 if (Err != CUDA_SUCCESS) { 989 DP("Error when copying data from device to host. Pointers: host " 990 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 991 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 992 CUDA_ERR_STRING(Err); 993 return OFFLOAD_FAIL; 994 } 995 996 return OFFLOAD_SUCCESS; 997 } 998 999 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1000 int64_t Size, __tgt_async_info *AsyncInfo) const { 1001 assert(AsyncInfo && "AsyncInfo is nullptr"); 1002 1003 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 1004 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1005 return OFFLOAD_FAIL; 1006 1007 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1008 1009 // If they are two devices, we try peer to peer copy first 1010 if (SrcDevId != DstDevId) { 1011 int CanAccessPeer = 0; 1012 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1013 if (Err != CUDA_SUCCESS) { 1014 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1015 ", dst = %" PRId32 "\n", 1016 SrcDevId, DstDevId); 1017 CUDA_ERR_STRING(Err); 1018 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1019 } 1020 1021 if (!CanAccessPeer) { 1022 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1023 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1024 } 1025 1026 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1027 if (Err != CUDA_SUCCESS) { 1028 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1029 ", dst = %" PRId32 "\n", 1030 SrcDevId, DstDevId); 1031 CUDA_ERR_STRING(Err); 1032 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1033 } 1034 1035 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1036 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1037 Size, Stream); 1038 if (Err == CUDA_SUCCESS) 1039 return OFFLOAD_SUCCESS; 1040 1041 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1042 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1043 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1044 CUDA_ERR_STRING(Err); 1045 } 1046 1047 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1048 } 1049 1050 int dataDelete(const int DeviceId, void *TgtPtr) { 1051 if (UseMemoryManager) 1052 return MemoryManagers[DeviceId]->free(TgtPtr); 1053 1054 return DeviceAllocators[DeviceId].free(TgtPtr); 1055 } 1056 1057 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1058 ptrdiff_t *TgtOffsets, const int ArgNum, 1059 const int TeamNum, const int ThreadLimit, 1060 const unsigned int LoopTripCount, 1061 __tgt_async_info *AsyncInfo) const { 1062 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1063 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1064 return OFFLOAD_FAIL; 1065 1066 // All args are references. 1067 std::vector<void *> Args(ArgNum); 1068 std::vector<void *> Ptrs(ArgNum); 1069 1070 for (int I = 0; I < ArgNum; ++I) { 1071 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1072 Args[I] = &Ptrs[I]; 1073 } 1074 1075 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1076 1077 const bool IsSPMDGenericMode = 1078 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1079 const bool IsSPMDMode = 1080 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1081 const bool IsGenericMode = 1082 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1083 1084 int CudaThreadsPerBlock; 1085 if (ThreadLimit > 0) { 1086 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1087 CudaThreadsPerBlock = ThreadLimit; 1088 // Add master warp if necessary 1089 if (IsGenericMode) { 1090 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1091 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1092 } 1093 } else { 1094 DP("Setting CUDA threads per block to default %d\n", 1095 DeviceData[DeviceId].NumThreads); 1096 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1097 } 1098 1099 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1100 DP("Threads per block capped at device limit %d\n", 1101 DeviceData[DeviceId].ThreadsPerBlock); 1102 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1103 } 1104 1105 if (!KernelInfo->MaxThreadsPerBlock) { 1106 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1107 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1108 KernelInfo->Func); 1109 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1110 return OFFLOAD_FAIL; 1111 } 1112 1113 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1114 DP("Threads per block capped at kernel limit %d\n", 1115 KernelInfo->MaxThreadsPerBlock); 1116 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1117 } 1118 1119 unsigned int CudaBlocksPerGrid; 1120 if (TeamNum <= 0) { 1121 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1122 if (IsSPMDGenericMode) { 1123 // If we reach this point, then we are executing a kernel that was 1124 // transformed from Generic-mode to SPMD-mode. This kernel has 1125 // SPMD-mode execution, but needs its blocks to be scheduled 1126 // differently because the current loop trip count only applies to the 1127 // `teams distribute` region and will create var too few blocks using 1128 // the regular SPMD-mode method. 1129 CudaBlocksPerGrid = LoopTripCount; 1130 } else if (IsSPMDMode) { 1131 // We have a combined construct, i.e. `target teams distribute 1132 // parallel for [simd]`. We launch so many teams so that each thread 1133 // will execute one iteration of the loop. round up to the nearest 1134 // integer 1135 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1136 } else if (IsGenericMode) { 1137 // If we reach this point, then we have a non-combined construct, i.e. 1138 // `teams distribute` with a nested `parallel for` and each team is 1139 // assigned one iteration of the `distribute` loop. E.g.: 1140 // 1141 // #pragma omp target teams distribute 1142 // for(...loop_tripcount...) { 1143 // #pragma omp parallel for 1144 // for(...) {} 1145 // } 1146 // 1147 // Threads within a team will execute the iterations of the `parallel` 1148 // loop. 1149 CudaBlocksPerGrid = LoopTripCount; 1150 } else { 1151 REPORT("Unknown execution mode: %d\n", 1152 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1153 return OFFLOAD_FAIL; 1154 } 1155 DP("Using %d teams due to loop trip count %" PRIu32 1156 " and number of threads per block %d\n", 1157 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1158 } else { 1159 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1160 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1161 } 1162 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1163 DP("Capping number of teams to team limit %d\n", 1164 DeviceData[DeviceId].BlocksPerGrid); 1165 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1166 } else { 1167 DP("Using requested number of teams %d\n", TeamNum); 1168 CudaBlocksPerGrid = TeamNum; 1169 } 1170 1171 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1172 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1173 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1174 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1175 : "(null)", 1176 CudaBlocksPerGrid, CudaThreadsPerBlock, 1177 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1178 1179 CUstream Stream = getStream(DeviceId, AsyncInfo); 1180 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1181 /* gridDimZ */ 1, CudaThreadsPerBlock, 1182 /* blockDimY */ 1, /* blockDimZ */ 1, 1183 DynamicMemorySize, Stream, &Args[0], nullptr); 1184 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1185 return OFFLOAD_FAIL; 1186 1187 DP("Launch of entry point at " DPxMOD " successful!\n", 1188 DPxPTR(TgtEntryPtr)); 1189 1190 return OFFLOAD_SUCCESS; 1191 } 1192 1193 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1194 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1195 CUresult Err = cuStreamSynchronize(Stream); 1196 1197 // Once the stream is synchronized, return it to stream pool and reset 1198 // AsyncInfo. This is to make sure the synchronization only works for its 1199 // own tasks. 1200 StreamManager->returnStream(DeviceId, 1201 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1202 AsyncInfo->Queue = nullptr; 1203 1204 if (Err != CUDA_SUCCESS) { 1205 DP("Error when synchronizing stream. stream = " DPxMOD 1206 ", async info ptr = " DPxMOD "\n", 1207 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1208 CUDA_ERR_STRING(Err); 1209 } 1210 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1211 } 1212 1213 void printDeviceInfo(int32_t device_id) { 1214 char TmpChar[1000]; 1215 std::string TmpStr; 1216 size_t TmpSt; 1217 int TmpInt, TmpInt2, TmpInt3; 1218 1219 CUdevice Device; 1220 checkResult(cuDeviceGet(&Device, device_id), 1221 "Error returned from cuCtxGetDevice\n"); 1222 1223 cuDriverGetVersion(&TmpInt); 1224 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1225 printf(" CUDA Device Number: \t\t%d \n", device_id); 1226 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1227 "Error returned from cuDeviceGetName\n"); 1228 printf(" Device Name: \t\t\t%s \n", TmpChar); 1229 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1230 "Error returned from cuDeviceTotalMem\n"); 1231 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1232 checkResult(cuDeviceGetAttribute( 1233 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1234 "Error returned from cuDeviceGetAttribute\n"); 1235 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1236 checkResult( 1237 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1238 "Error returned from cuDeviceGetAttribute\n"); 1239 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1240 checkResult(cuDeviceGetAttribute( 1241 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1242 "Error returned from cuDeviceGetAttribute\n"); 1243 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1244 checkResult( 1245 cuDeviceGetAttribute( 1246 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1247 "Error returned from cuDeviceGetAttribute\n"); 1248 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1249 checkResult( 1250 cuDeviceGetAttribute( 1251 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1252 "Error returned from cuDeviceGetAttribute\n"); 1253 printf(" Registers per Block: \t\t%d \n", TmpInt); 1254 checkResult( 1255 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1256 "Error returned from cuDeviceGetAttribute\n"); 1257 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1258 checkResult(cuDeviceGetAttribute( 1259 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1260 "Error returned from cuDeviceGetAttribute\n"); 1261 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1262 checkResult(cuDeviceGetAttribute( 1263 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1264 "Error returned from cuDeviceGetAttribute\n"); 1265 checkResult(cuDeviceGetAttribute( 1266 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1267 "Error returned from cuDeviceGetAttribute\n"); 1268 checkResult(cuDeviceGetAttribute( 1269 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1270 "Error returned from cuDeviceGetAttribute\n"); 1271 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1272 TmpInt3); 1273 checkResult(cuDeviceGetAttribute( 1274 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1275 "Error returned from cuDeviceGetAttribute\n"); 1276 checkResult(cuDeviceGetAttribute( 1277 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1278 "Error returned from cuDeviceGetAttribute\n"); 1279 checkResult(cuDeviceGetAttribute( 1280 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1281 "Error returned from cuDeviceGetAttribute\n"); 1282 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1283 TmpInt3); 1284 checkResult( 1285 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1286 "Error returned from cuDeviceGetAttribute\n"); 1287 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1288 checkResult(cuDeviceGetAttribute( 1289 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1290 "Error returned from cuDeviceGetAttribute\n"); 1291 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1292 checkResult( 1293 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1294 "Error returned from cuDeviceGetAttribute\n"); 1295 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1296 checkResult(cuDeviceGetAttribute( 1297 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1298 "Error returned from cuDeviceGetAttribute\n"); 1299 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1300 checkResult( 1301 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1302 "Error returned from cuDeviceGetAttribute\n"); 1303 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1304 checkResult(cuDeviceGetAttribute( 1305 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1306 "Error returned from cuDeviceGetAttribute\n"); 1307 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1308 checkResult( 1309 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1310 "Error returned from cuDeviceGetAttribute\n"); 1311 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1312 TmpStr = "DEFAULT"; 1313 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1314 TmpStr = "PROHIBITED"; 1315 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1316 TmpStr = "EXCLUSIVE PROCESS"; 1317 else 1318 TmpStr = "unknown"; 1319 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1320 checkResult(cuDeviceGetAttribute( 1321 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1322 "Error returned from cuDeviceGetAttribute\n"); 1323 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1324 checkResult( 1325 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1326 "Error returned from cuDeviceGetAttribute\n"); 1327 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1328 checkResult(cuDeviceGetAttribute( 1329 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1330 "Error returned from cuDeviceGetAttribute\n"); 1331 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1332 checkResult( 1333 cuDeviceGetAttribute( 1334 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1335 "Error returned from cuDeviceGetAttribute\n"); 1336 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1337 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1338 Device), 1339 "Error returned from cuDeviceGetAttribute\n"); 1340 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1341 checkResult(cuDeviceGetAttribute( 1342 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1343 Device), 1344 "Error returned from cuDeviceGetAttribute\n"); 1345 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1346 checkResult(cuDeviceGetAttribute( 1347 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1348 "Error returned from cuDeviceGetAttribute\n"); 1349 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1350 checkResult(cuDeviceGetAttribute( 1351 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1352 "Error returned from cuDeviceGetAttribute\n"); 1353 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1354 checkResult(cuDeviceGetAttribute( 1355 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1356 "Error returned from cuDeviceGetAttribute\n"); 1357 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1358 checkResult( 1359 cuDeviceGetAttribute( 1360 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1361 "Error returned from cuDeviceGetAttribute\n"); 1362 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1363 checkResult( 1364 cuDeviceGetAttribute( 1365 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1366 "Error returned from cuDeviceGetAttribute\n"); 1367 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1368 checkResult(cuDeviceGetAttribute( 1369 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1370 "Error returned from cuDeviceGetAttribute\n"); 1371 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1372 checkResult(cuDeviceGetAttribute( 1373 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1374 "Error returned from cuDeviceGetAttribute\n"); 1375 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1376 checkResult( 1377 cuDeviceGetAttribute( 1378 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 checkResult( 1381 cuDeviceGetAttribute( 1382 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1383 "Error returned from cuDeviceGetAttribute\n"); 1384 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1385 } 1386 1387 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1388 void *EventPtr) const { 1389 CUstream Stream = getStream(DeviceId, AsyncInfo); 1390 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1391 1392 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1393 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1394 // API document, that argument has to be 0x0. 1395 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1396 if (Err != CUDA_SUCCESS) { 1397 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1398 DPxPTR(Stream), DPxPTR(Event)); 1399 CUDA_ERR_STRING(Err); 1400 return OFFLOAD_FAIL; 1401 } 1402 1403 return OFFLOAD_SUCCESS; 1404 } 1405 }; 1406 1407 DeviceRTLTy DeviceRTL; 1408 } // namespace 1409 1410 // Exposed library API function 1411 #ifdef __cplusplus 1412 extern "C" { 1413 #endif 1414 1415 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1416 return elf_check_machine(image, /* EM_CUDA */ 190); 1417 } 1418 1419 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1420 1421 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1422 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1423 DeviceRTL.setRequiresFlag(RequiresFlags); 1424 return RequiresFlags; 1425 } 1426 1427 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1428 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1429 DeviceRTL.isValidDeviceId(dst_dev_id)) 1430 return 1; 1431 1432 return 0; 1433 } 1434 1435 int32_t __tgt_rtl_init_device(int32_t device_id) { 1436 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1437 1438 return DeviceRTL.initDevice(device_id); 1439 } 1440 1441 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1442 __tgt_device_image *image) { 1443 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1444 1445 return DeviceRTL.loadBinary(device_id, image); 1446 } 1447 1448 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1449 int32_t kind) { 1450 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1451 1452 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1453 } 1454 1455 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1456 int64_t size) { 1457 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1458 1459 __tgt_async_info AsyncInfo; 1460 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1461 size, &AsyncInfo); 1462 if (rc != OFFLOAD_SUCCESS) 1463 return OFFLOAD_FAIL; 1464 1465 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1466 } 1467 1468 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1469 void *hst_ptr, int64_t size, 1470 __tgt_async_info *async_info_ptr) { 1471 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1472 assert(async_info_ptr && "async_info_ptr is nullptr"); 1473 1474 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1475 async_info_ptr); 1476 } 1477 1478 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1479 int64_t size) { 1480 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1481 1482 __tgt_async_info AsyncInfo; 1483 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1484 size, &AsyncInfo); 1485 if (rc != OFFLOAD_SUCCESS) 1486 return OFFLOAD_FAIL; 1487 1488 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1489 } 1490 1491 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1492 void *tgt_ptr, int64_t size, 1493 __tgt_async_info *async_info_ptr) { 1494 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1495 assert(async_info_ptr && "async_info_ptr is nullptr"); 1496 1497 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1498 async_info_ptr); 1499 } 1500 1501 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1502 int dst_dev_id, void *dst_ptr, 1503 int64_t size, 1504 __tgt_async_info *AsyncInfo) { 1505 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1506 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1507 assert(AsyncInfo && "AsyncInfo is nullptr"); 1508 1509 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1510 AsyncInfo); 1511 } 1512 1513 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1514 int32_t dst_dev_id, void *dst_ptr, 1515 int64_t size) { 1516 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1517 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1518 1519 __tgt_async_info AsyncInfo; 1520 const int32_t rc = __tgt_rtl_data_exchange_async( 1521 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1522 if (rc != OFFLOAD_SUCCESS) 1523 return OFFLOAD_FAIL; 1524 1525 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1526 } 1527 1528 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1529 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1530 1531 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1532 } 1533 1534 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1535 void **tgt_args, 1536 ptrdiff_t *tgt_offsets, 1537 int32_t arg_num, int32_t team_num, 1538 int32_t thread_limit, 1539 uint64_t loop_tripcount) { 1540 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1541 1542 __tgt_async_info AsyncInfo; 1543 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1544 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1545 thread_limit, loop_tripcount, &AsyncInfo); 1546 if (rc != OFFLOAD_SUCCESS) 1547 return OFFLOAD_FAIL; 1548 1549 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1550 } 1551 1552 int32_t __tgt_rtl_run_target_team_region_async( 1553 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1554 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1555 int32_t thread_limit, uint64_t loop_tripcount, 1556 __tgt_async_info *async_info_ptr) { 1557 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1558 1559 return DeviceRTL.runTargetTeamRegion( 1560 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1561 thread_limit, loop_tripcount, async_info_ptr); 1562 } 1563 1564 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1565 void **tgt_args, ptrdiff_t *tgt_offsets, 1566 int32_t arg_num) { 1567 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1568 1569 __tgt_async_info AsyncInfo; 1570 const int32_t rc = __tgt_rtl_run_target_region_async( 1571 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1572 if (rc != OFFLOAD_SUCCESS) 1573 return OFFLOAD_FAIL; 1574 1575 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1576 } 1577 1578 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1579 void *tgt_entry_ptr, void **tgt_args, 1580 ptrdiff_t *tgt_offsets, 1581 int32_t arg_num, 1582 __tgt_async_info *async_info_ptr) { 1583 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1584 1585 return __tgt_rtl_run_target_team_region_async( 1586 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1587 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1588 async_info_ptr); 1589 } 1590 1591 int32_t __tgt_rtl_synchronize(int32_t device_id, 1592 __tgt_async_info *async_info_ptr) { 1593 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1594 assert(async_info_ptr && "async_info_ptr is nullptr"); 1595 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1596 1597 return DeviceRTL.synchronize(device_id, async_info_ptr); 1598 } 1599 1600 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1601 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1602 InfoLevel.store(NewInfoLevel); 1603 } 1604 1605 void __tgt_rtl_print_device_info(int32_t device_id) { 1606 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1607 DeviceRTL.printDeviceInfo(device_id); 1608 } 1609 1610 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1611 assert(event && "event is nullptr"); 1612 return createEvent(event); 1613 } 1614 1615 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1616 __tgt_async_info *async_info_ptr) { 1617 assert(async_info_ptr && "async_info_ptr is nullptr"); 1618 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1619 assert(event_ptr && "event_ptr is nullptr"); 1620 1621 return recordEvent(event_ptr, async_info_ptr); 1622 } 1623 1624 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1625 __tgt_async_info *async_info_ptr) { 1626 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1627 assert(async_info_ptr && "async_info_ptr is nullptr"); 1628 assert(event_ptr && "event is nullptr"); 1629 1630 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1631 } 1632 1633 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1634 assert(event_ptr && "event is nullptr"); 1635 1636 return syncEvent(event_ptr); 1637 } 1638 1639 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1640 assert(event_ptr && "event is nullptr"); 1641 1642 return destroyEvent(event_ptr); 1643 } 1644 1645 #ifdef __cplusplus 1646 } 1647 #endif 1648