1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "omptargetplugin.h" 25 26 #define TARGET_NAME CUDA 27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 28 29 #include "MemoryManager.h" 30 31 #include "llvm/Frontend/OpenMP/OMPConstants.h" 32 33 // Utility for retrieving and printing CUDA error string. 34 #ifdef OMPTARGET_DEBUG 35 #define CUDA_ERR_STRING(err) \ 36 do { \ 37 if (getDebugLevel() > 0) { \ 38 const char *errStr = nullptr; \ 39 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 40 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 41 REPORT("Unrecognized CUDA error code: %d\n", err); \ 42 else if (errStr_status == CUDA_SUCCESS) \ 43 REPORT("CUDA error is: %s\n", errStr); \ 44 else { \ 45 REPORT("Unresolved CUDA error code: %d\n", err); \ 46 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 47 errStr_status); \ 48 } \ 49 } else { \ 50 const char *errStr = nullptr; \ 51 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 52 if (errStr_status == CUDA_SUCCESS) \ 53 REPORT("%s \n", errStr); \ 54 } \ 55 } while (false) 56 #else // OMPTARGET_DEBUG 57 #define CUDA_ERR_STRING(err) \ 58 do { \ 59 const char *errStr = nullptr; \ 60 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 61 if (errStr_status == CUDA_SUCCESS) \ 62 REPORT("%s \n", errStr); \ 63 } while (false) 64 #endif // OMPTARGET_DEBUG 65 66 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 67 68 #include "elf_common.h" 69 70 /// Keep entries table per device. 71 struct FuncOrGblEntryTy { 72 __tgt_target_table Table; 73 std::vector<__tgt_offload_entry> Entries; 74 }; 75 76 /// Use a single entity to encode a kernel and a set of flags. 77 struct KernelTy { 78 CUfunction Func; 79 80 // execution mode of kernel 81 llvm::omp::OMPTgtExecModeFlags ExecutionMode; 82 83 /// Maximal number of threads per block for this kernel. 84 int MaxThreadsPerBlock = 0; 85 86 KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) 87 : Func(_Func), ExecutionMode(_ExecutionMode) {} 88 }; 89 90 /// Device environment data 91 /// Manually sync with the deviceRTL side for now, move to a dedicated header 92 /// file later. 93 struct omptarget_device_environmentTy { 94 int32_t debug_level; 95 uint32_t num_devices; 96 uint32_t device_num; 97 uint64_t dynamic_shared_size; 98 }; 99 100 namespace { 101 bool checkResult(CUresult Err, const char *ErrMsg) { 102 if (Err == CUDA_SUCCESS) 103 return true; 104 105 REPORT("%s", ErrMsg); 106 CUDA_ERR_STRING(Err); 107 return false; 108 } 109 110 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 111 CUstream Stream) { 112 CUresult Err = 113 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 114 115 if (Err != CUDA_SUCCESS) { 116 DP("Error when copying data from device to device. Pointers: src " 117 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 118 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 119 CUDA_ERR_STRING(Err); 120 return OFFLOAD_FAIL; 121 } 122 123 return OFFLOAD_SUCCESS; 124 } 125 126 int createEvent(void **P) { 127 CUevent Event = nullptr; 128 129 CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); 130 if (Err != CUDA_SUCCESS) { 131 DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); 132 CUDA_ERR_STRING(Err); 133 return OFFLOAD_FAIL; 134 } 135 136 *P = Event; 137 138 return OFFLOAD_SUCCESS; 139 } 140 141 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 142 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 143 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 144 145 CUresult Err = cuEventRecord(Event, Stream); 146 if (Err != CUDA_SUCCESS) { 147 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 148 DPxPTR(Stream), DPxPTR(Event)); 149 CUDA_ERR_STRING(Err); 150 return OFFLOAD_FAIL; 151 } 152 153 return OFFLOAD_SUCCESS; 154 } 155 156 int syncEvent(void *EventPtr) { 157 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 158 159 CUresult Err = cuEventSynchronize(Event); 160 if (Err != CUDA_SUCCESS) { 161 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 162 CUDA_ERR_STRING(Err); 163 return OFFLOAD_FAIL; 164 } 165 166 return OFFLOAD_SUCCESS; 167 } 168 169 int destroyEvent(void *EventPtr) { 170 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 171 172 CUresult Err = cuEventDestroy(Event); 173 if (Err != CUDA_SUCCESS) { 174 DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); 175 CUDA_ERR_STRING(Err); 176 return OFFLOAD_FAIL; 177 } 178 179 return OFFLOAD_SUCCESS; 180 } 181 182 // Structure contains per-device data 183 struct DeviceDataTy { 184 /// List that contains all the kernels. 185 std::list<KernelTy> KernelsList; 186 187 std::list<FuncOrGblEntryTy> FuncGblEntries; 188 189 CUcontext Context = nullptr; 190 // Device properties 191 int ThreadsPerBlock = 0; 192 int BlocksPerGrid = 0; 193 int WarpSize = 0; 194 // OpenMP properties 195 int NumTeams = 0; 196 int NumThreads = 0; 197 }; 198 199 class StreamManagerTy { 200 int NumberOfDevices; 201 // The initial size of stream pool 202 int EnvNumInitialStreams; 203 // Per-device stream mutex 204 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 205 // Per-device stream Id indicates the next available stream in the pool 206 std::vector<int> NextStreamId; 207 // Per-device stream pool 208 std::vector<std::vector<CUstream>> StreamPool; 209 // Reference to per-device data 210 std::vector<DeviceDataTy> &DeviceData; 211 212 // If there is no CUstream left in the pool, we will resize the pool to 213 // allocate more CUstream. This function should be called with device mutex, 214 // and we do not resize to smaller one. 215 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 216 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 217 const size_t CurrentSize = Pool.size(); 218 assert(NewSize > CurrentSize && "new size is not larger than current size"); 219 220 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 221 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 222 // We will return if cannot switch to the right context in case of 223 // creating bunch of streams that are not corresponding to the right 224 // device. The offloading will fail later because selected CUstream is 225 // nullptr. 226 return; 227 } 228 229 Pool.resize(NewSize, nullptr); 230 231 for (size_t I = CurrentSize; I < NewSize; ++I) { 232 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 233 "Error returned from cuStreamCreate\n"); 234 } 235 } 236 237 public: 238 StreamManagerTy(const int NumberOfDevices, 239 std::vector<DeviceDataTy> &DeviceData) 240 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 241 DeviceData(DeviceData) { 242 StreamPool.resize(NumberOfDevices); 243 NextStreamId.resize(NumberOfDevices); 244 StreamMtx.resize(NumberOfDevices); 245 246 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 247 EnvNumInitialStreams = std::stoi(EnvStr); 248 249 // Initialize the next stream id 250 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 251 252 // Initialize stream mutex 253 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 254 Ptr = std::make_unique<std::mutex>(); 255 } 256 257 ~StreamManagerTy() { 258 // Destroy streams 259 for (int I = 0; I < NumberOfDevices; ++I) { 260 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 261 "Error returned from cuCtxSetCurrent\n"); 262 263 for (CUstream &S : StreamPool[I]) { 264 if (S) 265 checkResult(cuStreamDestroy(S), 266 "Error returned from cuStreamDestroy\n"); 267 } 268 } 269 } 270 271 // Get a CUstream from pool. Per-device next stream id always points to the 272 // next available CUstream. That means, CUstreams [0, id-1] have been 273 // assigned, and [id,] are still available. If there is no CUstream left, we 274 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 275 // the id will increase one. 276 // xxxxxs+++++++++ 277 // ^ 278 // id 279 // After assignment, the pool becomes the following and s is assigned. 280 // xxxxxs+++++++++ 281 // ^ 282 // id 283 CUstream getStream(const int DeviceId) { 284 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 285 int &Id = NextStreamId[DeviceId]; 286 // No CUstream left in the pool, we need to request from CUDA RT 287 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 288 // By default we double the stream pool every time 289 resizeStreamPool(DeviceId, Id * 2); 290 } 291 return StreamPool[DeviceId][Id++]; 292 } 293 294 // Return a CUstream back to pool. As mentioned above, per-device next 295 // stream is always points to the next available CUstream, so when we return 296 // a CUstream, we need to first decrease the id, and then copy the CUstream 297 // back. 298 // It is worth noting that, the order of streams return might be different 299 // from that they're assigned, that saying, at some point, there might be 300 // two identical CUstreams. 301 // xxax+a+++++ 302 // ^ 303 // id 304 // However, it doesn't matter, because they're always on the two sides of 305 // id. The left one will in the end be overwritten by another CUstream. 306 // Therefore, after several execution, the order of pool might be different 307 // from its initial state. 308 void returnStream(const int DeviceId, CUstream Stream) { 309 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 310 int &Id = NextStreamId[DeviceId]; 311 assert(Id > 0 && "Wrong stream ID"); 312 StreamPool[DeviceId][--Id] = Stream; 313 } 314 315 bool initializeDeviceStreamPool(const int DeviceId) { 316 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 317 318 resizeStreamPool(DeviceId, EnvNumInitialStreams); 319 320 // Check the size of stream pool 321 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 322 return false; 323 324 // Check whether each stream is valid 325 for (CUstream &S : StreamPool[DeviceId]) 326 if (!S) 327 return false; 328 329 return true; 330 } 331 }; 332 333 class DeviceRTLTy { 334 int NumberOfDevices; 335 // OpenMP environment properties 336 int EnvNumTeams; 337 int EnvTeamLimit; 338 int EnvTeamThreadLimit; 339 // OpenMP requires flags 340 int64_t RequiresFlags; 341 // Amount of dynamic shared memory to use at launch. 342 uint64_t DynamicMemorySize; 343 344 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 345 static constexpr const int HardThreadLimit = 1024; 346 static constexpr const int DefaultNumTeams = 128; 347 static constexpr const int DefaultNumThreads = 128; 348 349 std::unique_ptr<StreamManagerTy> StreamManager; 350 std::vector<DeviceDataTy> DeviceData; 351 std::vector<CUmodule> Modules; 352 353 /// A class responsible for interacting with device native runtime library to 354 /// allocate and free memory. 355 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 356 const int DeviceId; 357 const std::vector<DeviceDataTy> &DeviceData; 358 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 359 360 public: 361 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 362 : DeviceId(DeviceId), DeviceData(DeviceData) {} 363 364 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 365 if (Size == 0) 366 return nullptr; 367 368 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 369 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 370 return nullptr; 371 372 void *MemAlloc = nullptr; 373 switch (Kind) { 374 case TARGET_ALLOC_DEFAULT: 375 case TARGET_ALLOC_DEVICE: 376 CUdeviceptr DevicePtr; 377 Err = cuMemAlloc(&DevicePtr, Size); 378 MemAlloc = (void *)DevicePtr; 379 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 380 return nullptr; 381 break; 382 case TARGET_ALLOC_HOST: 383 void *HostPtr; 384 Err = cuMemAllocHost(&HostPtr, Size); 385 MemAlloc = HostPtr; 386 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 387 return nullptr; 388 HostPinnedAllocs[MemAlloc] = Kind; 389 break; 390 case TARGET_ALLOC_SHARED: 391 CUdeviceptr SharedPtr; 392 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 393 MemAlloc = (void *)SharedPtr; 394 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 395 return nullptr; 396 break; 397 } 398 399 return MemAlloc; 400 } 401 402 int free(void *TgtPtr) override { 403 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 404 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 405 return OFFLOAD_FAIL; 406 407 // Host pinned memory must be freed differently. 408 TargetAllocTy Kind = 409 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 410 ? TARGET_ALLOC_DEFAULT 411 : TARGET_ALLOC_HOST; 412 switch (Kind) { 413 case TARGET_ALLOC_DEFAULT: 414 case TARGET_ALLOC_DEVICE: 415 case TARGET_ALLOC_SHARED: 416 Err = cuMemFree((CUdeviceptr)TgtPtr); 417 if (!checkResult(Err, "Error returned from cuMemFree\n")) 418 return OFFLOAD_FAIL; 419 break; 420 case TARGET_ALLOC_HOST: 421 Err = cuMemFreeHost(TgtPtr); 422 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 423 return OFFLOAD_FAIL; 424 break; 425 } 426 427 return OFFLOAD_SUCCESS; 428 } 429 }; 430 431 /// A vector of device allocators 432 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 433 434 /// A vector of memory managers. Since the memory manager is non-copyable and 435 // non-removable, we wrap them into std::unique_ptr. 436 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 437 438 /// Whether use memory manager 439 bool UseMemoryManager = true; 440 441 // Record entry point associated with device 442 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 443 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 444 E.Entries.push_back(entry); 445 } 446 447 // Return a pointer to the entry associated with the pointer 448 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 449 const void *Addr) const { 450 for (const __tgt_offload_entry &Itr : 451 DeviceData[DeviceId].FuncGblEntries.back().Entries) 452 if (Itr.addr == Addr) 453 return &Itr; 454 455 return nullptr; 456 } 457 458 // Return the pointer to the target entries table 459 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 460 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 461 462 if (E.Entries.empty()) 463 return nullptr; 464 465 // Update table info according to the entries and return the pointer 466 E.Table.EntriesBegin = E.Entries.data(); 467 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 468 469 return &E.Table; 470 } 471 472 // Clear entries table for a device 473 void clearOffloadEntriesTable(const int DeviceId) { 474 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 475 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 476 E.Entries.clear(); 477 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 478 } 479 480 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 481 assert(AsyncInfo && "AsyncInfo is nullptr"); 482 483 if (!AsyncInfo->Queue) 484 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 485 486 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 487 } 488 489 public: 490 // This class should not be copied 491 DeviceRTLTy(const DeviceRTLTy &) = delete; 492 DeviceRTLTy(DeviceRTLTy &&) = delete; 493 494 DeviceRTLTy() 495 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 496 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), 497 DynamicMemorySize(0) { 498 499 DP("Start initializing CUDA\n"); 500 501 CUresult Err = cuInit(0); 502 if (Err == CUDA_ERROR_INVALID_HANDLE) { 503 // Can't call cuGetErrorString if dlsym failed 504 DP("Failed to load CUDA shared library\n"); 505 return; 506 } 507 if (!checkResult(Err, "Error returned from cuInit\n")) { 508 return; 509 } 510 511 Err = cuDeviceGetCount(&NumberOfDevices); 512 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 513 return; 514 515 if (NumberOfDevices == 0) { 516 DP("There are no devices supporting CUDA.\n"); 517 return; 518 } 519 520 DeviceData.resize(NumberOfDevices); 521 522 // Get environment variables regarding teams 523 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 524 // OMP_TEAM_LIMIT has been set 525 EnvTeamLimit = std::stoi(EnvStr); 526 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 527 } 528 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 529 // OMP_TEAMS_THREAD_LIMIT has been set 530 EnvTeamThreadLimit = std::stoi(EnvStr); 531 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 532 } 533 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 534 // OMP_NUM_TEAMS has been set 535 EnvNumTeams = std::stoi(EnvStr); 536 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 537 } 538 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { 539 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set 540 DynamicMemorySize = std::stoi(EnvStr); 541 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", 542 DynamicMemorySize); 543 } 544 545 StreamManager = 546 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 547 548 for (int I = 0; I < NumberOfDevices; ++I) 549 DeviceAllocators.emplace_back(I, DeviceData); 550 551 // Get the size threshold from environment variable 552 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 553 UseMemoryManager = Res.second; 554 size_t MemoryManagerThreshold = Res.first; 555 556 if (UseMemoryManager) 557 for (int I = 0; I < NumberOfDevices; ++I) 558 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 559 DeviceAllocators[I], MemoryManagerThreshold)); 560 } 561 562 ~DeviceRTLTy() { 563 // We first destruct memory managers in case that its dependent data are 564 // destroyed before it. 565 for (auto &M : MemoryManagers) 566 M.release(); 567 568 StreamManager = nullptr; 569 570 for (CUmodule &M : Modules) 571 // Close module 572 if (M) 573 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 574 575 for (DeviceDataTy &D : DeviceData) { 576 // Destroy context 577 if (D.Context) { 578 checkResult(cuCtxSetCurrent(D.Context), 579 "Error returned from cuCtxSetCurrent\n"); 580 CUdevice Device; 581 checkResult(cuCtxGetDevice(&Device), 582 "Error returned from cuCtxGetDevice\n"); 583 checkResult(cuDevicePrimaryCtxRelease(Device), 584 "Error returned from cuDevicePrimaryCtxRelease\n"); 585 } 586 } 587 } 588 589 // Check whether a given DeviceId is valid 590 bool isValidDeviceId(const int DeviceId) const { 591 return DeviceId >= 0 && DeviceId < NumberOfDevices; 592 } 593 594 int getNumOfDevices() const { return NumberOfDevices; } 595 596 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 597 598 int initDevice(const int DeviceId) { 599 CUdevice Device; 600 601 DP("Getting device %d\n", DeviceId); 602 CUresult Err = cuDeviceGet(&Device, DeviceId); 603 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 604 return OFFLOAD_FAIL; 605 606 // Query the current flags of the primary context and set its flags if 607 // it is inactive 608 unsigned int FormerPrimaryCtxFlags = 0; 609 int FormerPrimaryCtxIsActive = 0; 610 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 611 &FormerPrimaryCtxIsActive); 612 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 613 return OFFLOAD_FAIL; 614 615 if (FormerPrimaryCtxIsActive) { 616 DP("The primary context is active, no change to its flags\n"); 617 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 618 CU_CTX_SCHED_BLOCKING_SYNC) 619 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 620 } else { 621 DP("The primary context is inactive, set its flags to " 622 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 623 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 624 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 625 return OFFLOAD_FAIL; 626 } 627 628 // Retain the per device primary context and save it to use whenever this 629 // device is selected. 630 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 631 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 632 return OFFLOAD_FAIL; 633 634 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 635 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 636 return OFFLOAD_FAIL; 637 638 // Initialize stream pool 639 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 640 return OFFLOAD_FAIL; 641 642 // Query attributes to determine number of threads/block and blocks/grid. 643 int MaxGridDimX; 644 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 645 Device); 646 if (Err != CUDA_SUCCESS) { 647 DP("Error getting max grid dimension, use default value %d\n", 648 DeviceRTLTy::DefaultNumTeams); 649 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 650 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 651 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 652 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 653 } else { 654 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 655 "at the hard limit\n", 656 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 657 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 658 } 659 660 // We are only exploiting threads along the x axis. 661 int MaxBlockDimX; 662 Err = cuDeviceGetAttribute(&MaxBlockDimX, 663 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 664 if (Err != CUDA_SUCCESS) { 665 DP("Error getting max block dimension, use default value %d\n", 666 DeviceRTLTy::DefaultNumThreads); 667 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 668 } else { 669 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 670 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 671 672 if (EnvTeamThreadLimit > 0 && 673 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 674 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 675 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 676 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 677 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 678 } 679 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 680 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 681 "capping at the hard limit\n", 682 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 683 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 684 } 685 } 686 687 // Get and set warp size 688 int WarpSize; 689 Err = 690 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 691 if (Err != CUDA_SUCCESS) { 692 DP("Error getting warp size, assume default value 32\n"); 693 DeviceData[DeviceId].WarpSize = 32; 694 } else { 695 DP("Using warp size %d\n", WarpSize); 696 DeviceData[DeviceId].WarpSize = WarpSize; 697 } 698 699 // Adjust teams to the env variables 700 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 701 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 702 EnvTeamLimit); 703 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 704 } 705 706 size_t StackLimit; 707 size_t HeapLimit; 708 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 709 StackLimit = std::stol(EnvStr); 710 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 711 return OFFLOAD_FAIL; 712 } else { 713 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 714 return OFFLOAD_FAIL; 715 } 716 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 717 HeapLimit = std::stol(EnvStr); 718 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 719 return OFFLOAD_FAIL; 720 } else { 721 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 722 return OFFLOAD_FAIL; 723 } 724 725 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 726 "Device supports up to %d CUDA blocks and %d threads with a " 727 "warp size of %d\n", 728 DeviceData[DeviceId].BlocksPerGrid, 729 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 730 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 731 "Device heap size is %d Bytes, device stack size is %d Bytes per " 732 "thread\n", 733 (int)HeapLimit, (int)StackLimit); 734 735 // Set default number of teams 736 if (EnvNumTeams > 0) { 737 DP("Default number of teams set according to environment %d\n", 738 EnvNumTeams); 739 DeviceData[DeviceId].NumTeams = EnvNumTeams; 740 } else { 741 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 742 DP("Default number of teams set according to library's default %d\n", 743 DeviceRTLTy::DefaultNumTeams); 744 } 745 746 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 747 DP("Default number of teams exceeds device limit, capping at %d\n", 748 DeviceData[DeviceId].BlocksPerGrid); 749 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 750 } 751 752 // Set default number of threads 753 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 754 DP("Default number of threads set according to library's default %d\n", 755 DeviceRTLTy::DefaultNumThreads); 756 if (DeviceData[DeviceId].NumThreads > 757 DeviceData[DeviceId].ThreadsPerBlock) { 758 DP("Default number of threads exceeds device limit, capping at %d\n", 759 DeviceData[DeviceId].ThreadsPerBlock); 760 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 761 } 762 763 return OFFLOAD_SUCCESS; 764 } 765 766 __tgt_target_table *loadBinary(const int DeviceId, 767 const __tgt_device_image *Image) { 768 // Set the context we are using 769 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 770 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 771 return nullptr; 772 773 // Clear the offload table as we are going to create a new one. 774 clearOffloadEntriesTable(DeviceId); 775 776 // Create the module and extract the function pointers. 777 CUmodule Module; 778 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 779 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 780 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 781 return nullptr; 782 783 DP("CUDA module successfully loaded!\n"); 784 785 Modules.push_back(Module); 786 787 // Find the symbols in the module by name. 788 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 789 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 790 791 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 792 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 793 if (!E->addr) { 794 // We return nullptr when something like this happens, the host should 795 // have always something in the address to uniquely identify the target 796 // region. 797 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 798 return nullptr; 799 } 800 801 if (E->size) { 802 __tgt_offload_entry Entry = *E; 803 CUdeviceptr CUPtr; 804 size_t CUSize; 805 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 806 // We keep this style here because we need the name 807 if (Err != CUDA_SUCCESS) { 808 REPORT("Loading global '%s' Failed\n", E->name); 809 CUDA_ERR_STRING(Err); 810 return nullptr; 811 } 812 813 if (CUSize != E->size) { 814 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 815 CUSize, E->size); 816 return nullptr; 817 } 818 819 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 820 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 821 822 Entry.addr = (void *)(CUPtr); 823 824 // Note: In the current implementation declare target variables 825 // can either be link or to. This means that once unified 826 // memory is activated via the requires directive, the variable 827 // can be used directly from the host in both cases. 828 // TODO: when variables types other than to or link are added, 829 // the below condition should be changed to explicitly 830 // check for to and link variables types: 831 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 832 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 833 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 834 // If unified memory is present any target link or to variables 835 // can access host addresses directly. There is no longer a 836 // need for device copies. 837 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 838 DP("Copy linked variable host address (" DPxMOD 839 ") to device address (" DPxMOD ")\n", 840 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 841 } 842 843 addOffloadEntry(DeviceId, Entry); 844 845 continue; 846 } 847 848 CUfunction Func; 849 Err = cuModuleGetFunction(&Func, Module, E->name); 850 // We keep this style here because we need the name 851 if (Err != CUDA_SUCCESS) { 852 REPORT("Loading '%s' Failed\n", E->name); 853 CUDA_ERR_STRING(Err); 854 return nullptr; 855 } 856 857 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 858 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 859 860 // default value GENERIC (in case symbol is missing from cubin file) 861 llvm::omp::OMPTgtExecModeFlags ExecModeVal; 862 std::string ExecModeNameStr(E->name); 863 ExecModeNameStr += "_exec_mode"; 864 const char *ExecModeName = ExecModeNameStr.c_str(); 865 866 CUdeviceptr ExecModePtr; 867 size_t CUSize; 868 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 869 if (Err == CUDA_SUCCESS) { 870 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { 871 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 872 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); 873 return nullptr; 874 } 875 876 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 877 if (Err != CUDA_SUCCESS) { 878 REPORT("Error when copying data from device to host. Pointers: " 879 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 880 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 881 CUDA_ERR_STRING(Err); 882 return nullptr; 883 } 884 } else { 885 DP("Loading global exec_mode '%s' - symbol missing, using default " 886 "value GENERIC (1)\n", 887 ExecModeName); 888 } 889 890 KernelsList.emplace_back(Func, ExecModeVal); 891 892 __tgt_offload_entry Entry = *E; 893 Entry.addr = &KernelsList.back(); 894 addOffloadEntry(DeviceId, Entry); 895 } 896 897 // send device environment data to the device 898 { 899 // TODO: The device ID used here is not the real device ID used by OpenMP. 900 omptarget_device_environmentTy DeviceEnv{ 901 0, static_cast<uint32_t>(NumberOfDevices), 902 static_cast<uint32_t>(DeviceId), DynamicMemorySize}; 903 904 #ifdef OMPTARGET_DEBUG 905 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 906 DeviceEnv.debug_level = std::stoi(EnvStr); 907 #endif 908 909 const char *DeviceEnvName = "omptarget_device_environment"; 910 CUdeviceptr DeviceEnvPtr; 911 size_t CUSize; 912 913 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 914 if (Err == CUDA_SUCCESS) { 915 if (CUSize != sizeof(DeviceEnv)) { 916 REPORT( 917 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 918 DeviceEnvName, CUSize, sizeof(int32_t)); 919 CUDA_ERR_STRING(Err); 920 return nullptr; 921 } 922 923 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 924 if (Err != CUDA_SUCCESS) { 925 REPORT("Error when copying data from host to device. Pointers: " 926 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 927 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 928 CUDA_ERR_STRING(Err); 929 return nullptr; 930 } 931 932 DP("Sending global device environment data %zu bytes\n", CUSize); 933 } else { 934 DP("Finding global device environment '%s' - symbol missing.\n", 935 DeviceEnvName); 936 DP("Continue, considering this is a device RTL which does not accept " 937 "environment setting.\n"); 938 } 939 } 940 941 return getOffloadEntriesTable(DeviceId); 942 } 943 944 void *dataAlloc(const int DeviceId, const int64_t Size, 945 const TargetAllocTy Kind) { 946 switch (Kind) { 947 case TARGET_ALLOC_DEFAULT: 948 case TARGET_ALLOC_DEVICE: 949 if (UseMemoryManager) 950 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 951 else 952 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 953 case TARGET_ALLOC_HOST: 954 case TARGET_ALLOC_SHARED: 955 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 956 } 957 958 REPORT("Invalid target data allocation kind or requested allocator not " 959 "implemented yet\n"); 960 961 return nullptr; 962 } 963 964 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 965 const int64_t Size, __tgt_async_info *AsyncInfo) const { 966 assert(AsyncInfo && "AsyncInfo is nullptr"); 967 968 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 969 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 970 return OFFLOAD_FAIL; 971 972 CUstream Stream = getStream(DeviceId, AsyncInfo); 973 974 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 975 if (Err != CUDA_SUCCESS) { 976 DP("Error when copying data from host to device. Pointers: host " 977 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 978 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 979 CUDA_ERR_STRING(Err); 980 return OFFLOAD_FAIL; 981 } 982 983 return OFFLOAD_SUCCESS; 984 } 985 986 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 987 const int64_t Size, __tgt_async_info *AsyncInfo) const { 988 assert(AsyncInfo && "AsyncInfo is nullptr"); 989 990 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 991 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 992 return OFFLOAD_FAIL; 993 994 CUstream Stream = getStream(DeviceId, AsyncInfo); 995 996 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 997 if (Err != CUDA_SUCCESS) { 998 DP("Error when copying data from device to host. Pointers: host " 999 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1000 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1001 CUDA_ERR_STRING(Err); 1002 return OFFLOAD_FAIL; 1003 } 1004 1005 return OFFLOAD_SUCCESS; 1006 } 1007 1008 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1009 int64_t Size, __tgt_async_info *AsyncInfo) const { 1010 assert(AsyncInfo && "AsyncInfo is nullptr"); 1011 1012 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 1013 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1014 return OFFLOAD_FAIL; 1015 1016 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1017 1018 // If they are two devices, we try peer to peer copy first 1019 if (SrcDevId != DstDevId) { 1020 int CanAccessPeer = 0; 1021 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1022 if (Err != CUDA_SUCCESS) { 1023 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1024 ", dst = %" PRId32 "\n", 1025 SrcDevId, DstDevId); 1026 CUDA_ERR_STRING(Err); 1027 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1028 } 1029 1030 if (!CanAccessPeer) { 1031 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1032 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1033 } 1034 1035 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1036 if (Err != CUDA_SUCCESS) { 1037 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1038 ", dst = %" PRId32 "\n", 1039 SrcDevId, DstDevId); 1040 CUDA_ERR_STRING(Err); 1041 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1042 } 1043 1044 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1045 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1046 Size, Stream); 1047 if (Err == CUDA_SUCCESS) 1048 return OFFLOAD_SUCCESS; 1049 1050 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1051 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1052 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1053 CUDA_ERR_STRING(Err); 1054 } 1055 1056 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1057 } 1058 1059 int dataDelete(const int DeviceId, void *TgtPtr) { 1060 if (UseMemoryManager) 1061 return MemoryManagers[DeviceId]->free(TgtPtr); 1062 1063 return DeviceAllocators[DeviceId].free(TgtPtr); 1064 } 1065 1066 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1067 ptrdiff_t *TgtOffsets, const int ArgNum, 1068 const int TeamNum, const int ThreadLimit, 1069 const unsigned int LoopTripCount, 1070 __tgt_async_info *AsyncInfo) const { 1071 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1072 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1073 return OFFLOAD_FAIL; 1074 1075 // All args are references. 1076 std::vector<void *> Args(ArgNum); 1077 std::vector<void *> Ptrs(ArgNum); 1078 1079 for (int I = 0; I < ArgNum; ++I) { 1080 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1081 Args[I] = &Ptrs[I]; 1082 } 1083 1084 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1085 1086 const bool IsSPMDGenericMode = 1087 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; 1088 const bool IsSPMDMode = 1089 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; 1090 const bool IsGenericMode = 1091 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; 1092 1093 int CudaThreadsPerBlock; 1094 if (ThreadLimit > 0) { 1095 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1096 CudaThreadsPerBlock = ThreadLimit; 1097 // Add master warp if necessary 1098 if (IsGenericMode) { 1099 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1100 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1101 } 1102 } else { 1103 DP("Setting CUDA threads per block to default %d\n", 1104 DeviceData[DeviceId].NumThreads); 1105 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1106 } 1107 1108 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1109 DP("Threads per block capped at device limit %d\n", 1110 DeviceData[DeviceId].ThreadsPerBlock); 1111 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1112 } 1113 1114 if (!KernelInfo->MaxThreadsPerBlock) { 1115 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1116 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1117 KernelInfo->Func); 1118 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1119 return OFFLOAD_FAIL; 1120 } 1121 1122 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1123 DP("Threads per block capped at kernel limit %d\n", 1124 KernelInfo->MaxThreadsPerBlock); 1125 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1126 } 1127 1128 unsigned int CudaBlocksPerGrid; 1129 if (TeamNum <= 0) { 1130 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1131 if (IsSPMDGenericMode) { 1132 // If we reach this point, then we are executing a kernel that was 1133 // transformed from Generic-mode to SPMD-mode. This kernel has 1134 // SPMD-mode execution, but needs its blocks to be scheduled 1135 // differently because the current loop trip count only applies to the 1136 // `teams distribute` region and will create var too few blocks using 1137 // the regular SPMD-mode method. 1138 CudaBlocksPerGrid = LoopTripCount; 1139 } else if (IsSPMDMode) { 1140 // We have a combined construct, i.e. `target teams distribute 1141 // parallel for [simd]`. We launch so many teams so that each thread 1142 // will execute one iteration of the loop. round up to the nearest 1143 // integer 1144 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1145 } else if (IsGenericMode) { 1146 // If we reach this point, then we have a non-combined construct, i.e. 1147 // `teams distribute` with a nested `parallel for` and each team is 1148 // assigned one iteration of the `distribute` loop. E.g.: 1149 // 1150 // #pragma omp target teams distribute 1151 // for(...loop_tripcount...) { 1152 // #pragma omp parallel for 1153 // for(...) {} 1154 // } 1155 // 1156 // Threads within a team will execute the iterations of the `parallel` 1157 // loop. 1158 CudaBlocksPerGrid = LoopTripCount; 1159 } else { 1160 REPORT("Unknown execution mode: %d\n", 1161 static_cast<int8_t>(KernelInfo->ExecutionMode)); 1162 return OFFLOAD_FAIL; 1163 } 1164 DP("Using %d teams due to loop trip count %" PRIu32 1165 " and number of threads per block %d\n", 1166 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1167 } else { 1168 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1169 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1170 } 1171 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1172 DP("Capping number of teams to team limit %d\n", 1173 DeviceData[DeviceId].BlocksPerGrid); 1174 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1175 } else { 1176 DP("Using requested number of teams %d\n", TeamNum); 1177 CudaBlocksPerGrid = TeamNum; 1178 } 1179 1180 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1181 "Launching kernel %s with %d blocks and %d threads in %s mode\n", 1182 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1183 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1184 : "(null)", 1185 CudaBlocksPerGrid, CudaThreadsPerBlock, 1186 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); 1187 1188 CUstream Stream = getStream(DeviceId, AsyncInfo); 1189 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1190 /* gridDimZ */ 1, CudaThreadsPerBlock, 1191 /* blockDimY */ 1, /* blockDimZ */ 1, 1192 DynamicMemorySize, Stream, &Args[0], nullptr); 1193 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1194 return OFFLOAD_FAIL; 1195 1196 DP("Launch of entry point at " DPxMOD " successful!\n", 1197 DPxPTR(TgtEntryPtr)); 1198 1199 return OFFLOAD_SUCCESS; 1200 } 1201 1202 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1203 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1204 CUresult Err = cuStreamSynchronize(Stream); 1205 1206 // Once the stream is synchronized, return it to stream pool and reset 1207 // AsyncInfo. This is to make sure the synchronization only works for its 1208 // own tasks. 1209 StreamManager->returnStream(DeviceId, 1210 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1211 AsyncInfo->Queue = nullptr; 1212 1213 if (Err != CUDA_SUCCESS) { 1214 DP("Error when synchronizing stream. stream = " DPxMOD 1215 ", async info ptr = " DPxMOD "\n", 1216 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1217 CUDA_ERR_STRING(Err); 1218 } 1219 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1220 } 1221 1222 void printDeviceInfo(int32_t device_id) { 1223 char TmpChar[1000]; 1224 std::string TmpStr; 1225 size_t TmpSt; 1226 int TmpInt, TmpInt2, TmpInt3; 1227 1228 CUdevice Device; 1229 checkResult(cuDeviceGet(&Device, device_id), 1230 "Error returned from cuCtxGetDevice\n"); 1231 1232 cuDriverGetVersion(&TmpInt); 1233 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1234 printf(" CUDA Device Number: \t\t%d \n", device_id); 1235 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1236 "Error returned from cuDeviceGetName\n"); 1237 printf(" Device Name: \t\t\t%s \n", TmpChar); 1238 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1239 "Error returned from cuDeviceTotalMem\n"); 1240 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1241 checkResult(cuDeviceGetAttribute( 1242 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1243 "Error returned from cuDeviceGetAttribute\n"); 1244 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1245 checkResult( 1246 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1247 "Error returned from cuDeviceGetAttribute\n"); 1248 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1249 checkResult(cuDeviceGetAttribute( 1250 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1251 "Error returned from cuDeviceGetAttribute\n"); 1252 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1253 checkResult( 1254 cuDeviceGetAttribute( 1255 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1256 "Error returned from cuDeviceGetAttribute\n"); 1257 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1258 checkResult( 1259 cuDeviceGetAttribute( 1260 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1261 "Error returned from cuDeviceGetAttribute\n"); 1262 printf(" Registers per Block: \t\t%d \n", TmpInt); 1263 checkResult( 1264 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1265 "Error returned from cuDeviceGetAttribute\n"); 1266 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1267 checkResult(cuDeviceGetAttribute( 1268 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1269 "Error returned from cuDeviceGetAttribute\n"); 1270 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1271 checkResult(cuDeviceGetAttribute( 1272 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1273 "Error returned from cuDeviceGetAttribute\n"); 1274 checkResult(cuDeviceGetAttribute( 1275 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1276 "Error returned from cuDeviceGetAttribute\n"); 1277 checkResult(cuDeviceGetAttribute( 1278 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1279 "Error returned from cuDeviceGetAttribute\n"); 1280 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1281 TmpInt3); 1282 checkResult(cuDeviceGetAttribute( 1283 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1284 "Error returned from cuDeviceGetAttribute\n"); 1285 checkResult(cuDeviceGetAttribute( 1286 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1287 "Error returned from cuDeviceGetAttribute\n"); 1288 checkResult(cuDeviceGetAttribute( 1289 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1290 "Error returned from cuDeviceGetAttribute\n"); 1291 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1292 TmpInt3); 1293 checkResult( 1294 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1295 "Error returned from cuDeviceGetAttribute\n"); 1296 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1297 checkResult(cuDeviceGetAttribute( 1298 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1299 "Error returned from cuDeviceGetAttribute\n"); 1300 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1301 checkResult( 1302 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1303 "Error returned from cuDeviceGetAttribute\n"); 1304 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1305 checkResult(cuDeviceGetAttribute( 1306 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1307 "Error returned from cuDeviceGetAttribute\n"); 1308 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1309 checkResult( 1310 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1311 "Error returned from cuDeviceGetAttribute\n"); 1312 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1313 checkResult(cuDeviceGetAttribute( 1314 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1315 "Error returned from cuDeviceGetAttribute\n"); 1316 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1317 checkResult( 1318 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1319 "Error returned from cuDeviceGetAttribute\n"); 1320 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1321 TmpStr = "DEFAULT"; 1322 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1323 TmpStr = "PROHIBITED"; 1324 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1325 TmpStr = "EXCLUSIVE PROCESS"; 1326 else 1327 TmpStr = "unknown"; 1328 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1329 checkResult(cuDeviceGetAttribute( 1330 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1331 "Error returned from cuDeviceGetAttribute\n"); 1332 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1333 checkResult( 1334 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1335 "Error returned from cuDeviceGetAttribute\n"); 1336 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1337 checkResult(cuDeviceGetAttribute( 1338 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1339 "Error returned from cuDeviceGetAttribute\n"); 1340 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1341 checkResult( 1342 cuDeviceGetAttribute( 1343 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1344 "Error returned from cuDeviceGetAttribute\n"); 1345 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1346 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1347 Device), 1348 "Error returned from cuDeviceGetAttribute\n"); 1349 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1350 checkResult(cuDeviceGetAttribute( 1351 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1352 Device), 1353 "Error returned from cuDeviceGetAttribute\n"); 1354 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1355 checkResult(cuDeviceGetAttribute( 1356 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1357 "Error returned from cuDeviceGetAttribute\n"); 1358 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1359 checkResult(cuDeviceGetAttribute( 1360 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1361 "Error returned from cuDeviceGetAttribute\n"); 1362 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1363 checkResult(cuDeviceGetAttribute( 1364 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1365 "Error returned from cuDeviceGetAttribute\n"); 1366 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1367 checkResult( 1368 cuDeviceGetAttribute( 1369 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1370 "Error returned from cuDeviceGetAttribute\n"); 1371 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1372 checkResult( 1373 cuDeviceGetAttribute( 1374 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1375 "Error returned from cuDeviceGetAttribute\n"); 1376 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1377 checkResult(cuDeviceGetAttribute( 1378 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1381 checkResult(cuDeviceGetAttribute( 1382 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1383 "Error returned from cuDeviceGetAttribute\n"); 1384 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1385 checkResult( 1386 cuDeviceGetAttribute( 1387 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1388 "Error returned from cuDeviceGetAttribute\n"); 1389 checkResult( 1390 cuDeviceGetAttribute( 1391 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1392 "Error returned from cuDeviceGetAttribute\n"); 1393 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1394 } 1395 1396 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1397 void *EventPtr) const { 1398 CUstream Stream = getStream(DeviceId, AsyncInfo); 1399 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1400 1401 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1402 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1403 // API document, that argument has to be 0x0. 1404 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1405 if (Err != CUDA_SUCCESS) { 1406 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1407 DPxPTR(Stream), DPxPTR(Event)); 1408 CUDA_ERR_STRING(Err); 1409 return OFFLOAD_FAIL; 1410 } 1411 1412 return OFFLOAD_SUCCESS; 1413 } 1414 }; 1415 1416 DeviceRTLTy DeviceRTL; 1417 } // namespace 1418 1419 // Exposed library API function 1420 #ifdef __cplusplus 1421 extern "C" { 1422 #endif 1423 1424 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1425 return elf_check_machine(image, /* EM_CUDA */ 190); 1426 } 1427 1428 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1429 1430 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1431 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1432 DeviceRTL.setRequiresFlag(RequiresFlags); 1433 return RequiresFlags; 1434 } 1435 1436 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1437 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1438 DeviceRTL.isValidDeviceId(dst_dev_id)) 1439 return 1; 1440 1441 return 0; 1442 } 1443 1444 int32_t __tgt_rtl_init_device(int32_t device_id) { 1445 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1446 1447 return DeviceRTL.initDevice(device_id); 1448 } 1449 1450 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1451 __tgt_device_image *image) { 1452 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1453 1454 return DeviceRTL.loadBinary(device_id, image); 1455 } 1456 1457 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1458 int32_t kind) { 1459 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1460 1461 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1462 } 1463 1464 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1465 int64_t size) { 1466 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1467 1468 __tgt_async_info AsyncInfo; 1469 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1470 size, &AsyncInfo); 1471 if (rc != OFFLOAD_SUCCESS) 1472 return OFFLOAD_FAIL; 1473 1474 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1475 } 1476 1477 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1478 void *hst_ptr, int64_t size, 1479 __tgt_async_info *async_info_ptr) { 1480 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1481 assert(async_info_ptr && "async_info_ptr is nullptr"); 1482 1483 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1484 async_info_ptr); 1485 } 1486 1487 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1488 int64_t size) { 1489 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1490 1491 __tgt_async_info AsyncInfo; 1492 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1493 size, &AsyncInfo); 1494 if (rc != OFFLOAD_SUCCESS) 1495 return OFFLOAD_FAIL; 1496 1497 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1498 } 1499 1500 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1501 void *tgt_ptr, int64_t size, 1502 __tgt_async_info *async_info_ptr) { 1503 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1504 assert(async_info_ptr && "async_info_ptr is nullptr"); 1505 1506 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1507 async_info_ptr); 1508 } 1509 1510 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1511 int dst_dev_id, void *dst_ptr, 1512 int64_t size, 1513 __tgt_async_info *AsyncInfo) { 1514 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1515 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1516 assert(AsyncInfo && "AsyncInfo is nullptr"); 1517 1518 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1519 AsyncInfo); 1520 } 1521 1522 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1523 int32_t dst_dev_id, void *dst_ptr, 1524 int64_t size) { 1525 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1526 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1527 1528 __tgt_async_info AsyncInfo; 1529 const int32_t rc = __tgt_rtl_data_exchange_async( 1530 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1531 if (rc != OFFLOAD_SUCCESS) 1532 return OFFLOAD_FAIL; 1533 1534 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1535 } 1536 1537 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1538 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1539 1540 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1541 } 1542 1543 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1544 void **tgt_args, 1545 ptrdiff_t *tgt_offsets, 1546 int32_t arg_num, int32_t team_num, 1547 int32_t thread_limit, 1548 uint64_t loop_tripcount) { 1549 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1550 1551 __tgt_async_info AsyncInfo; 1552 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1553 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1554 thread_limit, loop_tripcount, &AsyncInfo); 1555 if (rc != OFFLOAD_SUCCESS) 1556 return OFFLOAD_FAIL; 1557 1558 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1559 } 1560 1561 int32_t __tgt_rtl_run_target_team_region_async( 1562 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1563 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1564 int32_t thread_limit, uint64_t loop_tripcount, 1565 __tgt_async_info *async_info_ptr) { 1566 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1567 1568 return DeviceRTL.runTargetTeamRegion( 1569 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1570 thread_limit, loop_tripcount, async_info_ptr); 1571 } 1572 1573 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1574 void **tgt_args, ptrdiff_t *tgt_offsets, 1575 int32_t arg_num) { 1576 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1577 1578 __tgt_async_info AsyncInfo; 1579 const int32_t rc = __tgt_rtl_run_target_region_async( 1580 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1581 if (rc != OFFLOAD_SUCCESS) 1582 return OFFLOAD_FAIL; 1583 1584 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1585 } 1586 1587 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1588 void *tgt_entry_ptr, void **tgt_args, 1589 ptrdiff_t *tgt_offsets, 1590 int32_t arg_num, 1591 __tgt_async_info *async_info_ptr) { 1592 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1593 1594 return __tgt_rtl_run_target_team_region_async( 1595 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1596 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1597 async_info_ptr); 1598 } 1599 1600 int32_t __tgt_rtl_synchronize(int32_t device_id, 1601 __tgt_async_info *async_info_ptr) { 1602 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1603 assert(async_info_ptr && "async_info_ptr is nullptr"); 1604 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1605 1606 return DeviceRTL.synchronize(device_id, async_info_ptr); 1607 } 1608 1609 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1610 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1611 InfoLevel.store(NewInfoLevel); 1612 } 1613 1614 void __tgt_rtl_print_device_info(int32_t device_id) { 1615 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1616 DeviceRTL.printDeviceInfo(device_id); 1617 } 1618 1619 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1620 assert(event && "event is nullptr"); 1621 return createEvent(event); 1622 } 1623 1624 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1625 __tgt_async_info *async_info_ptr) { 1626 assert(async_info_ptr && "async_info_ptr is nullptr"); 1627 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1628 assert(event_ptr && "event_ptr is nullptr"); 1629 1630 return recordEvent(event_ptr, async_info_ptr); 1631 } 1632 1633 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1634 __tgt_async_info *async_info_ptr) { 1635 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1636 assert(async_info_ptr && "async_info_ptr is nullptr"); 1637 assert(event_ptr && "event is nullptr"); 1638 1639 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1640 } 1641 1642 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1643 assert(event_ptr && "event is nullptr"); 1644 1645 return syncEvent(event_ptr); 1646 } 1647 1648 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1649 assert(event_ptr && "event is nullptr"); 1650 1651 return destroyEvent(event_ptr); 1652 } 1653 1654 #ifdef __cplusplus 1655 } 1656 #endif 1657