1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <vector> 21 22 #include "Debug.h" 23 #include "omptargetplugin.h" 24 25 #define TARGET_NAME CUDA 26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 27 28 #include "MemoryManager.h" 29 30 // Utility for retrieving and printing CUDA error string. 31 #ifdef OMPTARGET_DEBUG 32 #define CUDA_ERR_STRING(err) \ 33 do { \ 34 if (getDebugLevel() > 0) { \ 35 const char *errStr = nullptr; \ 36 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 37 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 38 REPORT("Unrecognized CUDA error code: %d\n", err); \ 39 else if (errStr_status == CUDA_SUCCESS) \ 40 REPORT("CUDA error is: %s\n", errStr); \ 41 else { \ 42 REPORT("Unresolved CUDA error code: %d\n", err); \ 43 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 44 errStr_status); \ 45 } \ 46 } else { \ 47 const char *errStr = nullptr; \ 48 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 49 if (errStr_status == CUDA_SUCCESS) \ 50 REPORT("%s \n", errStr); \ 51 } \ 52 } while (false) 53 #else // OMPTARGET_DEBUG 54 #define CUDA_ERR_STRING(err) \ 55 do { \ 56 const char *errStr = nullptr; \ 57 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 58 if (errStr_status == CUDA_SUCCESS) \ 59 REPORT("%s \n", errStr); \ 60 } while (false) 61 #endif // OMPTARGET_DEBUG 62 63 #include "elf_common.h" 64 65 /// Keep entries table per device. 66 struct FuncOrGblEntryTy { 67 __tgt_target_table Table; 68 std::vector<__tgt_offload_entry> Entries; 69 }; 70 71 enum ExecutionModeType { 72 SPMD, // constructors, destructors, 73 // combined constructs (`teams distribute parallel for [simd]`) 74 GENERIC, // everything else 75 NONE 76 }; 77 78 /// Use a single entity to encode a kernel and a set of flags. 79 struct KernelTy { 80 CUfunction Func; 81 82 // execution mode of kernel 83 // 0 - SPMD mode (without master warp) 84 // 1 - Generic mode (with master warp) 85 int8_t ExecutionMode; 86 87 /// Maximal number of threads per block for this kernel. 88 int MaxThreadsPerBlock = 0; 89 90 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 91 : Func(_Func), ExecutionMode(_ExecutionMode) {} 92 }; 93 94 /// Device environment data 95 /// Manually sync with the deviceRTL side for now, move to a dedicated header 96 /// file later. 97 struct omptarget_device_environmentTy { 98 int32_t debug_level; 99 }; 100 101 namespace { 102 bool checkResult(CUresult Err, const char *ErrMsg) { 103 if (Err == CUDA_SUCCESS) 104 return true; 105 106 REPORT("%s", ErrMsg); 107 CUDA_ERR_STRING(Err); 108 return false; 109 } 110 111 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 112 CUstream Stream) { 113 CUresult Err = 114 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 115 116 if (Err != CUDA_SUCCESS) { 117 DP("Error when copying data from device to device. Pointers: src " 118 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 119 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 120 CUDA_ERR_STRING(Err); 121 return OFFLOAD_FAIL; 122 } 123 124 return OFFLOAD_SUCCESS; 125 } 126 127 // Structure contains per-device data 128 struct DeviceDataTy { 129 /// List that contains all the kernels. 130 std::list<KernelTy> KernelsList; 131 132 std::list<FuncOrGblEntryTy> FuncGblEntries; 133 134 CUcontext Context = nullptr; 135 // Device properties 136 int ThreadsPerBlock = 0; 137 int BlocksPerGrid = 0; 138 int WarpSize = 0; 139 // OpenMP properties 140 int NumTeams = 0; 141 int NumThreads = 0; 142 }; 143 144 class StreamManagerTy { 145 int NumberOfDevices; 146 // The initial size of stream pool 147 int EnvNumInitialStreams; 148 // Per-device stream mutex 149 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 150 // Per-device stream Id indicates the next available stream in the pool 151 std::vector<int> NextStreamId; 152 // Per-device stream pool 153 std::vector<std::vector<CUstream>> StreamPool; 154 // Reference to per-device data 155 std::vector<DeviceDataTy> &DeviceData; 156 157 // If there is no CUstream left in the pool, we will resize the pool to 158 // allocate more CUstream. This function should be called with device mutex, 159 // and we do not resize to smaller one. 160 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 161 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 162 const size_t CurrentSize = Pool.size(); 163 assert(NewSize > CurrentSize && "new size is not larger than current size"); 164 165 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 166 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 167 // We will return if cannot switch to the right context in case of 168 // creating bunch of streams that are not corresponding to the right 169 // device. The offloading will fail later because selected CUstream is 170 // nullptr. 171 return; 172 } 173 174 Pool.resize(NewSize, nullptr); 175 176 for (size_t I = CurrentSize; I < NewSize; ++I) { 177 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 178 "Error returned from cuStreamCreate\n"); 179 } 180 } 181 182 public: 183 StreamManagerTy(const int NumberOfDevices, 184 std::vector<DeviceDataTy> &DeviceData) 185 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 186 DeviceData(DeviceData) { 187 StreamPool.resize(NumberOfDevices); 188 NextStreamId.resize(NumberOfDevices); 189 StreamMtx.resize(NumberOfDevices); 190 191 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 192 EnvNumInitialStreams = std::stoi(EnvStr); 193 194 // Initialize the next stream id 195 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 196 197 // Initialize stream mutex 198 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 199 Ptr = std::make_unique<std::mutex>(); 200 } 201 202 ~StreamManagerTy() { 203 // Destroy streams 204 for (int I = 0; I < NumberOfDevices; ++I) { 205 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 206 "Error returned from cuCtxSetCurrent\n"); 207 208 for (CUstream &S : StreamPool[I]) { 209 if (S) 210 checkResult(cuStreamDestroy(S), 211 "Error returned from cuStreamDestroy\n"); 212 } 213 } 214 } 215 216 // Get a CUstream from pool. Per-device next stream id always points to the 217 // next available CUstream. That means, CUstreams [0, id-1] have been 218 // assigned, and [id,] are still available. If there is no CUstream left, we 219 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 220 // the id will increase one. 221 // xxxxxs+++++++++ 222 // ^ 223 // id 224 // After assignment, the pool becomes the following and s is assigned. 225 // xxxxxs+++++++++ 226 // ^ 227 // id 228 CUstream getStream(const int DeviceId) { 229 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 230 int &Id = NextStreamId[DeviceId]; 231 // No CUstream left in the pool, we need to request from CUDA RT 232 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 233 // By default we double the stream pool every time 234 resizeStreamPool(DeviceId, Id * 2); 235 } 236 return StreamPool[DeviceId][Id++]; 237 } 238 239 // Return a CUstream back to pool. As mentioned above, per-device next 240 // stream is always points to the next available CUstream, so when we return 241 // a CUstream, we need to first decrease the id, and then copy the CUstream 242 // back. 243 // It is worth noting that, the order of streams return might be different 244 // from that they're assigned, that saying, at some point, there might be 245 // two identical CUstreams. 246 // xxax+a+++++ 247 // ^ 248 // id 249 // However, it doesn't matter, because they're always on the two sides of 250 // id. The left one will in the end be overwritten by another CUstream. 251 // Therefore, after several execution, the order of pool might be different 252 // from its initial state. 253 void returnStream(const int DeviceId, CUstream Stream) { 254 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 255 int &Id = NextStreamId[DeviceId]; 256 assert(Id > 0 && "Wrong stream ID"); 257 StreamPool[DeviceId][--Id] = Stream; 258 } 259 260 bool initializeDeviceStreamPool(const int DeviceId) { 261 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 262 263 resizeStreamPool(DeviceId, EnvNumInitialStreams); 264 265 // Check the size of stream pool 266 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 267 return false; 268 269 // Check whether each stream is valid 270 for (CUstream &S : StreamPool[DeviceId]) 271 if (!S) 272 return false; 273 274 return true; 275 } 276 }; 277 278 class DeviceRTLTy { 279 int NumberOfDevices; 280 // OpenMP environment properties 281 int EnvNumTeams; 282 int EnvTeamLimit; 283 // OpenMP requires flags 284 int64_t RequiresFlags; 285 286 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 287 static constexpr const int HardThreadLimit = 1024; 288 static constexpr const int DefaultNumTeams = 128; 289 static constexpr const int DefaultNumThreads = 128; 290 291 std::unique_ptr<StreamManagerTy> StreamManager; 292 std::vector<DeviceDataTy> DeviceData; 293 std::vector<CUmodule> Modules; 294 295 /// A class responsible for interacting with device native runtime library to 296 /// allocate and free memory. 297 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 298 const int DeviceId; 299 const std::vector<DeviceDataTy> &DeviceData; 300 301 public: 302 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 303 : DeviceId(DeviceId), DeviceData(DeviceData) {} 304 305 void *allocate(size_t Size, void *) override { 306 if (Size == 0) 307 return nullptr; 308 309 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 310 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 311 return nullptr; 312 313 CUdeviceptr DevicePtr; 314 Err = cuMemAlloc(&DevicePtr, Size); 315 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 316 return nullptr; 317 318 return (void *)DevicePtr; 319 } 320 321 int free(void *TgtPtr) override { 322 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 323 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 324 return OFFLOAD_FAIL; 325 326 Err = cuMemFree((CUdeviceptr)TgtPtr); 327 if (!checkResult(Err, "Error returned from cuMemFree\n")) 328 return OFFLOAD_FAIL; 329 330 return OFFLOAD_SUCCESS; 331 } 332 }; 333 334 /// A vector of device allocators 335 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 336 337 /// A vector of memory managers. Since the memory manager is non-copyable and 338 // non-removable, we wrap them into std::unique_ptr. 339 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 340 341 /// Whether use memory manager 342 bool UseMemoryManager = true; 343 344 // Record entry point associated with device 345 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 346 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 347 E.Entries.push_back(entry); 348 } 349 350 // Return a pointer to the entry associated with the pointer 351 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 352 const void *Addr) const { 353 for (const __tgt_offload_entry &Itr : 354 DeviceData[DeviceId].FuncGblEntries.back().Entries) 355 if (Itr.addr == Addr) 356 return &Itr; 357 358 return nullptr; 359 } 360 361 // Return the pointer to the target entries table 362 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 363 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 364 365 if (E.Entries.empty()) 366 return nullptr; 367 368 // Update table info according to the entries and return the pointer 369 E.Table.EntriesBegin = E.Entries.data(); 370 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 371 372 return &E.Table; 373 } 374 375 // Clear entries table for a device 376 void clearOffloadEntriesTable(const int DeviceId) { 377 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 378 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 379 E.Entries.clear(); 380 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 381 } 382 383 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 384 assert(AsyncInfo && "AsyncInfo is nullptr"); 385 386 if (!AsyncInfo->Queue) 387 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 388 389 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 390 } 391 392 public: 393 // This class should not be copied 394 DeviceRTLTy(const DeviceRTLTy &) = delete; 395 DeviceRTLTy(DeviceRTLTy &&) = delete; 396 397 DeviceRTLTy() 398 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 399 RequiresFlags(OMP_REQ_UNDEFINED) { 400 401 DP("Start initializing CUDA\n"); 402 403 CUresult Err = cuInit(0); 404 if (Err == CUDA_ERROR_INVALID_HANDLE) { 405 // Can't call cuGetErrorString if dlsym failed 406 DP("Failed to load CUDA shared library\n"); 407 return; 408 } 409 if (!checkResult(Err, "Error returned from cuInit\n")) { 410 return; 411 } 412 413 Err = cuDeviceGetCount(&NumberOfDevices); 414 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 415 return; 416 417 if (NumberOfDevices == 0) { 418 DP("There are no devices supporting CUDA.\n"); 419 return; 420 } 421 422 DeviceData.resize(NumberOfDevices); 423 424 // Get environment variables regarding teams 425 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 426 // OMP_TEAM_LIMIT has been set 427 EnvTeamLimit = std::stoi(EnvStr); 428 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 429 } 430 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 431 // OMP_NUM_TEAMS has been set 432 EnvNumTeams = std::stoi(EnvStr); 433 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 434 } 435 436 StreamManager = 437 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 438 439 for (int I = 0; I < NumberOfDevices; ++I) 440 DeviceAllocators.emplace_back(I, DeviceData); 441 442 // Get the size threshold from environment variable 443 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 444 UseMemoryManager = Res.second; 445 size_t MemoryManagerThreshold = Res.first; 446 447 if (UseMemoryManager) 448 for (int I = 0; I < NumberOfDevices; ++I) 449 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 450 DeviceAllocators[I], MemoryManagerThreshold)); 451 } 452 453 ~DeviceRTLTy() { 454 // We first destruct memory managers in case that its dependent data are 455 // destroyed before it. 456 for (auto &M : MemoryManagers) 457 M.release(); 458 459 StreamManager = nullptr; 460 461 for (CUmodule &M : Modules) 462 // Close module 463 if (M) 464 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 465 466 for (DeviceDataTy &D : DeviceData) { 467 // Destroy context 468 if (D.Context) { 469 checkResult(cuCtxSetCurrent(D.Context), 470 "Error returned from cuCtxSetCurrent\n"); 471 CUdevice Device; 472 checkResult(cuCtxGetDevice(&Device), 473 "Error returned from cuCtxGetDevice\n"); 474 checkResult(cuDevicePrimaryCtxRelease(Device), 475 "Error returned from cuDevicePrimaryCtxRelease\n"); 476 } 477 } 478 } 479 480 // Check whether a given DeviceId is valid 481 bool isValidDeviceId(const int DeviceId) const { 482 return DeviceId >= 0 && DeviceId < NumberOfDevices; 483 } 484 485 int getNumOfDevices() const { return NumberOfDevices; } 486 487 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 488 489 int initDevice(const int DeviceId) { 490 CUdevice Device; 491 492 DP("Getting device %d\n", DeviceId); 493 CUresult Err = cuDeviceGet(&Device, DeviceId); 494 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 495 return OFFLOAD_FAIL; 496 497 // Query the current flags of the primary context and set its flags if 498 // it is inactive 499 unsigned int FormerPrimaryCtxFlags = 0; 500 int FormerPrimaryCtxIsActive = 0; 501 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 502 &FormerPrimaryCtxIsActive); 503 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 504 return OFFLOAD_FAIL; 505 506 if (FormerPrimaryCtxIsActive) { 507 DP("The primary context is active, no change to its flags\n"); 508 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 509 CU_CTX_SCHED_BLOCKING_SYNC) 510 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 511 } else { 512 DP("The primary context is inactive, set its flags to " 513 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 514 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 515 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 516 return OFFLOAD_FAIL; 517 } 518 519 // Retain the per device primary context and save it to use whenever this 520 // device is selected. 521 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 522 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 523 return OFFLOAD_FAIL; 524 525 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 526 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 527 return OFFLOAD_FAIL; 528 529 // Initialize stream pool 530 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 531 return OFFLOAD_FAIL; 532 533 // Query attributes to determine number of threads/block and blocks/grid. 534 int MaxGridDimX; 535 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 536 Device); 537 if (Err != CUDA_SUCCESS) { 538 DP("Error getting max grid dimension, use default value %d\n", 539 DeviceRTLTy::DefaultNumTeams); 540 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 541 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 542 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 543 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 544 } else { 545 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 546 "at the hard limit\n", 547 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 548 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 549 } 550 551 // We are only exploiting threads along the x axis. 552 int MaxBlockDimX; 553 Err = cuDeviceGetAttribute(&MaxBlockDimX, 554 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 555 if (Err != CUDA_SUCCESS) { 556 DP("Error getting max block dimension, use default value %d\n", 557 DeviceRTLTy::DefaultNumThreads); 558 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 559 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { 560 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 561 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 562 } else { 563 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 564 "capping at the hard limit\n", 565 MaxBlockDimX, DeviceRTLTy::HardThreadLimit); 566 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 567 } 568 569 // Get and set warp size 570 int WarpSize; 571 Err = 572 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 573 if (Err != CUDA_SUCCESS) { 574 DP("Error getting warp size, assume default value 32\n"); 575 DeviceData[DeviceId].WarpSize = 32; 576 } else { 577 DP("Using warp size %d\n", WarpSize); 578 DeviceData[DeviceId].WarpSize = WarpSize; 579 } 580 581 // Adjust teams to the env variables 582 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 583 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 584 EnvTeamLimit); 585 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 586 } 587 588 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 589 "Device supports up to %d CUDA blocks and %d threads with a " 590 "warp size of %d\n", 591 DeviceData[DeviceId].BlocksPerGrid, 592 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 593 594 // Set default number of teams 595 if (EnvNumTeams > 0) { 596 DP("Default number of teams set according to environment %d\n", 597 EnvNumTeams); 598 DeviceData[DeviceId].NumTeams = EnvNumTeams; 599 } else { 600 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 601 DP("Default number of teams set according to library's default %d\n", 602 DeviceRTLTy::DefaultNumTeams); 603 } 604 605 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 606 DP("Default number of teams exceeds device limit, capping at %d\n", 607 DeviceData[DeviceId].BlocksPerGrid); 608 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 609 } 610 611 // Set default number of threads 612 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 613 DP("Default number of threads set according to library's default %d\n", 614 DeviceRTLTy::DefaultNumThreads); 615 if (DeviceData[DeviceId].NumThreads > 616 DeviceData[DeviceId].ThreadsPerBlock) { 617 DP("Default number of threads exceeds device limit, capping at %d\n", 618 DeviceData[DeviceId].ThreadsPerBlock); 619 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 620 } 621 622 return OFFLOAD_SUCCESS; 623 } 624 625 __tgt_target_table *loadBinary(const int DeviceId, 626 const __tgt_device_image *Image) { 627 // Set the context we are using 628 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 629 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 630 return nullptr; 631 632 // Clear the offload table as we are going to create a new one. 633 clearOffloadEntriesTable(DeviceId); 634 635 // Create the module and extract the function pointers. 636 CUmodule Module; 637 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 638 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 639 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 640 return nullptr; 641 642 DP("CUDA module successfully loaded!\n"); 643 644 Modules.push_back(Module); 645 646 // Find the symbols in the module by name. 647 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 648 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 649 650 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 651 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 652 if (!E->addr) { 653 // We return nullptr when something like this happens, the host should 654 // have always something in the address to uniquely identify the target 655 // region. 656 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 657 return nullptr; 658 } 659 660 if (E->size) { 661 __tgt_offload_entry Entry = *E; 662 CUdeviceptr CUPtr; 663 size_t CUSize; 664 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 665 // We keep this style here because we need the name 666 if (Err != CUDA_SUCCESS) { 667 REPORT("Loading global '%s' Failed\n", E->name); 668 CUDA_ERR_STRING(Err); 669 return nullptr; 670 } 671 672 if (CUSize != E->size) { 673 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 674 CUSize, E->size); 675 return nullptr; 676 } 677 678 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 679 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 680 681 Entry.addr = (void *)(CUPtr); 682 683 // Note: In the current implementation declare target variables 684 // can either be link or to. This means that once unified 685 // memory is activated via the requires directive, the variable 686 // can be used directly from the host in both cases. 687 // TODO: when variables types other than to or link are added, 688 // the below condition should be changed to explicitly 689 // check for to and link variables types: 690 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 691 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 692 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 693 // If unified memory is present any target link or to variables 694 // can access host addresses directly. There is no longer a 695 // need for device copies. 696 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 697 DP("Copy linked variable host address (" DPxMOD 698 ") to device address (" DPxMOD ")\n", 699 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 700 } 701 702 addOffloadEntry(DeviceId, Entry); 703 704 continue; 705 } 706 707 CUfunction Func; 708 Err = cuModuleGetFunction(&Func, Module, E->name); 709 // We keep this style here because we need the name 710 if (Err != CUDA_SUCCESS) { 711 REPORT("Loading '%s' Failed\n", E->name); 712 CUDA_ERR_STRING(Err); 713 return nullptr; 714 } 715 716 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 717 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 718 719 // default value GENERIC (in case symbol is missing from cubin file) 720 int8_t ExecModeVal = ExecutionModeType::GENERIC; 721 std::string ExecModeNameStr(E->name); 722 ExecModeNameStr += "_exec_mode"; 723 const char *ExecModeName = ExecModeNameStr.c_str(); 724 725 CUdeviceptr ExecModePtr; 726 size_t CUSize; 727 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 728 if (Err == CUDA_SUCCESS) { 729 if (CUSize != sizeof(int8_t)) { 730 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 731 ExecModeName, CUSize, sizeof(int8_t)); 732 return nullptr; 733 } 734 735 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 736 if (Err != CUDA_SUCCESS) { 737 REPORT("Error when copying data from device to host. Pointers: " 738 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 739 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 740 CUDA_ERR_STRING(Err); 741 return nullptr; 742 } 743 744 if (ExecModeVal < 0 || ExecModeVal > 1) { 745 DP("Error wrong exec_mode value specified in cubin file: %d\n", 746 ExecModeVal); 747 return nullptr; 748 } 749 } else { 750 REPORT("Loading global exec_mode '%s' - symbol missing, using default " 751 "value GENERIC (1)\n", 752 ExecModeName); 753 CUDA_ERR_STRING(Err); 754 } 755 756 KernelsList.emplace_back(Func, ExecModeVal); 757 758 __tgt_offload_entry Entry = *E; 759 Entry.addr = &KernelsList.back(); 760 addOffloadEntry(DeviceId, Entry); 761 } 762 763 // send device environment data to the device 764 { 765 omptarget_device_environmentTy DeviceEnv{0}; 766 767 #ifdef OMPTARGET_DEBUG 768 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 769 DeviceEnv.debug_level = std::stoi(EnvStr); 770 #endif 771 772 const char *DeviceEnvName = "omptarget_device_environment"; 773 CUdeviceptr DeviceEnvPtr; 774 size_t CUSize; 775 776 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 777 if (Err == CUDA_SUCCESS) { 778 if (CUSize != sizeof(DeviceEnv)) { 779 REPORT( 780 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 781 DeviceEnvName, CUSize, sizeof(int32_t)); 782 CUDA_ERR_STRING(Err); 783 return nullptr; 784 } 785 786 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 787 if (Err != CUDA_SUCCESS) { 788 REPORT("Error when copying data from host to device. Pointers: " 789 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 790 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 791 CUDA_ERR_STRING(Err); 792 return nullptr; 793 } 794 795 DP("Sending global device environment data %zu bytes\n", CUSize); 796 } else { 797 DP("Finding global device environment '%s' - symbol missing.\n", 798 DeviceEnvName); 799 DP("Continue, considering this is a device RTL which does not accept " 800 "environment setting.\n"); 801 } 802 } 803 804 return getOffloadEntriesTable(DeviceId); 805 } 806 807 void *dataAlloc(const int DeviceId, const int64_t Size) { 808 if (UseMemoryManager) 809 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 810 811 return DeviceAllocators[DeviceId].allocate(Size, nullptr); 812 } 813 814 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 815 const int64_t Size, __tgt_async_info *AsyncInfo) const { 816 assert(AsyncInfo && "AsyncInfo is nullptr"); 817 818 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 819 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 820 return OFFLOAD_FAIL; 821 822 CUstream Stream = getStream(DeviceId, AsyncInfo); 823 824 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 825 if (Err != CUDA_SUCCESS) { 826 DP("Error when copying data from host to device. Pointers: host " 827 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 828 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 829 CUDA_ERR_STRING(Err); 830 return OFFLOAD_FAIL; 831 } 832 833 return OFFLOAD_SUCCESS; 834 } 835 836 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 837 const int64_t Size, __tgt_async_info *AsyncInfo) const { 838 assert(AsyncInfo && "AsyncInfo is nullptr"); 839 840 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 841 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 842 return OFFLOAD_FAIL; 843 844 CUstream Stream = getStream(DeviceId, AsyncInfo); 845 846 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 847 if (Err != CUDA_SUCCESS) { 848 DP("Error when copying data from device to host. Pointers: host " 849 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 850 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 851 CUDA_ERR_STRING(Err); 852 return OFFLOAD_FAIL; 853 } 854 855 return OFFLOAD_SUCCESS; 856 } 857 858 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 859 int64_t Size, __tgt_async_info *AsyncInfo) const { 860 assert(AsyncInfo && "AsyncInfo is nullptr"); 861 862 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 863 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 864 return OFFLOAD_FAIL; 865 866 CUstream Stream = getStream(SrcDevId, AsyncInfo); 867 868 // If they are two devices, we try peer to peer copy first 869 if (SrcDevId != DstDevId) { 870 int CanAccessPeer = 0; 871 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 872 if (Err != CUDA_SUCCESS) { 873 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 874 ", dst = %" PRId32 "\n", 875 SrcDevId, DstDevId); 876 CUDA_ERR_STRING(Err); 877 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 878 } 879 880 if (!CanAccessPeer) { 881 DP("P2P memcpy not supported so fall back to D2D memcpy"); 882 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 883 } 884 885 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 886 if (Err != CUDA_SUCCESS) { 887 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 888 ", dst = %" PRId32 "\n", 889 SrcDevId, DstDevId); 890 CUDA_ERR_STRING(Err); 891 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 892 } 893 894 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 895 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 896 Size, Stream); 897 if (Err == CUDA_SUCCESS) 898 return OFFLOAD_SUCCESS; 899 900 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 901 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 902 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 903 CUDA_ERR_STRING(Err); 904 } 905 906 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 907 } 908 909 int dataDelete(const int DeviceId, void *TgtPtr) { 910 if (UseMemoryManager) 911 return MemoryManagers[DeviceId]->free(TgtPtr); 912 913 return DeviceAllocators[DeviceId].free(TgtPtr); 914 } 915 916 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 917 ptrdiff_t *TgtOffsets, const int ArgNum, 918 const int TeamNum, const int ThreadLimit, 919 const unsigned int LoopTripCount, 920 __tgt_async_info *AsyncInfo) const { 921 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 922 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 923 return OFFLOAD_FAIL; 924 925 // All args are references. 926 std::vector<void *> Args(ArgNum); 927 std::vector<void *> Ptrs(ArgNum); 928 929 for (int I = 0; I < ArgNum; ++I) { 930 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 931 Args[I] = &Ptrs[I]; 932 } 933 934 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 935 936 int CudaThreadsPerBlock; 937 if (ThreadLimit > 0) { 938 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 939 CudaThreadsPerBlock = ThreadLimit; 940 // Add master warp if necessary 941 if (KernelInfo->ExecutionMode == GENERIC) { 942 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 943 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 944 } 945 } else { 946 DP("Setting CUDA threads per block to default %d\n", 947 DeviceData[DeviceId].NumThreads); 948 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 949 } 950 951 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 952 DP("Threads per block capped at device limit %d\n", 953 DeviceData[DeviceId].ThreadsPerBlock); 954 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 955 } 956 957 if (!KernelInfo->MaxThreadsPerBlock) { 958 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 959 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 960 KernelInfo->Func); 961 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 962 return OFFLOAD_FAIL; 963 } 964 965 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 966 DP("Threads per block capped at kernel limit %d\n", 967 KernelInfo->MaxThreadsPerBlock); 968 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 969 } 970 971 unsigned int CudaBlocksPerGrid; 972 if (TeamNum <= 0) { 973 if (LoopTripCount > 0 && EnvNumTeams < 0) { 974 if (KernelInfo->ExecutionMode == SPMD) { 975 // We have a combined construct, i.e. `target teams distribute 976 // parallel for [simd]`. We launch so many teams so that each thread 977 // will execute one iteration of the loop. round up to the nearest 978 // integer 979 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 980 } else { 981 // If we reach this point, then we have a non-combined construct, i.e. 982 // `teams distribute` with a nested `parallel for` and each team is 983 // assigned one iteration of the `distribute` loop. E.g.: 984 // 985 // #pragma omp target teams distribute 986 // for(...loop_tripcount...) { 987 // #pragma omp parallel for 988 // for(...) {} 989 // } 990 // 991 // Threads within a team will execute the iterations of the `parallel` 992 // loop. 993 CudaBlocksPerGrid = LoopTripCount; 994 } 995 DP("Using %d teams due to loop trip count %" PRIu32 996 " and number of threads per block %d\n", 997 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 998 } else { 999 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1000 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1001 } 1002 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1003 DP("Capping number of teams to team limit %d\n", 1004 DeviceData[DeviceId].BlocksPerGrid); 1005 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1006 } else { 1007 DP("Using requested number of teams %d\n", TeamNum); 1008 CudaBlocksPerGrid = TeamNum; 1009 } 1010 1011 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1012 "Launching kernel %s with %d blocks and %d threads in %s " 1013 "mode\n", 1014 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1015 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1016 : "(null)", 1017 CudaBlocksPerGrid, CudaThreadsPerBlock, 1018 (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic"); 1019 1020 CUstream Stream = getStream(DeviceId, AsyncInfo); 1021 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1022 /* gridDimZ */ 1, CudaThreadsPerBlock, 1023 /* blockDimY */ 1, /* blockDimZ */ 1, 1024 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 1025 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1026 return OFFLOAD_FAIL; 1027 1028 DP("Launch of entry point at " DPxMOD " successful!\n", 1029 DPxPTR(TgtEntryPtr)); 1030 1031 return OFFLOAD_SUCCESS; 1032 } 1033 1034 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1035 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1036 CUresult Err = cuStreamSynchronize(Stream); 1037 1038 // Once the stream is synchronized, return it to stream pool and reset 1039 // AsyncInfo. This is to make sure the synchronization only works for its 1040 // own tasks. 1041 StreamManager->returnStream(DeviceId, 1042 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1043 AsyncInfo->Queue = nullptr; 1044 1045 if (Err != CUDA_SUCCESS) { 1046 DP("Error when synchronizing stream. stream = " DPxMOD 1047 ", async info ptr = " DPxMOD "\n", 1048 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1049 CUDA_ERR_STRING(Err); 1050 } 1051 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1052 } 1053 }; 1054 1055 DeviceRTLTy DeviceRTL; 1056 } // namespace 1057 1058 // Exposed library API function 1059 #ifdef __cplusplus 1060 extern "C" { 1061 #endif 1062 1063 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1064 return elf_check_machine(image, /* EM_CUDA */ 190); 1065 } 1066 1067 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1068 1069 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1070 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1071 DeviceRTL.setRequiresFlag(RequiresFlags); 1072 return RequiresFlags; 1073 } 1074 1075 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1076 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1077 DeviceRTL.isValidDeviceId(dst_dev_id)) 1078 return 1; 1079 1080 return 0; 1081 } 1082 1083 int32_t __tgt_rtl_init_device(int32_t device_id) { 1084 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1085 1086 return DeviceRTL.initDevice(device_id); 1087 } 1088 1089 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1090 __tgt_device_image *image) { 1091 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1092 1093 return DeviceRTL.loadBinary(device_id, image); 1094 } 1095 1096 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1097 int32_t kind) { 1098 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1099 1100 if (kind != TARGET_ALLOC_DEFAULT) { 1101 REPORT("Invalid target data allocation kind or requested allocator not " 1102 "implemented yet\n"); 1103 return NULL; 1104 } 1105 1106 return DeviceRTL.dataAlloc(device_id, size); 1107 } 1108 1109 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1110 int64_t size) { 1111 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1112 1113 __tgt_async_info AsyncInfo; 1114 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1115 size, &AsyncInfo); 1116 if (rc != OFFLOAD_SUCCESS) 1117 return OFFLOAD_FAIL; 1118 1119 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1120 } 1121 1122 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1123 void *hst_ptr, int64_t size, 1124 __tgt_async_info *async_info_ptr) { 1125 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1126 assert(async_info_ptr && "async_info_ptr is nullptr"); 1127 1128 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1129 async_info_ptr); 1130 } 1131 1132 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1133 int64_t size) { 1134 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1135 1136 __tgt_async_info AsyncInfo; 1137 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1138 size, &AsyncInfo); 1139 if (rc != OFFLOAD_SUCCESS) 1140 return OFFLOAD_FAIL; 1141 1142 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1143 } 1144 1145 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1146 void *tgt_ptr, int64_t size, 1147 __tgt_async_info *async_info_ptr) { 1148 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1149 assert(async_info_ptr && "async_info_ptr is nullptr"); 1150 1151 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1152 async_info_ptr); 1153 } 1154 1155 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1156 int dst_dev_id, void *dst_ptr, 1157 int64_t size, 1158 __tgt_async_info *AsyncInfo) { 1159 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1160 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1161 assert(AsyncInfo && "AsyncInfo is nullptr"); 1162 1163 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1164 AsyncInfo); 1165 } 1166 1167 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1168 int32_t dst_dev_id, void *dst_ptr, 1169 int64_t size) { 1170 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1171 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1172 1173 __tgt_async_info AsyncInfo; 1174 const int32_t rc = __tgt_rtl_data_exchange_async( 1175 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1176 if (rc != OFFLOAD_SUCCESS) 1177 return OFFLOAD_FAIL; 1178 1179 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1180 } 1181 1182 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1183 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1184 1185 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1186 } 1187 1188 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1189 void **tgt_args, 1190 ptrdiff_t *tgt_offsets, 1191 int32_t arg_num, int32_t team_num, 1192 int32_t thread_limit, 1193 uint64_t loop_tripcount) { 1194 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1195 1196 __tgt_async_info AsyncInfo; 1197 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1198 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1199 thread_limit, loop_tripcount, &AsyncInfo); 1200 if (rc != OFFLOAD_SUCCESS) 1201 return OFFLOAD_FAIL; 1202 1203 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1204 } 1205 1206 int32_t __tgt_rtl_run_target_team_region_async( 1207 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1208 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1209 int32_t thread_limit, uint64_t loop_tripcount, 1210 __tgt_async_info *async_info_ptr) { 1211 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1212 1213 return DeviceRTL.runTargetTeamRegion( 1214 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1215 thread_limit, loop_tripcount, async_info_ptr); 1216 } 1217 1218 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1219 void **tgt_args, ptrdiff_t *tgt_offsets, 1220 int32_t arg_num) { 1221 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1222 1223 __tgt_async_info AsyncInfo; 1224 const int32_t rc = __tgt_rtl_run_target_region_async( 1225 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1226 if (rc != OFFLOAD_SUCCESS) 1227 return OFFLOAD_FAIL; 1228 1229 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1230 } 1231 1232 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1233 void *tgt_entry_ptr, void **tgt_args, 1234 ptrdiff_t *tgt_offsets, 1235 int32_t arg_num, 1236 __tgt_async_info *async_info_ptr) { 1237 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1238 1239 return __tgt_rtl_run_target_team_region_async( 1240 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1241 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1242 async_info_ptr); 1243 } 1244 1245 int32_t __tgt_rtl_synchronize(int32_t device_id, 1246 __tgt_async_info *async_info_ptr) { 1247 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1248 assert(async_info_ptr && "async_info_ptr is nullptr"); 1249 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1250 1251 return DeviceRTL.synchronize(device_id, async_info_ptr); 1252 } 1253 1254 #ifdef __cplusplus 1255 } 1256 #endif 1257