1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <vector> 21 22 #include "omptargetplugin.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #ifdef OMPTARGET_DEBUG 29 static int DebugLevel = 0; 30 31 #define GETNAME2(name) #name 32 #define GETNAME(name) GETNAME2(name) 33 #define DP(...) \ 34 do { \ 35 if (DebugLevel > 0) { \ 36 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 37 } \ 38 } while (false) 39 40 // Utility for retrieving and printing CUDA error string. 41 #define CUDA_ERR_STRING(err) \ 42 do { \ 43 if (DebugLevel > 0) { \ 44 const char *errStr; \ 45 cuGetErrorString(err, &errStr); \ 46 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ 47 } \ 48 } while (false) 49 #else // OMPTARGET_DEBUG 50 #define DP(...) {} 51 #define CUDA_ERR_STRING(err) {} 52 #endif // OMPTARGET_DEBUG 53 54 #include "../../common/elf_common.c" 55 56 /// Keep entries table per device. 57 struct FuncOrGblEntryTy { 58 __tgt_target_table Table; 59 std::vector<__tgt_offload_entry> Entries; 60 }; 61 62 enum ExecutionModeType { 63 SPMD, // constructors, destructors, 64 // combined constructs (`teams distribute parallel for [simd]`) 65 GENERIC, // everything else 66 NONE 67 }; 68 69 /// Use a single entity to encode a kernel and a set of flags. 70 struct KernelTy { 71 CUfunction Func; 72 73 // execution mode of kernel 74 // 0 - SPMD mode (without master warp) 75 // 1 - Generic mode (with master warp) 76 int8_t ExecutionMode; 77 78 /// Maximal number of threads per block for this kernel. 79 int MaxThreadsPerBlock = 0; 80 81 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 82 : Func(_Func), ExecutionMode(_ExecutionMode) {} 83 }; 84 85 /// Device environment data 86 /// Manually sync with the deviceRTL side for now, move to a dedicated header 87 /// file later. 88 struct omptarget_device_environmentTy { 89 int32_t debug_level; 90 }; 91 92 namespace { 93 bool checkResult(CUresult Err, const char *ErrMsg) { 94 if (Err == CUDA_SUCCESS) 95 return true; 96 97 DP("%s", ErrMsg); 98 CUDA_ERR_STRING(Err); 99 return false; 100 } 101 102 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 103 CUstream Stream) { 104 CUresult Err = 105 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 106 107 if (Err != CUDA_SUCCESS) { 108 DP("Error when copying data from device to device. Pointers: src " 109 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 110 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 111 CUDA_ERR_STRING(Err); 112 return OFFLOAD_FAIL; 113 } 114 115 return OFFLOAD_SUCCESS; 116 } 117 118 // Structure contains per-device data 119 struct DeviceDataTy { 120 /// List that contains all the kernels. 121 std::list<KernelTy> KernelsList; 122 123 std::list<FuncOrGblEntryTy> FuncGblEntries; 124 125 CUcontext Context = nullptr; 126 // Device properties 127 int ThreadsPerBlock = 0; 128 int BlocksPerGrid = 0; 129 int WarpSize = 0; 130 // OpenMP properties 131 int NumTeams = 0; 132 int NumThreads = 0; 133 }; 134 135 class StreamManagerTy { 136 int NumberOfDevices; 137 // The initial size of stream pool 138 int EnvNumInitialStreams; 139 // Per-device stream mutex 140 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 141 // Per-device stream Id indicates the next available stream in the pool 142 std::vector<int> NextStreamId; 143 // Per-device stream pool 144 std::vector<std::vector<CUstream>> StreamPool; 145 // Reference to per-device data 146 std::vector<DeviceDataTy> &DeviceData; 147 148 // If there is no CUstream left in the pool, we will resize the pool to 149 // allocate more CUstream. This function should be called with device mutex, 150 // and we do not resize to smaller one. 151 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 152 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 153 const size_t CurrentSize = Pool.size(); 154 assert(NewSize > CurrentSize && "new size is not larger than current size"); 155 156 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 157 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 158 // We will return if cannot switch to the right context in case of 159 // creating bunch of streams that are not corresponding to the right 160 // device. The offloading will fail later because selected CUstream is 161 // nullptr. 162 return; 163 } 164 165 Pool.resize(NewSize, nullptr); 166 167 for (size_t I = CurrentSize; I < NewSize; ++I) { 168 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 169 "Error returned from cuStreamCreate\n"); 170 } 171 } 172 173 public: 174 StreamManagerTy(const int NumberOfDevices, 175 std::vector<DeviceDataTy> &DeviceData) 176 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 177 DeviceData(DeviceData) { 178 StreamPool.resize(NumberOfDevices); 179 NextStreamId.resize(NumberOfDevices); 180 StreamMtx.resize(NumberOfDevices); 181 182 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 183 EnvNumInitialStreams = std::stoi(EnvStr); 184 185 // Initialize the next stream id 186 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 187 188 // Initialize stream mutex 189 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 190 Ptr = std::make_unique<std::mutex>(); 191 } 192 193 ~StreamManagerTy() { 194 // Destroy streams 195 for (int I = 0; I < NumberOfDevices; ++I) { 196 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 197 "Error returned from cuCtxSetCurrent\n"); 198 199 for (CUstream &S : StreamPool[I]) { 200 if (S) 201 checkResult(cuStreamDestroy(S), 202 "Error returned from cuStreamDestroy\n"); 203 } 204 } 205 } 206 207 // Get a CUstream from pool. Per-device next stream id always points to the 208 // next available CUstream. That means, CUstreams [0, id-1] have been 209 // assigned, and [id,] are still available. If there is no CUstream left, we 210 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 211 // the id will increase one. 212 // xxxxxs+++++++++ 213 // ^ 214 // id 215 // After assignment, the pool becomes the following and s is assigned. 216 // xxxxxs+++++++++ 217 // ^ 218 // id 219 CUstream getStream(const int DeviceId) { 220 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 221 int &Id = NextStreamId[DeviceId]; 222 // No CUstream left in the pool, we need to request from CUDA RT 223 if (Id == StreamPool[DeviceId].size()) { 224 // By default we double the stream pool every time 225 resizeStreamPool(DeviceId, Id * 2); 226 } 227 return StreamPool[DeviceId][Id++]; 228 } 229 230 // Return a CUstream back to pool. As mentioned above, per-device next 231 // stream is always points to the next available CUstream, so when we return 232 // a CUstream, we need to first decrease the id, and then copy the CUstream 233 // back. 234 // It is worth noting that, the order of streams return might be different 235 // from that they're assigned, that saying, at some point, there might be 236 // two identical CUstreams. 237 // xxax+a+++++ 238 // ^ 239 // id 240 // However, it doesn't matter, because they're always on the two sides of 241 // id. The left one will in the end be overwritten by another CUstream. 242 // Therefore, after several execution, the order of pool might be different 243 // from its initial state. 244 void returnStream(const int DeviceId, CUstream Stream) { 245 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 246 int &Id = NextStreamId[DeviceId]; 247 assert(Id > 0 && "Wrong stream ID"); 248 StreamPool[DeviceId][--Id] = Stream; 249 } 250 251 bool initializeDeviceStreamPool(const int DeviceId) { 252 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 253 254 resizeStreamPool(DeviceId, EnvNumInitialStreams); 255 256 // Check the size of stream pool 257 if (StreamPool[DeviceId].size() != EnvNumInitialStreams) 258 return false; 259 260 // Check whether each stream is valid 261 for (CUstream &S : StreamPool[DeviceId]) 262 if (!S) 263 return false; 264 265 return true; 266 } 267 }; 268 269 class DeviceRTLTy { 270 int NumberOfDevices; 271 // OpenMP environment properties 272 int EnvNumTeams; 273 int EnvTeamLimit; 274 // OpenMP requires flags 275 int64_t RequiresFlags; 276 277 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 278 static constexpr const int HardThreadLimit = 1024; 279 static constexpr const int DefaultNumTeams = 128; 280 static constexpr const int DefaultNumThreads = 128; 281 282 std::unique_ptr<StreamManagerTy> StreamManager; 283 std::vector<DeviceDataTy> DeviceData; 284 std::vector<CUmodule> Modules; 285 286 // Record entry point associated with device 287 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 288 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 289 E.Entries.push_back(entry); 290 } 291 292 // Return true if the entry is associated with device 293 bool findOffloadEntry(const int DeviceId, const void *Addr) const { 294 for (const __tgt_offload_entry &Itr : 295 DeviceData[DeviceId].FuncGblEntries.back().Entries) 296 if (Itr.addr == Addr) 297 return true; 298 299 return false; 300 } 301 302 // Return the pointer to the target entries table 303 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 304 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 305 306 if (E.Entries.empty()) 307 return nullptr; 308 309 // Update table info according to the entries and return the pointer 310 E.Table.EntriesBegin = E.Entries.data(); 311 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 312 313 return &E.Table; 314 } 315 316 // Clear entries table for a device 317 void clearOffloadEntriesTable(const int DeviceId) { 318 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 319 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 320 E.Entries.clear(); 321 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 322 } 323 324 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 325 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 326 327 if (!AsyncInfoPtr->Queue) 328 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); 329 330 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 331 } 332 333 public: 334 // This class should not be copied 335 DeviceRTLTy(const DeviceRTLTy &) = delete; 336 DeviceRTLTy(DeviceRTLTy &&) = delete; 337 338 DeviceRTLTy() 339 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 340 RequiresFlags(OMP_REQ_UNDEFINED) { 341 #ifdef OMPTARGET_DEBUG 342 if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG")) 343 DebugLevel = std::stoi(EnvStr); 344 #endif // OMPTARGET_DEBUG 345 346 DP("Start initializing CUDA\n"); 347 348 CUresult Err = cuInit(0); 349 if (!checkResult(Err, "Error returned from cuInit\n")) { 350 return; 351 } 352 353 Err = cuDeviceGetCount(&NumberOfDevices); 354 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 355 return; 356 357 if (NumberOfDevices == 0) { 358 DP("There are no devices supporting CUDA.\n"); 359 return; 360 } 361 362 DeviceData.resize(NumberOfDevices); 363 364 // Get environment variables regarding teams 365 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 366 // OMP_TEAM_LIMIT has been set 367 EnvTeamLimit = std::stoi(EnvStr); 368 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 369 } 370 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 371 // OMP_NUM_TEAMS has been set 372 EnvNumTeams = std::stoi(EnvStr); 373 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 374 } 375 376 StreamManager = 377 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 378 } 379 380 ~DeviceRTLTy() { 381 // First destruct stream manager in case of Contexts is destructed before it 382 StreamManager = nullptr; 383 384 for (CUmodule &M : Modules) 385 // Close module 386 if (M) 387 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 388 389 for (DeviceDataTy &D : DeviceData) { 390 // Destroy context 391 if (D.Context) { 392 checkResult(cuCtxSetCurrent(D.Context), 393 "Error returned from cuCtxSetCurrent\n"); 394 CUdevice Device; 395 checkResult(cuCtxGetDevice(&Device), 396 "Error returned from cuCtxGetDevice\n"); 397 checkResult(cuDevicePrimaryCtxRelease(Device), 398 "Error returned from cuDevicePrimaryCtxRelease\n"); 399 } 400 } 401 } 402 403 // Check whether a given DeviceId is valid 404 bool isValidDeviceId(const int DeviceId) const { 405 return DeviceId >= 0 && DeviceId < NumberOfDevices; 406 } 407 408 int getNumOfDevices() const { return NumberOfDevices; } 409 410 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 411 412 int initDevice(const int DeviceId) { 413 CUdevice Device; 414 415 DP("Getting device %d\n", DeviceId); 416 CUresult Err = cuDeviceGet(&Device, DeviceId); 417 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 418 return OFFLOAD_FAIL; 419 420 // Query the current flags of the primary context and set its flags if 421 // it is inactive 422 unsigned int FormerPrimaryCtxFlags = 0; 423 int FormerPrimaryCtxIsActive = 0; 424 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 425 &FormerPrimaryCtxIsActive); 426 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 427 return OFFLOAD_FAIL; 428 429 if (FormerPrimaryCtxIsActive) { 430 DP("The primary context is active, no change to its flags\n"); 431 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 432 CU_CTX_SCHED_BLOCKING_SYNC) 433 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 434 } else { 435 DP("The primary context is inactive, set its flags to " 436 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 437 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 438 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 439 return OFFLOAD_FAIL; 440 } 441 442 // Retain the per device primary context and save it to use whenever this 443 // device is selected. 444 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 445 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 446 return OFFLOAD_FAIL; 447 448 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 449 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 450 return OFFLOAD_FAIL; 451 452 // Initialize stream pool 453 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 454 return OFFLOAD_FAIL; 455 456 // Query attributes to determine number of threads/block and blocks/grid. 457 int MaxGridDimX; 458 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 459 Device); 460 if (Err != CUDA_SUCCESS) { 461 DP("Error getting max grid dimension, use default value %d\n", 462 DeviceRTLTy::DefaultNumTeams); 463 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 464 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 465 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 466 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 467 } else { 468 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 469 "at the hard limit\n", 470 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 471 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 472 } 473 474 // We are only exploiting threads along the x axis. 475 int MaxBlockDimX; 476 Err = cuDeviceGetAttribute(&MaxBlockDimX, 477 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 478 if (Err != CUDA_SUCCESS) { 479 DP("Error getting max block dimension, use default value %d\n", 480 DeviceRTLTy::DefaultNumThreads); 481 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 482 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { 483 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 484 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 485 } else { 486 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 487 "capping at the hard limit\n", 488 MaxBlockDimX, DeviceRTLTy::HardThreadLimit); 489 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 490 } 491 492 // Get and set warp size 493 int WarpSize; 494 Err = 495 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 496 if (Err != CUDA_SUCCESS) { 497 DP("Error getting warp size, assume default value 32\n"); 498 DeviceData[DeviceId].WarpSize = 32; 499 } else { 500 DP("Using warp size %d\n", WarpSize); 501 DeviceData[DeviceId].WarpSize = WarpSize; 502 } 503 504 // Adjust teams to the env variables 505 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 506 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 507 EnvTeamLimit); 508 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 509 } 510 511 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 512 DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, 513 DeviceData[DeviceId].WarpSize); 514 515 // Set default number of teams 516 if (EnvNumTeams > 0) { 517 DP("Default number of teams set according to environment %d\n", 518 EnvNumTeams); 519 DeviceData[DeviceId].NumTeams = EnvNumTeams; 520 } else { 521 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 522 DP("Default number of teams set according to library's default %d\n", 523 DeviceRTLTy::DefaultNumTeams); 524 } 525 526 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 527 DP("Default number of teams exceeds device limit, capping at %d\n", 528 DeviceData[DeviceId].BlocksPerGrid); 529 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 530 } 531 532 // Set default number of threads 533 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 534 DP("Default number of threads set according to library's default %d\n", 535 DeviceRTLTy::DefaultNumThreads); 536 if (DeviceData[DeviceId].NumThreads > 537 DeviceData[DeviceId].ThreadsPerBlock) { 538 DP("Default number of threads exceeds device limit, capping at %d\n", 539 DeviceData[DeviceId].ThreadsPerBlock); 540 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 541 } 542 543 return OFFLOAD_SUCCESS; 544 } 545 546 __tgt_target_table *loadBinary(const int DeviceId, 547 const __tgt_device_image *Image) { 548 // Set the context we are using 549 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 550 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 551 return nullptr; 552 553 // Clear the offload table as we are going to create a new one. 554 clearOffloadEntriesTable(DeviceId); 555 556 // Create the module and extract the function pointers. 557 CUmodule Module; 558 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 559 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 560 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 561 return nullptr; 562 563 DP("CUDA module successfully loaded!\n"); 564 565 Modules.push_back(Module); 566 567 // Find the symbols in the module by name. 568 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 569 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 570 571 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 572 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 573 if (!E->addr) { 574 // We return nullptr when something like this happens, the host should 575 // have always something in the address to uniquely identify the target 576 // region. 577 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 578 return nullptr; 579 } 580 581 if (E->size) { 582 __tgt_offload_entry Entry = *E; 583 CUdeviceptr CUPtr; 584 size_t CUSize; 585 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 586 // We keep this style here because we need the name 587 if (Err != CUDA_SUCCESS) { 588 DP("Loading global '%s' (Failed)\n", E->name); 589 CUDA_ERR_STRING(Err); 590 return nullptr; 591 } 592 593 if (CUSize != E->size) { 594 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 595 CUSize, E->size); 596 return nullptr; 597 } 598 599 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 600 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 601 602 Entry.addr = (void *)(CUPtr); 603 604 // Note: In the current implementation declare target variables 605 // can either be link or to. This means that once unified 606 // memory is activated via the requires directive, the variable 607 // can be used directly from the host in both cases. 608 // TODO: when variables types other than to or link are added, 609 // the below condition should be changed to explicitly 610 // check for to and link variables types: 611 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 612 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 613 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 614 // If unified memory is present any target link or to variables 615 // can access host addresses directly. There is no longer a 616 // need for device copies. 617 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 618 DP("Copy linked variable host address (" DPxMOD 619 ") to device address (" DPxMOD ")\n", 620 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 621 } 622 623 addOffloadEntry(DeviceId, Entry); 624 625 continue; 626 } 627 628 CUfunction Func; 629 Err = cuModuleGetFunction(&Func, Module, E->name); 630 // We keep this style here because we need the name 631 if (Err != CUDA_SUCCESS) { 632 DP("Loading '%s' (Failed)\n", E->name); 633 CUDA_ERR_STRING(Err); 634 return nullptr; 635 } 636 637 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 638 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 639 640 // default value GENERIC (in case symbol is missing from cubin file) 641 int8_t ExecModeVal = ExecutionModeType::GENERIC; 642 std::string ExecModeNameStr(E->name); 643 ExecModeNameStr += "_exec_mode"; 644 const char *ExecModeName = ExecModeNameStr.c_str(); 645 646 CUdeviceptr ExecModePtr; 647 size_t CUSize; 648 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 649 if (Err == CUDA_SUCCESS) { 650 if (CUSize != sizeof(int8_t)) { 651 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 652 ExecModeName, CUSize, sizeof(int8_t)); 653 return nullptr; 654 } 655 656 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 657 if (Err != CUDA_SUCCESS) { 658 DP("Error when copying data from device to host. Pointers: " 659 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 660 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 661 CUDA_ERR_STRING(Err); 662 return nullptr; 663 } 664 665 if (ExecModeVal < 0 || ExecModeVal > 1) { 666 DP("Error wrong exec_mode value specified in cubin file: %d\n", 667 ExecModeVal); 668 return nullptr; 669 } 670 } else { 671 DP("Loading global exec_mode '%s' - symbol missing, using default " 672 "value GENERIC (1)\n", 673 ExecModeName); 674 CUDA_ERR_STRING(Err); 675 } 676 677 KernelsList.emplace_back(Func, ExecModeVal); 678 679 __tgt_offload_entry Entry = *E; 680 Entry.addr = &KernelsList.back(); 681 addOffloadEntry(DeviceId, Entry); 682 } 683 684 // send device environment data to the device 685 { 686 omptarget_device_environmentTy DeviceEnv{0}; 687 688 #ifdef OMPTARGET_DEBUG 689 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 690 DeviceEnv.debug_level = std::stoi(EnvStr); 691 #endif 692 693 const char *DeviceEnvName = "omptarget_device_environment"; 694 CUdeviceptr DeviceEnvPtr; 695 size_t CUSize; 696 697 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 698 if (Err == CUDA_SUCCESS) { 699 if (CUSize != sizeof(DeviceEnv)) { 700 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 701 DeviceEnvName, CUSize, sizeof(int32_t)); 702 CUDA_ERR_STRING(Err); 703 return nullptr; 704 } 705 706 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 707 if (Err != CUDA_SUCCESS) { 708 DP("Error when copying data from host to device. Pointers: " 709 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 710 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 711 CUDA_ERR_STRING(Err); 712 return nullptr; 713 } 714 715 DP("Sending global device environment data %zu bytes\n", CUSize); 716 } else { 717 DP("Finding global device environment '%s' - symbol missing.\n", 718 DeviceEnvName); 719 DP("Continue, considering this is a device RTL which does not accept " 720 "environment setting.\n"); 721 } 722 } 723 724 return getOffloadEntriesTable(DeviceId); 725 } 726 727 void *dataAlloc(const int DeviceId, const int64_t Size) const { 728 if (Size == 0) 729 return nullptr; 730 731 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 732 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 733 return nullptr; 734 735 CUdeviceptr DevicePtr; 736 Err = cuMemAlloc(&DevicePtr, Size); 737 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 738 return nullptr; 739 740 return (void *)DevicePtr; 741 } 742 743 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 744 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 745 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 746 747 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 748 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 749 return OFFLOAD_FAIL; 750 751 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 752 753 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 754 if (Err != CUDA_SUCCESS) { 755 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 756 ", device = " DPxMOD ", size = %" PRId64 "\n", 757 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 758 CUDA_ERR_STRING(Err); 759 return OFFLOAD_FAIL; 760 } 761 762 return OFFLOAD_SUCCESS; 763 } 764 765 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 766 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 767 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 768 769 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 770 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 771 return OFFLOAD_FAIL; 772 773 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 774 775 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 776 if (Err != CUDA_SUCCESS) { 777 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 778 ", device = " DPxMOD ", size = %" PRId64 "\n", 779 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 780 CUDA_ERR_STRING(Err); 781 return OFFLOAD_FAIL; 782 } 783 784 return OFFLOAD_SUCCESS; 785 } 786 787 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 788 int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 789 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 790 791 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 792 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 793 return OFFLOAD_FAIL; 794 795 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); 796 797 // If they are two devices, we try peer to peer copy first 798 if (SrcDevId != DstDevId) { 799 int CanAccessPeer = 0; 800 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 801 if (Err != CUDA_SUCCESS) { 802 DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 803 ", dst = %" PRId32 "\n", 804 SrcDevId, DstDevId); 805 CUDA_ERR_STRING(Err); 806 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 807 } 808 809 if (!CanAccessPeer) { 810 DP("P2P memcpy not supported so fall back to D2D memcpy"); 811 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 812 } 813 814 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 815 if (Err != CUDA_SUCCESS) { 816 DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 817 ", dst = %" PRId32 "\n", 818 SrcDevId, DstDevId); 819 CUDA_ERR_STRING(Err); 820 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 821 } 822 823 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 824 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 825 Size, Stream); 826 if (Err == CUDA_SUCCESS) 827 return OFFLOAD_SUCCESS; 828 829 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 830 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 831 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 832 CUDA_ERR_STRING(Err); 833 } 834 835 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 836 } 837 838 int dataDelete(const int DeviceId, void *TgtPtr) const { 839 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 840 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 841 return OFFLOAD_FAIL; 842 843 Err = cuMemFree((CUdeviceptr)TgtPtr); 844 if (!checkResult(Err, "Error returned from cuMemFree\n")) 845 return OFFLOAD_FAIL; 846 847 return OFFLOAD_SUCCESS; 848 } 849 850 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 851 ptrdiff_t *TgtOffsets, const int ArgNum, 852 const int TeamNum, const int ThreadLimit, 853 const unsigned int LoopTripCount, 854 __tgt_async_info *AsyncInfo) const { 855 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 856 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 857 return OFFLOAD_FAIL; 858 859 // All args are references. 860 std::vector<void *> Args(ArgNum); 861 std::vector<void *> Ptrs(ArgNum); 862 863 for (int I = 0; I < ArgNum; ++I) { 864 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 865 Args[I] = &Ptrs[I]; 866 } 867 868 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 869 870 int CudaThreadsPerBlock; 871 if (ThreadLimit > 0) { 872 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 873 CudaThreadsPerBlock = ThreadLimit; 874 // Add master warp if necessary 875 if (KernelInfo->ExecutionMode == GENERIC) { 876 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 877 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 878 } 879 } else { 880 DP("Setting CUDA threads per block to default %d\n", 881 DeviceData[DeviceId].NumThreads); 882 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 883 } 884 885 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 886 DP("Threads per block capped at device limit %d\n", 887 DeviceData[DeviceId].ThreadsPerBlock); 888 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 889 } 890 891 if (!KernelInfo->MaxThreadsPerBlock) { 892 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 893 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 894 KernelInfo->Func); 895 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 896 return OFFLOAD_FAIL; 897 } 898 899 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 900 DP("Threads per block capped at kernel limit %d\n", 901 KernelInfo->MaxThreadsPerBlock); 902 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 903 } 904 905 unsigned int CudaBlocksPerGrid; 906 if (TeamNum <= 0) { 907 if (LoopTripCount > 0 && EnvNumTeams < 0) { 908 if (KernelInfo->ExecutionMode == SPMD) { 909 // We have a combined construct, i.e. `target teams distribute 910 // parallel for [simd]`. We launch so many teams so that each thread 911 // will execute one iteration of the loop. round up to the nearest 912 // integer 913 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 914 } else { 915 // If we reach this point, then we have a non-combined construct, i.e. 916 // `teams distribute` with a nested `parallel for` and each team is 917 // assigned one iteration of the `distribute` loop. E.g.: 918 // 919 // #pragma omp target teams distribute 920 // for(...loop_tripcount...) { 921 // #pragma omp parallel for 922 // for(...) {} 923 // } 924 // 925 // Threads within a team will execute the iterations of the `parallel` 926 // loop. 927 CudaBlocksPerGrid = LoopTripCount; 928 } 929 DP("Using %d teams due to loop trip count %" PRIu32 930 " and number of threads per block %d\n", 931 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 932 } else { 933 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 934 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 935 } 936 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 937 DP("Capping number of teams to team limit %d\n", 938 DeviceData[DeviceId].BlocksPerGrid); 939 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 940 } else { 941 DP("Using requested number of teams %d\n", TeamNum); 942 CudaBlocksPerGrid = TeamNum; 943 } 944 945 // Run on the device. 946 DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid, 947 CudaThreadsPerBlock); 948 949 CUstream Stream = getStream(DeviceId, AsyncInfo); 950 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 951 /* gridDimZ */ 1, CudaThreadsPerBlock, 952 /* blockDimY */ 1, /* blockDimZ */ 1, 953 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 954 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 955 return OFFLOAD_FAIL; 956 957 DP("Launch of entry point at " DPxMOD " successful!\n", 958 DPxPTR(TgtEntryPtr)); 959 960 return OFFLOAD_SUCCESS; 961 } 962 963 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 964 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 965 CUresult Err = cuStreamSynchronize(Stream); 966 if (Err != CUDA_SUCCESS) { 967 DP("Error when synchronizing stream. stream = " DPxMOD 968 ", async info ptr = " DPxMOD "\n", 969 DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); 970 CUDA_ERR_STRING(Err); 971 return OFFLOAD_FAIL; 972 } 973 974 // Once the stream is synchronized, return it to stream pool and reset 975 // async_info. This is to make sure the synchronization only works for its 976 // own tasks. 977 StreamManager->returnStream( 978 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue)); 979 AsyncInfoPtr->Queue = nullptr; 980 981 return OFFLOAD_SUCCESS; 982 } 983 }; 984 985 DeviceRTLTy DeviceRTL; 986 } // namespace 987 988 // Exposed library API function 989 #ifdef __cplusplus 990 extern "C" { 991 #endif 992 993 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 994 return elf_check_machine(image, /* EM_CUDA */ 190); 995 } 996 997 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 998 999 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1000 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1001 DeviceRTL.setRequiresFlag(RequiresFlags); 1002 return RequiresFlags; 1003 } 1004 1005 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1006 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1007 DeviceRTL.isValidDeviceId(dst_dev_id)) 1008 return 1; 1009 1010 return 0; 1011 } 1012 1013 int32_t __tgt_rtl_init_device(int32_t device_id) { 1014 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1015 1016 return DeviceRTL.initDevice(device_id); 1017 } 1018 1019 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1020 __tgt_device_image *image) { 1021 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1022 1023 return DeviceRTL.loadBinary(device_id, image); 1024 } 1025 1026 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { 1027 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1028 1029 return DeviceRTL.dataAlloc(device_id, size); 1030 } 1031 1032 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1033 int64_t size) { 1034 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1035 1036 __tgt_async_info async_info; 1037 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1038 size, &async_info); 1039 if (rc != OFFLOAD_SUCCESS) 1040 return OFFLOAD_FAIL; 1041 1042 return __tgt_rtl_synchronize(device_id, &async_info); 1043 } 1044 1045 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1046 void *hst_ptr, int64_t size, 1047 __tgt_async_info *async_info_ptr) { 1048 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1049 assert(async_info_ptr && "async_info_ptr is nullptr"); 1050 1051 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1052 async_info_ptr); 1053 } 1054 1055 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1056 int64_t size) { 1057 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1058 1059 __tgt_async_info async_info; 1060 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1061 size, &async_info); 1062 if (rc != OFFLOAD_SUCCESS) 1063 return OFFLOAD_FAIL; 1064 1065 return __tgt_rtl_synchronize(device_id, &async_info); 1066 } 1067 1068 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1069 void *tgt_ptr, int64_t size, 1070 __tgt_async_info *async_info_ptr) { 1071 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1072 assert(async_info_ptr && "async_info_ptr is nullptr"); 1073 1074 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1075 async_info_ptr); 1076 } 1077 1078 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1079 int dst_dev_id, void *dst_ptr, 1080 int64_t size, 1081 __tgt_async_info *async_info_ptr) { 1082 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1083 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1084 assert(async_info_ptr && "async_info_ptr is nullptr"); 1085 1086 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1087 async_info_ptr); 1088 } 1089 1090 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1091 int32_t dst_dev_id, void *dst_ptr, 1092 int64_t size) { 1093 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1094 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1095 1096 __tgt_async_info async_info; 1097 const int32_t rc = __tgt_rtl_data_exchange_async( 1098 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); 1099 if (rc != OFFLOAD_SUCCESS) 1100 return OFFLOAD_FAIL; 1101 1102 return __tgt_rtl_synchronize(src_dev_id, &async_info); 1103 } 1104 1105 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1106 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1107 1108 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1109 } 1110 1111 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1112 void **tgt_args, 1113 ptrdiff_t *tgt_offsets, 1114 int32_t arg_num, int32_t team_num, 1115 int32_t thread_limit, 1116 uint64_t loop_tripcount) { 1117 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1118 1119 __tgt_async_info async_info; 1120 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1121 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1122 thread_limit, loop_tripcount, &async_info); 1123 if (rc != OFFLOAD_SUCCESS) 1124 return OFFLOAD_FAIL; 1125 1126 return __tgt_rtl_synchronize(device_id, &async_info); 1127 } 1128 1129 int32_t __tgt_rtl_run_target_team_region_async( 1130 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1131 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1132 int32_t thread_limit, uint64_t loop_tripcount, 1133 __tgt_async_info *async_info_ptr) { 1134 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1135 1136 return DeviceRTL.runTargetTeamRegion( 1137 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1138 thread_limit, loop_tripcount, async_info_ptr); 1139 } 1140 1141 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1142 void **tgt_args, ptrdiff_t *tgt_offsets, 1143 int32_t arg_num) { 1144 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1145 1146 __tgt_async_info async_info; 1147 const int32_t rc = __tgt_rtl_run_target_region_async( 1148 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); 1149 if (rc != OFFLOAD_SUCCESS) 1150 return OFFLOAD_FAIL; 1151 1152 return __tgt_rtl_synchronize(device_id, &async_info); 1153 } 1154 1155 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1156 void *tgt_entry_ptr, void **tgt_args, 1157 ptrdiff_t *tgt_offsets, 1158 int32_t arg_num, 1159 __tgt_async_info *async_info_ptr) { 1160 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1161 1162 return __tgt_rtl_run_target_team_region_async( 1163 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1164 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1165 async_info_ptr); 1166 } 1167 1168 int32_t __tgt_rtl_synchronize(int32_t device_id, 1169 __tgt_async_info *async_info_ptr) { 1170 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1171 assert(async_info_ptr && "async_info_ptr is nullptr"); 1172 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1173 1174 return DeviceRTL.synchronize(device_id, async_info_ptr); 1175 } 1176 1177 #ifdef __cplusplus 1178 } 1179 #endif 1180