1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <vector> 21 22 #include "omptargetplugin.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #ifdef OMPTARGET_DEBUG 29 static int DebugLevel = 0; 30 31 #define GETNAME2(name) #name 32 #define GETNAME(name) GETNAME2(name) 33 #define DP(...) \ 34 do { \ 35 if (DebugLevel > 0) { \ 36 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 37 } \ 38 } while (false) 39 40 // Utility for retrieving and printing CUDA error string. 41 #define CUDA_ERR_STRING(err) \ 42 do { \ 43 if (DebugLevel > 0) { \ 44 const char *errStr; \ 45 cuGetErrorString(err, &errStr); \ 46 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ 47 } \ 48 } while (false) 49 #else // OMPTARGET_DEBUG 50 #define DP(...) {} 51 #define CUDA_ERR_STRING(err) {} 52 #endif // OMPTARGET_DEBUG 53 54 #include "../../common/elf_common.c" 55 56 /// Keep entries table per device. 57 struct FuncOrGblEntryTy { 58 __tgt_target_table Table; 59 std::vector<__tgt_offload_entry> Entries; 60 }; 61 62 enum ExecutionModeType { 63 SPMD, // constructors, destructors, 64 // combined constructs (`teams distribute parallel for [simd]`) 65 GENERIC, // everything else 66 NONE 67 }; 68 69 /// Use a single entity to encode a kernel and a set of flags. 70 struct KernelTy { 71 CUfunction Func; 72 73 // execution mode of kernel 74 // 0 - SPMD mode (without master warp) 75 // 1 - Generic mode (with master warp) 76 int8_t ExecutionMode; 77 78 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 79 : Func(_Func), ExecutionMode(_ExecutionMode) {} 80 }; 81 82 /// Device environment data 83 /// Manually sync with the deviceRTL side for now, move to a dedicated header 84 /// file later. 85 struct omptarget_device_environmentTy { 86 int32_t debug_level; 87 }; 88 89 /// List that contains all the kernels. 90 /// FIXME: we may need this to be per device and per library. 91 std::list<KernelTy> KernelsList; 92 93 namespace { 94 bool checkResult(CUresult Err, const char *ErrMsg) { 95 if (Err == CUDA_SUCCESS) 96 return true; 97 98 DP(ErrMsg); 99 CUDA_ERR_STRING(Err); 100 return false; 101 } 102 103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 104 CUstream Stream) { 105 CUresult Err = 106 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 107 108 if (Err != CUDA_SUCCESS) { 109 DP("Error when copying data from device to device. Pointers: src " 110 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 111 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 112 CUDA_ERR_STRING(Err); 113 return OFFLOAD_FAIL; 114 } 115 116 return OFFLOAD_SUCCESS; 117 } 118 119 // Structure contains per-device data 120 struct DeviceDataTy { 121 std::list<FuncOrGblEntryTy> FuncGblEntries; 122 CUcontext Context = nullptr; 123 // Device properties 124 int ThreadsPerBlock = 0; 125 int BlocksPerGrid = 0; 126 int WarpSize = 0; 127 // OpenMP properties 128 int NumTeams = 0; 129 int NumThreads = 0; 130 }; 131 132 class StreamManagerTy { 133 int NumberOfDevices; 134 // The initial size of stream pool 135 int EnvNumInitialStreams; 136 // Per-device stream mutex 137 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 138 // Per-device stream Id indicates the next available stream in the pool 139 std::vector<int> NextStreamId; 140 // Per-device stream pool 141 std::vector<std::vector<CUstream>> StreamPool; 142 // Reference to per-device data 143 std::vector<DeviceDataTy> &DeviceData; 144 145 // If there is no CUstream left in the pool, we will resize the pool to 146 // allocate more CUstream. This function should be called with device mutex, 147 // and we do not resize to smaller one. 148 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 149 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 150 const size_t CurrentSize = Pool.size(); 151 assert(NewSize > CurrentSize && "new size is not larger than current size"); 152 153 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 154 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 155 // We will return if cannot switch to the right context in case of 156 // creating bunch of streams that are not corresponding to the right 157 // device. The offloading will fail later because selected CUstream is 158 // nullptr. 159 return; 160 } 161 162 Pool.resize(NewSize, nullptr); 163 164 for (size_t I = CurrentSize; I < NewSize; ++I) { 165 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 166 "Error returned from cuStreamCreate\n"); 167 } 168 } 169 170 public: 171 StreamManagerTy(const int NumberOfDevices, 172 std::vector<DeviceDataTy> &DeviceData) 173 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 174 DeviceData(DeviceData) { 175 StreamPool.resize(NumberOfDevices); 176 NextStreamId.resize(NumberOfDevices); 177 StreamMtx.resize(NumberOfDevices); 178 179 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 180 EnvNumInitialStreams = std::stoi(EnvStr); 181 182 // Initialize the next stream id 183 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 184 185 // Initialize stream mutex 186 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 187 Ptr = std::make_unique<std::mutex>(); 188 } 189 190 ~StreamManagerTy() { 191 // Destroy streams 192 for (int I = 0; I < NumberOfDevices; ++I) { 193 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 194 "Error returned from cuCtxSetCurrent\n"); 195 196 for (CUstream &S : StreamPool[I]) { 197 if (S) 198 checkResult(cuStreamDestroy(S), 199 "Error returned from cuStreamDestroy\n"); 200 } 201 } 202 } 203 204 // Get a CUstream from pool. Per-device next stream id always points to the 205 // next available CUstream. That means, CUstreams [0, id-1] have been 206 // assigned, and [id,] are still available. If there is no CUstream left, we 207 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 208 // the id will increase one. 209 // xxxxxs+++++++++ 210 // ^ 211 // id 212 // After assignment, the pool becomes the following and s is assigned. 213 // xxxxxs+++++++++ 214 // ^ 215 // id 216 CUstream getStream(const int DeviceId) { 217 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 218 int &Id = NextStreamId[DeviceId]; 219 // No CUstream left in the pool, we need to request from CUDA RT 220 if (Id == StreamPool[DeviceId].size()) { 221 // By default we double the stream pool every time 222 resizeStreamPool(DeviceId, Id * 2); 223 } 224 return StreamPool[DeviceId][Id++]; 225 } 226 227 // Return a CUstream back to pool. As mentioned above, per-device next 228 // stream is always points to the next available CUstream, so when we return 229 // a CUstream, we need to first decrease the id, and then copy the CUstream 230 // back. 231 // It is worth noting that, the order of streams return might be different 232 // from that they're assigned, that saying, at some point, there might be 233 // two identical CUstreams. 234 // xxax+a+++++ 235 // ^ 236 // id 237 // However, it doesn't matter, because they're always on the two sides of 238 // id. The left one will in the end be overwritten by another CUstream. 239 // Therefore, after several execution, the order of pool might be different 240 // from its initial state. 241 void returnStream(const int DeviceId, CUstream Stream) { 242 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 243 int &Id = NextStreamId[DeviceId]; 244 assert(Id > 0 && "Wrong stream ID"); 245 StreamPool[DeviceId][--Id] = Stream; 246 } 247 248 bool initializeDeviceStreamPool(const int DeviceId) { 249 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 250 251 resizeStreamPool(DeviceId, EnvNumInitialStreams); 252 253 // Check the size of stream pool 254 if (StreamPool[DeviceId].size() != EnvNumInitialStreams) 255 return false; 256 257 // Check whether each stream is valid 258 for (CUstream &S : StreamPool[DeviceId]) 259 if (!S) 260 return false; 261 262 return true; 263 } 264 }; 265 266 class DeviceRTLTy { 267 int NumberOfDevices; 268 // OpenMP environment properties 269 int EnvNumTeams; 270 int EnvTeamLimit; 271 // OpenMP requires flags 272 int64_t RequiresFlags; 273 274 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 275 static constexpr const int HardThreadLimit = 1024; 276 static constexpr const int DefaultNumTeams = 128; 277 static constexpr const int DefaultNumThreads = 128; 278 279 std::unique_ptr<StreamManagerTy> StreamManager; 280 std::vector<DeviceDataTy> DeviceData; 281 std::vector<CUmodule> Modules; 282 283 // Record entry point associated with device 284 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 285 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 286 E.Entries.push_back(entry); 287 } 288 289 // Return true if the entry is associated with device 290 bool findOffloadEntry(const int DeviceId, const void *Addr) const { 291 for (const __tgt_offload_entry &Itr : 292 DeviceData[DeviceId].FuncGblEntries.back().Entries) 293 if (Itr.addr == Addr) 294 return true; 295 296 return false; 297 } 298 299 // Return the pointer to the target entries table 300 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 301 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 302 303 if (E.Entries.empty()) 304 return nullptr; 305 306 // Update table info according to the entries and return the pointer 307 E.Table.EntriesBegin = E.Entries.data(); 308 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 309 310 return &E.Table; 311 } 312 313 // Clear entries table for a device 314 void clearOffloadEntriesTable(const int DeviceId) { 315 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 316 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 317 E.Entries.clear(); 318 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 319 } 320 321 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 322 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 323 324 if (!AsyncInfoPtr->Queue) 325 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); 326 327 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 328 } 329 330 public: 331 // This class should not be copied 332 DeviceRTLTy(const DeviceRTLTy &) = delete; 333 DeviceRTLTy(DeviceRTLTy &&) = delete; 334 335 DeviceRTLTy() 336 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 337 RequiresFlags(OMP_REQ_UNDEFINED) { 338 #ifdef OMPTARGET_DEBUG 339 if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG")) 340 DebugLevel = std::stoi(EnvStr); 341 #endif // OMPTARGET_DEBUG 342 343 DP("Start initializing CUDA\n"); 344 345 CUresult Err = cuInit(0); 346 if (!checkResult(Err, "Error returned from cuInit\n")) { 347 return; 348 } 349 350 Err = cuDeviceGetCount(&NumberOfDevices); 351 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 352 return; 353 354 if (NumberOfDevices == 0) { 355 DP("There are no devices supporting CUDA.\n"); 356 return; 357 } 358 359 DeviceData.resize(NumberOfDevices); 360 361 // Get environment variables regarding teams 362 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 363 // OMP_TEAM_LIMIT has been set 364 EnvTeamLimit = std::stoi(EnvStr); 365 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 366 } 367 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 368 // OMP_NUM_TEAMS has been set 369 EnvNumTeams = std::stoi(EnvStr); 370 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 371 } 372 373 StreamManager = 374 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 375 } 376 377 ~DeviceRTLTy() { 378 // First destruct stream manager in case of Contexts is destructed before it 379 StreamManager = nullptr; 380 381 for (CUmodule &M : Modules) 382 // Close module 383 if (M) 384 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 385 386 for (DeviceDataTy &D : DeviceData) { 387 // Destroy context 388 if (D.Context) 389 checkResult(cuCtxDestroy(D.Context), 390 "Error returned from cuCtxDestroy\n"); 391 } 392 } 393 394 // Check whether a given DeviceId is valid 395 bool isValidDeviceId(const int DeviceId) const { 396 return DeviceId >= 0 && DeviceId < NumberOfDevices; 397 } 398 399 int getNumOfDevices() const { return NumberOfDevices; } 400 401 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 402 403 int initDevice(const int DeviceId) { 404 CUdevice Device; 405 406 DP("Getting device %d\n", DeviceId); 407 CUresult Err = cuDeviceGet(&Device, DeviceId); 408 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 409 return OFFLOAD_FAIL; 410 411 // Create the context and save it to use whenever this device is selected. 412 Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC, 413 Device); 414 if (!checkResult(Err, "Error returned from cuCtxCreate\n")) 415 return OFFLOAD_FAIL; 416 417 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 418 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 419 return OFFLOAD_FAIL; 420 421 // Initialize stream pool 422 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 423 return OFFLOAD_FAIL; 424 425 // Query attributes to determine number of threads/block and blocks/grid. 426 int MaxGridDimX; 427 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 428 Device); 429 if (Err != CUDA_SUCCESS) { 430 DP("Error getting max grid dimension, use default value %d\n", 431 DeviceRTLTy::DefaultNumTeams); 432 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 433 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 434 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 435 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 436 } else { 437 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 438 "at the hard limit\n", 439 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 440 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 441 } 442 443 // We are only exploiting threads along the x axis. 444 int MaxBlockDimX; 445 Err = cuDeviceGetAttribute(&MaxBlockDimX, 446 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 447 if (Err != CUDA_SUCCESS) { 448 DP("Error getting max block dimension, use default value %d\n", 449 DeviceRTLTy::DefaultNumThreads); 450 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 451 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { 452 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 453 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 454 } else { 455 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 456 "capping at the hard limit\n", 457 MaxBlockDimX, DeviceRTLTy::HardThreadLimit); 458 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 459 } 460 461 // Get and set warp size 462 int WarpSize; 463 Err = 464 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 465 if (Err != CUDA_SUCCESS) { 466 DP("Error getting warp size, assume default value 32\n"); 467 DeviceData[DeviceId].WarpSize = 32; 468 } else { 469 DP("Using warp size %d\n", WarpSize); 470 DeviceData[DeviceId].WarpSize = WarpSize; 471 } 472 473 // Adjust teams to the env variables 474 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 475 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 476 EnvTeamLimit); 477 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 478 } 479 480 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 481 DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, 482 DeviceData[DeviceId].WarpSize); 483 484 // Set default number of teams 485 if (EnvNumTeams > 0) { 486 DP("Default number of teams set according to environment %d\n", 487 EnvNumTeams); 488 DeviceData[DeviceId].NumTeams = EnvNumTeams; 489 } else { 490 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 491 DP("Default number of teams set according to library's default %d\n", 492 DeviceRTLTy::DefaultNumTeams); 493 } 494 495 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 496 DP("Default number of teams exceeds device limit, capping at %d\n", 497 DeviceData[DeviceId].BlocksPerGrid); 498 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 499 } 500 501 // Set default number of threads 502 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 503 DP("Default number of threads set according to library's default %d\n", 504 DeviceRTLTy::DefaultNumThreads); 505 if (DeviceData[DeviceId].NumThreads > 506 DeviceData[DeviceId].ThreadsPerBlock) { 507 DP("Default number of threads exceeds device limit, capping at %d\n", 508 DeviceData[DeviceId].ThreadsPerBlock); 509 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 510 } 511 512 return OFFLOAD_SUCCESS; 513 } 514 515 __tgt_target_table *loadBinary(const int DeviceId, 516 const __tgt_device_image *Image) { 517 // Set the context we are using 518 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 519 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 520 return nullptr; 521 522 // Clear the offload table as we are going to create a new one. 523 clearOffloadEntriesTable(DeviceId); 524 525 // Create the module and extract the function pointers. 526 CUmodule Module; 527 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 528 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 529 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 530 return nullptr; 531 532 DP("CUDA module successfully loaded!\n"); 533 534 Modules.push_back(Module); 535 536 // Find the symbols in the module by name. 537 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 538 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 539 540 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 541 if (!E->addr) { 542 // We return nullptr when something like this happens, the host should 543 // have always something in the address to uniquely identify the target 544 // region. 545 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 546 return nullptr; 547 } 548 549 if (E->size) { 550 __tgt_offload_entry Entry = *E; 551 CUdeviceptr CUPtr; 552 size_t CUSize; 553 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 554 // We keep this style here because we need the name 555 if (Err != CUDA_SUCCESS) { 556 DP("Loading global '%s' (Failed)\n", E->name); 557 CUDA_ERR_STRING(Err); 558 return nullptr; 559 } 560 561 if (CUSize != E->size) { 562 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 563 CUSize, E->size); 564 return nullptr; 565 } 566 567 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 568 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 569 570 Entry.addr = (void *)(CUPtr); 571 572 // Note: In the current implementation declare target variables 573 // can either be link or to. This means that once unified 574 // memory is activated via the requires directive, the variable 575 // can be used directly from the host in both cases. 576 // TODO: when variables types other than to or link are added, 577 // the below condition should be changed to explicitly 578 // check for to and link variables types: 579 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 580 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 581 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 582 // If unified memory is present any target link or to variables 583 // can access host addresses directly. There is no longer a 584 // need for device copies. 585 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 586 DP("Copy linked variable host address (" DPxMOD 587 ") to device address (" DPxMOD ")\n", 588 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 589 } 590 591 addOffloadEntry(DeviceId, Entry); 592 593 continue; 594 } 595 596 CUfunction Func; 597 Err = cuModuleGetFunction(&Func, Module, E->name); 598 // We keep this style here because we need the name 599 if (Err != CUDA_SUCCESS) { 600 DP("Loading '%s' (Failed)\n", E->name); 601 CUDA_ERR_STRING(Err); 602 return nullptr; 603 } 604 605 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 606 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 607 608 // default value GENERIC (in case symbol is missing from cubin file) 609 int8_t ExecModeVal = ExecutionModeType::GENERIC; 610 std::string ExecModeNameStr(E->name); 611 ExecModeNameStr += "_exec_mode"; 612 const char *ExecModeName = ExecModeNameStr.c_str(); 613 614 CUdeviceptr ExecModePtr; 615 size_t CUSize; 616 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 617 if (Err == CUDA_SUCCESS) { 618 if (CUSize != sizeof(int8_t)) { 619 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 620 ExecModeName, CUSize, sizeof(int8_t)); 621 return nullptr; 622 } 623 624 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 625 if (Err != CUDA_SUCCESS) { 626 DP("Error when copying data from device to host. Pointers: " 627 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 628 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 629 CUDA_ERR_STRING(Err); 630 return nullptr; 631 } 632 633 if (ExecModeVal < 0 || ExecModeVal > 1) { 634 DP("Error wrong exec_mode value specified in cubin file: %d\n", 635 ExecModeVal); 636 return nullptr; 637 } 638 } else { 639 DP("Loading global exec_mode '%s' - symbol missing, using default " 640 "value GENERIC (1)\n", 641 ExecModeName); 642 CUDA_ERR_STRING(Err); 643 } 644 645 KernelsList.emplace_back(Func, ExecModeVal); 646 647 __tgt_offload_entry Entry = *E; 648 Entry.addr = &KernelsList.back(); 649 addOffloadEntry(DeviceId, Entry); 650 } 651 652 // send device environment data to the device 653 { 654 omptarget_device_environmentTy DeviceEnv{0}; 655 656 #ifdef OMPTARGET_DEBUG 657 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 658 DeviceEnv.debug_level = std::stoi(EnvStr); 659 #endif 660 661 const char *DeviceEnvName = "omptarget_device_environment"; 662 CUdeviceptr DeviceEnvPtr; 663 size_t CUSize; 664 665 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 666 if (Err == CUDA_SUCCESS) { 667 if (CUSize != sizeof(DeviceEnv)) { 668 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 669 DeviceEnvName, CUSize, sizeof(int32_t)); 670 CUDA_ERR_STRING(Err); 671 return nullptr; 672 } 673 674 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 675 if (Err != CUDA_SUCCESS) { 676 DP("Error when copying data from host to device. Pointers: " 677 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 678 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 679 CUDA_ERR_STRING(Err); 680 return nullptr; 681 } 682 683 DP("Sending global device environment data %zu bytes\n", CUSize); 684 } else { 685 DP("Finding global device environment '%s' - symbol missing.\n", 686 DeviceEnvName); 687 DP("Continue, considering this is a device RTL which does not accept " 688 "environment setting.\n"); 689 } 690 } 691 692 return getOffloadEntriesTable(DeviceId); 693 } 694 695 void *dataAlloc(const int DeviceId, const int64_t Size) const { 696 if (Size == 0) 697 return nullptr; 698 699 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 700 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 701 return nullptr; 702 703 CUdeviceptr DevicePtr; 704 Err = cuMemAlloc(&DevicePtr, Size); 705 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 706 return nullptr; 707 708 return (void *)DevicePtr; 709 } 710 711 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 712 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 713 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 714 715 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 716 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 717 return OFFLOAD_FAIL; 718 719 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 720 721 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 722 if (Err != CUDA_SUCCESS) { 723 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 724 ", device = " DPxMOD ", size = %" PRId64 "\n", 725 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 726 CUDA_ERR_STRING(Err); 727 return OFFLOAD_FAIL; 728 } 729 730 return OFFLOAD_SUCCESS; 731 } 732 733 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 734 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 735 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 736 737 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 738 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 739 return OFFLOAD_FAIL; 740 741 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 742 743 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 744 if (Err != CUDA_SUCCESS) { 745 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 746 ", device = " DPxMOD ", size = %" PRId64 "\n", 747 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 748 CUDA_ERR_STRING(Err); 749 return OFFLOAD_FAIL; 750 } 751 752 return OFFLOAD_SUCCESS; 753 } 754 755 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 756 int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 757 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 758 759 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 760 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 761 return OFFLOAD_FAIL; 762 763 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); 764 765 // If they are two devices, we try peer to peer copy first 766 if (SrcDevId != DstDevId) { 767 int CanAccessPeer = 0; 768 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 769 if (Err != CUDA_SUCCESS) { 770 DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 771 ", dst = %" PRId32 "\n", 772 SrcDevId, DstDevId); 773 CUDA_ERR_STRING(Err); 774 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 775 } 776 777 if (!CanAccessPeer) { 778 DP("P2P memcpy not supported so fall back to D2D memcpy"); 779 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 780 } 781 782 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 783 if (Err != CUDA_SUCCESS) { 784 DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 785 ", dst = %" PRId32 "\n", 786 SrcDevId, DstDevId); 787 CUDA_ERR_STRING(Err); 788 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 789 } 790 791 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 792 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 793 Size, Stream); 794 if (Err == CUDA_SUCCESS) 795 return OFFLOAD_SUCCESS; 796 797 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 798 ", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n", 799 SrcPtr, SrcDevId, DstPtr, DstDevId); 800 CUDA_ERR_STRING(Err); 801 } 802 803 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 804 } 805 806 int dataDelete(const int DeviceId, void *TgtPtr) const { 807 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 808 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 809 return OFFLOAD_FAIL; 810 811 Err = cuMemFree((CUdeviceptr)TgtPtr); 812 if (!checkResult(Err, "Error returned from cuMemFree\n")) 813 return OFFLOAD_FAIL; 814 815 return OFFLOAD_SUCCESS; 816 } 817 818 int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr, 819 void **TgtArgs, ptrdiff_t *TgtOffsets, 820 const int ArgNum, const int TeamNum, 821 const int ThreadLimit, 822 const unsigned int LoopTripCount, 823 __tgt_async_info *AsyncInfo) const { 824 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 825 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 826 return OFFLOAD_FAIL; 827 828 // All args are references. 829 std::vector<void *> Args(ArgNum); 830 std::vector<void *> Ptrs(ArgNum); 831 832 for (int I = 0; I < ArgNum; ++I) { 833 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 834 Args[I] = &Ptrs[I]; 835 } 836 837 const KernelTy *KernelInfo = 838 reinterpret_cast<const KernelTy *>(TgtEntryPtr); 839 840 unsigned int CudaThreadsPerBlock; 841 if (ThreadLimit > 0) { 842 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 843 CudaThreadsPerBlock = ThreadLimit; 844 // Add master warp if necessary 845 if (KernelInfo->ExecutionMode == GENERIC) { 846 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 847 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 848 } 849 } else { 850 DP("Setting CUDA threads per block to default %d\n", 851 DeviceData[DeviceId].NumThreads); 852 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 853 } 854 855 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 856 DP("Threads per block capped at device limit %d\n", 857 DeviceData[DeviceId].ThreadsPerBlock); 858 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 859 } 860 861 int KernelLimit; 862 Err = cuFuncGetAttribute(&KernelLimit, 863 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 864 KernelInfo->Func); 865 if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) { 866 DP("Threads per block capped at kernel limit %d\n", KernelLimit); 867 CudaThreadsPerBlock = KernelLimit; 868 } 869 870 unsigned int CudaBlocksPerGrid; 871 if (TeamNum <= 0) { 872 if (LoopTripCount > 0 && EnvNumTeams < 0) { 873 if (KernelInfo->ExecutionMode == SPMD) { 874 // We have a combined construct, i.e. `target teams distribute 875 // parallel for [simd]`. We launch so many teams so that each thread 876 // will execute one iteration of the loop. round up to the nearest 877 // integer 878 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 879 } else { 880 // If we reach this point, then we have a non-combined construct, i.e. 881 // `teams distribute` with a nested `parallel for` and each team is 882 // assigned one iteration of the `distribute` loop. E.g.: 883 // 884 // #pragma omp target teams distribute 885 // for(...loop_tripcount...) { 886 // #pragma omp parallel for 887 // for(...) {} 888 // } 889 // 890 // Threads within a team will execute the iterations of the `parallel` 891 // loop. 892 CudaBlocksPerGrid = LoopTripCount; 893 } 894 DP("Using %d teams due to loop trip count %" PRIu64 895 " and number of threads per block %d\n", 896 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 897 } else { 898 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 899 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 900 } 901 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 902 DP("Capping number of teams to team limit %d\n", 903 DeviceData[DeviceId].BlocksPerGrid); 904 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 905 } else { 906 DP("Using requested number of teams %d\n", TeamNum); 907 CudaBlocksPerGrid = TeamNum; 908 } 909 910 // Run on the device. 911 DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid, 912 CudaThreadsPerBlock); 913 914 CUstream Stream = getStream(DeviceId, AsyncInfo); 915 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 916 /* gridDimZ */ 1, CudaThreadsPerBlock, 917 /* blockDimY */ 1, /* blockDimZ */ 1, 918 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 919 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 920 return OFFLOAD_FAIL; 921 922 DP("Launch of entry point at " DPxMOD " successful!\n", 923 DPxPTR(TgtEntryPtr)); 924 925 return OFFLOAD_SUCCESS; 926 } 927 928 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 929 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 930 CUresult Err = cuStreamSynchronize(Stream); 931 if (Err != CUDA_SUCCESS) { 932 DP("Error when synchronizing stream. stream = " DPxMOD 933 ", async info ptr = " DPxMOD "\n", 934 DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); 935 CUDA_ERR_STRING(Err); 936 return OFFLOAD_FAIL; 937 } 938 939 // Once the stream is synchronized, return it to stream pool and reset 940 // async_info. This is to make sure the synchronization only works for its 941 // own tasks. 942 StreamManager->returnStream( 943 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue)); 944 AsyncInfoPtr->Queue = nullptr; 945 946 return OFFLOAD_SUCCESS; 947 } 948 }; 949 950 DeviceRTLTy DeviceRTL; 951 } // namespace 952 953 // Exposed library API function 954 #ifdef __cplusplus 955 extern "C" { 956 #endif 957 958 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 959 return elf_check_machine(image, /* EM_CUDA */ 190); 960 } 961 962 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 963 964 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 965 DP("Init requires flags to %ld\n", RequiresFlags); 966 DeviceRTL.setRequiresFlag(RequiresFlags); 967 return RequiresFlags; 968 } 969 970 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 971 if (DeviceRTL.isValidDeviceId(src_dev_id) && 972 DeviceRTL.isValidDeviceId(dst_dev_id)) 973 return 1; 974 975 return 0; 976 } 977 978 int32_t __tgt_rtl_init_device(int32_t device_id) { 979 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 980 981 return DeviceRTL.initDevice(device_id); 982 } 983 984 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 985 __tgt_device_image *image) { 986 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 987 988 return DeviceRTL.loadBinary(device_id, image); 989 } 990 991 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { 992 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 993 994 return DeviceRTL.dataAlloc(device_id, size); 995 } 996 997 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 998 int64_t size) { 999 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1000 1001 __tgt_async_info async_info; 1002 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1003 size, &async_info); 1004 if (rc != OFFLOAD_SUCCESS) 1005 return OFFLOAD_FAIL; 1006 1007 return __tgt_rtl_synchronize(device_id, &async_info); 1008 } 1009 1010 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1011 void *hst_ptr, int64_t size, 1012 __tgt_async_info *async_info_ptr) { 1013 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1014 assert(async_info_ptr && "async_info_ptr is nullptr"); 1015 1016 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1017 async_info_ptr); 1018 } 1019 1020 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1021 int64_t size) { 1022 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1023 1024 __tgt_async_info async_info; 1025 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1026 size, &async_info); 1027 if (rc != OFFLOAD_SUCCESS) 1028 return OFFLOAD_FAIL; 1029 1030 return __tgt_rtl_synchronize(device_id, &async_info); 1031 } 1032 1033 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1034 void *tgt_ptr, int64_t size, 1035 __tgt_async_info *async_info_ptr) { 1036 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1037 assert(async_info_ptr && "async_info_ptr is nullptr"); 1038 1039 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1040 async_info_ptr); 1041 } 1042 1043 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1044 int dst_dev_id, void *dst_ptr, 1045 int64_t size, 1046 __tgt_async_info *async_info_ptr) { 1047 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1048 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1049 assert(async_info_ptr && "async_info_ptr is nullptr"); 1050 1051 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1052 async_info_ptr); 1053 } 1054 1055 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1056 int32_t dst_dev_id, void *dst_ptr, 1057 int64_t size) { 1058 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1059 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1060 1061 __tgt_async_info async_info; 1062 const int32_t rc = __tgt_rtl_data_exchange_async( 1063 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); 1064 if (rc != OFFLOAD_SUCCESS) 1065 return OFFLOAD_FAIL; 1066 1067 return __tgt_rtl_synchronize(src_dev_id, &async_info); 1068 } 1069 1070 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1071 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1072 1073 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1074 } 1075 1076 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1077 void **tgt_args, 1078 ptrdiff_t *tgt_offsets, 1079 int32_t arg_num, int32_t team_num, 1080 int32_t thread_limit, 1081 uint64_t loop_tripcount) { 1082 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1083 1084 __tgt_async_info async_info; 1085 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1086 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1087 thread_limit, loop_tripcount, &async_info); 1088 if (rc != OFFLOAD_SUCCESS) 1089 return OFFLOAD_FAIL; 1090 1091 return __tgt_rtl_synchronize(device_id, &async_info); 1092 } 1093 1094 int32_t __tgt_rtl_run_target_team_region_async( 1095 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1096 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1097 int32_t thread_limit, uint64_t loop_tripcount, 1098 __tgt_async_info *async_info_ptr) { 1099 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1100 1101 return DeviceRTL.runTargetTeamRegion( 1102 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1103 thread_limit, loop_tripcount, async_info_ptr); 1104 } 1105 1106 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1107 void **tgt_args, ptrdiff_t *tgt_offsets, 1108 int32_t arg_num) { 1109 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1110 1111 __tgt_async_info async_info; 1112 const int32_t rc = __tgt_rtl_run_target_region_async( 1113 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); 1114 if (rc != OFFLOAD_SUCCESS) 1115 return OFFLOAD_FAIL; 1116 1117 return __tgt_rtl_synchronize(device_id, &async_info); 1118 } 1119 1120 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1121 void *tgt_entry_ptr, void **tgt_args, 1122 ptrdiff_t *tgt_offsets, 1123 int32_t arg_num, 1124 __tgt_async_info *async_info_ptr) { 1125 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1126 1127 return __tgt_rtl_run_target_team_region_async( 1128 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1129 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1130 async_info_ptr); 1131 } 1132 1133 int32_t __tgt_rtl_synchronize(int32_t device_id, 1134 __tgt_async_info *async_info_ptr) { 1135 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1136 assert(async_info_ptr && "async_info_ptr is nullptr"); 1137 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1138 1139 return DeviceRTL.synchronize(device_id, async_info_ptr); 1140 } 1141 1142 #ifdef __cplusplus 1143 } 1144 #endif 1145