1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <vector> 21 22 #include "Debug.h" 23 #include "omptargetplugin.h" 24 25 #define TARGET_NAME CUDA 26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 27 28 // Utility for retrieving and printing CUDA error string. 29 #ifdef OMPTARGET_DEBUG 30 #define CUDA_ERR_STRING(err) \ 31 do { \ 32 if (getDebugLevel() > 0) { \ 33 const char *errStr; \ 34 cuGetErrorString(err, &errStr); \ 35 DP("CUDA error is: %s\n", errStr); \ 36 } \ 37 } while (false) 38 #else // OMPTARGET_DEBUG 39 #define CUDA_ERR_STRING(err) {} 40 #endif // OMPTARGET_DEBUG 41 42 #include "../../common/elf_common.c" 43 44 /// Keep entries table per device. 45 struct FuncOrGblEntryTy { 46 __tgt_target_table Table; 47 std::vector<__tgt_offload_entry> Entries; 48 }; 49 50 enum ExecutionModeType { 51 SPMD, // constructors, destructors, 52 // combined constructs (`teams distribute parallel for [simd]`) 53 GENERIC, // everything else 54 NONE 55 }; 56 57 /// Use a single entity to encode a kernel and a set of flags. 58 struct KernelTy { 59 CUfunction Func; 60 61 // execution mode of kernel 62 // 0 - SPMD mode (without master warp) 63 // 1 - Generic mode (with master warp) 64 int8_t ExecutionMode; 65 66 /// Maximal number of threads per block for this kernel. 67 int MaxThreadsPerBlock = 0; 68 69 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 70 : Func(_Func), ExecutionMode(_ExecutionMode) {} 71 }; 72 73 /// Device environment data 74 /// Manually sync with the deviceRTL side for now, move to a dedicated header 75 /// file later. 76 struct omptarget_device_environmentTy { 77 int32_t debug_level; 78 }; 79 80 namespace { 81 bool checkResult(CUresult Err, const char *ErrMsg) { 82 if (Err == CUDA_SUCCESS) 83 return true; 84 85 DP("%s", ErrMsg); 86 CUDA_ERR_STRING(Err); 87 return false; 88 } 89 90 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 91 CUstream Stream) { 92 CUresult Err = 93 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 94 95 if (Err != CUDA_SUCCESS) { 96 DP("Error when copying data from device to device. Pointers: src " 97 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 98 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 99 CUDA_ERR_STRING(Err); 100 return OFFLOAD_FAIL; 101 } 102 103 return OFFLOAD_SUCCESS; 104 } 105 106 // Structure contains per-device data 107 struct DeviceDataTy { 108 /// List that contains all the kernels. 109 std::list<KernelTy> KernelsList; 110 111 std::list<FuncOrGblEntryTy> FuncGblEntries; 112 113 CUcontext Context = nullptr; 114 // Device properties 115 int ThreadsPerBlock = 0; 116 int BlocksPerGrid = 0; 117 int WarpSize = 0; 118 // OpenMP properties 119 int NumTeams = 0; 120 int NumThreads = 0; 121 }; 122 123 class StreamManagerTy { 124 int NumberOfDevices; 125 // The initial size of stream pool 126 int EnvNumInitialStreams; 127 // Per-device stream mutex 128 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 129 // Per-device stream Id indicates the next available stream in the pool 130 std::vector<int> NextStreamId; 131 // Per-device stream pool 132 std::vector<std::vector<CUstream>> StreamPool; 133 // Reference to per-device data 134 std::vector<DeviceDataTy> &DeviceData; 135 136 // If there is no CUstream left in the pool, we will resize the pool to 137 // allocate more CUstream. This function should be called with device mutex, 138 // and we do not resize to smaller one. 139 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 140 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 141 const size_t CurrentSize = Pool.size(); 142 assert(NewSize > CurrentSize && "new size is not larger than current size"); 143 144 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 145 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 146 // We will return if cannot switch to the right context in case of 147 // creating bunch of streams that are not corresponding to the right 148 // device. The offloading will fail later because selected CUstream is 149 // nullptr. 150 return; 151 } 152 153 Pool.resize(NewSize, nullptr); 154 155 for (size_t I = CurrentSize; I < NewSize; ++I) { 156 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 157 "Error returned from cuStreamCreate\n"); 158 } 159 } 160 161 public: 162 StreamManagerTy(const int NumberOfDevices, 163 std::vector<DeviceDataTy> &DeviceData) 164 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 165 DeviceData(DeviceData) { 166 StreamPool.resize(NumberOfDevices); 167 NextStreamId.resize(NumberOfDevices); 168 StreamMtx.resize(NumberOfDevices); 169 170 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 171 EnvNumInitialStreams = std::stoi(EnvStr); 172 173 // Initialize the next stream id 174 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 175 176 // Initialize stream mutex 177 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 178 Ptr = std::make_unique<std::mutex>(); 179 } 180 181 ~StreamManagerTy() { 182 // Destroy streams 183 for (int I = 0; I < NumberOfDevices; ++I) { 184 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 185 "Error returned from cuCtxSetCurrent\n"); 186 187 for (CUstream &S : StreamPool[I]) { 188 if (S) 189 checkResult(cuStreamDestroy(S), 190 "Error returned from cuStreamDestroy\n"); 191 } 192 } 193 } 194 195 // Get a CUstream from pool. Per-device next stream id always points to the 196 // next available CUstream. That means, CUstreams [0, id-1] have been 197 // assigned, and [id,] are still available. If there is no CUstream left, we 198 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 199 // the id will increase one. 200 // xxxxxs+++++++++ 201 // ^ 202 // id 203 // After assignment, the pool becomes the following and s is assigned. 204 // xxxxxs+++++++++ 205 // ^ 206 // id 207 CUstream getStream(const int DeviceId) { 208 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 209 int &Id = NextStreamId[DeviceId]; 210 // No CUstream left in the pool, we need to request from CUDA RT 211 if (Id == StreamPool[DeviceId].size()) { 212 // By default we double the stream pool every time 213 resizeStreamPool(DeviceId, Id * 2); 214 } 215 return StreamPool[DeviceId][Id++]; 216 } 217 218 // Return a CUstream back to pool. As mentioned above, per-device next 219 // stream is always points to the next available CUstream, so when we return 220 // a CUstream, we need to first decrease the id, and then copy the CUstream 221 // back. 222 // It is worth noting that, the order of streams return might be different 223 // from that they're assigned, that saying, at some point, there might be 224 // two identical CUstreams. 225 // xxax+a+++++ 226 // ^ 227 // id 228 // However, it doesn't matter, because they're always on the two sides of 229 // id. The left one will in the end be overwritten by another CUstream. 230 // Therefore, after several execution, the order of pool might be different 231 // from its initial state. 232 void returnStream(const int DeviceId, CUstream Stream) { 233 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 234 int &Id = NextStreamId[DeviceId]; 235 assert(Id > 0 && "Wrong stream ID"); 236 StreamPool[DeviceId][--Id] = Stream; 237 } 238 239 bool initializeDeviceStreamPool(const int DeviceId) { 240 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 241 242 resizeStreamPool(DeviceId, EnvNumInitialStreams); 243 244 // Check the size of stream pool 245 if (StreamPool[DeviceId].size() != EnvNumInitialStreams) 246 return false; 247 248 // Check whether each stream is valid 249 for (CUstream &S : StreamPool[DeviceId]) 250 if (!S) 251 return false; 252 253 return true; 254 } 255 }; 256 257 class DeviceRTLTy { 258 int NumberOfDevices; 259 // OpenMP environment properties 260 int EnvNumTeams; 261 int EnvTeamLimit; 262 // OpenMP requires flags 263 int64_t RequiresFlags; 264 265 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 266 static constexpr const int HardThreadLimit = 1024; 267 static constexpr const int DefaultNumTeams = 128; 268 static constexpr const int DefaultNumThreads = 128; 269 270 std::unique_ptr<StreamManagerTy> StreamManager; 271 std::vector<DeviceDataTy> DeviceData; 272 std::vector<CUmodule> Modules; 273 274 // Record entry point associated with device 275 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 276 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 277 E.Entries.push_back(entry); 278 } 279 280 // Return a pointer to the entry associated with the pointer 281 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 282 const void *Addr) const { 283 for (const __tgt_offload_entry &Itr : 284 DeviceData[DeviceId].FuncGblEntries.back().Entries) 285 if (Itr.addr == Addr) 286 return &Itr; 287 288 return nullptr; 289 } 290 291 // Return the pointer to the target entries table 292 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 293 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 294 295 if (E.Entries.empty()) 296 return nullptr; 297 298 // Update table info according to the entries and return the pointer 299 E.Table.EntriesBegin = E.Entries.data(); 300 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 301 302 return &E.Table; 303 } 304 305 // Clear entries table for a device 306 void clearOffloadEntriesTable(const int DeviceId) { 307 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 308 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 309 E.Entries.clear(); 310 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 311 } 312 313 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 314 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 315 316 if (!AsyncInfoPtr->Queue) 317 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); 318 319 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 320 } 321 322 public: 323 // This class should not be copied 324 DeviceRTLTy(const DeviceRTLTy &) = delete; 325 DeviceRTLTy(DeviceRTLTy &&) = delete; 326 327 DeviceRTLTy() 328 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 329 RequiresFlags(OMP_REQ_UNDEFINED) { 330 331 DP("Start initializing CUDA\n"); 332 333 CUresult Err = cuInit(0); 334 if (!checkResult(Err, "Error returned from cuInit\n")) { 335 return; 336 } 337 338 Err = cuDeviceGetCount(&NumberOfDevices); 339 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 340 return; 341 342 if (NumberOfDevices == 0) { 343 DP("There are no devices supporting CUDA.\n"); 344 return; 345 } 346 347 DeviceData.resize(NumberOfDevices); 348 349 // Get environment variables regarding teams 350 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 351 // OMP_TEAM_LIMIT has been set 352 EnvTeamLimit = std::stoi(EnvStr); 353 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 354 } 355 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 356 // OMP_NUM_TEAMS has been set 357 EnvNumTeams = std::stoi(EnvStr); 358 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 359 } 360 361 StreamManager = 362 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 363 } 364 365 ~DeviceRTLTy() { 366 // First destruct stream manager in case of Contexts is destructed before it 367 StreamManager = nullptr; 368 369 for (CUmodule &M : Modules) 370 // Close module 371 if (M) 372 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 373 374 for (DeviceDataTy &D : DeviceData) { 375 // Destroy context 376 if (D.Context) { 377 checkResult(cuCtxSetCurrent(D.Context), 378 "Error returned from cuCtxSetCurrent\n"); 379 CUdevice Device; 380 checkResult(cuCtxGetDevice(&Device), 381 "Error returned from cuCtxGetDevice\n"); 382 checkResult(cuDevicePrimaryCtxRelease(Device), 383 "Error returned from cuDevicePrimaryCtxRelease\n"); 384 } 385 } 386 } 387 388 // Check whether a given DeviceId is valid 389 bool isValidDeviceId(const int DeviceId) const { 390 return DeviceId >= 0 && DeviceId < NumberOfDevices; 391 } 392 393 int getNumOfDevices() const { return NumberOfDevices; } 394 395 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 396 397 int initDevice(const int DeviceId) { 398 CUdevice Device; 399 400 DP("Getting device %d\n", DeviceId); 401 CUresult Err = cuDeviceGet(&Device, DeviceId); 402 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 403 return OFFLOAD_FAIL; 404 405 // Query the current flags of the primary context and set its flags if 406 // it is inactive 407 unsigned int FormerPrimaryCtxFlags = 0; 408 int FormerPrimaryCtxIsActive = 0; 409 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 410 &FormerPrimaryCtxIsActive); 411 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 412 return OFFLOAD_FAIL; 413 414 if (FormerPrimaryCtxIsActive) { 415 DP("The primary context is active, no change to its flags\n"); 416 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 417 CU_CTX_SCHED_BLOCKING_SYNC) 418 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 419 } else { 420 DP("The primary context is inactive, set its flags to " 421 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 422 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 423 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 424 return OFFLOAD_FAIL; 425 } 426 427 // Retain the per device primary context and save it to use whenever this 428 // device is selected. 429 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 430 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 431 return OFFLOAD_FAIL; 432 433 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 434 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 435 return OFFLOAD_FAIL; 436 437 // Initialize stream pool 438 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 439 return OFFLOAD_FAIL; 440 441 // Query attributes to determine number of threads/block and blocks/grid. 442 int MaxGridDimX; 443 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 444 Device); 445 if (Err != CUDA_SUCCESS) { 446 DP("Error getting max grid dimension, use default value %d\n", 447 DeviceRTLTy::DefaultNumTeams); 448 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 449 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 450 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 451 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 452 } else { 453 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 454 "at the hard limit\n", 455 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 456 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 457 } 458 459 // We are only exploiting threads along the x axis. 460 int MaxBlockDimX; 461 Err = cuDeviceGetAttribute(&MaxBlockDimX, 462 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 463 if (Err != CUDA_SUCCESS) { 464 DP("Error getting max block dimension, use default value %d\n", 465 DeviceRTLTy::DefaultNumThreads); 466 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 467 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { 468 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 469 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 470 } else { 471 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 472 "capping at the hard limit\n", 473 MaxBlockDimX, DeviceRTLTy::HardThreadLimit); 474 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 475 } 476 477 // Get and set warp size 478 int WarpSize; 479 Err = 480 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 481 if (Err != CUDA_SUCCESS) { 482 DP("Error getting warp size, assume default value 32\n"); 483 DeviceData[DeviceId].WarpSize = 32; 484 } else { 485 DP("Using warp size %d\n", WarpSize); 486 DeviceData[DeviceId].WarpSize = WarpSize; 487 } 488 489 // Adjust teams to the env variables 490 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 491 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 492 EnvTeamLimit); 493 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 494 } 495 496 INFO(DeviceId, 497 "Device supports up to %d CUDA blocks and %d threads with a " 498 "warp size of %d\n", 499 DeviceData[DeviceId].BlocksPerGrid, 500 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 501 502 // Set default number of teams 503 if (EnvNumTeams > 0) { 504 DP("Default number of teams set according to environment %d\n", 505 EnvNumTeams); 506 DeviceData[DeviceId].NumTeams = EnvNumTeams; 507 } else { 508 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 509 DP("Default number of teams set according to library's default %d\n", 510 DeviceRTLTy::DefaultNumTeams); 511 } 512 513 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 514 DP("Default number of teams exceeds device limit, capping at %d\n", 515 DeviceData[DeviceId].BlocksPerGrid); 516 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 517 } 518 519 // Set default number of threads 520 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 521 DP("Default number of threads set according to library's default %d\n", 522 DeviceRTLTy::DefaultNumThreads); 523 if (DeviceData[DeviceId].NumThreads > 524 DeviceData[DeviceId].ThreadsPerBlock) { 525 DP("Default number of threads exceeds device limit, capping at %d\n", 526 DeviceData[DeviceId].ThreadsPerBlock); 527 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 528 } 529 530 return OFFLOAD_SUCCESS; 531 } 532 533 __tgt_target_table *loadBinary(const int DeviceId, 534 const __tgt_device_image *Image) { 535 // Set the context we are using 536 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 537 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 538 return nullptr; 539 540 // Clear the offload table as we are going to create a new one. 541 clearOffloadEntriesTable(DeviceId); 542 543 // Create the module and extract the function pointers. 544 CUmodule Module; 545 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 546 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 547 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 548 return nullptr; 549 550 DP("CUDA module successfully loaded!\n"); 551 552 Modules.push_back(Module); 553 554 // Find the symbols in the module by name. 555 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 556 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 557 558 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 559 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 560 if (!E->addr) { 561 // We return nullptr when something like this happens, the host should 562 // have always something in the address to uniquely identify the target 563 // region. 564 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 565 return nullptr; 566 } 567 568 if (E->size) { 569 __tgt_offload_entry Entry = *E; 570 CUdeviceptr CUPtr; 571 size_t CUSize; 572 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 573 // We keep this style here because we need the name 574 if (Err != CUDA_SUCCESS) { 575 DP("Loading global '%s' (Failed)\n", E->name); 576 CUDA_ERR_STRING(Err); 577 return nullptr; 578 } 579 580 if (CUSize != E->size) { 581 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 582 CUSize, E->size); 583 return nullptr; 584 } 585 586 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 587 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 588 589 Entry.addr = (void *)(CUPtr); 590 591 // Note: In the current implementation declare target variables 592 // can either be link or to. This means that once unified 593 // memory is activated via the requires directive, the variable 594 // can be used directly from the host in both cases. 595 // TODO: when variables types other than to or link are added, 596 // the below condition should be changed to explicitly 597 // check for to and link variables types: 598 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 599 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 600 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 601 // If unified memory is present any target link or to variables 602 // can access host addresses directly. There is no longer a 603 // need for device copies. 604 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 605 DP("Copy linked variable host address (" DPxMOD 606 ") to device address (" DPxMOD ")\n", 607 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 608 } 609 610 addOffloadEntry(DeviceId, Entry); 611 612 continue; 613 } 614 615 CUfunction Func; 616 Err = cuModuleGetFunction(&Func, Module, E->name); 617 // We keep this style here because we need the name 618 if (Err != CUDA_SUCCESS) { 619 DP("Loading '%s' (Failed)\n", E->name); 620 CUDA_ERR_STRING(Err); 621 return nullptr; 622 } 623 624 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 625 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 626 627 // default value GENERIC (in case symbol is missing from cubin file) 628 int8_t ExecModeVal = ExecutionModeType::GENERIC; 629 std::string ExecModeNameStr(E->name); 630 ExecModeNameStr += "_exec_mode"; 631 const char *ExecModeName = ExecModeNameStr.c_str(); 632 633 CUdeviceptr ExecModePtr; 634 size_t CUSize; 635 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 636 if (Err == CUDA_SUCCESS) { 637 if (CUSize != sizeof(int8_t)) { 638 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 639 ExecModeName, CUSize, sizeof(int8_t)); 640 return nullptr; 641 } 642 643 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 644 if (Err != CUDA_SUCCESS) { 645 DP("Error when copying data from device to host. Pointers: " 646 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 647 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 648 CUDA_ERR_STRING(Err); 649 return nullptr; 650 } 651 652 if (ExecModeVal < 0 || ExecModeVal > 1) { 653 DP("Error wrong exec_mode value specified in cubin file: %d\n", 654 ExecModeVal); 655 return nullptr; 656 } 657 } else { 658 DP("Loading global exec_mode '%s' - symbol missing, using default " 659 "value GENERIC (1)\n", 660 ExecModeName); 661 CUDA_ERR_STRING(Err); 662 } 663 664 KernelsList.emplace_back(Func, ExecModeVal); 665 666 __tgt_offload_entry Entry = *E; 667 Entry.addr = &KernelsList.back(); 668 addOffloadEntry(DeviceId, Entry); 669 } 670 671 // send device environment data to the device 672 { 673 omptarget_device_environmentTy DeviceEnv{0}; 674 675 #ifdef OMPTARGET_DEBUG 676 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 677 DeviceEnv.debug_level = std::stoi(EnvStr); 678 #endif 679 680 const char *DeviceEnvName = "omptarget_device_environment"; 681 CUdeviceptr DeviceEnvPtr; 682 size_t CUSize; 683 684 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 685 if (Err == CUDA_SUCCESS) { 686 if (CUSize != sizeof(DeviceEnv)) { 687 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 688 DeviceEnvName, CUSize, sizeof(int32_t)); 689 CUDA_ERR_STRING(Err); 690 return nullptr; 691 } 692 693 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 694 if (Err != CUDA_SUCCESS) { 695 DP("Error when copying data from host to device. Pointers: " 696 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 697 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 698 CUDA_ERR_STRING(Err); 699 return nullptr; 700 } 701 702 DP("Sending global device environment data %zu bytes\n", CUSize); 703 } else { 704 DP("Finding global device environment '%s' - symbol missing.\n", 705 DeviceEnvName); 706 DP("Continue, considering this is a device RTL which does not accept " 707 "environment setting.\n"); 708 } 709 } 710 711 return getOffloadEntriesTable(DeviceId); 712 } 713 714 void *dataAlloc(const int DeviceId, const int64_t Size) const { 715 if (Size == 0) 716 return nullptr; 717 718 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 719 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 720 return nullptr; 721 722 CUdeviceptr DevicePtr; 723 Err = cuMemAlloc(&DevicePtr, Size); 724 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 725 return nullptr; 726 727 return (void *)DevicePtr; 728 } 729 730 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 731 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 732 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 733 734 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 735 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 736 return OFFLOAD_FAIL; 737 738 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 739 740 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 741 if (Err != CUDA_SUCCESS) { 742 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 743 ", device = " DPxMOD ", size = %" PRId64 "\n", 744 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 745 CUDA_ERR_STRING(Err); 746 return OFFLOAD_FAIL; 747 } 748 749 return OFFLOAD_SUCCESS; 750 } 751 752 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 753 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 754 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 755 756 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 757 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 758 return OFFLOAD_FAIL; 759 760 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 761 762 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 763 if (Err != CUDA_SUCCESS) { 764 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 765 ", device = " DPxMOD ", size = %" PRId64 "\n", 766 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 767 CUDA_ERR_STRING(Err); 768 return OFFLOAD_FAIL; 769 } 770 771 return OFFLOAD_SUCCESS; 772 } 773 774 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 775 int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 776 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 777 778 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 779 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 780 return OFFLOAD_FAIL; 781 782 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); 783 784 // If they are two devices, we try peer to peer copy first 785 if (SrcDevId != DstDevId) { 786 int CanAccessPeer = 0; 787 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 788 if (Err != CUDA_SUCCESS) { 789 DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 790 ", dst = %" PRId32 "\n", 791 SrcDevId, DstDevId); 792 CUDA_ERR_STRING(Err); 793 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 794 } 795 796 if (!CanAccessPeer) { 797 DP("P2P memcpy not supported so fall back to D2D memcpy"); 798 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 799 } 800 801 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 802 if (Err != CUDA_SUCCESS) { 803 DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 804 ", dst = %" PRId32 "\n", 805 SrcDevId, DstDevId); 806 CUDA_ERR_STRING(Err); 807 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 808 } 809 810 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 811 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 812 Size, Stream); 813 if (Err == CUDA_SUCCESS) 814 return OFFLOAD_SUCCESS; 815 816 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 817 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 818 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 819 CUDA_ERR_STRING(Err); 820 } 821 822 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 823 } 824 825 int dataDelete(const int DeviceId, void *TgtPtr) const { 826 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 827 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 828 return OFFLOAD_FAIL; 829 830 Err = cuMemFree((CUdeviceptr)TgtPtr); 831 if (!checkResult(Err, "Error returned from cuMemFree\n")) 832 return OFFLOAD_FAIL; 833 834 return OFFLOAD_SUCCESS; 835 } 836 837 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 838 ptrdiff_t *TgtOffsets, const int ArgNum, 839 const int TeamNum, const int ThreadLimit, 840 const unsigned int LoopTripCount, 841 __tgt_async_info *AsyncInfo) const { 842 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 843 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 844 return OFFLOAD_FAIL; 845 846 // All args are references. 847 std::vector<void *> Args(ArgNum); 848 std::vector<void *> Ptrs(ArgNum); 849 850 for (int I = 0; I < ArgNum; ++I) { 851 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 852 Args[I] = &Ptrs[I]; 853 } 854 855 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 856 857 int CudaThreadsPerBlock; 858 if (ThreadLimit > 0) { 859 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 860 CudaThreadsPerBlock = ThreadLimit; 861 // Add master warp if necessary 862 if (KernelInfo->ExecutionMode == GENERIC) { 863 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 864 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 865 } 866 } else { 867 DP("Setting CUDA threads per block to default %d\n", 868 DeviceData[DeviceId].NumThreads); 869 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 870 } 871 872 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 873 DP("Threads per block capped at device limit %d\n", 874 DeviceData[DeviceId].ThreadsPerBlock); 875 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 876 } 877 878 if (!KernelInfo->MaxThreadsPerBlock) { 879 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 880 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 881 KernelInfo->Func); 882 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 883 return OFFLOAD_FAIL; 884 } 885 886 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 887 DP("Threads per block capped at kernel limit %d\n", 888 KernelInfo->MaxThreadsPerBlock); 889 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 890 } 891 892 unsigned int CudaBlocksPerGrid; 893 if (TeamNum <= 0) { 894 if (LoopTripCount > 0 && EnvNumTeams < 0) { 895 if (KernelInfo->ExecutionMode == SPMD) { 896 // We have a combined construct, i.e. `target teams distribute 897 // parallel for [simd]`. We launch so many teams so that each thread 898 // will execute one iteration of the loop. round up to the nearest 899 // integer 900 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 901 } else { 902 // If we reach this point, then we have a non-combined construct, i.e. 903 // `teams distribute` with a nested `parallel for` and each team is 904 // assigned one iteration of the `distribute` loop. E.g.: 905 // 906 // #pragma omp target teams distribute 907 // for(...loop_tripcount...) { 908 // #pragma omp parallel for 909 // for(...) {} 910 // } 911 // 912 // Threads within a team will execute the iterations of the `parallel` 913 // loop. 914 CudaBlocksPerGrid = LoopTripCount; 915 } 916 DP("Using %d teams due to loop trip count %" PRIu32 917 " and number of threads per block %d\n", 918 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 919 } else { 920 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 921 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 922 } 923 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 924 DP("Capping number of teams to team limit %d\n", 925 DeviceData[DeviceId].BlocksPerGrid); 926 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 927 } else { 928 DP("Using requested number of teams %d\n", TeamNum); 929 CudaBlocksPerGrid = TeamNum; 930 } 931 932 INFO(DeviceId, 933 "Launching kernel %s with %d blocks and %d threads in %s " 934 "mode\n", 935 (getOffloadEntry(DeviceId, TgtEntryPtr)) 936 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 937 : "(null)", 938 CudaBlocksPerGrid, CudaThreadsPerBlock, 939 (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic"); 940 941 CUstream Stream = getStream(DeviceId, AsyncInfo); 942 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 943 /* gridDimZ */ 1, CudaThreadsPerBlock, 944 /* blockDimY */ 1, /* blockDimZ */ 1, 945 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 946 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 947 return OFFLOAD_FAIL; 948 949 DP("Launch of entry point at " DPxMOD " successful!\n", 950 DPxPTR(TgtEntryPtr)); 951 952 return OFFLOAD_SUCCESS; 953 } 954 955 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 956 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 957 CUresult Err = cuStreamSynchronize(Stream); 958 if (Err != CUDA_SUCCESS) { 959 DP("Error when synchronizing stream. stream = " DPxMOD 960 ", async info ptr = " DPxMOD "\n", 961 DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); 962 CUDA_ERR_STRING(Err); 963 return OFFLOAD_FAIL; 964 } 965 966 // Once the stream is synchronized, return it to stream pool and reset 967 // async_info. This is to make sure the synchronization only works for its 968 // own tasks. 969 StreamManager->returnStream( 970 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue)); 971 AsyncInfoPtr->Queue = nullptr; 972 973 return OFFLOAD_SUCCESS; 974 } 975 }; 976 977 DeviceRTLTy DeviceRTL; 978 } // namespace 979 980 // Exposed library API function 981 #ifdef __cplusplus 982 extern "C" { 983 #endif 984 985 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 986 return elf_check_machine(image, /* EM_CUDA */ 190); 987 } 988 989 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 990 991 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 992 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 993 DeviceRTL.setRequiresFlag(RequiresFlags); 994 return RequiresFlags; 995 } 996 997 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 998 if (DeviceRTL.isValidDeviceId(src_dev_id) && 999 DeviceRTL.isValidDeviceId(dst_dev_id)) 1000 return 1; 1001 1002 return 0; 1003 } 1004 1005 int32_t __tgt_rtl_init_device(int32_t device_id) { 1006 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1007 1008 return DeviceRTL.initDevice(device_id); 1009 } 1010 1011 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1012 __tgt_device_image *image) { 1013 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1014 1015 return DeviceRTL.loadBinary(device_id, image); 1016 } 1017 1018 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { 1019 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1020 1021 return DeviceRTL.dataAlloc(device_id, size); 1022 } 1023 1024 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1025 int64_t size) { 1026 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1027 1028 __tgt_async_info async_info; 1029 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1030 size, &async_info); 1031 if (rc != OFFLOAD_SUCCESS) 1032 return OFFLOAD_FAIL; 1033 1034 return __tgt_rtl_synchronize(device_id, &async_info); 1035 } 1036 1037 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1038 void *hst_ptr, int64_t size, 1039 __tgt_async_info *async_info_ptr) { 1040 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1041 assert(async_info_ptr && "async_info_ptr is nullptr"); 1042 1043 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1044 async_info_ptr); 1045 } 1046 1047 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1048 int64_t size) { 1049 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1050 1051 __tgt_async_info async_info; 1052 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1053 size, &async_info); 1054 if (rc != OFFLOAD_SUCCESS) 1055 return OFFLOAD_FAIL; 1056 1057 return __tgt_rtl_synchronize(device_id, &async_info); 1058 } 1059 1060 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1061 void *tgt_ptr, int64_t size, 1062 __tgt_async_info *async_info_ptr) { 1063 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1064 assert(async_info_ptr && "async_info_ptr is nullptr"); 1065 1066 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1067 async_info_ptr); 1068 } 1069 1070 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1071 int dst_dev_id, void *dst_ptr, 1072 int64_t size, 1073 __tgt_async_info *async_info_ptr) { 1074 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1075 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1076 assert(async_info_ptr && "async_info_ptr is nullptr"); 1077 1078 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1079 async_info_ptr); 1080 } 1081 1082 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1083 int32_t dst_dev_id, void *dst_ptr, 1084 int64_t size) { 1085 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1086 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1087 1088 __tgt_async_info async_info; 1089 const int32_t rc = __tgt_rtl_data_exchange_async( 1090 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); 1091 if (rc != OFFLOAD_SUCCESS) 1092 return OFFLOAD_FAIL; 1093 1094 return __tgt_rtl_synchronize(src_dev_id, &async_info); 1095 } 1096 1097 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1098 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1099 1100 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1101 } 1102 1103 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1104 void **tgt_args, 1105 ptrdiff_t *tgt_offsets, 1106 int32_t arg_num, int32_t team_num, 1107 int32_t thread_limit, 1108 uint64_t loop_tripcount) { 1109 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1110 1111 __tgt_async_info async_info; 1112 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1113 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1114 thread_limit, loop_tripcount, &async_info); 1115 if (rc != OFFLOAD_SUCCESS) 1116 return OFFLOAD_FAIL; 1117 1118 return __tgt_rtl_synchronize(device_id, &async_info); 1119 } 1120 1121 int32_t __tgt_rtl_run_target_team_region_async( 1122 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1123 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1124 int32_t thread_limit, uint64_t loop_tripcount, 1125 __tgt_async_info *async_info_ptr) { 1126 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1127 1128 return DeviceRTL.runTargetTeamRegion( 1129 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1130 thread_limit, loop_tripcount, async_info_ptr); 1131 } 1132 1133 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1134 void **tgt_args, ptrdiff_t *tgt_offsets, 1135 int32_t arg_num) { 1136 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1137 1138 __tgt_async_info async_info; 1139 const int32_t rc = __tgt_rtl_run_target_region_async( 1140 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); 1141 if (rc != OFFLOAD_SUCCESS) 1142 return OFFLOAD_FAIL; 1143 1144 return __tgt_rtl_synchronize(device_id, &async_info); 1145 } 1146 1147 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1148 void *tgt_entry_ptr, void **tgt_args, 1149 ptrdiff_t *tgt_offsets, 1150 int32_t arg_num, 1151 __tgt_async_info *async_info_ptr) { 1152 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1153 1154 return __tgt_rtl_run_target_team_region_async( 1155 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1156 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1157 async_info_ptr); 1158 } 1159 1160 int32_t __tgt_rtl_synchronize(int32_t device_id, 1161 __tgt_async_info *async_info_ptr) { 1162 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1163 assert(async_info_ptr && "async_info_ptr is nullptr"); 1164 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1165 1166 return DeviceRTL.synchronize(device_id, async_info_ptr); 1167 } 1168 1169 #ifdef __cplusplus 1170 } 1171 #endif 1172