1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <vector> 21 22 #include "Debug.h" 23 #include "omptargetplugin.h" 24 25 #define TARGET_NAME CUDA 26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 27 28 // Utility for retrieving and printing CUDA error string. 29 #ifdef OMPTARGET_DEBUG 30 #define CUDA_ERR_STRING(err) \ 31 do { \ 32 if (getDebugLevel() > 0) { \ 33 const char *errStr = nullptr; \ 34 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 35 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 36 DP("Unrecognized CUDA error code: %d\n", err); \ 37 else if (errStr_status == CUDA_SUCCESS) \ 38 DP("CUDA error is: %s\n", errStr); \ 39 else { \ 40 DP("Unresolved CUDA error code: %d\n", err); \ 41 DP("Unsuccessful cuGetErrorString return status: %d\n", \ 42 errStr_status); \ 43 } \ 44 } \ 45 } while (false) 46 #else // OMPTARGET_DEBUG 47 #define CUDA_ERR_STRING(err) {} 48 #endif // OMPTARGET_DEBUG 49 50 #include "../../common/elf_common.c" 51 52 /// Keep entries table per device. 53 struct FuncOrGblEntryTy { 54 __tgt_target_table Table; 55 std::vector<__tgt_offload_entry> Entries; 56 }; 57 58 enum ExecutionModeType { 59 SPMD, // constructors, destructors, 60 // combined constructs (`teams distribute parallel for [simd]`) 61 GENERIC, // everything else 62 NONE 63 }; 64 65 /// Use a single entity to encode a kernel and a set of flags. 66 struct KernelTy { 67 CUfunction Func; 68 69 // execution mode of kernel 70 // 0 - SPMD mode (without master warp) 71 // 1 - Generic mode (with master warp) 72 int8_t ExecutionMode; 73 74 /// Maximal number of threads per block for this kernel. 75 int MaxThreadsPerBlock = 0; 76 77 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 78 : Func(_Func), ExecutionMode(_ExecutionMode) {} 79 }; 80 81 /// Device environment data 82 /// Manually sync with the deviceRTL side for now, move to a dedicated header 83 /// file later. 84 struct omptarget_device_environmentTy { 85 int32_t debug_level; 86 }; 87 88 namespace { 89 bool checkResult(CUresult Err, const char *ErrMsg) { 90 if (Err == CUDA_SUCCESS) 91 return true; 92 93 DP("%s", ErrMsg); 94 CUDA_ERR_STRING(Err); 95 return false; 96 } 97 98 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 99 CUstream Stream) { 100 CUresult Err = 101 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 102 103 if (Err != CUDA_SUCCESS) { 104 DP("Error when copying data from device to device. Pointers: src " 105 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 106 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 107 CUDA_ERR_STRING(Err); 108 return OFFLOAD_FAIL; 109 } 110 111 return OFFLOAD_SUCCESS; 112 } 113 114 // Structure contains per-device data 115 struct DeviceDataTy { 116 /// List that contains all the kernels. 117 std::list<KernelTy> KernelsList; 118 119 std::list<FuncOrGblEntryTy> FuncGblEntries; 120 121 CUcontext Context = nullptr; 122 // Device properties 123 int ThreadsPerBlock = 0; 124 int BlocksPerGrid = 0; 125 int WarpSize = 0; 126 // OpenMP properties 127 int NumTeams = 0; 128 int NumThreads = 0; 129 }; 130 131 class StreamManagerTy { 132 int NumberOfDevices; 133 // The initial size of stream pool 134 int EnvNumInitialStreams; 135 // Per-device stream mutex 136 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 137 // Per-device stream Id indicates the next available stream in the pool 138 std::vector<int> NextStreamId; 139 // Per-device stream pool 140 std::vector<std::vector<CUstream>> StreamPool; 141 // Reference to per-device data 142 std::vector<DeviceDataTy> &DeviceData; 143 144 // If there is no CUstream left in the pool, we will resize the pool to 145 // allocate more CUstream. This function should be called with device mutex, 146 // and we do not resize to smaller one. 147 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 148 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 149 const size_t CurrentSize = Pool.size(); 150 assert(NewSize > CurrentSize && "new size is not larger than current size"); 151 152 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 153 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 154 // We will return if cannot switch to the right context in case of 155 // creating bunch of streams that are not corresponding to the right 156 // device. The offloading will fail later because selected CUstream is 157 // nullptr. 158 return; 159 } 160 161 Pool.resize(NewSize, nullptr); 162 163 for (size_t I = CurrentSize; I < NewSize; ++I) { 164 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 165 "Error returned from cuStreamCreate\n"); 166 } 167 } 168 169 public: 170 StreamManagerTy(const int NumberOfDevices, 171 std::vector<DeviceDataTy> &DeviceData) 172 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 173 DeviceData(DeviceData) { 174 StreamPool.resize(NumberOfDevices); 175 NextStreamId.resize(NumberOfDevices); 176 StreamMtx.resize(NumberOfDevices); 177 178 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 179 EnvNumInitialStreams = std::stoi(EnvStr); 180 181 // Initialize the next stream id 182 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 183 184 // Initialize stream mutex 185 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 186 Ptr = std::make_unique<std::mutex>(); 187 } 188 189 ~StreamManagerTy() { 190 // Destroy streams 191 for (int I = 0; I < NumberOfDevices; ++I) { 192 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 193 "Error returned from cuCtxSetCurrent\n"); 194 195 for (CUstream &S : StreamPool[I]) { 196 if (S) 197 checkResult(cuStreamDestroy(S), 198 "Error returned from cuStreamDestroy\n"); 199 } 200 } 201 } 202 203 // Get a CUstream from pool. Per-device next stream id always points to the 204 // next available CUstream. That means, CUstreams [0, id-1] have been 205 // assigned, and [id,] are still available. If there is no CUstream left, we 206 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 207 // the id will increase one. 208 // xxxxxs+++++++++ 209 // ^ 210 // id 211 // After assignment, the pool becomes the following and s is assigned. 212 // xxxxxs+++++++++ 213 // ^ 214 // id 215 CUstream getStream(const int DeviceId) { 216 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 217 int &Id = NextStreamId[DeviceId]; 218 // No CUstream left in the pool, we need to request from CUDA RT 219 if (Id == StreamPool[DeviceId].size()) { 220 // By default we double the stream pool every time 221 resizeStreamPool(DeviceId, Id * 2); 222 } 223 return StreamPool[DeviceId][Id++]; 224 } 225 226 // Return a CUstream back to pool. As mentioned above, per-device next 227 // stream is always points to the next available CUstream, so when we return 228 // a CUstream, we need to first decrease the id, and then copy the CUstream 229 // back. 230 // It is worth noting that, the order of streams return might be different 231 // from that they're assigned, that saying, at some point, there might be 232 // two identical CUstreams. 233 // xxax+a+++++ 234 // ^ 235 // id 236 // However, it doesn't matter, because they're always on the two sides of 237 // id. The left one will in the end be overwritten by another CUstream. 238 // Therefore, after several execution, the order of pool might be different 239 // from its initial state. 240 void returnStream(const int DeviceId, CUstream Stream) { 241 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 242 int &Id = NextStreamId[DeviceId]; 243 assert(Id > 0 && "Wrong stream ID"); 244 StreamPool[DeviceId][--Id] = Stream; 245 } 246 247 bool initializeDeviceStreamPool(const int DeviceId) { 248 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 249 250 resizeStreamPool(DeviceId, EnvNumInitialStreams); 251 252 // Check the size of stream pool 253 if (StreamPool[DeviceId].size() != EnvNumInitialStreams) 254 return false; 255 256 // Check whether each stream is valid 257 for (CUstream &S : StreamPool[DeviceId]) 258 if (!S) 259 return false; 260 261 return true; 262 } 263 }; 264 265 class DeviceRTLTy { 266 int NumberOfDevices; 267 // OpenMP environment properties 268 int EnvNumTeams; 269 int EnvTeamLimit; 270 // OpenMP requires flags 271 int64_t RequiresFlags; 272 273 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 274 static constexpr const int HardThreadLimit = 1024; 275 static constexpr const int DefaultNumTeams = 128; 276 static constexpr const int DefaultNumThreads = 128; 277 278 std::unique_ptr<StreamManagerTy> StreamManager; 279 std::vector<DeviceDataTy> DeviceData; 280 std::vector<CUmodule> Modules; 281 282 // Record entry point associated with device 283 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 284 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 285 E.Entries.push_back(entry); 286 } 287 288 // Return a pointer to the entry associated with the pointer 289 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 290 const void *Addr) const { 291 for (const __tgt_offload_entry &Itr : 292 DeviceData[DeviceId].FuncGblEntries.back().Entries) 293 if (Itr.addr == Addr) 294 return &Itr; 295 296 return nullptr; 297 } 298 299 // Return the pointer to the target entries table 300 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 301 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 302 303 if (E.Entries.empty()) 304 return nullptr; 305 306 // Update table info according to the entries and return the pointer 307 E.Table.EntriesBegin = E.Entries.data(); 308 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 309 310 return &E.Table; 311 } 312 313 // Clear entries table for a device 314 void clearOffloadEntriesTable(const int DeviceId) { 315 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 316 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 317 E.Entries.clear(); 318 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 319 } 320 321 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 322 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 323 324 if (!AsyncInfoPtr->Queue) 325 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); 326 327 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 328 } 329 330 public: 331 // This class should not be copied 332 DeviceRTLTy(const DeviceRTLTy &) = delete; 333 DeviceRTLTy(DeviceRTLTy &&) = delete; 334 335 DeviceRTLTy() 336 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 337 RequiresFlags(OMP_REQ_UNDEFINED) { 338 339 DP("Start initializing CUDA\n"); 340 341 CUresult Err = cuInit(0); 342 if (!checkResult(Err, "Error returned from cuInit\n")) { 343 return; 344 } 345 346 Err = cuDeviceGetCount(&NumberOfDevices); 347 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 348 return; 349 350 if (NumberOfDevices == 0) { 351 DP("There are no devices supporting CUDA.\n"); 352 return; 353 } 354 355 DeviceData.resize(NumberOfDevices); 356 357 // Get environment variables regarding teams 358 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 359 // OMP_TEAM_LIMIT has been set 360 EnvTeamLimit = std::stoi(EnvStr); 361 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 362 } 363 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 364 // OMP_NUM_TEAMS has been set 365 EnvNumTeams = std::stoi(EnvStr); 366 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 367 } 368 369 StreamManager = 370 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 371 } 372 373 ~DeviceRTLTy() { 374 // First destruct stream manager in case of Contexts is destructed before it 375 StreamManager = nullptr; 376 377 for (CUmodule &M : Modules) 378 // Close module 379 if (M) 380 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 381 382 for (DeviceDataTy &D : DeviceData) { 383 // Destroy context 384 if (D.Context) { 385 checkResult(cuCtxSetCurrent(D.Context), 386 "Error returned from cuCtxSetCurrent\n"); 387 CUdevice Device; 388 checkResult(cuCtxGetDevice(&Device), 389 "Error returned from cuCtxGetDevice\n"); 390 checkResult(cuDevicePrimaryCtxRelease(Device), 391 "Error returned from cuDevicePrimaryCtxRelease\n"); 392 } 393 } 394 } 395 396 // Check whether a given DeviceId is valid 397 bool isValidDeviceId(const int DeviceId) const { 398 return DeviceId >= 0 && DeviceId < NumberOfDevices; 399 } 400 401 int getNumOfDevices() const { return NumberOfDevices; } 402 403 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 404 405 int initDevice(const int DeviceId) { 406 CUdevice Device; 407 408 DP("Getting device %d\n", DeviceId); 409 CUresult Err = cuDeviceGet(&Device, DeviceId); 410 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 411 return OFFLOAD_FAIL; 412 413 // Query the current flags of the primary context and set its flags if 414 // it is inactive 415 unsigned int FormerPrimaryCtxFlags = 0; 416 int FormerPrimaryCtxIsActive = 0; 417 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 418 &FormerPrimaryCtxIsActive); 419 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 420 return OFFLOAD_FAIL; 421 422 if (FormerPrimaryCtxIsActive) { 423 DP("The primary context is active, no change to its flags\n"); 424 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 425 CU_CTX_SCHED_BLOCKING_SYNC) 426 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 427 } else { 428 DP("The primary context is inactive, set its flags to " 429 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 430 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 431 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 432 return OFFLOAD_FAIL; 433 } 434 435 // Retain the per device primary context and save it to use whenever this 436 // device is selected. 437 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 438 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 439 return OFFLOAD_FAIL; 440 441 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 442 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 443 return OFFLOAD_FAIL; 444 445 // Initialize stream pool 446 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 447 return OFFLOAD_FAIL; 448 449 // Query attributes to determine number of threads/block and blocks/grid. 450 int MaxGridDimX; 451 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 452 Device); 453 if (Err != CUDA_SUCCESS) { 454 DP("Error getting max grid dimension, use default value %d\n", 455 DeviceRTLTy::DefaultNumTeams); 456 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 457 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 458 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 459 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 460 } else { 461 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 462 "at the hard limit\n", 463 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 464 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 465 } 466 467 // We are only exploiting threads along the x axis. 468 int MaxBlockDimX; 469 Err = cuDeviceGetAttribute(&MaxBlockDimX, 470 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 471 if (Err != CUDA_SUCCESS) { 472 DP("Error getting max block dimension, use default value %d\n", 473 DeviceRTLTy::DefaultNumThreads); 474 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 475 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { 476 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 477 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 478 } else { 479 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 480 "capping at the hard limit\n", 481 MaxBlockDimX, DeviceRTLTy::HardThreadLimit); 482 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 483 } 484 485 // Get and set warp size 486 int WarpSize; 487 Err = 488 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 489 if (Err != CUDA_SUCCESS) { 490 DP("Error getting warp size, assume default value 32\n"); 491 DeviceData[DeviceId].WarpSize = 32; 492 } else { 493 DP("Using warp size %d\n", WarpSize); 494 DeviceData[DeviceId].WarpSize = WarpSize; 495 } 496 497 // Adjust teams to the env variables 498 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 499 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 500 EnvTeamLimit); 501 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 502 } 503 504 if (getDebugLevel() || (getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) 505 INFO(DeviceId, 506 "Device supports up to %d CUDA blocks and %d threads with a " 507 "warp size of %d\n", 508 DeviceData[DeviceId].BlocksPerGrid, 509 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 510 511 // Set default number of teams 512 if (EnvNumTeams > 0) { 513 DP("Default number of teams set according to environment %d\n", 514 EnvNumTeams); 515 DeviceData[DeviceId].NumTeams = EnvNumTeams; 516 } else { 517 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 518 DP("Default number of teams set according to library's default %d\n", 519 DeviceRTLTy::DefaultNumTeams); 520 } 521 522 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 523 DP("Default number of teams exceeds device limit, capping at %d\n", 524 DeviceData[DeviceId].BlocksPerGrid); 525 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 526 } 527 528 // Set default number of threads 529 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 530 DP("Default number of threads set according to library's default %d\n", 531 DeviceRTLTy::DefaultNumThreads); 532 if (DeviceData[DeviceId].NumThreads > 533 DeviceData[DeviceId].ThreadsPerBlock) { 534 DP("Default number of threads exceeds device limit, capping at %d\n", 535 DeviceData[DeviceId].ThreadsPerBlock); 536 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 537 } 538 539 return OFFLOAD_SUCCESS; 540 } 541 542 __tgt_target_table *loadBinary(const int DeviceId, 543 const __tgt_device_image *Image) { 544 // Set the context we are using 545 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 546 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 547 return nullptr; 548 549 // Clear the offload table as we are going to create a new one. 550 clearOffloadEntriesTable(DeviceId); 551 552 // Create the module and extract the function pointers. 553 CUmodule Module; 554 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 555 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 556 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 557 return nullptr; 558 559 DP("CUDA module successfully loaded!\n"); 560 561 Modules.push_back(Module); 562 563 // Find the symbols in the module by name. 564 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 565 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 566 567 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 568 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 569 if (!E->addr) { 570 // We return nullptr when something like this happens, the host should 571 // have always something in the address to uniquely identify the target 572 // region. 573 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 574 return nullptr; 575 } 576 577 if (E->size) { 578 __tgt_offload_entry Entry = *E; 579 CUdeviceptr CUPtr; 580 size_t CUSize; 581 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 582 // We keep this style here because we need the name 583 if (Err != CUDA_SUCCESS) { 584 DP("Loading global '%s' (Failed)\n", E->name); 585 CUDA_ERR_STRING(Err); 586 return nullptr; 587 } 588 589 if (CUSize != E->size) { 590 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 591 CUSize, E->size); 592 return nullptr; 593 } 594 595 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 596 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 597 598 Entry.addr = (void *)(CUPtr); 599 600 // Note: In the current implementation declare target variables 601 // can either be link or to. This means that once unified 602 // memory is activated via the requires directive, the variable 603 // can be used directly from the host in both cases. 604 // TODO: when variables types other than to or link are added, 605 // the below condition should be changed to explicitly 606 // check for to and link variables types: 607 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 608 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 609 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 610 // If unified memory is present any target link or to variables 611 // can access host addresses directly. There is no longer a 612 // need for device copies. 613 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 614 DP("Copy linked variable host address (" DPxMOD 615 ") to device address (" DPxMOD ")\n", 616 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 617 } 618 619 addOffloadEntry(DeviceId, Entry); 620 621 continue; 622 } 623 624 CUfunction Func; 625 Err = cuModuleGetFunction(&Func, Module, E->name); 626 // We keep this style here because we need the name 627 if (Err != CUDA_SUCCESS) { 628 DP("Loading '%s' (Failed)\n", E->name); 629 CUDA_ERR_STRING(Err); 630 return nullptr; 631 } 632 633 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 634 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 635 636 // default value GENERIC (in case symbol is missing from cubin file) 637 int8_t ExecModeVal = ExecutionModeType::GENERIC; 638 std::string ExecModeNameStr(E->name); 639 ExecModeNameStr += "_exec_mode"; 640 const char *ExecModeName = ExecModeNameStr.c_str(); 641 642 CUdeviceptr ExecModePtr; 643 size_t CUSize; 644 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 645 if (Err == CUDA_SUCCESS) { 646 if (CUSize != sizeof(int8_t)) { 647 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 648 ExecModeName, CUSize, sizeof(int8_t)); 649 return nullptr; 650 } 651 652 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 653 if (Err != CUDA_SUCCESS) { 654 DP("Error when copying data from device to host. Pointers: " 655 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 656 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 657 CUDA_ERR_STRING(Err); 658 return nullptr; 659 } 660 661 if (ExecModeVal < 0 || ExecModeVal > 1) { 662 DP("Error wrong exec_mode value specified in cubin file: %d\n", 663 ExecModeVal); 664 return nullptr; 665 } 666 } else { 667 DP("Loading global exec_mode '%s' - symbol missing, using default " 668 "value GENERIC (1)\n", 669 ExecModeName); 670 CUDA_ERR_STRING(Err); 671 } 672 673 KernelsList.emplace_back(Func, ExecModeVal); 674 675 __tgt_offload_entry Entry = *E; 676 Entry.addr = &KernelsList.back(); 677 addOffloadEntry(DeviceId, Entry); 678 } 679 680 // send device environment data to the device 681 { 682 omptarget_device_environmentTy DeviceEnv{0}; 683 684 #ifdef OMPTARGET_DEBUG 685 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 686 DeviceEnv.debug_level = std::stoi(EnvStr); 687 #endif 688 689 const char *DeviceEnvName = "omptarget_device_environment"; 690 CUdeviceptr DeviceEnvPtr; 691 size_t CUSize; 692 693 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 694 if (Err == CUDA_SUCCESS) { 695 if (CUSize != sizeof(DeviceEnv)) { 696 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 697 DeviceEnvName, CUSize, sizeof(int32_t)); 698 CUDA_ERR_STRING(Err); 699 return nullptr; 700 } 701 702 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 703 if (Err != CUDA_SUCCESS) { 704 DP("Error when copying data from host to device. Pointers: " 705 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 706 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 707 CUDA_ERR_STRING(Err); 708 return nullptr; 709 } 710 711 DP("Sending global device environment data %zu bytes\n", CUSize); 712 } else { 713 DP("Finding global device environment '%s' - symbol missing.\n", 714 DeviceEnvName); 715 DP("Continue, considering this is a device RTL which does not accept " 716 "environment setting.\n"); 717 } 718 } 719 720 return getOffloadEntriesTable(DeviceId); 721 } 722 723 void *dataAlloc(const int DeviceId, const int64_t Size) const { 724 if (Size == 0) 725 return nullptr; 726 727 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 728 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 729 return nullptr; 730 731 CUdeviceptr DevicePtr; 732 Err = cuMemAlloc(&DevicePtr, Size); 733 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 734 return nullptr; 735 736 return (void *)DevicePtr; 737 } 738 739 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 740 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 741 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 742 743 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 744 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 745 return OFFLOAD_FAIL; 746 747 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 748 749 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 750 if (Err != CUDA_SUCCESS) { 751 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 752 ", device = " DPxMOD ", size = %" PRId64 "\n", 753 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 754 CUDA_ERR_STRING(Err); 755 return OFFLOAD_FAIL; 756 } 757 758 return OFFLOAD_SUCCESS; 759 } 760 761 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 762 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 763 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 764 765 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 766 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 767 return OFFLOAD_FAIL; 768 769 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 770 771 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 772 if (Err != CUDA_SUCCESS) { 773 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 774 ", device = " DPxMOD ", size = %" PRId64 "\n", 775 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 776 CUDA_ERR_STRING(Err); 777 return OFFLOAD_FAIL; 778 } 779 780 return OFFLOAD_SUCCESS; 781 } 782 783 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 784 int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 785 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 786 787 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 788 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 789 return OFFLOAD_FAIL; 790 791 CUstream Stream = getStream(SrcDevId, AsyncInfoPtr); 792 793 // If they are two devices, we try peer to peer copy first 794 if (SrcDevId != DstDevId) { 795 int CanAccessPeer = 0; 796 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 797 if (Err != CUDA_SUCCESS) { 798 DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 799 ", dst = %" PRId32 "\n", 800 SrcDevId, DstDevId); 801 CUDA_ERR_STRING(Err); 802 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 803 } 804 805 if (!CanAccessPeer) { 806 DP("P2P memcpy not supported so fall back to D2D memcpy"); 807 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 808 } 809 810 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 811 if (Err != CUDA_SUCCESS) { 812 DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 813 ", dst = %" PRId32 "\n", 814 SrcDevId, DstDevId); 815 CUDA_ERR_STRING(Err); 816 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 817 } 818 819 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 820 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 821 Size, Stream); 822 if (Err == CUDA_SUCCESS) 823 return OFFLOAD_SUCCESS; 824 825 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 826 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 827 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 828 CUDA_ERR_STRING(Err); 829 } 830 831 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 832 } 833 834 int dataDelete(const int DeviceId, void *TgtPtr) const { 835 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 836 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 837 return OFFLOAD_FAIL; 838 839 Err = cuMemFree((CUdeviceptr)TgtPtr); 840 if (!checkResult(Err, "Error returned from cuMemFree\n")) 841 return OFFLOAD_FAIL; 842 843 return OFFLOAD_SUCCESS; 844 } 845 846 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 847 ptrdiff_t *TgtOffsets, const int ArgNum, 848 const int TeamNum, const int ThreadLimit, 849 const unsigned int LoopTripCount, 850 __tgt_async_info *AsyncInfo) const { 851 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 852 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 853 return OFFLOAD_FAIL; 854 855 // All args are references. 856 std::vector<void *> Args(ArgNum); 857 std::vector<void *> Ptrs(ArgNum); 858 859 for (int I = 0; I < ArgNum; ++I) { 860 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 861 Args[I] = &Ptrs[I]; 862 } 863 864 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 865 866 int CudaThreadsPerBlock; 867 if (ThreadLimit > 0) { 868 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 869 CudaThreadsPerBlock = ThreadLimit; 870 // Add master warp if necessary 871 if (KernelInfo->ExecutionMode == GENERIC) { 872 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 873 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 874 } 875 } else { 876 DP("Setting CUDA threads per block to default %d\n", 877 DeviceData[DeviceId].NumThreads); 878 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 879 } 880 881 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 882 DP("Threads per block capped at device limit %d\n", 883 DeviceData[DeviceId].ThreadsPerBlock); 884 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 885 } 886 887 if (!KernelInfo->MaxThreadsPerBlock) { 888 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 889 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 890 KernelInfo->Func); 891 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 892 return OFFLOAD_FAIL; 893 } 894 895 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 896 DP("Threads per block capped at kernel limit %d\n", 897 KernelInfo->MaxThreadsPerBlock); 898 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 899 } 900 901 unsigned int CudaBlocksPerGrid; 902 if (TeamNum <= 0) { 903 if (LoopTripCount > 0 && EnvNumTeams < 0) { 904 if (KernelInfo->ExecutionMode == SPMD) { 905 // We have a combined construct, i.e. `target teams distribute 906 // parallel for [simd]`. We launch so many teams so that each thread 907 // will execute one iteration of the loop. round up to the nearest 908 // integer 909 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 910 } else { 911 // If we reach this point, then we have a non-combined construct, i.e. 912 // `teams distribute` with a nested `parallel for` and each team is 913 // assigned one iteration of the `distribute` loop. E.g.: 914 // 915 // #pragma omp target teams distribute 916 // for(...loop_tripcount...) { 917 // #pragma omp parallel for 918 // for(...) {} 919 // } 920 // 921 // Threads within a team will execute the iterations of the `parallel` 922 // loop. 923 CudaBlocksPerGrid = LoopTripCount; 924 } 925 DP("Using %d teams due to loop trip count %" PRIu32 926 " and number of threads per block %d\n", 927 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 928 } else { 929 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 930 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 931 } 932 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 933 DP("Capping number of teams to team limit %d\n", 934 DeviceData[DeviceId].BlocksPerGrid); 935 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 936 } else { 937 DP("Using requested number of teams %d\n", TeamNum); 938 CudaBlocksPerGrid = TeamNum; 939 } 940 941 if (getDebugLevel() || (getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) 942 INFO(DeviceId, 943 "Launching kernel %s with %d blocks and %d threads in %s " 944 "mode\n", 945 (getOffloadEntry(DeviceId, TgtEntryPtr)) 946 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 947 : "(null)", 948 CudaBlocksPerGrid, CudaThreadsPerBlock, 949 (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic"); 950 951 CUstream Stream = getStream(DeviceId, AsyncInfo); 952 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 953 /* gridDimZ */ 1, CudaThreadsPerBlock, 954 /* blockDimY */ 1, /* blockDimZ */ 1, 955 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 956 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 957 return OFFLOAD_FAIL; 958 959 DP("Launch of entry point at " DPxMOD " successful!\n", 960 DPxPTR(TgtEntryPtr)); 961 962 return OFFLOAD_SUCCESS; 963 } 964 965 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 966 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 967 CUresult Err = cuStreamSynchronize(Stream); 968 if (Err != CUDA_SUCCESS) { 969 DP("Error when synchronizing stream. stream = " DPxMOD 970 ", async info ptr = " DPxMOD "\n", 971 DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); 972 CUDA_ERR_STRING(Err); 973 return OFFLOAD_FAIL; 974 } 975 976 // Once the stream is synchronized, return it to stream pool and reset 977 // async_info. This is to make sure the synchronization only works for its 978 // own tasks. 979 StreamManager->returnStream( 980 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue)); 981 AsyncInfoPtr->Queue = nullptr; 982 983 return OFFLOAD_SUCCESS; 984 } 985 }; 986 987 DeviceRTLTy DeviceRTL; 988 } // namespace 989 990 // Exposed library API function 991 #ifdef __cplusplus 992 extern "C" { 993 #endif 994 995 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 996 return elf_check_machine(image, /* EM_CUDA */ 190); 997 } 998 999 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1000 1001 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1002 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1003 DeviceRTL.setRequiresFlag(RequiresFlags); 1004 return RequiresFlags; 1005 } 1006 1007 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1008 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1009 DeviceRTL.isValidDeviceId(dst_dev_id)) 1010 return 1; 1011 1012 return 0; 1013 } 1014 1015 int32_t __tgt_rtl_init_device(int32_t device_id) { 1016 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1017 1018 return DeviceRTL.initDevice(device_id); 1019 } 1020 1021 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1022 __tgt_device_image *image) { 1023 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1024 1025 return DeviceRTL.loadBinary(device_id, image); 1026 } 1027 1028 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { 1029 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1030 1031 return DeviceRTL.dataAlloc(device_id, size); 1032 } 1033 1034 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1035 int64_t size) { 1036 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1037 1038 __tgt_async_info async_info; 1039 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1040 size, &async_info); 1041 if (rc != OFFLOAD_SUCCESS) 1042 return OFFLOAD_FAIL; 1043 1044 return __tgt_rtl_synchronize(device_id, &async_info); 1045 } 1046 1047 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1048 void *hst_ptr, int64_t size, 1049 __tgt_async_info *async_info_ptr) { 1050 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1051 assert(async_info_ptr && "async_info_ptr is nullptr"); 1052 1053 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1054 async_info_ptr); 1055 } 1056 1057 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1058 int64_t size) { 1059 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1060 1061 __tgt_async_info async_info; 1062 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1063 size, &async_info); 1064 if (rc != OFFLOAD_SUCCESS) 1065 return OFFLOAD_FAIL; 1066 1067 return __tgt_rtl_synchronize(device_id, &async_info); 1068 } 1069 1070 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1071 void *tgt_ptr, int64_t size, 1072 __tgt_async_info *async_info_ptr) { 1073 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1074 assert(async_info_ptr && "async_info_ptr is nullptr"); 1075 1076 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1077 async_info_ptr); 1078 } 1079 1080 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1081 int dst_dev_id, void *dst_ptr, 1082 int64_t size, 1083 __tgt_async_info *async_info_ptr) { 1084 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1085 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1086 assert(async_info_ptr && "async_info_ptr is nullptr"); 1087 1088 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1089 async_info_ptr); 1090 } 1091 1092 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1093 int32_t dst_dev_id, void *dst_ptr, 1094 int64_t size) { 1095 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1096 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1097 1098 __tgt_async_info async_info; 1099 const int32_t rc = __tgt_rtl_data_exchange_async( 1100 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info); 1101 if (rc != OFFLOAD_SUCCESS) 1102 return OFFLOAD_FAIL; 1103 1104 return __tgt_rtl_synchronize(src_dev_id, &async_info); 1105 } 1106 1107 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1108 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1109 1110 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1111 } 1112 1113 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1114 void **tgt_args, 1115 ptrdiff_t *tgt_offsets, 1116 int32_t arg_num, int32_t team_num, 1117 int32_t thread_limit, 1118 uint64_t loop_tripcount) { 1119 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1120 1121 __tgt_async_info async_info; 1122 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1123 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1124 thread_limit, loop_tripcount, &async_info); 1125 if (rc != OFFLOAD_SUCCESS) 1126 return OFFLOAD_FAIL; 1127 1128 return __tgt_rtl_synchronize(device_id, &async_info); 1129 } 1130 1131 int32_t __tgt_rtl_run_target_team_region_async( 1132 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1133 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1134 int32_t thread_limit, uint64_t loop_tripcount, 1135 __tgt_async_info *async_info_ptr) { 1136 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1137 1138 return DeviceRTL.runTargetTeamRegion( 1139 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1140 thread_limit, loop_tripcount, async_info_ptr); 1141 } 1142 1143 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1144 void **tgt_args, ptrdiff_t *tgt_offsets, 1145 int32_t arg_num) { 1146 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1147 1148 __tgt_async_info async_info; 1149 const int32_t rc = __tgt_rtl_run_target_region_async( 1150 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); 1151 if (rc != OFFLOAD_SUCCESS) 1152 return OFFLOAD_FAIL; 1153 1154 return __tgt_rtl_synchronize(device_id, &async_info); 1155 } 1156 1157 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1158 void *tgt_entry_ptr, void **tgt_args, 1159 ptrdiff_t *tgt_offsets, 1160 int32_t arg_num, 1161 __tgt_async_info *async_info_ptr) { 1162 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1163 1164 return __tgt_rtl_run_target_team_region_async( 1165 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1166 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1167 async_info_ptr); 1168 } 1169 1170 int32_t __tgt_rtl_synchronize(int32_t device_id, 1171 __tgt_async_info *async_info_ptr) { 1172 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1173 assert(async_info_ptr && "async_info_ptr is nullptr"); 1174 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1175 1176 return DeviceRTL.synchronize(device_id, async_info_ptr); 1177 } 1178 1179 #ifdef __cplusplus 1180 } 1181 #endif 1182