1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "Debug.h" 24 #include "omptargetplugin.h" 25 26 #define TARGET_NAME CUDA 27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 28 29 #include "MemoryManager.h" 30 31 // Utility for retrieving and printing CUDA error string. 32 #ifdef OMPTARGET_DEBUG 33 #define CUDA_ERR_STRING(err) \ 34 do { \ 35 if (getDebugLevel() > 0) { \ 36 const char *errStr = nullptr; \ 37 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 38 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ 39 REPORT("Unrecognized CUDA error code: %d\n", err); \ 40 else if (errStr_status == CUDA_SUCCESS) \ 41 REPORT("CUDA error is: %s\n", errStr); \ 42 else { \ 43 REPORT("Unresolved CUDA error code: %d\n", err); \ 44 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ 45 errStr_status); \ 46 } \ 47 } else { \ 48 const char *errStr = nullptr; \ 49 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 50 if (errStr_status == CUDA_SUCCESS) \ 51 REPORT("%s \n", errStr); \ 52 } \ 53 } while (false) 54 #else // OMPTARGET_DEBUG 55 #define CUDA_ERR_STRING(err) \ 56 do { \ 57 const char *errStr = nullptr; \ 58 CUresult errStr_status = cuGetErrorString(err, &errStr); \ 59 if (errStr_status == CUDA_SUCCESS) \ 60 REPORT("%s \n", errStr); \ 61 } while (false) 62 #endif // OMPTARGET_DEBUG 63 64 #define BOOL2TEXT(b) ((b) ? "Yes" : "No") 65 66 #include "elf_common.h" 67 68 /// Keep entries table per device. 69 struct FuncOrGblEntryTy { 70 __tgt_target_table Table; 71 std::vector<__tgt_offload_entry> Entries; 72 }; 73 74 enum ExecutionModeType { 75 SPMD, // constructors, destructors, 76 // combined constructs (`teams distribute parallel for [simd]`) 77 GENERIC, // everything else 78 SPMD_GENERIC, // Generic kernel with SPMD execution 79 NONE 80 }; 81 82 /// Use a single entity to encode a kernel and a set of flags. 83 struct KernelTy { 84 CUfunction Func; 85 86 // execution mode of kernel 87 // 0 - SPMD mode (without master warp) 88 // 1 - Generic mode (with master warp) 89 // 2 - SPMD mode execution with Generic mode semantics. 90 int8_t ExecutionMode; 91 92 /// Maximal number of threads per block for this kernel. 93 int MaxThreadsPerBlock = 0; 94 95 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 96 : Func(_Func), ExecutionMode(_ExecutionMode) {} 97 }; 98 99 /// Device environment data 100 /// Manually sync with the deviceRTL side for now, move to a dedicated header 101 /// file later. 102 struct omptarget_device_environmentTy { 103 int32_t debug_level; 104 }; 105 106 namespace { 107 bool checkResult(CUresult Err, const char *ErrMsg) { 108 if (Err == CUDA_SUCCESS) 109 return true; 110 111 REPORT("%s", ErrMsg); 112 CUDA_ERR_STRING(Err); 113 return false; 114 } 115 116 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, 117 CUstream Stream) { 118 CUresult Err = 119 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); 120 121 if (Err != CUDA_SUCCESS) { 122 DP("Error when copying data from device to device. Pointers: src " 123 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", 124 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); 125 CUDA_ERR_STRING(Err); 126 return OFFLOAD_FAIL; 127 } 128 129 return OFFLOAD_SUCCESS; 130 } 131 132 int createEvent(void **P) { 133 CUevent Event = nullptr; 134 135 CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); 136 if (Err != CUDA_SUCCESS) { 137 DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); 138 CUDA_ERR_STRING(Err); 139 return OFFLOAD_FAIL; 140 } 141 142 *P = Event; 143 144 return OFFLOAD_SUCCESS; 145 } 146 147 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { 148 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 149 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 150 151 CUresult Err = cuEventRecord(Event, Stream); 152 if (Err != CUDA_SUCCESS) { 153 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", 154 DPxPTR(Stream), DPxPTR(Event)); 155 CUDA_ERR_STRING(Err); 156 return OFFLOAD_FAIL; 157 } 158 159 return OFFLOAD_SUCCESS; 160 } 161 162 int syncEvent(void *EventPtr) { 163 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 164 165 CUresult Err = cuEventSynchronize(Event); 166 if (Err != CUDA_SUCCESS) { 167 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); 168 CUDA_ERR_STRING(Err); 169 return OFFLOAD_FAIL; 170 } 171 172 return OFFLOAD_SUCCESS; 173 } 174 175 int destroyEvent(void *EventPtr) { 176 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 177 178 CUresult Err = cuEventDestroy(Event); 179 if (Err != CUDA_SUCCESS) { 180 DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); 181 CUDA_ERR_STRING(Err); 182 return OFFLOAD_FAIL; 183 } 184 185 return OFFLOAD_SUCCESS; 186 } 187 188 // Structure contains per-device data 189 struct DeviceDataTy { 190 /// List that contains all the kernels. 191 std::list<KernelTy> KernelsList; 192 193 std::list<FuncOrGblEntryTy> FuncGblEntries; 194 195 CUcontext Context = nullptr; 196 // Device properties 197 int ThreadsPerBlock = 0; 198 int BlocksPerGrid = 0; 199 int WarpSize = 0; 200 // OpenMP properties 201 int NumTeams = 0; 202 int NumThreads = 0; 203 }; 204 205 class StreamManagerTy { 206 int NumberOfDevices; 207 // The initial size of stream pool 208 int EnvNumInitialStreams; 209 // Per-device stream mutex 210 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 211 // Per-device stream Id indicates the next available stream in the pool 212 std::vector<int> NextStreamId; 213 // Per-device stream pool 214 std::vector<std::vector<CUstream>> StreamPool; 215 // Reference to per-device data 216 std::vector<DeviceDataTy> &DeviceData; 217 218 // If there is no CUstream left in the pool, we will resize the pool to 219 // allocate more CUstream. This function should be called with device mutex, 220 // and we do not resize to smaller one. 221 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 222 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 223 const size_t CurrentSize = Pool.size(); 224 assert(NewSize > CurrentSize && "new size is not larger than current size"); 225 226 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 227 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 228 // We will return if cannot switch to the right context in case of 229 // creating bunch of streams that are not corresponding to the right 230 // device. The offloading will fail later because selected CUstream is 231 // nullptr. 232 return; 233 } 234 235 Pool.resize(NewSize, nullptr); 236 237 for (size_t I = CurrentSize; I < NewSize; ++I) { 238 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 239 "Error returned from cuStreamCreate\n"); 240 } 241 } 242 243 public: 244 StreamManagerTy(const int NumberOfDevices, 245 std::vector<DeviceDataTy> &DeviceData) 246 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 247 DeviceData(DeviceData) { 248 StreamPool.resize(NumberOfDevices); 249 NextStreamId.resize(NumberOfDevices); 250 StreamMtx.resize(NumberOfDevices); 251 252 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 253 EnvNumInitialStreams = std::stoi(EnvStr); 254 255 // Initialize the next stream id 256 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 257 258 // Initialize stream mutex 259 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 260 Ptr = std::make_unique<std::mutex>(); 261 } 262 263 ~StreamManagerTy() { 264 // Destroy streams 265 for (int I = 0; I < NumberOfDevices; ++I) { 266 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 267 "Error returned from cuCtxSetCurrent\n"); 268 269 for (CUstream &S : StreamPool[I]) { 270 if (S) 271 checkResult(cuStreamDestroy(S), 272 "Error returned from cuStreamDestroy\n"); 273 } 274 } 275 } 276 277 // Get a CUstream from pool. Per-device next stream id always points to the 278 // next available CUstream. That means, CUstreams [0, id-1] have been 279 // assigned, and [id,] are still available. If there is no CUstream left, we 280 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 281 // the id will increase one. 282 // xxxxxs+++++++++ 283 // ^ 284 // id 285 // After assignment, the pool becomes the following and s is assigned. 286 // xxxxxs+++++++++ 287 // ^ 288 // id 289 CUstream getStream(const int DeviceId) { 290 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 291 int &Id = NextStreamId[DeviceId]; 292 // No CUstream left in the pool, we need to request from CUDA RT 293 if (Id == static_cast<int>(StreamPool[DeviceId].size())) { 294 // By default we double the stream pool every time 295 resizeStreamPool(DeviceId, Id * 2); 296 } 297 return StreamPool[DeviceId][Id++]; 298 } 299 300 // Return a CUstream back to pool. As mentioned above, per-device next 301 // stream is always points to the next available CUstream, so when we return 302 // a CUstream, we need to first decrease the id, and then copy the CUstream 303 // back. 304 // It is worth noting that, the order of streams return might be different 305 // from that they're assigned, that saying, at some point, there might be 306 // two identical CUstreams. 307 // xxax+a+++++ 308 // ^ 309 // id 310 // However, it doesn't matter, because they're always on the two sides of 311 // id. The left one will in the end be overwritten by another CUstream. 312 // Therefore, after several execution, the order of pool might be different 313 // from its initial state. 314 void returnStream(const int DeviceId, CUstream Stream) { 315 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 316 int &Id = NextStreamId[DeviceId]; 317 assert(Id > 0 && "Wrong stream ID"); 318 StreamPool[DeviceId][--Id] = Stream; 319 } 320 321 bool initializeDeviceStreamPool(const int DeviceId) { 322 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 323 324 resizeStreamPool(DeviceId, EnvNumInitialStreams); 325 326 // Check the size of stream pool 327 if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams) 328 return false; 329 330 // Check whether each stream is valid 331 for (CUstream &S : StreamPool[DeviceId]) 332 if (!S) 333 return false; 334 335 return true; 336 } 337 }; 338 339 class DeviceRTLTy { 340 int NumberOfDevices; 341 // OpenMP environment properties 342 int EnvNumTeams; 343 int EnvTeamLimit; 344 int EnvTeamThreadLimit; 345 // OpenMP requires flags 346 int64_t RequiresFlags; 347 348 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 349 static constexpr const int HardThreadLimit = 1024; 350 static constexpr const int DefaultNumTeams = 128; 351 static constexpr const int DefaultNumThreads = 128; 352 353 std::unique_ptr<StreamManagerTy> StreamManager; 354 std::vector<DeviceDataTy> DeviceData; 355 std::vector<CUmodule> Modules; 356 357 /// A class responsible for interacting with device native runtime library to 358 /// allocate and free memory. 359 class CUDADeviceAllocatorTy : public DeviceAllocatorTy { 360 const int DeviceId; 361 const std::vector<DeviceDataTy> &DeviceData; 362 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs; 363 364 public: 365 CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData) 366 : DeviceId(DeviceId), DeviceData(DeviceData) {} 367 368 void *allocate(size_t Size, void *, TargetAllocTy Kind) override { 369 if (Size == 0) 370 return nullptr; 371 372 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 373 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 374 return nullptr; 375 376 void *MemAlloc = nullptr; 377 switch (Kind) { 378 case TARGET_ALLOC_DEFAULT: 379 case TARGET_ALLOC_DEVICE: 380 CUdeviceptr DevicePtr; 381 Err = cuMemAlloc(&DevicePtr, Size); 382 MemAlloc = (void *)DevicePtr; 383 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 384 return nullptr; 385 break; 386 case TARGET_ALLOC_HOST: 387 void *HostPtr; 388 Err = cuMemAllocHost(&HostPtr, Size); 389 MemAlloc = HostPtr; 390 if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) 391 return nullptr; 392 HostPinnedAllocs[MemAlloc] = Kind; 393 break; 394 case TARGET_ALLOC_SHARED: 395 CUdeviceptr SharedPtr; 396 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); 397 MemAlloc = (void *)SharedPtr; 398 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) 399 return nullptr; 400 break; 401 } 402 403 return MemAlloc; 404 } 405 406 int free(void *TgtPtr) override { 407 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 408 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 409 return OFFLOAD_FAIL; 410 411 // Host pinned memory must be freed differently. 412 TargetAllocTy Kind = 413 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) 414 ? TARGET_ALLOC_DEFAULT 415 : TARGET_ALLOC_HOST; 416 switch (Kind) { 417 case TARGET_ALLOC_DEFAULT: 418 case TARGET_ALLOC_DEVICE: 419 case TARGET_ALLOC_SHARED: 420 Err = cuMemFree((CUdeviceptr)TgtPtr); 421 if (!checkResult(Err, "Error returned from cuMemFree\n")) 422 return OFFLOAD_FAIL; 423 break; 424 case TARGET_ALLOC_HOST: 425 Err = cuMemFreeHost(TgtPtr); 426 if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) 427 return OFFLOAD_FAIL; 428 break; 429 } 430 431 return OFFLOAD_SUCCESS; 432 } 433 }; 434 435 /// A vector of device allocators 436 std::vector<CUDADeviceAllocatorTy> DeviceAllocators; 437 438 /// A vector of memory managers. Since the memory manager is non-copyable and 439 // non-removable, we wrap them into std::unique_ptr. 440 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers; 441 442 /// Whether use memory manager 443 bool UseMemoryManager = true; 444 445 // Record entry point associated with device 446 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 447 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 448 E.Entries.push_back(entry); 449 } 450 451 // Return a pointer to the entry associated with the pointer 452 const __tgt_offload_entry *getOffloadEntry(const int DeviceId, 453 const void *Addr) const { 454 for (const __tgt_offload_entry &Itr : 455 DeviceData[DeviceId].FuncGblEntries.back().Entries) 456 if (Itr.addr == Addr) 457 return &Itr; 458 459 return nullptr; 460 } 461 462 // Return the pointer to the target entries table 463 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 464 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 465 466 if (E.Entries.empty()) 467 return nullptr; 468 469 // Update table info according to the entries and return the pointer 470 E.Table.EntriesBegin = E.Entries.data(); 471 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 472 473 return &E.Table; 474 } 475 476 // Clear entries table for a device 477 void clearOffloadEntriesTable(const int DeviceId) { 478 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 479 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 480 E.Entries.clear(); 481 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 482 } 483 484 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { 485 assert(AsyncInfo && "AsyncInfo is nullptr"); 486 487 if (!AsyncInfo->Queue) 488 AsyncInfo->Queue = StreamManager->getStream(DeviceId); 489 490 return reinterpret_cast<CUstream>(AsyncInfo->Queue); 491 } 492 493 public: 494 // This class should not be copied 495 DeviceRTLTy(const DeviceRTLTy &) = delete; 496 DeviceRTLTy(DeviceRTLTy &&) = delete; 497 498 DeviceRTLTy() 499 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 500 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) { 501 502 DP("Start initializing CUDA\n"); 503 504 CUresult Err = cuInit(0); 505 if (Err == CUDA_ERROR_INVALID_HANDLE) { 506 // Can't call cuGetErrorString if dlsym failed 507 DP("Failed to load CUDA shared library\n"); 508 return; 509 } 510 if (!checkResult(Err, "Error returned from cuInit\n")) { 511 return; 512 } 513 514 Err = cuDeviceGetCount(&NumberOfDevices); 515 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 516 return; 517 518 if (NumberOfDevices == 0) { 519 DP("There are no devices supporting CUDA.\n"); 520 return; 521 } 522 523 DeviceData.resize(NumberOfDevices); 524 525 // Get environment variables regarding teams 526 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 527 // OMP_TEAM_LIMIT has been set 528 EnvTeamLimit = std::stoi(EnvStr); 529 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 530 } 531 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { 532 // OMP_TEAMS_THREAD_LIMIT has been set 533 EnvTeamThreadLimit = std::stoi(EnvStr); 534 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); 535 } 536 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 537 // OMP_NUM_TEAMS has been set 538 EnvNumTeams = std::stoi(EnvStr); 539 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 540 } 541 542 StreamManager = 543 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 544 545 for (int I = 0; I < NumberOfDevices; ++I) 546 DeviceAllocators.emplace_back(I, DeviceData); 547 548 // Get the size threshold from environment variable 549 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv(); 550 UseMemoryManager = Res.second; 551 size_t MemoryManagerThreshold = Res.first; 552 553 if (UseMemoryManager) 554 for (int I = 0; I < NumberOfDevices; ++I) 555 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>( 556 DeviceAllocators[I], MemoryManagerThreshold)); 557 } 558 559 ~DeviceRTLTy() { 560 // We first destruct memory managers in case that its dependent data are 561 // destroyed before it. 562 for (auto &M : MemoryManagers) 563 M.release(); 564 565 StreamManager = nullptr; 566 567 for (CUmodule &M : Modules) 568 // Close module 569 if (M) 570 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 571 572 for (DeviceDataTy &D : DeviceData) { 573 // Destroy context 574 if (D.Context) { 575 checkResult(cuCtxSetCurrent(D.Context), 576 "Error returned from cuCtxSetCurrent\n"); 577 CUdevice Device; 578 checkResult(cuCtxGetDevice(&Device), 579 "Error returned from cuCtxGetDevice\n"); 580 checkResult(cuDevicePrimaryCtxRelease(Device), 581 "Error returned from cuDevicePrimaryCtxRelease\n"); 582 } 583 } 584 } 585 586 // Check whether a given DeviceId is valid 587 bool isValidDeviceId(const int DeviceId) const { 588 return DeviceId >= 0 && DeviceId < NumberOfDevices; 589 } 590 591 int getNumOfDevices() const { return NumberOfDevices; } 592 593 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 594 595 int initDevice(const int DeviceId) { 596 CUdevice Device; 597 598 DP("Getting device %d\n", DeviceId); 599 CUresult Err = cuDeviceGet(&Device, DeviceId); 600 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 601 return OFFLOAD_FAIL; 602 603 // Query the current flags of the primary context and set its flags if 604 // it is inactive 605 unsigned int FormerPrimaryCtxFlags = 0; 606 int FormerPrimaryCtxIsActive = 0; 607 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, 608 &FormerPrimaryCtxIsActive); 609 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) 610 return OFFLOAD_FAIL; 611 612 if (FormerPrimaryCtxIsActive) { 613 DP("The primary context is active, no change to its flags\n"); 614 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != 615 CU_CTX_SCHED_BLOCKING_SYNC) 616 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); 617 } else { 618 DP("The primary context is inactive, set its flags to " 619 "CU_CTX_SCHED_BLOCKING_SYNC\n"); 620 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); 621 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) 622 return OFFLOAD_FAIL; 623 } 624 625 // Retain the per device primary context and save it to use whenever this 626 // device is selected. 627 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); 628 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) 629 return OFFLOAD_FAIL; 630 631 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 632 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 633 return OFFLOAD_FAIL; 634 635 // Initialize stream pool 636 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 637 return OFFLOAD_FAIL; 638 639 // Query attributes to determine number of threads/block and blocks/grid. 640 int MaxGridDimX; 641 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 642 Device); 643 if (Err != CUDA_SUCCESS) { 644 DP("Error getting max grid dimension, use default value %d\n", 645 DeviceRTLTy::DefaultNumTeams); 646 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 647 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 648 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 649 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 650 } else { 651 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 652 "at the hard limit\n", 653 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 654 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 655 } 656 657 // We are only exploiting threads along the x axis. 658 int MaxBlockDimX; 659 Err = cuDeviceGetAttribute(&MaxBlockDimX, 660 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 661 if (Err != CUDA_SUCCESS) { 662 DP("Error getting max block dimension, use default value %d\n", 663 DeviceRTLTy::DefaultNumThreads); 664 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 665 } else { 666 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 667 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 668 669 if (EnvTeamThreadLimit > 0 && 670 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { 671 DP("Max CUDA threads per block %d exceeds the thread limit %d set by " 672 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", 673 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); 674 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; 675 } 676 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { 677 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 678 "capping at the hard limit\n", 679 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); 680 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 681 } 682 } 683 684 // Get and set warp size 685 int WarpSize; 686 Err = 687 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 688 if (Err != CUDA_SUCCESS) { 689 DP("Error getting warp size, assume default value 32\n"); 690 DeviceData[DeviceId].WarpSize = 32; 691 } else { 692 DP("Using warp size %d\n", WarpSize); 693 DeviceData[DeviceId].WarpSize = WarpSize; 694 } 695 696 // Adjust teams to the env variables 697 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 698 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 699 EnvTeamLimit); 700 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 701 } 702 703 size_t StackLimit; 704 size_t HeapLimit; 705 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { 706 StackLimit = std::stol(EnvStr); 707 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) 708 return OFFLOAD_FAIL; 709 } else { 710 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) 711 return OFFLOAD_FAIL; 712 } 713 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { 714 HeapLimit = std::stol(EnvStr); 715 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) 716 return OFFLOAD_FAIL; 717 } else { 718 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) 719 return OFFLOAD_FAIL; 720 } 721 722 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 723 "Device supports up to %d CUDA blocks and %d threads with a " 724 "warp size of %d\n", 725 DeviceData[DeviceId].BlocksPerGrid, 726 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); 727 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 728 "Device heap size is %d Bytes, device stack size is %d Bytes per " 729 "thread\n", 730 (int)HeapLimit, (int)StackLimit); 731 732 // Set default number of teams 733 if (EnvNumTeams > 0) { 734 DP("Default number of teams set according to environment %d\n", 735 EnvNumTeams); 736 DeviceData[DeviceId].NumTeams = EnvNumTeams; 737 } else { 738 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 739 DP("Default number of teams set according to library's default %d\n", 740 DeviceRTLTy::DefaultNumTeams); 741 } 742 743 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 744 DP("Default number of teams exceeds device limit, capping at %d\n", 745 DeviceData[DeviceId].BlocksPerGrid); 746 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 747 } 748 749 // Set default number of threads 750 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 751 DP("Default number of threads set according to library's default %d\n", 752 DeviceRTLTy::DefaultNumThreads); 753 if (DeviceData[DeviceId].NumThreads > 754 DeviceData[DeviceId].ThreadsPerBlock) { 755 DP("Default number of threads exceeds device limit, capping at %d\n", 756 DeviceData[DeviceId].ThreadsPerBlock); 757 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 758 } 759 760 return OFFLOAD_SUCCESS; 761 } 762 763 __tgt_target_table *loadBinary(const int DeviceId, 764 const __tgt_device_image *Image) { 765 // Set the context we are using 766 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 767 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 768 return nullptr; 769 770 // Clear the offload table as we are going to create a new one. 771 clearOffloadEntriesTable(DeviceId); 772 773 // Create the module and extract the function pointers. 774 CUmodule Module; 775 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 776 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 777 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 778 return nullptr; 779 780 DP("CUDA module successfully loaded!\n"); 781 782 Modules.push_back(Module); 783 784 // Find the symbols in the module by name. 785 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 786 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 787 788 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList; 789 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 790 if (!E->addr) { 791 // We return nullptr when something like this happens, the host should 792 // have always something in the address to uniquely identify the target 793 // region. 794 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 795 return nullptr; 796 } 797 798 if (E->size) { 799 __tgt_offload_entry Entry = *E; 800 CUdeviceptr CUPtr; 801 size_t CUSize; 802 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 803 // We keep this style here because we need the name 804 if (Err != CUDA_SUCCESS) { 805 REPORT("Loading global '%s' Failed\n", E->name); 806 CUDA_ERR_STRING(Err); 807 return nullptr; 808 } 809 810 if (CUSize != E->size) { 811 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 812 CUSize, E->size); 813 return nullptr; 814 } 815 816 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 817 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 818 819 Entry.addr = (void *)(CUPtr); 820 821 // Note: In the current implementation declare target variables 822 // can either be link or to. This means that once unified 823 // memory is activated via the requires directive, the variable 824 // can be used directly from the host in both cases. 825 // TODO: when variables types other than to or link are added, 826 // the below condition should be changed to explicitly 827 // check for to and link variables types: 828 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 829 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 830 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 831 // If unified memory is present any target link or to variables 832 // can access host addresses directly. There is no longer a 833 // need for device copies. 834 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 835 DP("Copy linked variable host address (" DPxMOD 836 ") to device address (" DPxMOD ")\n", 837 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 838 } 839 840 addOffloadEntry(DeviceId, Entry); 841 842 continue; 843 } 844 845 CUfunction Func; 846 Err = cuModuleGetFunction(&Func, Module, E->name); 847 // We keep this style here because we need the name 848 if (Err != CUDA_SUCCESS) { 849 REPORT("Loading '%s' Failed\n", E->name); 850 CUDA_ERR_STRING(Err); 851 return nullptr; 852 } 853 854 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 855 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 856 857 // default value GENERIC (in case symbol is missing from cubin file) 858 int8_t ExecModeVal = ExecutionModeType::GENERIC; 859 std::string ExecModeNameStr(E->name); 860 ExecModeNameStr += "_exec_mode"; 861 const char *ExecModeName = ExecModeNameStr.c_str(); 862 863 CUdeviceptr ExecModePtr; 864 size_t CUSize; 865 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 866 if (Err == CUDA_SUCCESS) { 867 if (CUSize != sizeof(int8_t)) { 868 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 869 ExecModeName, CUSize, sizeof(int8_t)); 870 return nullptr; 871 } 872 873 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 874 if (Err != CUDA_SUCCESS) { 875 REPORT("Error when copying data from device to host. Pointers: " 876 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 877 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 878 CUDA_ERR_STRING(Err); 879 return nullptr; 880 } 881 882 if (ExecModeVal < 0 || ExecModeVal > 2) { 883 DP("Error wrong exec_mode value specified in cubin file: %d\n", 884 ExecModeVal); 885 return nullptr; 886 } 887 } else { 888 DP("Loading global exec_mode '%s' - symbol missing, using default " 889 "value GENERIC (1)\n", 890 ExecModeName); 891 } 892 893 KernelsList.emplace_back(Func, ExecModeVal); 894 895 __tgt_offload_entry Entry = *E; 896 Entry.addr = &KernelsList.back(); 897 addOffloadEntry(DeviceId, Entry); 898 } 899 900 // send device environment data to the device 901 { 902 omptarget_device_environmentTy DeviceEnv{0}; 903 904 #ifdef OMPTARGET_DEBUG 905 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 906 DeviceEnv.debug_level = std::stoi(EnvStr); 907 #endif 908 909 const char *DeviceEnvName = "omptarget_device_environment"; 910 CUdeviceptr DeviceEnvPtr; 911 size_t CUSize; 912 913 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 914 if (Err == CUDA_SUCCESS) { 915 if (CUSize != sizeof(DeviceEnv)) { 916 REPORT( 917 "Global device_environment '%s' - size mismatch (%zu != %zu)\n", 918 DeviceEnvName, CUSize, sizeof(int32_t)); 919 CUDA_ERR_STRING(Err); 920 return nullptr; 921 } 922 923 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 924 if (Err != CUDA_SUCCESS) { 925 REPORT("Error when copying data from host to device. Pointers: " 926 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 927 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 928 CUDA_ERR_STRING(Err); 929 return nullptr; 930 } 931 932 DP("Sending global device environment data %zu bytes\n", CUSize); 933 } else { 934 DP("Finding global device environment '%s' - symbol missing.\n", 935 DeviceEnvName); 936 DP("Continue, considering this is a device RTL which does not accept " 937 "environment setting.\n"); 938 } 939 } 940 941 return getOffloadEntriesTable(DeviceId); 942 } 943 944 void *dataAlloc(const int DeviceId, const int64_t Size, 945 const TargetAllocTy Kind) { 946 switch (Kind) { 947 case TARGET_ALLOC_DEFAULT: 948 case TARGET_ALLOC_DEVICE: 949 if (UseMemoryManager) 950 return MemoryManagers[DeviceId]->allocate(Size, nullptr); 951 else 952 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 953 case TARGET_ALLOC_HOST: 954 case TARGET_ALLOC_SHARED: 955 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); 956 } 957 958 REPORT("Invalid target data allocation kind or requested allocator not " 959 "implemented yet\n"); 960 961 return nullptr; 962 } 963 964 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 965 const int64_t Size, __tgt_async_info *AsyncInfo) const { 966 assert(AsyncInfo && "AsyncInfo is nullptr"); 967 968 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 969 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 970 return OFFLOAD_FAIL; 971 972 CUstream Stream = getStream(DeviceId, AsyncInfo); 973 974 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 975 if (Err != CUDA_SUCCESS) { 976 DP("Error when copying data from host to device. Pointers: host " 977 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 978 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 979 CUDA_ERR_STRING(Err); 980 return OFFLOAD_FAIL; 981 } 982 983 return OFFLOAD_SUCCESS; 984 } 985 986 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 987 const int64_t Size, __tgt_async_info *AsyncInfo) const { 988 assert(AsyncInfo && "AsyncInfo is nullptr"); 989 990 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 991 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 992 return OFFLOAD_FAIL; 993 994 CUstream Stream = getStream(DeviceId, AsyncInfo); 995 996 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 997 if (Err != CUDA_SUCCESS) { 998 DP("Error when copying data from device to host. Pointers: host " 999 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 1000 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 1001 CUDA_ERR_STRING(Err); 1002 return OFFLOAD_FAIL; 1003 } 1004 1005 return OFFLOAD_SUCCESS; 1006 } 1007 1008 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, 1009 int64_t Size, __tgt_async_info *AsyncInfo) const { 1010 assert(AsyncInfo && "AsyncInfo is nullptr"); 1011 1012 CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); 1013 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1014 return OFFLOAD_FAIL; 1015 1016 CUstream Stream = getStream(SrcDevId, AsyncInfo); 1017 1018 // If they are two devices, we try peer to peer copy first 1019 if (SrcDevId != DstDevId) { 1020 int CanAccessPeer = 0; 1021 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); 1022 if (Err != CUDA_SUCCESS) { 1023 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 1024 ", dst = %" PRId32 "\n", 1025 SrcDevId, DstDevId); 1026 CUDA_ERR_STRING(Err); 1027 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1028 } 1029 1030 if (!CanAccessPeer) { 1031 DP("P2P memcpy not supported so fall back to D2D memcpy"); 1032 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1033 } 1034 1035 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); 1036 if (Err != CUDA_SUCCESS) { 1037 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 1038 ", dst = %" PRId32 "\n", 1039 SrcDevId, DstDevId); 1040 CUDA_ERR_STRING(Err); 1041 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1042 } 1043 1044 Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, 1045 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, 1046 Size, Stream); 1047 if (Err == CUDA_SUCCESS) 1048 return OFFLOAD_SUCCESS; 1049 1050 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD 1051 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", 1052 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); 1053 CUDA_ERR_STRING(Err); 1054 } 1055 1056 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); 1057 } 1058 1059 int dataDelete(const int DeviceId, void *TgtPtr) { 1060 if (UseMemoryManager) 1061 return MemoryManagers[DeviceId]->free(TgtPtr); 1062 1063 return DeviceAllocators[DeviceId].free(TgtPtr); 1064 } 1065 1066 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, 1067 ptrdiff_t *TgtOffsets, const int ArgNum, 1068 const int TeamNum, const int ThreadLimit, 1069 const unsigned int LoopTripCount, 1070 __tgt_async_info *AsyncInfo) const { 1071 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 1072 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 1073 return OFFLOAD_FAIL; 1074 1075 // All args are references. 1076 std::vector<void *> Args(ArgNum); 1077 std::vector<void *> Ptrs(ArgNum); 1078 1079 for (int I = 0; I < ArgNum; ++I) { 1080 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 1081 Args[I] = &Ptrs[I]; 1082 } 1083 1084 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr); 1085 1086 int CudaThreadsPerBlock; 1087 if (ThreadLimit > 0) { 1088 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 1089 CudaThreadsPerBlock = ThreadLimit; 1090 // Add master warp if necessary 1091 if (KernelInfo->ExecutionMode == GENERIC) { 1092 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 1093 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 1094 } 1095 } else { 1096 DP("Setting CUDA threads per block to default %d\n", 1097 DeviceData[DeviceId].NumThreads); 1098 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 1099 } 1100 1101 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 1102 DP("Threads per block capped at device limit %d\n", 1103 DeviceData[DeviceId].ThreadsPerBlock); 1104 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 1105 } 1106 1107 if (!KernelInfo->MaxThreadsPerBlock) { 1108 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, 1109 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 1110 KernelInfo->Func); 1111 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) 1112 return OFFLOAD_FAIL; 1113 } 1114 1115 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { 1116 DP("Threads per block capped at kernel limit %d\n", 1117 KernelInfo->MaxThreadsPerBlock); 1118 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; 1119 } 1120 1121 unsigned int CudaBlocksPerGrid; 1122 if (TeamNum <= 0) { 1123 if (LoopTripCount > 0 && EnvNumTeams < 0) { 1124 if (KernelInfo->ExecutionMode == SPMD) { 1125 // We have a combined construct, i.e. `target teams distribute 1126 // parallel for [simd]`. We launch so many teams so that each thread 1127 // will execute one iteration of the loop. round up to the nearest 1128 // integer 1129 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 1130 } else if (KernelInfo->ExecutionMode == GENERIC) { 1131 // If we reach this point, then we have a non-combined construct, i.e. 1132 // `teams distribute` with a nested `parallel for` and each team is 1133 // assigned one iteration of the `distribute` loop. E.g.: 1134 // 1135 // #pragma omp target teams distribute 1136 // for(...loop_tripcount...) { 1137 // #pragma omp parallel for 1138 // for(...) {} 1139 // } 1140 // 1141 // Threads within a team will execute the iterations of the `parallel` 1142 // loop. 1143 CudaBlocksPerGrid = LoopTripCount; 1144 } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) { 1145 // If we reach this point, then we are executing a kernel that was 1146 // transformed from Generic-mode to SPMD-mode. This kernel has 1147 // SPMD-mode execution, but needs its blocks to be scheduled 1148 // differently because the current loop trip count only applies to the 1149 // `teams distribute` region and will create var too few blocks using 1150 // the regular SPMD-mode method. 1151 CudaBlocksPerGrid = LoopTripCount; 1152 } else { 1153 REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode); 1154 return OFFLOAD_FAIL; 1155 } 1156 DP("Using %d teams due to loop trip count %" PRIu32 1157 " and number of threads per block %d\n", 1158 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 1159 } else { 1160 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 1161 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 1162 } 1163 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 1164 DP("Capping number of teams to team limit %d\n", 1165 DeviceData[DeviceId].BlocksPerGrid); 1166 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 1167 } else { 1168 DP("Using requested number of teams %d\n", TeamNum); 1169 CudaBlocksPerGrid = TeamNum; 1170 } 1171 1172 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, 1173 "Launching kernel %s with %d blocks and %d threads in %s " 1174 "mode\n", 1175 (getOffloadEntry(DeviceId, TgtEntryPtr)) 1176 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name 1177 : "(null)", 1178 CudaBlocksPerGrid, CudaThreadsPerBlock, 1179 (KernelInfo->ExecutionMode != SPMD 1180 ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" 1181 : "SPMD-Generic") 1182 : "SPMD")); 1183 1184 CUstream Stream = getStream(DeviceId, AsyncInfo); 1185 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 1186 /* gridDimZ */ 1, CudaThreadsPerBlock, 1187 /* blockDimY */ 1, /* blockDimZ */ 1, 1188 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 1189 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 1190 return OFFLOAD_FAIL; 1191 1192 DP("Launch of entry point at " DPxMOD " successful!\n", 1193 DPxPTR(TgtEntryPtr)); 1194 1195 return OFFLOAD_SUCCESS; 1196 } 1197 1198 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { 1199 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue); 1200 CUresult Err = cuStreamSynchronize(Stream); 1201 1202 // Once the stream is synchronized, return it to stream pool and reset 1203 // AsyncInfo. This is to make sure the synchronization only works for its 1204 // own tasks. 1205 StreamManager->returnStream(DeviceId, 1206 reinterpret_cast<CUstream>(AsyncInfo->Queue)); 1207 AsyncInfo->Queue = nullptr; 1208 1209 if (Err != CUDA_SUCCESS) { 1210 DP("Error when synchronizing stream. stream = " DPxMOD 1211 ", async info ptr = " DPxMOD "\n", 1212 DPxPTR(Stream), DPxPTR(AsyncInfo)); 1213 CUDA_ERR_STRING(Err); 1214 } 1215 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; 1216 } 1217 1218 void printDeviceInfo(int32_t device_id) { 1219 char TmpChar[1000]; 1220 std::string TmpStr; 1221 size_t TmpSt; 1222 int TmpInt, TmpInt2, TmpInt3; 1223 1224 CUdevice Device; 1225 checkResult(cuDeviceGet(&Device, device_id), 1226 "Error returned from cuCtxGetDevice\n"); 1227 1228 cuDriverGetVersion(&TmpInt); 1229 printf(" CUDA Driver Version: \t\t%d \n", TmpInt); 1230 printf(" CUDA Device Number: \t\t%d \n", device_id); 1231 checkResult(cuDeviceGetName(TmpChar, 1000, Device), 1232 "Error returned from cuDeviceGetName\n"); 1233 printf(" Device Name: \t\t\t%s \n", TmpChar); 1234 checkResult(cuDeviceTotalMem(&TmpSt, Device), 1235 "Error returned from cuDeviceTotalMem\n"); 1236 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); 1237 checkResult(cuDeviceGetAttribute( 1238 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), 1239 "Error returned from cuDeviceGetAttribute\n"); 1240 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); 1241 checkResult( 1242 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), 1243 "Error returned from cuDeviceGetAttribute\n"); 1244 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); 1245 checkResult(cuDeviceGetAttribute( 1246 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), 1247 "Error returned from cuDeviceGetAttribute\n"); 1248 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); 1249 checkResult( 1250 cuDeviceGetAttribute( 1251 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), 1252 "Error returned from cuDeviceGetAttribute\n"); 1253 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); 1254 checkResult( 1255 cuDeviceGetAttribute( 1256 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), 1257 "Error returned from cuDeviceGetAttribute\n"); 1258 printf(" Registers per Block: \t\t%d \n", TmpInt); 1259 checkResult( 1260 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), 1261 "Error returned from cuDeviceGetAttribute\n"); 1262 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); 1263 checkResult(cuDeviceGetAttribute( 1264 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), 1265 "Error returned from cuDeviceGetAttribute\n"); 1266 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); 1267 checkResult(cuDeviceGetAttribute( 1268 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), 1269 "Error returned from cuDeviceGetAttribute\n"); 1270 checkResult(cuDeviceGetAttribute( 1271 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), 1272 "Error returned from cuDeviceGetAttribute\n"); 1273 checkResult(cuDeviceGetAttribute( 1274 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), 1275 "Error returned from cuDeviceGetAttribute\n"); 1276 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, 1277 TmpInt3); 1278 checkResult(cuDeviceGetAttribute( 1279 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), 1280 "Error returned from cuDeviceGetAttribute\n"); 1281 checkResult(cuDeviceGetAttribute( 1282 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), 1283 "Error returned from cuDeviceGetAttribute\n"); 1284 checkResult(cuDeviceGetAttribute( 1285 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), 1286 "Error returned from cuDeviceGetAttribute\n"); 1287 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, 1288 TmpInt3); 1289 checkResult( 1290 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), 1291 "Error returned from cuDeviceGetAttribute\n"); 1292 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); 1293 checkResult(cuDeviceGetAttribute( 1294 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), 1295 "Error returned from cuDeviceGetAttribute\n"); 1296 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); 1297 checkResult( 1298 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), 1299 "Error returned from cuDeviceGetAttribute\n"); 1300 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); 1301 checkResult(cuDeviceGetAttribute( 1302 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), 1303 "Error returned from cuDeviceGetAttribute\n"); 1304 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1305 checkResult( 1306 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), 1307 "Error returned from cuDeviceGetAttribute\n"); 1308 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1309 checkResult(cuDeviceGetAttribute( 1310 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), 1311 "Error returned from cuDeviceGetAttribute\n"); 1312 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1313 checkResult( 1314 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), 1315 "Error returned from cuDeviceGetAttribute\n"); 1316 if (TmpInt == CU_COMPUTEMODE_DEFAULT) 1317 TmpStr = "DEFAULT"; 1318 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) 1319 TmpStr = "PROHIBITED"; 1320 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) 1321 TmpStr = "EXCLUSIVE PROCESS"; 1322 else 1323 TmpStr = "unknown"; 1324 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); 1325 checkResult(cuDeviceGetAttribute( 1326 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), 1327 "Error returned from cuDeviceGetAttribute\n"); 1328 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); 1329 checkResult( 1330 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), 1331 "Error returned from cuDeviceGetAttribute\n"); 1332 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1333 checkResult(cuDeviceGetAttribute( 1334 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), 1335 "Error returned from cuDeviceGetAttribute\n"); 1336 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); 1337 checkResult( 1338 cuDeviceGetAttribute( 1339 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), 1340 "Error returned from cuDeviceGetAttribute\n"); 1341 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); 1342 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, 1343 Device), 1344 "Error returned from cuDeviceGetAttribute\n"); 1345 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); 1346 checkResult(cuDeviceGetAttribute( 1347 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 1348 Device), 1349 "Error returned from cuDeviceGetAttribute\n"); 1350 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); 1351 checkResult(cuDeviceGetAttribute( 1352 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), 1353 "Error returned from cuDeviceGetAttribute\n"); 1354 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); 1355 checkResult(cuDeviceGetAttribute( 1356 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), 1357 "Error returned from cuDeviceGetAttribute\n"); 1358 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); 1359 checkResult(cuDeviceGetAttribute( 1360 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), 1361 "Error returned from cuDeviceGetAttribute\n"); 1362 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); 1363 checkResult( 1364 cuDeviceGetAttribute( 1365 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), 1366 "Error returned from cuDeviceGetAttribute\n"); 1367 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); 1368 checkResult( 1369 cuDeviceGetAttribute( 1370 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), 1371 "Error returned from cuDeviceGetAttribute\n"); 1372 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); 1373 checkResult(cuDeviceGetAttribute( 1374 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), 1375 "Error returned from cuDeviceGetAttribute\n"); 1376 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); 1377 checkResult(cuDeviceGetAttribute( 1378 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), 1379 "Error returned from cuDeviceGetAttribute\n"); 1380 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); 1381 checkResult( 1382 cuDeviceGetAttribute( 1383 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), 1384 "Error returned from cuDeviceGetAttribute\n"); 1385 checkResult( 1386 cuDeviceGetAttribute( 1387 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), 1388 "Error returned from cuDeviceGetAttribute\n"); 1389 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); 1390 } 1391 1392 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, 1393 void *EventPtr) const { 1394 CUstream Stream = getStream(DeviceId, AsyncInfo); 1395 CUevent Event = reinterpret_cast<CUevent>(EventPtr); 1396 1397 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from 1398 // specific CUDA version, and defined as 0x0. In previous version, per CUDA 1399 // API document, that argument has to be 0x0. 1400 CUresult Err = cuStreamWaitEvent(Stream, Event, 0); 1401 if (Err != CUDA_SUCCESS) { 1402 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", 1403 DPxPTR(Stream), DPxPTR(Event)); 1404 CUDA_ERR_STRING(Err); 1405 return OFFLOAD_FAIL; 1406 } 1407 1408 return OFFLOAD_SUCCESS; 1409 } 1410 }; 1411 1412 DeviceRTLTy DeviceRTL; 1413 } // namespace 1414 1415 // Exposed library API function 1416 #ifdef __cplusplus 1417 extern "C" { 1418 #endif 1419 1420 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 1421 return elf_check_machine(image, /* EM_CUDA */ 190); 1422 } 1423 1424 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 1425 1426 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 1427 DP("Init requires flags to %" PRId64 "\n", RequiresFlags); 1428 DeviceRTL.setRequiresFlag(RequiresFlags); 1429 return RequiresFlags; 1430 } 1431 1432 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { 1433 if (DeviceRTL.isValidDeviceId(src_dev_id) && 1434 DeviceRTL.isValidDeviceId(dst_dev_id)) 1435 return 1; 1436 1437 return 0; 1438 } 1439 1440 int32_t __tgt_rtl_init_device(int32_t device_id) { 1441 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1442 1443 return DeviceRTL.initDevice(device_id); 1444 } 1445 1446 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 1447 __tgt_device_image *image) { 1448 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1449 1450 return DeviceRTL.loadBinary(device_id, image); 1451 } 1452 1453 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, 1454 int32_t kind) { 1455 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1456 1457 return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); 1458 } 1459 1460 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 1461 int64_t size) { 1462 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1463 1464 __tgt_async_info AsyncInfo; 1465 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 1466 size, &AsyncInfo); 1467 if (rc != OFFLOAD_SUCCESS) 1468 return OFFLOAD_FAIL; 1469 1470 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1471 } 1472 1473 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 1474 void *hst_ptr, int64_t size, 1475 __tgt_async_info *async_info_ptr) { 1476 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1477 assert(async_info_ptr && "async_info_ptr is nullptr"); 1478 1479 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 1480 async_info_ptr); 1481 } 1482 1483 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 1484 int64_t size) { 1485 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1486 1487 __tgt_async_info AsyncInfo; 1488 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 1489 size, &AsyncInfo); 1490 if (rc != OFFLOAD_SUCCESS) 1491 return OFFLOAD_FAIL; 1492 1493 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1494 } 1495 1496 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 1497 void *tgt_ptr, int64_t size, 1498 __tgt_async_info *async_info_ptr) { 1499 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1500 assert(async_info_ptr && "async_info_ptr is nullptr"); 1501 1502 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 1503 async_info_ptr); 1504 } 1505 1506 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, 1507 int dst_dev_id, void *dst_ptr, 1508 int64_t size, 1509 __tgt_async_info *AsyncInfo) { 1510 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1511 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1512 assert(AsyncInfo && "AsyncInfo is nullptr"); 1513 1514 return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, 1515 AsyncInfo); 1516 } 1517 1518 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, 1519 int32_t dst_dev_id, void *dst_ptr, 1520 int64_t size) { 1521 assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); 1522 assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); 1523 1524 __tgt_async_info AsyncInfo; 1525 const int32_t rc = __tgt_rtl_data_exchange_async( 1526 src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); 1527 if (rc != OFFLOAD_SUCCESS) 1528 return OFFLOAD_FAIL; 1529 1530 return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); 1531 } 1532 1533 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 1534 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1535 1536 return DeviceRTL.dataDelete(device_id, tgt_ptr); 1537 } 1538 1539 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 1540 void **tgt_args, 1541 ptrdiff_t *tgt_offsets, 1542 int32_t arg_num, int32_t team_num, 1543 int32_t thread_limit, 1544 uint64_t loop_tripcount) { 1545 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1546 1547 __tgt_async_info AsyncInfo; 1548 const int32_t rc = __tgt_rtl_run_target_team_region_async( 1549 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1550 thread_limit, loop_tripcount, &AsyncInfo); 1551 if (rc != OFFLOAD_SUCCESS) 1552 return OFFLOAD_FAIL; 1553 1554 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1555 } 1556 1557 int32_t __tgt_rtl_run_target_team_region_async( 1558 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 1559 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 1560 int32_t thread_limit, uint64_t loop_tripcount, 1561 __tgt_async_info *async_info_ptr) { 1562 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1563 1564 return DeviceRTL.runTargetTeamRegion( 1565 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1566 thread_limit, loop_tripcount, async_info_ptr); 1567 } 1568 1569 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1570 void **tgt_args, ptrdiff_t *tgt_offsets, 1571 int32_t arg_num) { 1572 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1573 1574 __tgt_async_info AsyncInfo; 1575 const int32_t rc = __tgt_rtl_run_target_region_async( 1576 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); 1577 if (rc != OFFLOAD_SUCCESS) 1578 return OFFLOAD_FAIL; 1579 1580 return __tgt_rtl_synchronize(device_id, &AsyncInfo); 1581 } 1582 1583 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1584 void *tgt_entry_ptr, void **tgt_args, 1585 ptrdiff_t *tgt_offsets, 1586 int32_t arg_num, 1587 __tgt_async_info *async_info_ptr) { 1588 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1589 1590 return __tgt_rtl_run_target_team_region_async( 1591 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1592 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1593 async_info_ptr); 1594 } 1595 1596 int32_t __tgt_rtl_synchronize(int32_t device_id, 1597 __tgt_async_info *async_info_ptr) { 1598 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1599 assert(async_info_ptr && "async_info_ptr is nullptr"); 1600 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1601 1602 return DeviceRTL.synchronize(device_id, async_info_ptr); 1603 } 1604 1605 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { 1606 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal(); 1607 InfoLevel.store(NewInfoLevel); 1608 } 1609 1610 void __tgt_rtl_print_device_info(int32_t device_id) { 1611 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1612 DeviceRTL.printDeviceInfo(device_id); 1613 } 1614 1615 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { 1616 assert(event && "event is nullptr"); 1617 return createEvent(event); 1618 } 1619 1620 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, 1621 __tgt_async_info *async_info_ptr) { 1622 assert(async_info_ptr && "async_info_ptr is nullptr"); 1623 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1624 assert(event_ptr && "event_ptr is nullptr"); 1625 1626 return recordEvent(event_ptr, async_info_ptr); 1627 } 1628 1629 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, 1630 __tgt_async_info *async_info_ptr) { 1631 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1632 assert(async_info_ptr && "async_info_ptr is nullptr"); 1633 assert(event_ptr && "event is nullptr"); 1634 1635 return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); 1636 } 1637 1638 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { 1639 assert(event_ptr && "event is nullptr"); 1640 1641 return syncEvent(event_ptr); 1642 } 1643 1644 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { 1645 assert(event_ptr && "event is nullptr"); 1646 1647 return destroyEvent(event_ptr); 1648 } 1649 1650 #ifdef __cplusplus 1651 } 1652 #endif 1653