1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "omptargetplugin.h"
25 
26 #define TARGET_NAME CUDA
27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
28 
29 #include "MemoryManager.h"
30 
31 // Utility for retrieving and printing CUDA error string.
32 #ifdef OMPTARGET_DEBUG
33 #define CUDA_ERR_STRING(err)                                                   \
34   do {                                                                         \
35     if (getDebugLevel() > 0) {                                                 \
36       const char *errStr = nullptr;                                            \
37       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
38       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
39         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
40       else if (errStr_status == CUDA_SUCCESS)                                  \
41         REPORT("CUDA error is: %s\n", errStr);                                 \
42       else {                                                                   \
43         REPORT("Unresolved CUDA error code: %d\n", err);                       \
44         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
45                errStr_status);                                                 \
46       }                                                                        \
47     } else {                                                                   \
48       const char *errStr = nullptr;                                            \
49       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
50       if (errStr_status == CUDA_SUCCESS)                                       \
51         REPORT("%s \n", errStr);                                               \
52     }                                                                          \
53   } while (false)
54 #else // OMPTARGET_DEBUG
55 #define CUDA_ERR_STRING(err)                                                   \
56   do {                                                                         \
57     const char *errStr = nullptr;                                              \
58     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
59     if (errStr_status == CUDA_SUCCESS)                                         \
60       REPORT("%s \n", errStr);                                                 \
61   } while (false)
62 #endif // OMPTARGET_DEBUG
63 
64 #include "elf_common.h"
65 
66 /// Keep entries table per device.
67 struct FuncOrGblEntryTy {
68   __tgt_target_table Table;
69   std::vector<__tgt_offload_entry> Entries;
70 };
71 
72 enum ExecutionModeType {
73   SPMD,         // constructors, destructors,
74                 // combined constructs (`teams distribute parallel for [simd]`)
75   GENERIC,      // everything else
76   SPMD_GENERIC, // Generic kernel with SPMD execution
77   NONE
78 };
79 
80 /// Use a single entity to encode a kernel and a set of flags.
81 struct KernelTy {
82   CUfunction Func;
83 
84   // execution mode of kernel
85   // 0 - SPMD mode (without master warp)
86   // 1 - Generic mode (with master warp)
87   // 2 - SPMD mode execution with Generic mode semantics.
88   int8_t ExecutionMode;
89 
90   /// Maximal number of threads per block for this kernel.
91   int MaxThreadsPerBlock = 0;
92 
93   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
94       : Func(_Func), ExecutionMode(_ExecutionMode) {}
95 };
96 
97 /// Device environment data
98 /// Manually sync with the deviceRTL side for now, move to a dedicated header
99 /// file later.
100 struct omptarget_device_environmentTy {
101   int32_t debug_level;
102 };
103 
104 namespace {
105 bool checkResult(CUresult Err, const char *ErrMsg) {
106   if (Err == CUDA_SUCCESS)
107     return true;
108 
109   REPORT("%s", ErrMsg);
110   CUDA_ERR_STRING(Err);
111   return false;
112 }
113 
114 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
115                CUstream Stream) {
116   CUresult Err =
117       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
118 
119   if (Err != CUDA_SUCCESS) {
120     DP("Error when copying data from device to device. Pointers: src "
121        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
122        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
123     CUDA_ERR_STRING(Err);
124     return OFFLOAD_FAIL;
125   }
126 
127   return OFFLOAD_SUCCESS;
128 }
129 
130 // Structure contains per-device data
131 struct DeviceDataTy {
132   /// List that contains all the kernels.
133   std::list<KernelTy> KernelsList;
134 
135   std::list<FuncOrGblEntryTy> FuncGblEntries;
136 
137   CUcontext Context = nullptr;
138   // Device properties
139   int ThreadsPerBlock = 0;
140   int BlocksPerGrid = 0;
141   int WarpSize = 0;
142   // OpenMP properties
143   int NumTeams = 0;
144   int NumThreads = 0;
145 };
146 
147 class StreamManagerTy {
148   int NumberOfDevices;
149   // The initial size of stream pool
150   int EnvNumInitialStreams;
151   // Per-device stream mutex
152   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
153   // Per-device stream Id indicates the next available stream in the pool
154   std::vector<int> NextStreamId;
155   // Per-device stream pool
156   std::vector<std::vector<CUstream>> StreamPool;
157   // Reference to per-device data
158   std::vector<DeviceDataTy> &DeviceData;
159 
160   // If there is no CUstream left in the pool, we will resize the pool to
161   // allocate more CUstream. This function should be called with device mutex,
162   // and we do not resize to smaller one.
163   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
164     std::vector<CUstream> &Pool = StreamPool[DeviceId];
165     const size_t CurrentSize = Pool.size();
166     assert(NewSize > CurrentSize && "new size is not larger than current size");
167 
168     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
169     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
170       // We will return if cannot switch to the right context in case of
171       // creating bunch of streams that are not corresponding to the right
172       // device. The offloading will fail later because selected CUstream is
173       // nullptr.
174       return;
175     }
176 
177     Pool.resize(NewSize, nullptr);
178 
179     for (size_t I = CurrentSize; I < NewSize; ++I) {
180       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
181                   "Error returned from cuStreamCreate\n");
182     }
183   }
184 
185 public:
186   StreamManagerTy(const int NumberOfDevices,
187                   std::vector<DeviceDataTy> &DeviceData)
188       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
189         DeviceData(DeviceData) {
190     StreamPool.resize(NumberOfDevices);
191     NextStreamId.resize(NumberOfDevices);
192     StreamMtx.resize(NumberOfDevices);
193 
194     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
195       EnvNumInitialStreams = std::stoi(EnvStr);
196 
197     // Initialize the next stream id
198     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
199 
200     // Initialize stream mutex
201     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
202       Ptr = std::make_unique<std::mutex>();
203   }
204 
205   ~StreamManagerTy() {
206     // Destroy streams
207     for (int I = 0; I < NumberOfDevices; ++I) {
208       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
209                   "Error returned from cuCtxSetCurrent\n");
210 
211       for (CUstream &S : StreamPool[I]) {
212         if (S)
213           checkResult(cuStreamDestroy(S),
214                       "Error returned from cuStreamDestroy\n");
215       }
216     }
217   }
218 
219   // Get a CUstream from pool. Per-device next stream id always points to the
220   // next available CUstream. That means, CUstreams [0, id-1] have been
221   // assigned, and [id,] are still available. If there is no CUstream left, we
222   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
223   // the id will increase one.
224   // xxxxxs+++++++++
225   //      ^
226   //      id
227   // After assignment, the pool becomes the following and s is assigned.
228   // xxxxxs+++++++++
229   //       ^
230   //       id
231   CUstream getStream(const int DeviceId) {
232     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
233     int &Id = NextStreamId[DeviceId];
234     // No CUstream left in the pool, we need to request from CUDA RT
235     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
236       // By default we double the stream pool every time
237       resizeStreamPool(DeviceId, Id * 2);
238     }
239     return StreamPool[DeviceId][Id++];
240   }
241 
242   // Return a CUstream back to pool. As mentioned above, per-device next
243   // stream is always points to the next available CUstream, so when we return
244   // a CUstream, we need to first decrease the id, and then copy the CUstream
245   // back.
246   // It is worth noting that, the order of streams return might be different
247   // from that they're assigned, that saying, at some point, there might be
248   // two identical CUstreams.
249   // xxax+a+++++
250   //     ^
251   //     id
252   // However, it doesn't matter, because they're always on the two sides of
253   // id. The left one will in the end be overwritten by another CUstream.
254   // Therefore, after several execution, the order of pool might be different
255   // from its initial state.
256   void returnStream(const int DeviceId, CUstream Stream) {
257     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
258     int &Id = NextStreamId[DeviceId];
259     assert(Id > 0 && "Wrong stream ID");
260     StreamPool[DeviceId][--Id] = Stream;
261   }
262 
263   bool initializeDeviceStreamPool(const int DeviceId) {
264     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
265 
266     resizeStreamPool(DeviceId, EnvNumInitialStreams);
267 
268     // Check the size of stream pool
269     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
270       return false;
271 
272     // Check whether each stream is valid
273     for (CUstream &S : StreamPool[DeviceId])
274       if (!S)
275         return false;
276 
277     return true;
278   }
279 };
280 
281 class DeviceRTLTy {
282   int NumberOfDevices;
283   // OpenMP environment properties
284   int EnvNumTeams;
285   int EnvTeamLimit;
286   int EnvTeamThreadLimit;
287   // OpenMP requires flags
288   int64_t RequiresFlags;
289 
290   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
291   static constexpr const int HardThreadLimit = 1024;
292   static constexpr const int DefaultNumTeams = 128;
293   static constexpr const int DefaultNumThreads = 128;
294 
295   std::unique_ptr<StreamManagerTy> StreamManager;
296   std::vector<DeviceDataTy> DeviceData;
297   std::vector<CUmodule> Modules;
298 
299   /// A class responsible for interacting with device native runtime library to
300   /// allocate and free memory.
301   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
302     const int DeviceId;
303     const std::vector<DeviceDataTy> &DeviceData;
304     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
305 
306   public:
307     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
308         : DeviceId(DeviceId), DeviceData(DeviceData) {}
309 
310     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
311       if (Size == 0)
312         return nullptr;
313 
314       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
315       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
316         return nullptr;
317 
318       void *MemAlloc = nullptr;
319       switch (Kind) {
320       case TARGET_ALLOC_DEFAULT:
321       case TARGET_ALLOC_DEVICE:
322         CUdeviceptr DevicePtr;
323         Err = cuMemAlloc(&DevicePtr, Size);
324         MemAlloc = (void *)DevicePtr;
325         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
326           return nullptr;
327         break;
328       case TARGET_ALLOC_HOST:
329         void *HostPtr;
330         Err = cuMemAllocHost(&HostPtr, Size);
331         MemAlloc = HostPtr;
332         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
333           return nullptr;
334         HostPinnedAllocs[MemAlloc] = Kind;
335         break;
336       case TARGET_ALLOC_SHARED:
337         CUdeviceptr SharedPtr;
338         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
339         MemAlloc = (void *)SharedPtr;
340         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
341           return nullptr;
342         break;
343       }
344 
345       return MemAlloc;
346     }
347 
348     int free(void *TgtPtr) override {
349       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
350       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
351         return OFFLOAD_FAIL;
352 
353       // Host pinned memory must be freed differently.
354       TargetAllocTy Kind =
355           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
356               ? TARGET_ALLOC_DEFAULT
357               : TARGET_ALLOC_HOST;
358       switch (Kind) {
359       case TARGET_ALLOC_DEFAULT:
360       case TARGET_ALLOC_DEVICE:
361       case TARGET_ALLOC_SHARED:
362         Err = cuMemFree((CUdeviceptr)TgtPtr);
363         if (!checkResult(Err, "Error returned from cuMemFree\n"))
364           return OFFLOAD_FAIL;
365         break;
366       case TARGET_ALLOC_HOST:
367         Err = cuMemFreeHost(TgtPtr);
368         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
369           return OFFLOAD_FAIL;
370         break;
371       }
372 
373       return OFFLOAD_SUCCESS;
374     }
375   };
376 
377   /// A vector of device allocators
378   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
379 
380   /// A vector of memory managers. Since the memory manager is non-copyable and
381   // non-removable, we wrap them into std::unique_ptr.
382   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
383 
384   /// Whether use memory manager
385   bool UseMemoryManager = true;
386 
387   // Record entry point associated with device
388   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
389     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
390     E.Entries.push_back(entry);
391   }
392 
393   // Return a pointer to the entry associated with the pointer
394   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
395                                              const void *Addr) const {
396     for (const __tgt_offload_entry &Itr :
397          DeviceData[DeviceId].FuncGblEntries.back().Entries)
398       if (Itr.addr == Addr)
399         return &Itr;
400 
401     return nullptr;
402   }
403 
404   // Return the pointer to the target entries table
405   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
406     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
407 
408     if (E.Entries.empty())
409       return nullptr;
410 
411     // Update table info according to the entries and return the pointer
412     E.Table.EntriesBegin = E.Entries.data();
413     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
414 
415     return &E.Table;
416   }
417 
418   // Clear entries table for a device
419   void clearOffloadEntriesTable(const int DeviceId) {
420     DeviceData[DeviceId].FuncGblEntries.emplace_back();
421     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
422     E.Entries.clear();
423     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
424   }
425 
426   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
427     assert(AsyncInfo && "AsyncInfo is nullptr");
428 
429     if (!AsyncInfo->Queue)
430       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
431 
432     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
433   }
434 
435 public:
436   // This class should not be copied
437   DeviceRTLTy(const DeviceRTLTy &) = delete;
438   DeviceRTLTy(DeviceRTLTy &&) = delete;
439 
440   DeviceRTLTy()
441       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
442         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
443 
444     DP("Start initializing CUDA\n");
445 
446     CUresult Err = cuInit(0);
447     if (Err == CUDA_ERROR_INVALID_HANDLE) {
448       // Can't call cuGetErrorString if dlsym failed
449       DP("Failed to load CUDA shared library\n");
450       return;
451     }
452     if (!checkResult(Err, "Error returned from cuInit\n")) {
453       return;
454     }
455 
456     Err = cuDeviceGetCount(&NumberOfDevices);
457     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
458       return;
459 
460     if (NumberOfDevices == 0) {
461       DP("There are no devices supporting CUDA.\n");
462       return;
463     }
464 
465     DeviceData.resize(NumberOfDevices);
466 
467     // Get environment variables regarding teams
468     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
469       // OMP_TEAM_LIMIT has been set
470       EnvTeamLimit = std::stoi(EnvStr);
471       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
472     }
473     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
474       // OMP_TEAMS_THREAD_LIMIT has been set
475       EnvTeamThreadLimit = std::stoi(EnvStr);
476       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
477     }
478     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
479       // OMP_NUM_TEAMS has been set
480       EnvNumTeams = std::stoi(EnvStr);
481       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
482     }
483 
484     StreamManager =
485         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
486 
487     for (int I = 0; I < NumberOfDevices; ++I)
488       DeviceAllocators.emplace_back(I, DeviceData);
489 
490     // Get the size threshold from environment variable
491     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
492     UseMemoryManager = Res.second;
493     size_t MemoryManagerThreshold = Res.first;
494 
495     if (UseMemoryManager)
496       for (int I = 0; I < NumberOfDevices; ++I)
497         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
498             DeviceAllocators[I], MemoryManagerThreshold));
499   }
500 
501   ~DeviceRTLTy() {
502     // We first destruct memory managers in case that its dependent data are
503     // destroyed before it.
504     for (auto &M : MemoryManagers)
505       M.release();
506 
507     StreamManager = nullptr;
508 
509     for (CUmodule &M : Modules)
510       // Close module
511       if (M)
512         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
513 
514     for (DeviceDataTy &D : DeviceData) {
515       // Destroy context
516       if (D.Context) {
517         checkResult(cuCtxSetCurrent(D.Context),
518                     "Error returned from cuCtxSetCurrent\n");
519         CUdevice Device;
520         checkResult(cuCtxGetDevice(&Device),
521                     "Error returned from cuCtxGetDevice\n");
522         checkResult(cuDevicePrimaryCtxRelease(Device),
523                     "Error returned from cuDevicePrimaryCtxRelease\n");
524       }
525     }
526   }
527 
528   // Check whether a given DeviceId is valid
529   bool isValidDeviceId(const int DeviceId) const {
530     return DeviceId >= 0 && DeviceId < NumberOfDevices;
531   }
532 
533   int getNumOfDevices() const { return NumberOfDevices; }
534 
535   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
536 
537   int initDevice(const int DeviceId) {
538     CUdevice Device;
539 
540     DP("Getting device %d\n", DeviceId);
541     CUresult Err = cuDeviceGet(&Device, DeviceId);
542     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
543       return OFFLOAD_FAIL;
544 
545     // Query the current flags of the primary context and set its flags if
546     // it is inactive
547     unsigned int FormerPrimaryCtxFlags = 0;
548     int FormerPrimaryCtxIsActive = 0;
549     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
550                                      &FormerPrimaryCtxIsActive);
551     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
552       return OFFLOAD_FAIL;
553 
554     if (FormerPrimaryCtxIsActive) {
555       DP("The primary context is active, no change to its flags\n");
556       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
557           CU_CTX_SCHED_BLOCKING_SYNC)
558         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
559     } else {
560       DP("The primary context is inactive, set its flags to "
561          "CU_CTX_SCHED_BLOCKING_SYNC\n");
562       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
563       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
564         return OFFLOAD_FAIL;
565     }
566 
567     // Retain the per device primary context and save it to use whenever this
568     // device is selected.
569     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
570     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
571       return OFFLOAD_FAIL;
572 
573     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
574     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
575       return OFFLOAD_FAIL;
576 
577     // Initialize stream pool
578     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
579       return OFFLOAD_FAIL;
580 
581     // Query attributes to determine number of threads/block and blocks/grid.
582     int MaxGridDimX;
583     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
584                                Device);
585     if (Err != CUDA_SUCCESS) {
586       DP("Error getting max grid dimension, use default value %d\n",
587          DeviceRTLTy::DefaultNumTeams);
588       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
589     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
590       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
591       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
592     } else {
593       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
594          "at the hard limit\n",
595          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
596       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
597     }
598 
599     // We are only exploiting threads along the x axis.
600     int MaxBlockDimX;
601     Err = cuDeviceGetAttribute(&MaxBlockDimX,
602                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
603     if (Err != CUDA_SUCCESS) {
604       DP("Error getting max block dimension, use default value %d\n",
605          DeviceRTLTy::DefaultNumThreads);
606       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
607     } else {
608       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
609       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
610 
611       if (EnvTeamThreadLimit > 0 &&
612           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
613         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
614            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
615            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
616         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
617       }
618       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
619         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
620            "capping at the hard limit\n",
621            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
622         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
623       }
624     }
625 
626     // Get and set warp size
627     int WarpSize;
628     Err =
629         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
630     if (Err != CUDA_SUCCESS) {
631       DP("Error getting warp size, assume default value 32\n");
632       DeviceData[DeviceId].WarpSize = 32;
633     } else {
634       DP("Using warp size %d\n", WarpSize);
635       DeviceData[DeviceId].WarpSize = WarpSize;
636     }
637 
638     // Adjust teams to the env variables
639     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
640       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
641          EnvTeamLimit);
642       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
643     }
644 
645     size_t StackLimit;
646     size_t HeapLimit;
647     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
648       StackLimit = std::stol(EnvStr);
649       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
650         return OFFLOAD_FAIL;
651     } else {
652       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
653         return OFFLOAD_FAIL;
654     }
655     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
656       HeapLimit = std::stol(EnvStr);
657       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
658         return OFFLOAD_FAIL;
659     } else {
660       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
661         return OFFLOAD_FAIL;
662     }
663 
664     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
665          "Device supports up to %d CUDA blocks and %d threads with a "
666          "warp size of %d\n",
667          DeviceData[DeviceId].BlocksPerGrid,
668          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
669     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
670          "Device heap size is %d Bytes, device stack size is %d Bytes per "
671          "thread\n",
672          (int)HeapLimit, (int)StackLimit);
673 
674     // Set default number of teams
675     if (EnvNumTeams > 0) {
676       DP("Default number of teams set according to environment %d\n",
677          EnvNumTeams);
678       DeviceData[DeviceId].NumTeams = EnvNumTeams;
679     } else {
680       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
681       DP("Default number of teams set according to library's default %d\n",
682          DeviceRTLTy::DefaultNumTeams);
683     }
684 
685     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
686       DP("Default number of teams exceeds device limit, capping at %d\n",
687          DeviceData[DeviceId].BlocksPerGrid);
688       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
689     }
690 
691     // Set default number of threads
692     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
693     DP("Default number of threads set according to library's default %d\n",
694        DeviceRTLTy::DefaultNumThreads);
695     if (DeviceData[DeviceId].NumThreads >
696         DeviceData[DeviceId].ThreadsPerBlock) {
697       DP("Default number of threads exceeds device limit, capping at %d\n",
698          DeviceData[DeviceId].ThreadsPerBlock);
699       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
700     }
701 
702     return OFFLOAD_SUCCESS;
703   }
704 
705   __tgt_target_table *loadBinary(const int DeviceId,
706                                  const __tgt_device_image *Image) {
707     // Set the context we are using
708     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
709     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
710       return nullptr;
711 
712     // Clear the offload table as we are going to create a new one.
713     clearOffloadEntriesTable(DeviceId);
714 
715     // Create the module and extract the function pointers.
716     CUmodule Module;
717     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
718     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
719     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
720       return nullptr;
721 
722     DP("CUDA module successfully loaded!\n");
723 
724     Modules.push_back(Module);
725 
726     // Find the symbols in the module by name.
727     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
728     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
729 
730     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
731     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
732       if (!E->addr) {
733         // We return nullptr when something like this happens, the host should
734         // have always something in the address to uniquely identify the target
735         // region.
736         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
737         return nullptr;
738       }
739 
740       if (E->size) {
741         __tgt_offload_entry Entry = *E;
742         CUdeviceptr CUPtr;
743         size_t CUSize;
744         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
745         // We keep this style here because we need the name
746         if (Err != CUDA_SUCCESS) {
747           REPORT("Loading global '%s' Failed\n", E->name);
748           CUDA_ERR_STRING(Err);
749           return nullptr;
750         }
751 
752         if (CUSize != E->size) {
753           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
754              CUSize, E->size);
755           return nullptr;
756         }
757 
758         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
759            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
760 
761         Entry.addr = (void *)(CUPtr);
762 
763         // Note: In the current implementation declare target variables
764         // can either be link or to. This means that once unified
765         // memory is activated via the requires directive, the variable
766         // can be used directly from the host in both cases.
767         // TODO: when variables types other than to or link are added,
768         // the below condition should be changed to explicitly
769         // check for to and link variables types:
770         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
771         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
772         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
773           // If unified memory is present any target link or to variables
774           // can access host addresses directly. There is no longer a
775           // need for device copies.
776           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
777           DP("Copy linked variable host address (" DPxMOD
778              ") to device address (" DPxMOD ")\n",
779              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
780         }
781 
782         addOffloadEntry(DeviceId, Entry);
783 
784         continue;
785       }
786 
787       CUfunction Func;
788       Err = cuModuleGetFunction(&Func, Module, E->name);
789       // We keep this style here because we need the name
790       if (Err != CUDA_SUCCESS) {
791         REPORT("Loading '%s' Failed\n", E->name);
792         CUDA_ERR_STRING(Err);
793         return nullptr;
794       }
795 
796       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
797          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
798 
799       // default value GENERIC (in case symbol is missing from cubin file)
800       int8_t ExecModeVal = ExecutionModeType::GENERIC;
801       std::string ExecModeNameStr(E->name);
802       ExecModeNameStr += "_exec_mode";
803       const char *ExecModeName = ExecModeNameStr.c_str();
804 
805       CUdeviceptr ExecModePtr;
806       size_t CUSize;
807       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
808       if (Err == CUDA_SUCCESS) {
809         if (CUSize != sizeof(int8_t)) {
810           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
811              ExecModeName, CUSize, sizeof(int8_t));
812           return nullptr;
813         }
814 
815         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
816         if (Err != CUDA_SUCCESS) {
817           REPORT("Error when copying data from device to host. Pointers: "
818                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
819                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
820           CUDA_ERR_STRING(Err);
821           return nullptr;
822         }
823 
824         if (ExecModeVal < 0 || ExecModeVal > 2) {
825           DP("Error wrong exec_mode value specified in cubin file: %d\n",
826              ExecModeVal);
827           return nullptr;
828         }
829       } else {
830         REPORT("Loading global exec_mode '%s' - symbol missing, using default "
831                "value GENERIC (1)\n",
832                ExecModeName);
833         CUDA_ERR_STRING(Err);
834       }
835 
836       KernelsList.emplace_back(Func, ExecModeVal);
837 
838       __tgt_offload_entry Entry = *E;
839       Entry.addr = &KernelsList.back();
840       addOffloadEntry(DeviceId, Entry);
841     }
842 
843     // send device environment data to the device
844     {
845       omptarget_device_environmentTy DeviceEnv{0};
846 
847 #ifdef OMPTARGET_DEBUG
848       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
849         DeviceEnv.debug_level = std::stoi(EnvStr);
850 #endif
851 
852       const char *DeviceEnvName = "omptarget_device_environment";
853       CUdeviceptr DeviceEnvPtr;
854       size_t CUSize;
855 
856       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
857       if (Err == CUDA_SUCCESS) {
858         if (CUSize != sizeof(DeviceEnv)) {
859           REPORT(
860               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
861               DeviceEnvName, CUSize, sizeof(int32_t));
862           CUDA_ERR_STRING(Err);
863           return nullptr;
864         }
865 
866         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
867         if (Err != CUDA_SUCCESS) {
868           REPORT("Error when copying data from host to device. Pointers: "
869                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
870                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
871           CUDA_ERR_STRING(Err);
872           return nullptr;
873         }
874 
875         DP("Sending global device environment data %zu bytes\n", CUSize);
876       } else {
877         DP("Finding global device environment '%s' - symbol missing.\n",
878            DeviceEnvName);
879         DP("Continue, considering this is a device RTL which does not accept "
880            "environment setting.\n");
881       }
882     }
883 
884     return getOffloadEntriesTable(DeviceId);
885   }
886 
887   void *dataAlloc(const int DeviceId, const int64_t Size,
888                   const TargetAllocTy Kind) {
889     switch (Kind) {
890     case TARGET_ALLOC_DEFAULT:
891     case TARGET_ALLOC_DEVICE:
892       if (UseMemoryManager)
893         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
894       else
895         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
896     case TARGET_ALLOC_HOST:
897     case TARGET_ALLOC_SHARED:
898       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
899     }
900 
901     REPORT("Invalid target data allocation kind or requested allocator not "
902            "implemented yet\n");
903 
904     return nullptr;
905   }
906 
907   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
908                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
909     assert(AsyncInfo && "AsyncInfo is nullptr");
910 
911     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
912     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
913       return OFFLOAD_FAIL;
914 
915     CUstream Stream = getStream(DeviceId, AsyncInfo);
916 
917     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
918     if (Err != CUDA_SUCCESS) {
919       DP("Error when copying data from host to device. Pointers: host "
920          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
921          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
922       CUDA_ERR_STRING(Err);
923       return OFFLOAD_FAIL;
924     }
925 
926     return OFFLOAD_SUCCESS;
927   }
928 
929   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
930                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
931     assert(AsyncInfo && "AsyncInfo is nullptr");
932 
933     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
934     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
935       return OFFLOAD_FAIL;
936 
937     CUstream Stream = getStream(DeviceId, AsyncInfo);
938 
939     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
940     if (Err != CUDA_SUCCESS) {
941       DP("Error when copying data from device to host. Pointers: host "
942          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
943          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
944       CUDA_ERR_STRING(Err);
945       return OFFLOAD_FAIL;
946     }
947 
948     return OFFLOAD_SUCCESS;
949   }
950 
951   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
952                    int64_t Size, __tgt_async_info *AsyncInfo) const {
953     assert(AsyncInfo && "AsyncInfo is nullptr");
954 
955     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
956     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
957       return OFFLOAD_FAIL;
958 
959     CUstream Stream = getStream(SrcDevId, AsyncInfo);
960 
961     // If they are two devices, we try peer to peer copy first
962     if (SrcDevId != DstDevId) {
963       int CanAccessPeer = 0;
964       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
965       if (Err != CUDA_SUCCESS) {
966         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
967                ", dst = %" PRId32 "\n",
968                SrcDevId, DstDevId);
969         CUDA_ERR_STRING(Err);
970         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
971       }
972 
973       if (!CanAccessPeer) {
974         DP("P2P memcpy not supported so fall back to D2D memcpy");
975         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
976       }
977 
978       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
979       if (Err != CUDA_SUCCESS) {
980         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
981                ", dst = %" PRId32 "\n",
982                SrcDevId, DstDevId);
983         CUDA_ERR_STRING(Err);
984         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
985       }
986 
987       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
988                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
989                               Size, Stream);
990       if (Err == CUDA_SUCCESS)
991         return OFFLOAD_SUCCESS;
992 
993       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
994          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
995          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
996       CUDA_ERR_STRING(Err);
997     }
998 
999     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1000   }
1001 
1002   int dataDelete(const int DeviceId, void *TgtPtr) {
1003     if (UseMemoryManager)
1004       return MemoryManagers[DeviceId]->free(TgtPtr);
1005 
1006     return DeviceAllocators[DeviceId].free(TgtPtr);
1007   }
1008 
1009   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1010                           ptrdiff_t *TgtOffsets, const int ArgNum,
1011                           const int TeamNum, const int ThreadLimit,
1012                           const unsigned int LoopTripCount,
1013                           __tgt_async_info *AsyncInfo) const {
1014     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1015     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1016       return OFFLOAD_FAIL;
1017 
1018     // All args are references.
1019     std::vector<void *> Args(ArgNum);
1020     std::vector<void *> Ptrs(ArgNum);
1021 
1022     for (int I = 0; I < ArgNum; ++I) {
1023       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1024       Args[I] = &Ptrs[I];
1025     }
1026 
1027     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1028 
1029     int CudaThreadsPerBlock;
1030     if (ThreadLimit > 0) {
1031       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1032       CudaThreadsPerBlock = ThreadLimit;
1033       // Add master warp if necessary
1034       if (KernelInfo->ExecutionMode == GENERIC) {
1035         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1036         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1037       }
1038     } else {
1039       DP("Setting CUDA threads per block to default %d\n",
1040          DeviceData[DeviceId].NumThreads);
1041       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1042     }
1043 
1044     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1045       DP("Threads per block capped at device limit %d\n",
1046          DeviceData[DeviceId].ThreadsPerBlock);
1047       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1048     }
1049 
1050     if (!KernelInfo->MaxThreadsPerBlock) {
1051       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1052                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1053                                KernelInfo->Func);
1054       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1055         return OFFLOAD_FAIL;
1056     }
1057 
1058     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1059       DP("Threads per block capped at kernel limit %d\n",
1060          KernelInfo->MaxThreadsPerBlock);
1061       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1062     }
1063 
1064     unsigned int CudaBlocksPerGrid;
1065     if (TeamNum <= 0) {
1066       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1067         if (KernelInfo->ExecutionMode == SPMD) {
1068           // We have a combined construct, i.e. `target teams distribute
1069           // parallel for [simd]`. We launch so many teams so that each thread
1070           // will execute one iteration of the loop. round up to the nearest
1071           // integer
1072           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1073         } else if (KernelInfo->ExecutionMode == GENERIC) {
1074           // If we reach this point, then we have a non-combined construct, i.e.
1075           // `teams distribute` with a nested `parallel for` and each team is
1076           // assigned one iteration of the `distribute` loop. E.g.:
1077           //
1078           // #pragma omp target teams distribute
1079           // for(...loop_tripcount...) {
1080           //   #pragma omp parallel for
1081           //   for(...) {}
1082           // }
1083           //
1084           // Threads within a team will execute the iterations of the `parallel`
1085           // loop.
1086           CudaBlocksPerGrid = LoopTripCount;
1087         } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1088           // If we reach this point, then we are executing a kernel that was
1089           // transformed from Generic-mode to SPMD-mode. This kernel has
1090           // SPMD-mode execution, but needs its blocks to be scheduled
1091           // differently because the current loop trip count only applies to the
1092           // `teams distribute` region and will create var too few blocks using
1093           // the regular SPMD-mode method.
1094           CudaBlocksPerGrid = LoopTripCount;
1095         } else {
1096           REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
1097           return OFFLOAD_FAIL;
1098         }
1099         DP("Using %d teams due to loop trip count %" PRIu32
1100            " and number of threads per block %d\n",
1101            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1102       } else {
1103         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1104         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1105       }
1106     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1107       DP("Capping number of teams to team limit %d\n",
1108          DeviceData[DeviceId].BlocksPerGrid);
1109       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1110     } else {
1111       DP("Using requested number of teams %d\n", TeamNum);
1112       CudaBlocksPerGrid = TeamNum;
1113     }
1114 
1115     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1116          "Launching kernel %s with %d blocks and %d threads in %s "
1117          "mode\n",
1118          (getOffloadEntry(DeviceId, TgtEntryPtr))
1119              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1120              : "(null)",
1121          CudaBlocksPerGrid, CudaThreadsPerBlock,
1122          (KernelInfo->ExecutionMode != SPMD
1123               ? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
1124                                                       : "SPMD-Generic")
1125               : "SPMD"));
1126 
1127     CUstream Stream = getStream(DeviceId, AsyncInfo);
1128     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1129                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1130                          /* blockDimY */ 1, /* blockDimZ */ 1,
1131                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1132     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1133       return OFFLOAD_FAIL;
1134 
1135     DP("Launch of entry point at " DPxMOD " successful!\n",
1136        DPxPTR(TgtEntryPtr));
1137 
1138     return OFFLOAD_SUCCESS;
1139   }
1140 
1141   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1142     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1143     CUresult Err = cuStreamSynchronize(Stream);
1144 
1145     // Once the stream is synchronized, return it to stream pool and reset
1146     // AsyncInfo. This is to make sure the synchronization only works for its
1147     // own tasks.
1148     StreamManager->returnStream(DeviceId,
1149                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1150     AsyncInfo->Queue = nullptr;
1151 
1152     if (Err != CUDA_SUCCESS) {
1153       DP("Error when synchronizing stream. stream = " DPxMOD
1154          ", async info ptr = " DPxMOD "\n",
1155          DPxPTR(Stream), DPxPTR(AsyncInfo));
1156       CUDA_ERR_STRING(Err);
1157     }
1158     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1159   }
1160 };
1161 
1162 DeviceRTLTy DeviceRTL;
1163 } // namespace
1164 
1165 // Exposed library API function
1166 #ifdef __cplusplus
1167 extern "C" {
1168 #endif
1169 
1170 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1171   return elf_check_machine(image, /* EM_CUDA */ 190);
1172 }
1173 
1174 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1175 
1176 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1177   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1178   DeviceRTL.setRequiresFlag(RequiresFlags);
1179   return RequiresFlags;
1180 }
1181 
1182 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1183   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1184       DeviceRTL.isValidDeviceId(dst_dev_id))
1185     return 1;
1186 
1187   return 0;
1188 }
1189 
1190 int32_t __tgt_rtl_init_device(int32_t device_id) {
1191   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1192 
1193   return DeviceRTL.initDevice(device_id);
1194 }
1195 
1196 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1197                                           __tgt_device_image *image) {
1198   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1199 
1200   return DeviceRTL.loadBinary(device_id, image);
1201 }
1202 
1203 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1204                            int32_t kind) {
1205   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1206 
1207   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1208 }
1209 
1210 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1211                               int64_t size) {
1212   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1213 
1214   __tgt_async_info AsyncInfo;
1215   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1216                                                  size, &AsyncInfo);
1217   if (rc != OFFLOAD_SUCCESS)
1218     return OFFLOAD_FAIL;
1219 
1220   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1221 }
1222 
1223 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1224                                     void *hst_ptr, int64_t size,
1225                                     __tgt_async_info *async_info_ptr) {
1226   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1227   assert(async_info_ptr && "async_info_ptr is nullptr");
1228 
1229   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1230                               async_info_ptr);
1231 }
1232 
1233 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1234                                 int64_t size) {
1235   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1236 
1237   __tgt_async_info AsyncInfo;
1238   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1239                                                    size, &AsyncInfo);
1240   if (rc != OFFLOAD_SUCCESS)
1241     return OFFLOAD_FAIL;
1242 
1243   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1244 }
1245 
1246 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1247                                       void *tgt_ptr, int64_t size,
1248                                       __tgt_async_info *async_info_ptr) {
1249   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1250   assert(async_info_ptr && "async_info_ptr is nullptr");
1251 
1252   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1253                                 async_info_ptr);
1254 }
1255 
1256 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1257                                       int dst_dev_id, void *dst_ptr,
1258                                       int64_t size,
1259                                       __tgt_async_info *AsyncInfo) {
1260   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1261   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1262   assert(AsyncInfo && "AsyncInfo is nullptr");
1263 
1264   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1265                                 AsyncInfo);
1266 }
1267 
1268 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1269                                 int32_t dst_dev_id, void *dst_ptr,
1270                                 int64_t size) {
1271   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1272   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1273 
1274   __tgt_async_info AsyncInfo;
1275   const int32_t rc = __tgt_rtl_data_exchange_async(
1276       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1277   if (rc != OFFLOAD_SUCCESS)
1278     return OFFLOAD_FAIL;
1279 
1280   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1281 }
1282 
1283 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1284   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1285 
1286   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1287 }
1288 
1289 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1290                                          void **tgt_args,
1291                                          ptrdiff_t *tgt_offsets,
1292                                          int32_t arg_num, int32_t team_num,
1293                                          int32_t thread_limit,
1294                                          uint64_t loop_tripcount) {
1295   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1296 
1297   __tgt_async_info AsyncInfo;
1298   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1299       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1300       thread_limit, loop_tripcount, &AsyncInfo);
1301   if (rc != OFFLOAD_SUCCESS)
1302     return OFFLOAD_FAIL;
1303 
1304   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1305 }
1306 
1307 int32_t __tgt_rtl_run_target_team_region_async(
1308     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1309     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1310     int32_t thread_limit, uint64_t loop_tripcount,
1311     __tgt_async_info *async_info_ptr) {
1312   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1313 
1314   return DeviceRTL.runTargetTeamRegion(
1315       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1316       thread_limit, loop_tripcount, async_info_ptr);
1317 }
1318 
1319 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1320                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1321                                     int32_t arg_num) {
1322   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1323 
1324   __tgt_async_info AsyncInfo;
1325   const int32_t rc = __tgt_rtl_run_target_region_async(
1326       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1327   if (rc != OFFLOAD_SUCCESS)
1328     return OFFLOAD_FAIL;
1329 
1330   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1331 }
1332 
1333 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1334                                           void *tgt_entry_ptr, void **tgt_args,
1335                                           ptrdiff_t *tgt_offsets,
1336                                           int32_t arg_num,
1337                                           __tgt_async_info *async_info_ptr) {
1338   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1339 
1340   return __tgt_rtl_run_target_team_region_async(
1341       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1342       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1343       async_info_ptr);
1344 }
1345 
1346 int32_t __tgt_rtl_synchronize(int32_t device_id,
1347                               __tgt_async_info *async_info_ptr) {
1348   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1349   assert(async_info_ptr && "async_info_ptr is nullptr");
1350   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1351 
1352   return DeviceRTL.synchronize(device_id, async_info_ptr);
1353 }
1354 
1355 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1356   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1357   InfoLevel.store(NewInfoLevel);
1358 }
1359 
1360 #ifdef __cplusplus
1361 }
1362 #endif
1363