1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
26 
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
29 
30 #include "MemoryManager.h"
31 
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
33 
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err)                                                   \
37   do {                                                                         \
38     if (getDebugLevel() > 0) {                                                 \
39       const char *errStr = nullptr;                                            \
40       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
41       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
42         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
43       else if (errStr_status == CUDA_SUCCESS)                                  \
44         REPORT("CUDA error is: %s\n", errStr);                                 \
45       else {                                                                   \
46         REPORT("Unresolved CUDA error code: %d\n", err);                       \
47         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
48                errStr_status);                                                 \
49       }                                                                        \
50     } else {                                                                   \
51       const char *errStr = nullptr;                                            \
52       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
53       if (errStr_status == CUDA_SUCCESS)                                       \
54         REPORT("%s \n", errStr);                                               \
55     }                                                                          \
56   } while (false)
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err)                                                   \
59   do {                                                                         \
60     const char *errStr = nullptr;                                              \
61     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
62     if (errStr_status == CUDA_SUCCESS)                                         \
63       REPORT("%s \n", errStr);                                                 \
64   } while (false)
65 #endif // OMPTARGET_DEBUG
66 
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
68 
69 #include "elf_common.h"
70 
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy {
73   __tgt_target_table Table;
74   std::vector<__tgt_offload_entry> Entries;
75 };
76 
77 /// Use a single entity to encode a kernel and a set of flags.
78 struct KernelTy {
79   CUfunction Func;
80 
81   // execution mode of kernel
82   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
83 
84   /// Maximal number of threads per block for this kernel.
85   int MaxThreadsPerBlock = 0;
86 
87   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
88       : Func(_Func), ExecutionMode(_ExecutionMode) {}
89 };
90 
91 namespace {
92 bool checkResult(CUresult Err, const char *ErrMsg) {
93   if (Err == CUDA_SUCCESS)
94     return true;
95 
96   REPORT("%s", ErrMsg);
97   CUDA_ERR_STRING(Err);
98   return false;
99 }
100 
101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
102                CUstream Stream) {
103   CUresult Err =
104       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
105 
106   if (Err != CUDA_SUCCESS) {
107     DP("Error when copying data from device to device. Pointers: src "
108        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
109        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
110     CUDA_ERR_STRING(Err);
111     return OFFLOAD_FAIL;
112   }
113 
114   return OFFLOAD_SUCCESS;
115 }
116 
117 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
118   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
119   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
120 
121   CUresult Err = cuEventRecord(Event, Stream);
122   if (Err != CUDA_SUCCESS) {
123     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
124        DPxPTR(Stream), DPxPTR(Event));
125     CUDA_ERR_STRING(Err);
126     return OFFLOAD_FAIL;
127   }
128 
129   return OFFLOAD_SUCCESS;
130 }
131 
132 int syncEvent(void *EventPtr) {
133   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
134 
135   CUresult Err = cuEventSynchronize(Event);
136   if (Err != CUDA_SUCCESS) {
137     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
138     CUDA_ERR_STRING(Err);
139     return OFFLOAD_FAIL;
140   }
141 
142   return OFFLOAD_SUCCESS;
143 }
144 
145 // Structure contains per-device data
146 struct DeviceDataTy {
147   /// List that contains all the kernels.
148   std::list<KernelTy> KernelsList;
149 
150   std::list<FuncOrGblEntryTy> FuncGblEntries;
151 
152   CUcontext Context = nullptr;
153   // Device properties
154   int ThreadsPerBlock = 0;
155   int BlocksPerGrid = 0;
156   int WarpSize = 0;
157   // OpenMP properties
158   int NumTeams = 0;
159   int NumThreads = 0;
160 };
161 
162 /// Resource allocator where \p T is the resource type.
163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
164 /// accordingly. The implementation should not raise any exception.
165 template <typename T> class AllocatorTy {
166 public:
167   /// Create a resource and assign to R.
168   int create(T &R) noexcept;
169   /// Destroy the resource.
170   int destroy(T) noexcept;
171 };
172 
173 /// Allocator for CUstream.
174 template <> class AllocatorTy<CUstream> {
175   CUcontext Context;
176 
177 public:
178   AllocatorTy(CUcontext C) noexcept : Context(C) {}
179 
180   /// See AllocatorTy<T>::create.
181   int create(CUstream &Stream) noexcept {
182     if (!checkResult(cuCtxSetCurrent(Context),
183                      "Error returned from cuCtxSetCurrent\n"))
184       return OFFLOAD_FAIL;
185 
186     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
187                      "Error returned from cuStreamCreate\n"))
188       return OFFLOAD_FAIL;
189 
190     return OFFLOAD_SUCCESS;
191   }
192 
193   /// See AllocatorTy<T>::destroy.
194   int destroy(CUstream Stream) noexcept {
195     if (!checkResult(cuCtxSetCurrent(Context),
196                      "Error returned from cuCtxSetCurrent\n"))
197       return OFFLOAD_FAIL;
198     if (!checkResult(cuStreamDestroy(Stream),
199                      "Error returned from cuStreamDestroy\n"))
200       return OFFLOAD_FAIL;
201 
202     return OFFLOAD_SUCCESS;
203   }
204 };
205 
206 /// Allocator for CUevent.
207 template <> class AllocatorTy<CUevent> {
208 public:
209   /// See AllocatorTy<T>::create.
210   int create(CUevent &Event) noexcept {
211     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
212                      "Error returned from cuEventCreate\n"))
213       return OFFLOAD_FAIL;
214 
215     return OFFLOAD_SUCCESS;
216   }
217 
218   /// See AllocatorTy<T>::destroy.
219   int destroy(CUevent Event) noexcept {
220     if (!checkResult(cuEventDestroy(Event),
221                      "Error returned from cuEventDestroy\n"))
222       return OFFLOAD_FAIL;
223 
224     return OFFLOAD_SUCCESS;
225   }
226 };
227 
228 /// A generic pool of resources where \p T is the resource type.
229 /// \p T should be copyable as the object is stored in \p std::vector .
230 template <typename T> class ResourcePoolTy {
231   /// Index of the next available resource.
232   size_t Next = 0;
233   /// Mutex to guard the pool.
234   std::mutex Mutex;
235   /// Pool of resources.
236   std::vector<T> Resources;
237   /// A reference to the corresponding allocator.
238   AllocatorTy<T> Allocator;
239 
240   /// If `Resources` is used up, we will fill in more resources. It assumes that
241   /// the new size `Size` should be always larger than the current size.
242   bool resize(size_t Size) {
243     auto CurSize = Resources.size();
244     assert(Size > CurSize && "Unexpected smaller size");
245     Resources.reserve(Size);
246     for (auto I = CurSize; I < Size; ++I) {
247       T NewItem;
248       int Ret = Allocator.create(NewItem);
249       if (Ret != OFFLOAD_SUCCESS)
250         return false;
251       Resources.push_back(NewItem);
252     }
253     return true;
254   }
255 
256 public:
257   ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
258       : Allocator(std::move(A)) {
259     if (Size)
260       (void)resize(Size);
261   }
262 
263   ~ResourcePoolTy() noexcept { clear(); }
264 
265   /// Get a resource from pool. `Next` always points to the next available
266   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
267   /// still available. If there is no resource left, we will ask for more. Each
268   /// time a resource is assigned, the id will increase one.
269   /// xxxxxs+++++++++
270   ///      ^
271   ///      Next
272   /// After assignment, the pool becomes the following and s is assigned.
273   /// xxxxxs+++++++++
274   ///       ^
275   ///       Next
276   int acquire(T &R) noexcept {
277     std::lock_guard<std::mutex> LG(Mutex);
278     if (Next == Resources.size()) {
279       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
280       if (!resize(NewSize))
281         return OFFLOAD_FAIL;
282     }
283 
284     assert(Next < Resources.size());
285 
286     R = Resources[Next++];
287 
288     return OFFLOAD_SUCCESS;
289   }
290 
291   /// Return the resource back to the pool. When we return a resource, we need
292   /// to first decrease `Next`, and then copy the resource back. It is worth
293   /// noting that, the order of resources return might be different from that
294   /// they're assigned, that saying, at some point, there might be two identical
295   /// resources.
296   /// xxax+a+++++
297   ///     ^
298   ///     Next
299   /// However, it doesn't matter, because they're always on the two sides of
300   /// `Next`. The left one will in the end be overwritten by another resource.
301   /// Therefore, after several execution, the order of pool might be different
302   /// from its initial state.
303   void release(T R) noexcept {
304     std::lock_guard<std::mutex> LG(Mutex);
305     Resources[--Next] = R;
306   }
307 
308   /// Released all stored resources and clear the pool.
309   /// Note: This function is not thread safe. Be sure to guard it if necessary.
310   void clear() noexcept {
311     for (auto &R : Resources)
312       (void)Allocator.destroy(R);
313     Resources.clear();
314   }
315 };
316 
317 class DeviceRTLTy {
318   int NumberOfDevices;
319   // OpenMP environment properties
320   int EnvNumTeams;
321   int EnvTeamLimit;
322   int EnvTeamThreadLimit;
323   // OpenMP requires flags
324   int64_t RequiresFlags;
325   // Amount of dynamic shared memory to use at launch.
326   uint64_t DynamicMemorySize;
327   // Number of initial streams for each device.
328   int NumInitialStreams = 32;
329 
330   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
331   static constexpr const int HardThreadLimit = 1024;
332   static constexpr const int DefaultNumTeams = 128;
333   static constexpr const int DefaultNumThreads = 128;
334 
335   using StreamPoolTy = ResourcePoolTy<CUstream>;
336   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
337 
338   ResourcePoolTy<CUevent> EventPool;
339 
340   std::vector<DeviceDataTy> DeviceData;
341   std::vector<CUmodule> Modules;
342 
343   /// A class responsible for interacting with device native runtime library to
344   /// allocate and free memory.
345   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
346     const int DeviceId;
347     const std::vector<DeviceDataTy> &DeviceData;
348     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
349 
350   public:
351     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
352         : DeviceId(DeviceId), DeviceData(DeviceData) {}
353 
354     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
355       if (Size == 0)
356         return nullptr;
357 
358       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
359       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
360         return nullptr;
361 
362       void *MemAlloc = nullptr;
363       switch (Kind) {
364       case TARGET_ALLOC_DEFAULT:
365       case TARGET_ALLOC_DEVICE:
366         CUdeviceptr DevicePtr;
367         Err = cuMemAlloc(&DevicePtr, Size);
368         MemAlloc = (void *)DevicePtr;
369         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
370           return nullptr;
371         break;
372       case TARGET_ALLOC_HOST:
373         void *HostPtr;
374         Err = cuMemAllocHost(&HostPtr, Size);
375         MemAlloc = HostPtr;
376         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
377           return nullptr;
378         HostPinnedAllocs[MemAlloc] = Kind;
379         break;
380       case TARGET_ALLOC_SHARED:
381         CUdeviceptr SharedPtr;
382         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
383         MemAlloc = (void *)SharedPtr;
384         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
385           return nullptr;
386         break;
387       }
388 
389       return MemAlloc;
390     }
391 
392     int free(void *TgtPtr) override {
393       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
394       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
395         return OFFLOAD_FAIL;
396 
397       // Host pinned memory must be freed differently.
398       TargetAllocTy Kind =
399           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
400               ? TARGET_ALLOC_DEFAULT
401               : TARGET_ALLOC_HOST;
402       switch (Kind) {
403       case TARGET_ALLOC_DEFAULT:
404       case TARGET_ALLOC_DEVICE:
405       case TARGET_ALLOC_SHARED:
406         Err = cuMemFree((CUdeviceptr)TgtPtr);
407         if (!checkResult(Err, "Error returned from cuMemFree\n"))
408           return OFFLOAD_FAIL;
409         break;
410       case TARGET_ALLOC_HOST:
411         Err = cuMemFreeHost(TgtPtr);
412         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
413           return OFFLOAD_FAIL;
414         break;
415       }
416 
417       return OFFLOAD_SUCCESS;
418     }
419   };
420 
421   /// A vector of device allocators
422   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
423 
424   /// A vector of memory managers. Since the memory manager is non-copyable and
425   // non-removable, we wrap them into std::unique_ptr.
426   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
427 
428   /// Whether use memory manager
429   bool UseMemoryManager = true;
430 
431   // Record entry point associated with device
432   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
433     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
434     E.Entries.push_back(entry);
435   }
436 
437   // Return a pointer to the entry associated with the pointer
438   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
439                                              const void *Addr) const {
440     for (const __tgt_offload_entry &Itr :
441          DeviceData[DeviceId].FuncGblEntries.back().Entries)
442       if (Itr.addr == Addr)
443         return &Itr;
444 
445     return nullptr;
446   }
447 
448   // Return the pointer to the target entries table
449   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
450     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
451 
452     if (E.Entries.empty())
453       return nullptr;
454 
455     // Update table info according to the entries and return the pointer
456     E.Table.EntriesBegin = E.Entries.data();
457     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
458 
459     return &E.Table;
460   }
461 
462   // Clear entries table for a device
463   void clearOffloadEntriesTable(const int DeviceId) {
464     DeviceData[DeviceId].FuncGblEntries.emplace_back();
465     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
466     E.Entries.clear();
467     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
468   }
469 
470 public:
471 
472   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
473     assert(AsyncInfo && "AsyncInfo is nullptr");
474 
475     if (!AsyncInfo->Queue) {
476       CUstream S;
477       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
478         return nullptr;
479 
480       AsyncInfo->Queue = S;
481     }
482 
483     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
484   }
485 
486   // This class should not be copied
487   DeviceRTLTy(const DeviceRTLTy &) = delete;
488   DeviceRTLTy(DeviceRTLTy &&) = delete;
489 
490   DeviceRTLTy()
491       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
492         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
493         DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
494 
495     DP("Start initializing CUDA\n");
496 
497     CUresult Err = cuInit(0);
498     if (Err == CUDA_ERROR_INVALID_HANDLE) {
499       // Can't call cuGetErrorString if dlsym failed
500       DP("Failed to load CUDA shared library\n");
501       return;
502     }
503     if (!checkResult(Err, "Error returned from cuInit\n")) {
504       return;
505     }
506 
507     Err = cuDeviceGetCount(&NumberOfDevices);
508     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
509       return;
510 
511     if (NumberOfDevices == 0) {
512       DP("There are no devices supporting CUDA.\n");
513       return;
514     }
515 
516     DeviceData.resize(NumberOfDevices);
517     StreamPool.resize(NumberOfDevices);
518 
519     // Get environment variables regarding teams
520     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
521       // OMP_TEAM_LIMIT has been set
522       EnvTeamLimit = std::stoi(EnvStr);
523       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
524     }
525     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
526       // OMP_TEAMS_THREAD_LIMIT has been set
527       EnvTeamThreadLimit = std::stoi(EnvStr);
528       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
529     }
530     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
531       // OMP_NUM_TEAMS has been set
532       EnvNumTeams = std::stoi(EnvStr);
533       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
534     }
535     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
536       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
537       DynamicMemorySize = std::stoi(EnvStr);
538       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
539          DynamicMemorySize);
540     }
541     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
542       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
543       NumInitialStreams = std::stoi(EnvStr);
544       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
545     }
546 
547     for (int I = 0; I < NumberOfDevices; ++I)
548       DeviceAllocators.emplace_back(I, DeviceData);
549 
550     // Get the size threshold from environment variable
551     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
552     UseMemoryManager = Res.second;
553     size_t MemoryManagerThreshold = Res.first;
554 
555     if (UseMemoryManager)
556       for (int I = 0; I < NumberOfDevices; ++I)
557         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
558             DeviceAllocators[I], MemoryManagerThreshold));
559   }
560 
561   ~DeviceRTLTy() {
562     // We first destruct memory managers in case that its dependent data are
563     // destroyed before it.
564     for (auto &M : MemoryManagers)
565       M.release();
566 
567     for (CUmodule &M : Modules)
568       // Close module
569       if (M)
570         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
571 
572     for (auto &S : StreamPool)
573       S.reset();
574 
575     EventPool.clear();
576 
577     for (DeviceDataTy &D : DeviceData) {
578       // Destroy context
579       if (D.Context) {
580         checkResult(cuCtxSetCurrent(D.Context),
581                     "Error returned from cuCtxSetCurrent\n");
582         CUdevice Device;
583         checkResult(cuCtxGetDevice(&Device),
584                     "Error returned from cuCtxGetDevice\n");
585         checkResult(cuDevicePrimaryCtxRelease(Device),
586                     "Error returned from cuDevicePrimaryCtxRelease\n");
587       }
588     }
589   }
590 
591   // Check whether a given DeviceId is valid
592   bool isValidDeviceId(const int DeviceId) const {
593     return DeviceId >= 0 && DeviceId < NumberOfDevices;
594   }
595 
596   int getNumOfDevices() const { return NumberOfDevices; }
597 
598   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
599 
600   int initDevice(const int DeviceId) {
601     CUdevice Device;
602 
603     DP("Getting device %d\n", DeviceId);
604     CUresult Err = cuDeviceGet(&Device, DeviceId);
605     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
606       return OFFLOAD_FAIL;
607 
608     // Query the current flags of the primary context and set its flags if
609     // it is inactive
610     unsigned int FormerPrimaryCtxFlags = 0;
611     int FormerPrimaryCtxIsActive = 0;
612     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
613                                      &FormerPrimaryCtxIsActive);
614     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
615       return OFFLOAD_FAIL;
616 
617     if (FormerPrimaryCtxIsActive) {
618       DP("The primary context is active, no change to its flags\n");
619       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
620           CU_CTX_SCHED_BLOCKING_SYNC)
621         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
622     } else {
623       DP("The primary context is inactive, set its flags to "
624          "CU_CTX_SCHED_BLOCKING_SYNC\n");
625       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
626       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
627         return OFFLOAD_FAIL;
628     }
629 
630     // Retain the per device primary context and save it to use whenever this
631     // device is selected.
632     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
633     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
634       return OFFLOAD_FAIL;
635 
636     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
637     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
638       return OFFLOAD_FAIL;
639 
640     // Initialize stream pool
641     if (!StreamPool[DeviceId])
642       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
643           AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
644           NumInitialStreams);
645 
646     // Query attributes to determine number of threads/block and blocks/grid.
647     int MaxGridDimX;
648     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
649                                Device);
650     if (Err != CUDA_SUCCESS) {
651       DP("Error getting max grid dimension, use default value %d\n",
652          DeviceRTLTy::DefaultNumTeams);
653       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
654     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
655       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
656       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
657     } else {
658       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
659          "at the hard limit\n",
660          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
661       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
662     }
663 
664     // We are only exploiting threads along the x axis.
665     int MaxBlockDimX;
666     Err = cuDeviceGetAttribute(&MaxBlockDimX,
667                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
668     if (Err != CUDA_SUCCESS) {
669       DP("Error getting max block dimension, use default value %d\n",
670          DeviceRTLTy::DefaultNumThreads);
671       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
672     } else {
673       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
674       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
675 
676       if (EnvTeamThreadLimit > 0 &&
677           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
678         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
679            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
680            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
681         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
682       }
683       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
684         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
685            "capping at the hard limit\n",
686            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
687         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
688       }
689     }
690 
691     // Get and set warp size
692     int WarpSize;
693     Err =
694         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
695     if (Err != CUDA_SUCCESS) {
696       DP("Error getting warp size, assume default value 32\n");
697       DeviceData[DeviceId].WarpSize = 32;
698     } else {
699       DP("Using warp size %d\n", WarpSize);
700       DeviceData[DeviceId].WarpSize = WarpSize;
701     }
702 
703     // Adjust teams to the env variables
704     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
705       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
706          EnvTeamLimit);
707       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
708     }
709 
710     size_t StackLimit;
711     size_t HeapLimit;
712     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
713       StackLimit = std::stol(EnvStr);
714       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
715         return OFFLOAD_FAIL;
716     } else {
717       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
718         return OFFLOAD_FAIL;
719     }
720     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
721       HeapLimit = std::stol(EnvStr);
722       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
723         return OFFLOAD_FAIL;
724     } else {
725       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
726         return OFFLOAD_FAIL;
727     }
728 
729     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
730          "Device supports up to %d CUDA blocks and %d threads with a "
731          "warp size of %d\n",
732          DeviceData[DeviceId].BlocksPerGrid,
733          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
734     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
735          "Device heap size is %d Bytes, device stack size is %d Bytes per "
736          "thread\n",
737          (int)HeapLimit, (int)StackLimit);
738 
739     // Set default number of teams
740     if (EnvNumTeams > 0) {
741       DP("Default number of teams set according to environment %d\n",
742          EnvNumTeams);
743       DeviceData[DeviceId].NumTeams = EnvNumTeams;
744     } else {
745       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
746       DP("Default number of teams set according to library's default %d\n",
747          DeviceRTLTy::DefaultNumTeams);
748     }
749 
750     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
751       DP("Default number of teams exceeds device limit, capping at %d\n",
752          DeviceData[DeviceId].BlocksPerGrid);
753       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
754     }
755 
756     // Set default number of threads
757     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
758     DP("Default number of threads set according to library's default %d\n",
759        DeviceRTLTy::DefaultNumThreads);
760     if (DeviceData[DeviceId].NumThreads >
761         DeviceData[DeviceId].ThreadsPerBlock) {
762       DP("Default number of threads exceeds device limit, capping at %d\n",
763          DeviceData[DeviceId].ThreadsPerBlock);
764       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
765     }
766 
767     return OFFLOAD_SUCCESS;
768   }
769 
770   __tgt_target_table *loadBinary(const int DeviceId,
771                                  const __tgt_device_image *Image) {
772     // Set the context we are using
773     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
774     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
775       return nullptr;
776 
777     // Clear the offload table as we are going to create a new one.
778     clearOffloadEntriesTable(DeviceId);
779 
780     // Create the module and extract the function pointers.
781     CUmodule Module;
782     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
783     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
784     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
785       return nullptr;
786 
787     DP("CUDA module successfully loaded!\n");
788 
789     Modules.push_back(Module);
790 
791     // Find the symbols in the module by name.
792     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
793     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
794 
795     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
796     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
797       if (!E->addr) {
798         // We return nullptr when something like this happens, the host should
799         // have always something in the address to uniquely identify the target
800         // region.
801         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
802         return nullptr;
803       }
804 
805       if (E->size) {
806         __tgt_offload_entry Entry = *E;
807         CUdeviceptr CUPtr;
808         size_t CUSize;
809         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
810         // We keep this style here because we need the name
811         if (Err != CUDA_SUCCESS) {
812           REPORT("Loading global '%s' Failed\n", E->name);
813           CUDA_ERR_STRING(Err);
814           return nullptr;
815         }
816 
817         if (CUSize != E->size) {
818           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
819              CUSize, E->size);
820           return nullptr;
821         }
822 
823         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
824            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
825 
826         Entry.addr = (void *)(CUPtr);
827 
828         // Note: In the current implementation declare target variables
829         // can either be link or to. This means that once unified
830         // memory is activated via the requires directive, the variable
831         // can be used directly from the host in both cases.
832         // TODO: when variables types other than to or link are added,
833         // the below condition should be changed to explicitly
834         // check for to and link variables types:
835         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
836         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
837         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
838           // If unified memory is present any target link or to variables
839           // can access host addresses directly. There is no longer a
840           // need for device copies.
841           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
842           DP("Copy linked variable host address (" DPxMOD
843              ") to device address (" DPxMOD ")\n",
844              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
845         }
846 
847         addOffloadEntry(DeviceId, Entry);
848 
849         continue;
850       }
851 
852       CUfunction Func;
853       Err = cuModuleGetFunction(&Func, Module, E->name);
854       // We keep this style here because we need the name
855       if (Err != CUDA_SUCCESS) {
856         REPORT("Loading '%s' Failed\n", E->name);
857         CUDA_ERR_STRING(Err);
858         return nullptr;
859       }
860 
861       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
862          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
863 
864       // default value GENERIC (in case symbol is missing from cubin file)
865       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
866       std::string ExecModeNameStr(E->name);
867       ExecModeNameStr += "_exec_mode";
868       const char *ExecModeName = ExecModeNameStr.c_str();
869 
870       CUdeviceptr ExecModePtr;
871       size_t CUSize;
872       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
873       if (Err == CUDA_SUCCESS) {
874         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
875           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
876              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
877           return nullptr;
878         }
879 
880         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
881         if (Err != CUDA_SUCCESS) {
882           REPORT("Error when copying data from device to host. Pointers: "
883                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
884                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
885           CUDA_ERR_STRING(Err);
886           return nullptr;
887         }
888       } else {
889         DP("Loading global exec_mode '%s' - symbol missing, using default "
890            "value GENERIC (1)\n",
891            ExecModeName);
892       }
893 
894       KernelsList.emplace_back(Func, ExecModeVal);
895 
896       __tgt_offload_entry Entry = *E;
897       Entry.addr = &KernelsList.back();
898       addOffloadEntry(DeviceId, Entry);
899     }
900 
901     // send device environment data to the device
902     {
903       // TODO: The device ID used here is not the real device ID used by OpenMP.
904       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
905                                     static_cast<uint32_t>(DeviceId),
906                                     static_cast<uint32_t>(DynamicMemorySize)};
907 
908       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
909         DeviceEnv.DebugKind = std::stoi(EnvStr);
910 
911       const char *DeviceEnvName = "omptarget_device_environment";
912       CUdeviceptr DeviceEnvPtr;
913       size_t CUSize;
914 
915       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
916       if (Err == CUDA_SUCCESS) {
917         if (CUSize != sizeof(DeviceEnv)) {
918           REPORT(
919               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
920               DeviceEnvName, CUSize, sizeof(int32_t));
921           CUDA_ERR_STRING(Err);
922           return nullptr;
923         }
924 
925         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
926         if (Err != CUDA_SUCCESS) {
927           REPORT("Error when copying data from host to device. Pointers: "
928                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
929                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
930           CUDA_ERR_STRING(Err);
931           return nullptr;
932         }
933 
934         DP("Sending global device environment data %zu bytes\n", CUSize);
935       } else {
936         DP("Finding global device environment '%s' - symbol missing.\n",
937            DeviceEnvName);
938         DP("Continue, considering this is a device RTL which does not accept "
939            "environment setting.\n");
940       }
941     }
942 
943     return getOffloadEntriesTable(DeviceId);
944   }
945 
946   void *dataAlloc(const int DeviceId, const int64_t Size,
947                   const TargetAllocTy Kind) {
948     switch (Kind) {
949     case TARGET_ALLOC_DEFAULT:
950     case TARGET_ALLOC_DEVICE:
951       if (UseMemoryManager)
952         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
953       else
954         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
955     case TARGET_ALLOC_HOST:
956     case TARGET_ALLOC_SHARED:
957       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
958     }
959 
960     REPORT("Invalid target data allocation kind or requested allocator not "
961            "implemented yet\n");
962 
963     return nullptr;
964   }
965 
966   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
967                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
968     assert(AsyncInfo && "AsyncInfo is nullptr");
969 
970     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
971     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
972       return OFFLOAD_FAIL;
973 
974     CUstream Stream = getStream(DeviceId, AsyncInfo);
975 
976     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
977     if (Err != CUDA_SUCCESS) {
978       DP("Error when copying data from host to device. Pointers: host "
979          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
980          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
981       CUDA_ERR_STRING(Err);
982       return OFFLOAD_FAIL;
983     }
984 
985     return OFFLOAD_SUCCESS;
986   }
987 
988   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
989                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
990     assert(AsyncInfo && "AsyncInfo is nullptr");
991 
992     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
993     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
994       return OFFLOAD_FAIL;
995 
996     CUstream Stream = getStream(DeviceId, AsyncInfo);
997 
998     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
999     if (Err != CUDA_SUCCESS) {
1000       DP("Error when copying data from device to host. Pointers: host "
1001          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1002          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1003       CUDA_ERR_STRING(Err);
1004       return OFFLOAD_FAIL;
1005     }
1006 
1007     return OFFLOAD_SUCCESS;
1008   }
1009 
1010   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1011                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1012     assert(AsyncInfo && "AsyncInfo is nullptr");
1013 
1014     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1015     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1016       return OFFLOAD_FAIL;
1017 
1018     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1019 
1020     // If they are two devices, we try peer to peer copy first
1021     if (SrcDevId != DstDevId) {
1022       int CanAccessPeer = 0;
1023       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1024       if (Err != CUDA_SUCCESS) {
1025         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1026                ", dst = %" PRId32 "\n",
1027                SrcDevId, DstDevId);
1028         CUDA_ERR_STRING(Err);
1029         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1030       }
1031 
1032       if (!CanAccessPeer) {
1033         DP("P2P memcpy not supported so fall back to D2D memcpy");
1034         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1035       }
1036 
1037       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1038       if (Err != CUDA_SUCCESS) {
1039         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1040                ", dst = %" PRId32 "\n",
1041                SrcDevId, DstDevId);
1042         CUDA_ERR_STRING(Err);
1043         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1044       }
1045 
1046       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1047                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1048                               Size, Stream);
1049       if (Err == CUDA_SUCCESS)
1050         return OFFLOAD_SUCCESS;
1051 
1052       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1053          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1054          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1055       CUDA_ERR_STRING(Err);
1056     }
1057 
1058     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1059   }
1060 
1061   int dataDelete(const int DeviceId, void *TgtPtr) {
1062     if (UseMemoryManager)
1063       return MemoryManagers[DeviceId]->free(TgtPtr);
1064 
1065     return DeviceAllocators[DeviceId].free(TgtPtr);
1066   }
1067 
1068   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1069                           ptrdiff_t *TgtOffsets, const int ArgNum,
1070                           const int TeamNum, const int ThreadLimit,
1071                           const unsigned int LoopTripCount,
1072                           __tgt_async_info *AsyncInfo) const {
1073     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1074     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1075       return OFFLOAD_FAIL;
1076 
1077     // All args are references.
1078     std::vector<void *> Args(ArgNum);
1079     std::vector<void *> Ptrs(ArgNum);
1080 
1081     for (int I = 0; I < ArgNum; ++I) {
1082       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1083       Args[I] = &Ptrs[I];
1084     }
1085 
1086     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1087 
1088     const bool IsSPMDGenericMode =
1089         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1090     const bool IsSPMDMode =
1091         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1092     const bool IsGenericMode =
1093         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1094 
1095     int CudaThreadsPerBlock;
1096     if (ThreadLimit > 0) {
1097       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1098       CudaThreadsPerBlock = ThreadLimit;
1099       // Add master warp if necessary
1100       if (IsGenericMode) {
1101         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1102         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1103       }
1104     } else {
1105       DP("Setting CUDA threads per block to default %d\n",
1106          DeviceData[DeviceId].NumThreads);
1107       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1108     }
1109 
1110     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1111       DP("Threads per block capped at device limit %d\n",
1112          DeviceData[DeviceId].ThreadsPerBlock);
1113       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1114     }
1115 
1116     if (!KernelInfo->MaxThreadsPerBlock) {
1117       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1118                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1119                                KernelInfo->Func);
1120       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1121         return OFFLOAD_FAIL;
1122     }
1123 
1124     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1125       DP("Threads per block capped at kernel limit %d\n",
1126          KernelInfo->MaxThreadsPerBlock);
1127       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1128     }
1129 
1130     unsigned int CudaBlocksPerGrid;
1131     if (TeamNum <= 0) {
1132       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1133         if (IsSPMDGenericMode) {
1134           // If we reach this point, then we are executing a kernel that was
1135           // transformed from Generic-mode to SPMD-mode. This kernel has
1136           // SPMD-mode execution, but needs its blocks to be scheduled
1137           // differently because the current loop trip count only applies to the
1138           // `teams distribute` region and will create var too few blocks using
1139           // the regular SPMD-mode method.
1140           CudaBlocksPerGrid = LoopTripCount;
1141         } else if (IsSPMDMode) {
1142           // We have a combined construct, i.e. `target teams distribute
1143           // parallel for [simd]`. We launch so many teams so that each thread
1144           // will execute one iteration of the loop. round up to the nearest
1145           // integer
1146           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1147         } else if (IsGenericMode) {
1148           // If we reach this point, then we have a non-combined construct, i.e.
1149           // `teams distribute` with a nested `parallel for` and each team is
1150           // assigned one iteration of the `distribute` loop. E.g.:
1151           //
1152           // #pragma omp target teams distribute
1153           // for(...loop_tripcount...) {
1154           //   #pragma omp parallel for
1155           //   for(...) {}
1156           // }
1157           //
1158           // Threads within a team will execute the iterations of the `parallel`
1159           // loop.
1160           CudaBlocksPerGrid = LoopTripCount;
1161         } else {
1162           REPORT("Unknown execution mode: %d\n",
1163                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1164           return OFFLOAD_FAIL;
1165         }
1166         DP("Using %d teams due to loop trip count %" PRIu32
1167            " and number of threads per block %d\n",
1168            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1169       } else {
1170         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1171         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1172       }
1173     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1174       DP("Capping number of teams to team limit %d\n",
1175          DeviceData[DeviceId].BlocksPerGrid);
1176       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1177     } else {
1178       DP("Using requested number of teams %d\n", TeamNum);
1179       CudaBlocksPerGrid = TeamNum;
1180     }
1181 
1182     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1183          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1184          (getOffloadEntry(DeviceId, TgtEntryPtr))
1185              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1186              : "(null)",
1187          CudaBlocksPerGrid, CudaThreadsPerBlock,
1188          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1189 
1190     CUstream Stream = getStream(DeviceId, AsyncInfo);
1191     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1192                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1193                          /* blockDimY */ 1, /* blockDimZ */ 1,
1194                          DynamicMemorySize, Stream, &Args[0], nullptr);
1195     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1196       return OFFLOAD_FAIL;
1197 
1198     DP("Launch of entry point at " DPxMOD " successful!\n",
1199        DPxPTR(TgtEntryPtr));
1200 
1201     return OFFLOAD_SUCCESS;
1202   }
1203 
1204   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1205     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1206     CUresult Err = cuStreamSynchronize(Stream);
1207 
1208     // Once the stream is synchronized, return it to stream pool and reset
1209     // AsyncInfo. This is to make sure the synchronization only works for its
1210     // own tasks.
1211     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1212     AsyncInfo->Queue = nullptr;
1213 
1214     if (Err != CUDA_SUCCESS) {
1215       DP("Error when synchronizing stream. stream = " DPxMOD
1216          ", async info ptr = " DPxMOD "\n",
1217          DPxPTR(Stream), DPxPTR(AsyncInfo));
1218       CUDA_ERR_STRING(Err);
1219     }
1220     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1221   }
1222 
1223   void printDeviceInfo(int32_t device_id) {
1224     char TmpChar[1000];
1225     std::string TmpStr;
1226     size_t TmpSt;
1227     int TmpInt, TmpInt2, TmpInt3;
1228 
1229     CUdevice Device;
1230     checkResult(cuDeviceGet(&Device, device_id),
1231                 "Error returned from cuCtxGetDevice\n");
1232 
1233     cuDriverGetVersion(&TmpInt);
1234     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1235     printf("    CUDA Device Number: \t\t%d \n", device_id);
1236     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1237                 "Error returned from cuDeviceGetName\n");
1238     printf("    Device Name: \t\t\t%s \n", TmpChar);
1239     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1240                 "Error returned from cuDeviceTotalMem\n");
1241     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1242     checkResult(cuDeviceGetAttribute(
1243                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1244                 "Error returned from cuDeviceGetAttribute\n");
1245     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1246     checkResult(
1247         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1248         "Error returned from cuDeviceGetAttribute\n");
1249     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1250     checkResult(cuDeviceGetAttribute(
1251                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1252                 "Error returned from cuDeviceGetAttribute\n");
1253     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1254     checkResult(
1255         cuDeviceGetAttribute(
1256             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1257         "Error returned from cuDeviceGetAttribute\n");
1258     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1259     checkResult(
1260         cuDeviceGetAttribute(
1261             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1262         "Error returned from cuDeviceGetAttribute\n");
1263     printf("    Registers per Block: \t\t%d \n", TmpInt);
1264     checkResult(
1265         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1266         "Error returned from cuDeviceGetAttribute\n");
1267     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1268     checkResult(cuDeviceGetAttribute(
1269                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1270                 "Error returned from cuDeviceGetAttribute\n");
1271     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1272     checkResult(cuDeviceGetAttribute(
1273                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1274                 "Error returned from cuDeviceGetAttribute\n");
1275     checkResult(cuDeviceGetAttribute(
1276                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1277                 "Error returned from cuDeviceGetAttribute\n");
1278     checkResult(cuDeviceGetAttribute(
1279                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1280                 "Error returned from cuDeviceGetAttribute\n");
1281     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1282            TmpInt3);
1283     checkResult(cuDeviceGetAttribute(
1284                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1285                 "Error returned from cuDeviceGetAttribute\n");
1286     checkResult(cuDeviceGetAttribute(
1287                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1288                 "Error returned from cuDeviceGetAttribute\n");
1289     checkResult(cuDeviceGetAttribute(
1290                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1291                 "Error returned from cuDeviceGetAttribute\n");
1292     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1293            TmpInt3);
1294     checkResult(
1295         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1296         "Error returned from cuDeviceGetAttribute\n");
1297     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1298     checkResult(cuDeviceGetAttribute(
1299                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1300                 "Error returned from cuDeviceGetAttribute\n");
1301     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1302     checkResult(
1303         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1304         "Error returned from cuDeviceGetAttribute\n");
1305     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1306     checkResult(cuDeviceGetAttribute(
1307                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1308                 "Error returned from cuDeviceGetAttribute\n");
1309     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1310     checkResult(
1311         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1312         "Error returned from cuDeviceGetAttribute\n");
1313     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1314     checkResult(cuDeviceGetAttribute(
1315                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1316                 "Error returned from cuDeviceGetAttribute\n");
1317     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1318     checkResult(
1319         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1320         "Error returned from cuDeviceGetAttribute\n");
1321     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1322       TmpStr = "DEFAULT";
1323     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1324       TmpStr = "PROHIBITED";
1325     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1326       TmpStr = "EXCLUSIVE PROCESS";
1327     else
1328       TmpStr = "unknown";
1329     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1330     checkResult(cuDeviceGetAttribute(
1331                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1332                 "Error returned from cuDeviceGetAttribute\n");
1333     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1334     checkResult(
1335         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1336         "Error returned from cuDeviceGetAttribute\n");
1337     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1338     checkResult(cuDeviceGetAttribute(
1339                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1340                 "Error returned from cuDeviceGetAttribute\n");
1341     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1342     checkResult(
1343         cuDeviceGetAttribute(
1344             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1345         "Error returned from cuDeviceGetAttribute\n");
1346     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1347     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1348                                      Device),
1349                 "Error returned from cuDeviceGetAttribute\n");
1350     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1351     checkResult(cuDeviceGetAttribute(
1352                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1353                     Device),
1354                 "Error returned from cuDeviceGetAttribute\n");
1355     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1356     checkResult(cuDeviceGetAttribute(
1357                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1358                 "Error returned from cuDeviceGetAttribute\n");
1359     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1360     checkResult(cuDeviceGetAttribute(
1361                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1362                 "Error returned from cuDeviceGetAttribute\n");
1363     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1364     checkResult(cuDeviceGetAttribute(
1365                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1366                 "Error returned from cuDeviceGetAttribute\n");
1367     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1368     checkResult(
1369         cuDeviceGetAttribute(
1370             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1371         "Error returned from cuDeviceGetAttribute\n");
1372     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1373     checkResult(
1374         cuDeviceGetAttribute(
1375             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1376         "Error returned from cuDeviceGetAttribute\n");
1377     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1378     checkResult(cuDeviceGetAttribute(
1379                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1380                 "Error returned from cuDeviceGetAttribute\n");
1381     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1382     checkResult(cuDeviceGetAttribute(
1383                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1384                 "Error returned from cuDeviceGetAttribute\n");
1385     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1386     checkResult(
1387         cuDeviceGetAttribute(
1388             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1389         "Error returned from cuDeviceGetAttribute\n");
1390     checkResult(
1391         cuDeviceGetAttribute(
1392             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1393         "Error returned from cuDeviceGetAttribute\n");
1394     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1395   }
1396 
1397   int createEvent(void **P) {
1398     CUevent Event = nullptr;
1399     if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
1400       return OFFLOAD_FAIL;
1401     *P = Event;
1402     return OFFLOAD_SUCCESS;
1403   }
1404 
1405   int destroyEvent(void *EventPtr) {
1406     EventPool.release(reinterpret_cast<CUevent>(EventPtr));
1407     return OFFLOAD_SUCCESS;
1408   }
1409 
1410   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1411                 void *EventPtr) const {
1412     CUstream Stream = getStream(DeviceId, AsyncInfo);
1413     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1414 
1415     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1416     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1417     // API document, that argument has to be 0x0.
1418     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1419     if (Err != CUDA_SUCCESS) {
1420       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1421          DPxPTR(Stream), DPxPTR(Event));
1422       CUDA_ERR_STRING(Err);
1423       return OFFLOAD_FAIL;
1424     }
1425 
1426     return OFFLOAD_SUCCESS;
1427   }
1428 
1429   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1430     if (AsyncInfo->Queue) {
1431       StreamPool[DeviceId]->release(
1432           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1433       AsyncInfo->Queue = nullptr;
1434     }
1435 
1436     return OFFLOAD_SUCCESS;
1437   }
1438 
1439   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1440     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1441     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1442       return OFFLOAD_FAIL;
1443 
1444     *AsyncInfo = new __tgt_async_info;
1445     getStream(DeviceId, *AsyncInfo);
1446     return OFFLOAD_SUCCESS;
1447   }
1448 
1449   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1450                      const char **ErrStr) const {
1451     assert(DeviceInfo && "DeviceInfo is nullptr");
1452 
1453     if (!DeviceInfo->Context)
1454       DeviceInfo->Context = DeviceData[DeviceId].Context;
1455     if (!DeviceInfo->Device) {
1456       CUdevice Dev;
1457       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1458       if (Err == CUDA_SUCCESS) {
1459         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1460       } else {
1461         cuGetErrorString(Err, ErrStr);
1462         return OFFLOAD_FAIL;
1463       }
1464     }
1465     return OFFLOAD_SUCCESS;
1466   }
1467 };
1468 
1469 DeviceRTLTy DeviceRTL;
1470 } // namespace
1471 
1472 // Exposed library API function
1473 #ifdef __cplusplus
1474 extern "C" {
1475 #endif
1476 
1477 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1478   return elf_check_machine(image, /* EM_CUDA */ 190);
1479 }
1480 
1481 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1482 
1483 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1484   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1485   DeviceRTL.setRequiresFlag(RequiresFlags);
1486   return RequiresFlags;
1487 }
1488 
1489 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1490   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1491       DeviceRTL.isValidDeviceId(dst_dev_id))
1492     return 1;
1493 
1494   return 0;
1495 }
1496 
1497 int32_t __tgt_rtl_init_device(int32_t device_id) {
1498   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1499 
1500   return DeviceRTL.initDevice(device_id);
1501 }
1502 
1503 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1504                                           __tgt_device_image *image) {
1505   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1506 
1507   return DeviceRTL.loadBinary(device_id, image);
1508 }
1509 
1510 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1511                            int32_t kind) {
1512   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1513 
1514   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1515 }
1516 
1517 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1518                               int64_t size) {
1519   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1520 
1521   __tgt_async_info AsyncInfo;
1522   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1523                                                  size, &AsyncInfo);
1524   if (rc != OFFLOAD_SUCCESS)
1525     return OFFLOAD_FAIL;
1526 
1527   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1528 }
1529 
1530 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1531                                     void *hst_ptr, int64_t size,
1532                                     __tgt_async_info *async_info_ptr) {
1533   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1534   assert(async_info_ptr && "async_info_ptr is nullptr");
1535 
1536   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1537                               async_info_ptr);
1538 }
1539 
1540 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1541                                 int64_t size) {
1542   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1543 
1544   __tgt_async_info AsyncInfo;
1545   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1546                                                    size, &AsyncInfo);
1547   if (rc != OFFLOAD_SUCCESS)
1548     return OFFLOAD_FAIL;
1549 
1550   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1551 }
1552 
1553 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1554                                       void *tgt_ptr, int64_t size,
1555                                       __tgt_async_info *async_info_ptr) {
1556   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1557   assert(async_info_ptr && "async_info_ptr is nullptr");
1558 
1559   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1560                                 async_info_ptr);
1561 }
1562 
1563 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1564                                       int dst_dev_id, void *dst_ptr,
1565                                       int64_t size,
1566                                       __tgt_async_info *AsyncInfo) {
1567   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1568   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1569   assert(AsyncInfo && "AsyncInfo is nullptr");
1570 
1571   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1572                                 AsyncInfo);
1573 }
1574 
1575 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1576                                 int32_t dst_dev_id, void *dst_ptr,
1577                                 int64_t size) {
1578   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1579   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1580 
1581   __tgt_async_info AsyncInfo;
1582   const int32_t rc = __tgt_rtl_data_exchange_async(
1583       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1584   if (rc != OFFLOAD_SUCCESS)
1585     return OFFLOAD_FAIL;
1586 
1587   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1588 }
1589 
1590 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1591   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1592 
1593   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1594 }
1595 
1596 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1597                                          void **tgt_args,
1598                                          ptrdiff_t *tgt_offsets,
1599                                          int32_t arg_num, int32_t team_num,
1600                                          int32_t thread_limit,
1601                                          uint64_t loop_tripcount) {
1602   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1603 
1604   __tgt_async_info AsyncInfo;
1605   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1606       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1607       thread_limit, loop_tripcount, &AsyncInfo);
1608   if (rc != OFFLOAD_SUCCESS)
1609     return OFFLOAD_FAIL;
1610 
1611   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1612 }
1613 
1614 int32_t __tgt_rtl_run_target_team_region_async(
1615     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1616     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1617     int32_t thread_limit, uint64_t loop_tripcount,
1618     __tgt_async_info *async_info_ptr) {
1619   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1620 
1621   return DeviceRTL.runTargetTeamRegion(
1622       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1623       thread_limit, loop_tripcount, async_info_ptr);
1624 }
1625 
1626 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1627                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1628                                     int32_t arg_num) {
1629   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1630 
1631   __tgt_async_info AsyncInfo;
1632   const int32_t rc = __tgt_rtl_run_target_region_async(
1633       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1634   if (rc != OFFLOAD_SUCCESS)
1635     return OFFLOAD_FAIL;
1636 
1637   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1638 }
1639 
1640 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1641                                           void *tgt_entry_ptr, void **tgt_args,
1642                                           ptrdiff_t *tgt_offsets,
1643                                           int32_t arg_num,
1644                                           __tgt_async_info *async_info_ptr) {
1645   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1646 
1647   return __tgt_rtl_run_target_team_region_async(
1648       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1649       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1650       async_info_ptr);
1651 }
1652 
1653 int32_t __tgt_rtl_synchronize(int32_t device_id,
1654                               __tgt_async_info *async_info_ptr) {
1655   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1656   assert(async_info_ptr && "async_info_ptr is nullptr");
1657   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1658 
1659   return DeviceRTL.synchronize(device_id, async_info_ptr);
1660 }
1661 
1662 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1663   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1664   InfoLevel.store(NewInfoLevel);
1665 }
1666 
1667 void __tgt_rtl_print_device_info(int32_t device_id) {
1668   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1669   DeviceRTL.printDeviceInfo(device_id);
1670 }
1671 
1672 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1673   assert(event && "event is nullptr");
1674   return DeviceRTL.createEvent(event);
1675 }
1676 
1677 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1678                                __tgt_async_info *async_info_ptr) {
1679   assert(async_info_ptr && "async_info_ptr is nullptr");
1680   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1681   assert(event_ptr && "event_ptr is nullptr");
1682 
1683   return recordEvent(event_ptr, async_info_ptr);
1684 }
1685 
1686 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1687                              __tgt_async_info *async_info_ptr) {
1688   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1689   assert(async_info_ptr && "async_info_ptr is nullptr");
1690   assert(event_ptr && "event is nullptr");
1691 
1692   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1693 }
1694 
1695 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1696   assert(event_ptr && "event is nullptr");
1697 
1698   return syncEvent(event_ptr);
1699 }
1700 
1701 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1702   assert(event_ptr && "event is nullptr");
1703 
1704   return DeviceRTL.destroyEvent(event_ptr);
1705 }
1706 
1707 int32_t __tgt_rtl_release_async_info(int32_t device_id,
1708                                      __tgt_async_info *async_info) {
1709   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1710   assert(async_info && "async_info is nullptr");
1711 
1712   return DeviceRTL.releaseAsyncInfo(device_id, async_info);
1713 }
1714 
1715 int32_t __tgt_rtl_init_async_info(int32_t device_id,
1716                                   __tgt_async_info **async_info) {
1717   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1718   assert(async_info && "async_info is nullptr");
1719 
1720   return DeviceRTL.initAsyncInfo(device_id, async_info);
1721 }
1722 
1723 int32_t __tgt_rtl_init_device_info(int32_t device_id,
1724                                    __tgt_device_info *device_info_ptr,
1725                                    const char **err_str) {
1726   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1727   assert(device_info_ptr && "device_info_ptr is nullptr");
1728 
1729   return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str);
1730 }
1731 
1732 #ifdef __cplusplus
1733 }
1734 #endif
1735