1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
26 
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
29 
30 #include "MemoryManager.h"
31 
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
33 
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err)                                                   \
37   do {                                                                         \
38     if (getDebugLevel() > 0) {                                                 \
39       const char *errStr = nullptr;                                            \
40       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
41       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
42         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
43       else if (errStr_status == CUDA_SUCCESS)                                  \
44         REPORT("CUDA error is: %s\n", errStr);                                 \
45       else {                                                                   \
46         REPORT("Unresolved CUDA error code: %d\n", err);                       \
47         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
48                errStr_status);                                                 \
49       }                                                                        \
50     } else {                                                                   \
51       const char *errStr = nullptr;                                            \
52       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
53       if (errStr_status == CUDA_SUCCESS)                                       \
54         REPORT("%s \n", errStr);                                               \
55     }                                                                          \
56   } while (false)
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err)                                                   \
59   do {                                                                         \
60     const char *errStr = nullptr;                                              \
61     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
62     if (errStr_status == CUDA_SUCCESS)                                         \
63       REPORT("%s \n", errStr);                                                 \
64   } while (false)
65 #endif // OMPTARGET_DEBUG
66 
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
68 
69 #include "elf_common.h"
70 
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy {
73   __tgt_target_table Table;
74   std::vector<__tgt_offload_entry> Entries;
75 };
76 
77 /// Use a single entity to encode a kernel and a set of flags.
78 struct KernelTy {
79   CUfunction Func;
80 
81   // execution mode of kernel
82   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
83 
84   /// Maximal number of threads per block for this kernel.
85   int MaxThreadsPerBlock = 0;
86 
87   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
88       : Func(_Func), ExecutionMode(_ExecutionMode) {}
89 };
90 
91 namespace {
92 bool checkResult(CUresult Err, const char *ErrMsg) {
93   if (Err == CUDA_SUCCESS)
94     return true;
95 
96   REPORT("%s", ErrMsg);
97   CUDA_ERR_STRING(Err);
98   return false;
99 }
100 
101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
102                CUstream Stream) {
103   CUresult Err =
104       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
105 
106   if (Err != CUDA_SUCCESS) {
107     DP("Error when copying data from device to device. Pointers: src "
108        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
109        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
110     CUDA_ERR_STRING(Err);
111     return OFFLOAD_FAIL;
112   }
113 
114   return OFFLOAD_SUCCESS;
115 }
116 
117 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
118   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
119   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
120 
121   CUresult Err = cuEventRecord(Event, Stream);
122   if (Err != CUDA_SUCCESS) {
123     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
124        DPxPTR(Stream), DPxPTR(Event));
125     CUDA_ERR_STRING(Err);
126     return OFFLOAD_FAIL;
127   }
128 
129   return OFFLOAD_SUCCESS;
130 }
131 
132 int syncEvent(void *EventPtr) {
133   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
134 
135   CUresult Err = cuEventSynchronize(Event);
136   if (Err != CUDA_SUCCESS) {
137     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
138     CUDA_ERR_STRING(Err);
139     return OFFLOAD_FAIL;
140   }
141 
142   return OFFLOAD_SUCCESS;
143 }
144 
145 // Structure contains per-device data
146 struct DeviceDataTy {
147   /// List that contains all the kernels.
148   std::list<KernelTy> KernelsList;
149 
150   std::list<FuncOrGblEntryTy> FuncGblEntries;
151 
152   CUcontext Context = nullptr;
153   // Device properties
154   int ThreadsPerBlock = 0;
155   int BlocksPerGrid = 0;
156   int WarpSize = 0;
157   // OpenMP properties
158   int NumTeams = 0;
159   int NumThreads = 0;
160 };
161 
162 /// Resource allocator where \p T is the resource type.
163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
164 /// accordingly. The implementation should not raise any exception.
165 template <typename T> class AllocatorTy {
166 public:
167   /// Create a resource and assign to R.
168   int create(T &R) noexcept;
169   /// Destroy the resource.
170   int destroy(T) noexcept;
171 };
172 
173 /// Allocator for CUstream.
174 template <> class AllocatorTy<CUstream> {
175   CUcontext Context;
176 
177 public:
178   AllocatorTy(CUcontext C) noexcept : Context(C) {}
179 
180   /// See AllocatorTy<T>::create.
181   int create(CUstream &Stream) noexcept {
182     if (!checkResult(cuCtxSetCurrent(Context),
183                      "Error returned from cuCtxSetCurrent\n"))
184       return OFFLOAD_FAIL;
185 
186     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
187                      "Error returned from cuStreamCreate\n"))
188       return OFFLOAD_FAIL;
189 
190     return OFFLOAD_SUCCESS;
191   }
192 
193   /// See AllocatorTy<T>::destroy.
194   int destroy(CUstream Stream) noexcept {
195     if (!checkResult(cuCtxSetCurrent(Context),
196                      "Error returned from cuCtxSetCurrent\n"))
197       return OFFLOAD_FAIL;
198     if (!checkResult(cuStreamDestroy(Stream),
199                      "Error returned from cuStreamDestroy\n"))
200       return OFFLOAD_FAIL;
201 
202     return OFFLOAD_SUCCESS;
203   }
204 };
205 
206 /// Allocator for CUevent.
207 template <> class AllocatorTy<CUevent> {
208 public:
209   /// See AllocatorTy<T>::create.
210   int create(CUevent &Event) noexcept {
211     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
212                      "Error returned from cuEventCreate\n"))
213       return OFFLOAD_FAIL;
214 
215     return OFFLOAD_SUCCESS;
216   }
217 
218   /// See AllocatorTy<T>::destroy.
219   int destroy(CUevent Event) noexcept {
220     if (!checkResult(cuEventDestroy(Event),
221                      "Error returned from cuEventDestroy\n"))
222       return OFFLOAD_FAIL;
223 
224     return OFFLOAD_SUCCESS;
225   }
226 };
227 
228 /// A generic pool of resources where \p T is the resource type.
229 /// \p T should be copyable as the object is stored in \p std::vector .
230 template <typename T> class ResourcePoolTy {
231   /// Index of the next available resource.
232   size_t Next = 0;
233   /// Mutex to guard the pool.
234   std::mutex Mutex;
235   /// Pool of resources.
236   std::vector<T> Resources;
237   /// A reference to the corresponding allocator.
238   AllocatorTy<T> Allocator;
239 
240   /// If `Resources` is used up, we will fill in more resources. It assumes that
241   /// the new size `Size` should be always larger than the current size.
242   bool resize(size_t Size) {
243     auto CurSize = Resources.size();
244     assert(Size > CurSize && "Unexpected smaller size");
245     Resources.reserve(Size);
246     for (auto I = CurSize; I < Size; ++I) {
247       T NewItem;
248       int Ret = Allocator.create(NewItem);
249       if (Ret != OFFLOAD_SUCCESS)
250         return false;
251       Resources.push_back(NewItem);
252     }
253     return true;
254   }
255 
256 public:
257   ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
258       : Allocator(std::move(A)) {
259     if (Size)
260       (void)resize(Size);
261   }
262 
263   ~ResourcePoolTy() noexcept { clear(); }
264 
265   /// Get a resource from pool. `Next` always points to the next available
266   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
267   /// still available. If there is no resource left, we will ask for more. Each
268   /// time a resource is assigned, the id will increase one.
269   /// xxxxxs+++++++++
270   ///      ^
271   ///      Next
272   /// After assignment, the pool becomes the following and s is assigned.
273   /// xxxxxs+++++++++
274   ///       ^
275   ///       Next
276   int acquire(T &R) noexcept {
277     std::lock_guard<std::mutex> LG(Mutex);
278     if (Next == Resources.size()) {
279       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
280       if (!resize(NewSize))
281         return OFFLOAD_FAIL;
282     }
283 
284     assert(Next < Resources.size());
285 
286     R = Resources[Next++];
287 
288     return OFFLOAD_SUCCESS;
289   }
290 
291   /// Return the resource back to the pool. When we return a resource, we need
292   /// to first decrease `Next`, and then copy the resource back. It is worth
293   /// noting that, the order of resources return might be different from that
294   /// they're assigned, that saying, at some point, there might be two identical
295   /// resources.
296   /// xxax+a+++++
297   ///     ^
298   ///     Next
299   /// However, it doesn't matter, because they're always on the two sides of
300   /// `Next`. The left one will in the end be overwritten by another resource.
301   /// Therefore, after several execution, the order of pool might be different
302   /// from its initial state.
303   void release(T R) noexcept {
304     std::lock_guard<std::mutex> LG(Mutex);
305     Resources[--Next] = R;
306   }
307 
308   /// Released all stored resources and clear the pool.
309   /// Note: This function is not thread safe. Be sure to guard it if necessary.
310   void clear() noexcept {
311     for (auto &R : Resources)
312       (void)Allocator.destroy(R);
313     Resources.clear();
314   }
315 };
316 
317 class DeviceRTLTy {
318   int NumberOfDevices;
319   // OpenMP environment properties
320   int EnvNumTeams;
321   int EnvTeamLimit;
322   int EnvTeamThreadLimit;
323   // OpenMP requires flags
324   int64_t RequiresFlags;
325   // Amount of dynamic shared memory to use at launch.
326   uint64_t DynamicMemorySize;
327   // Number of initial streams for each device.
328   int NumInitialStreams = 32;
329 
330   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
331   static constexpr const int HardThreadLimit = 1024;
332   static constexpr const int DefaultNumTeams = 128;
333   static constexpr const int DefaultNumThreads = 128;
334 
335   using StreamPoolTy = ResourcePoolTy<CUstream>;
336   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
337 
338   ResourcePoolTy<CUevent> EventPool;
339 
340   std::vector<DeviceDataTy> DeviceData;
341   std::vector<CUmodule> Modules;
342 
343   /// A class responsible for interacting with device native runtime library to
344   /// allocate and free memory.
345   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
346     const int DeviceId;
347     const std::vector<DeviceDataTy> &DeviceData;
348     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
349 
350   public:
351     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
352         : DeviceId(DeviceId), DeviceData(DeviceData) {}
353 
354     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
355       if (Size == 0)
356         return nullptr;
357 
358       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
359       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
360         return nullptr;
361 
362       void *MemAlloc = nullptr;
363       switch (Kind) {
364       case TARGET_ALLOC_DEFAULT:
365       case TARGET_ALLOC_DEVICE:
366         CUdeviceptr DevicePtr;
367         Err = cuMemAlloc(&DevicePtr, Size);
368         MemAlloc = (void *)DevicePtr;
369         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
370           return nullptr;
371         break;
372       case TARGET_ALLOC_HOST:
373         void *HostPtr;
374         Err = cuMemAllocHost(&HostPtr, Size);
375         MemAlloc = HostPtr;
376         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
377           return nullptr;
378         HostPinnedAllocs[MemAlloc] = Kind;
379         break;
380       case TARGET_ALLOC_SHARED:
381         CUdeviceptr SharedPtr;
382         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
383         MemAlloc = (void *)SharedPtr;
384         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
385           return nullptr;
386         break;
387       }
388 
389       return MemAlloc;
390     }
391 
392     int free(void *TgtPtr) override {
393       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
394       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
395         return OFFLOAD_FAIL;
396 
397       // Host pinned memory must be freed differently.
398       TargetAllocTy Kind =
399           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
400               ? TARGET_ALLOC_DEFAULT
401               : TARGET_ALLOC_HOST;
402       switch (Kind) {
403       case TARGET_ALLOC_DEFAULT:
404       case TARGET_ALLOC_DEVICE:
405       case TARGET_ALLOC_SHARED:
406         Err = cuMemFree((CUdeviceptr)TgtPtr);
407         if (!checkResult(Err, "Error returned from cuMemFree\n"))
408           return OFFLOAD_FAIL;
409         break;
410       case TARGET_ALLOC_HOST:
411         Err = cuMemFreeHost(TgtPtr);
412         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
413           return OFFLOAD_FAIL;
414         break;
415       }
416 
417       return OFFLOAD_SUCCESS;
418     }
419   };
420 
421   /// A vector of device allocators
422   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
423 
424   /// A vector of memory managers. Since the memory manager is non-copyable and
425   // non-removable, we wrap them into std::unique_ptr.
426   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
427 
428   /// Whether use memory manager
429   bool UseMemoryManager = true;
430 
431   // Record entry point associated with device
432   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
433     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
434     E.Entries.push_back(entry);
435   }
436 
437   // Return a pointer to the entry associated with the pointer
438   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
439                                              const void *Addr) const {
440     for (const __tgt_offload_entry &Itr :
441          DeviceData[DeviceId].FuncGblEntries.back().Entries)
442       if (Itr.addr == Addr)
443         return &Itr;
444 
445     return nullptr;
446   }
447 
448   // Return the pointer to the target entries table
449   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
450     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
451 
452     if (E.Entries.empty())
453       return nullptr;
454 
455     // Update table info according to the entries and return the pointer
456     E.Table.EntriesBegin = E.Entries.data();
457     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
458 
459     return &E.Table;
460   }
461 
462   // Clear entries table for a device
463   void clearOffloadEntriesTable(const int DeviceId) {
464     DeviceData[DeviceId].FuncGblEntries.emplace_back();
465     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
466     E.Entries.clear();
467     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
468   }
469 
470   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
471     assert(AsyncInfo && "AsyncInfo is nullptr");
472 
473     if (!AsyncInfo->Queue) {
474       CUstream S;
475       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
476         return nullptr;
477 
478       AsyncInfo->Queue = S;
479     }
480 
481     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
482   }
483 
484 public:
485   // This class should not be copied
486   DeviceRTLTy(const DeviceRTLTy &) = delete;
487   DeviceRTLTy(DeviceRTLTy &&) = delete;
488 
489   DeviceRTLTy()
490       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
491         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
492         DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
493 
494     DP("Start initializing CUDA\n");
495 
496     CUresult Err = cuInit(0);
497     if (Err == CUDA_ERROR_INVALID_HANDLE) {
498       // Can't call cuGetErrorString if dlsym failed
499       DP("Failed to load CUDA shared library\n");
500       return;
501     }
502     if (!checkResult(Err, "Error returned from cuInit\n")) {
503       return;
504     }
505 
506     Err = cuDeviceGetCount(&NumberOfDevices);
507     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
508       return;
509 
510     if (NumberOfDevices == 0) {
511       DP("There are no devices supporting CUDA.\n");
512       return;
513     }
514 
515     DeviceData.resize(NumberOfDevices);
516     StreamPool.resize(NumberOfDevices);
517 
518     // Get environment variables regarding teams
519     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
520       // OMP_TEAM_LIMIT has been set
521       EnvTeamLimit = std::stoi(EnvStr);
522       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
523     }
524     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
525       // OMP_TEAMS_THREAD_LIMIT has been set
526       EnvTeamThreadLimit = std::stoi(EnvStr);
527       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
528     }
529     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
530       // OMP_NUM_TEAMS has been set
531       EnvNumTeams = std::stoi(EnvStr);
532       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
533     }
534     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
535       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
536       DynamicMemorySize = std::stoi(EnvStr);
537       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
538          DynamicMemorySize);
539     }
540     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
541       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
542       NumInitialStreams = std::stoi(EnvStr);
543       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
544     }
545 
546     for (int I = 0; I < NumberOfDevices; ++I)
547       DeviceAllocators.emplace_back(I, DeviceData);
548 
549     // Get the size threshold from environment variable
550     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
551     UseMemoryManager = Res.second;
552     size_t MemoryManagerThreshold = Res.first;
553 
554     if (UseMemoryManager)
555       for (int I = 0; I < NumberOfDevices; ++I)
556         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
557             DeviceAllocators[I], MemoryManagerThreshold));
558   }
559 
560   ~DeviceRTLTy() {
561     // We first destruct memory managers in case that its dependent data are
562     // destroyed before it.
563     for (auto &M : MemoryManagers)
564       M.release();
565 
566     for (CUmodule &M : Modules)
567       // Close module
568       if (M)
569         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
570 
571     for (auto &S : StreamPool)
572       S.reset();
573 
574     EventPool.clear();
575 
576     for (DeviceDataTy &D : DeviceData) {
577       // Destroy context
578       if (D.Context) {
579         checkResult(cuCtxSetCurrent(D.Context),
580                     "Error returned from cuCtxSetCurrent\n");
581         CUdevice Device;
582         checkResult(cuCtxGetDevice(&Device),
583                     "Error returned from cuCtxGetDevice\n");
584         checkResult(cuDevicePrimaryCtxRelease(Device),
585                     "Error returned from cuDevicePrimaryCtxRelease\n");
586       }
587     }
588   }
589 
590   // Check whether a given DeviceId is valid
591   bool isValidDeviceId(const int DeviceId) const {
592     return DeviceId >= 0 && DeviceId < NumberOfDevices;
593   }
594 
595   int getNumOfDevices() const { return NumberOfDevices; }
596 
597   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
598 
599   int initDevice(const int DeviceId) {
600     CUdevice Device;
601 
602     DP("Getting device %d\n", DeviceId);
603     CUresult Err = cuDeviceGet(&Device, DeviceId);
604     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
605       return OFFLOAD_FAIL;
606 
607     // Query the current flags of the primary context and set its flags if
608     // it is inactive
609     unsigned int FormerPrimaryCtxFlags = 0;
610     int FormerPrimaryCtxIsActive = 0;
611     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
612                                      &FormerPrimaryCtxIsActive);
613     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
614       return OFFLOAD_FAIL;
615 
616     if (FormerPrimaryCtxIsActive) {
617       DP("The primary context is active, no change to its flags\n");
618       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
619           CU_CTX_SCHED_BLOCKING_SYNC)
620         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
621     } else {
622       DP("The primary context is inactive, set its flags to "
623          "CU_CTX_SCHED_BLOCKING_SYNC\n");
624       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
625       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
626         return OFFLOAD_FAIL;
627     }
628 
629     // Retain the per device primary context and save it to use whenever this
630     // device is selected.
631     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
632     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
633       return OFFLOAD_FAIL;
634 
635     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
636     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
637       return OFFLOAD_FAIL;
638 
639     // Initialize stream pool
640     if (!StreamPool[DeviceId])
641       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
642           AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
643           NumInitialStreams);
644 
645     // Query attributes to determine number of threads/block and blocks/grid.
646     int MaxGridDimX;
647     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
648                                Device);
649     if (Err != CUDA_SUCCESS) {
650       DP("Error getting max grid dimension, use default value %d\n",
651          DeviceRTLTy::DefaultNumTeams);
652       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
653     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
654       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
655       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
656     } else {
657       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
658          "at the hard limit\n",
659          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
660       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
661     }
662 
663     // We are only exploiting threads along the x axis.
664     int MaxBlockDimX;
665     Err = cuDeviceGetAttribute(&MaxBlockDimX,
666                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
667     if (Err != CUDA_SUCCESS) {
668       DP("Error getting max block dimension, use default value %d\n",
669          DeviceRTLTy::DefaultNumThreads);
670       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
671     } else {
672       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
673       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
674 
675       if (EnvTeamThreadLimit > 0 &&
676           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
677         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
678            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
679            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
680         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
681       }
682       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
683         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
684            "capping at the hard limit\n",
685            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
686         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
687       }
688     }
689 
690     // Get and set warp size
691     int WarpSize;
692     Err =
693         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
694     if (Err != CUDA_SUCCESS) {
695       DP("Error getting warp size, assume default value 32\n");
696       DeviceData[DeviceId].WarpSize = 32;
697     } else {
698       DP("Using warp size %d\n", WarpSize);
699       DeviceData[DeviceId].WarpSize = WarpSize;
700     }
701 
702     // Adjust teams to the env variables
703     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
704       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
705          EnvTeamLimit);
706       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
707     }
708 
709     size_t StackLimit;
710     size_t HeapLimit;
711     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
712       StackLimit = std::stol(EnvStr);
713       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
714         return OFFLOAD_FAIL;
715     } else {
716       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
717         return OFFLOAD_FAIL;
718     }
719     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
720       HeapLimit = std::stol(EnvStr);
721       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
722         return OFFLOAD_FAIL;
723     } else {
724       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
725         return OFFLOAD_FAIL;
726     }
727 
728     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
729          "Device supports up to %d CUDA blocks and %d threads with a "
730          "warp size of %d\n",
731          DeviceData[DeviceId].BlocksPerGrid,
732          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
733     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
734          "Device heap size is %d Bytes, device stack size is %d Bytes per "
735          "thread\n",
736          (int)HeapLimit, (int)StackLimit);
737 
738     // Set default number of teams
739     if (EnvNumTeams > 0) {
740       DP("Default number of teams set according to environment %d\n",
741          EnvNumTeams);
742       DeviceData[DeviceId].NumTeams = EnvNumTeams;
743     } else {
744       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
745       DP("Default number of teams set according to library's default %d\n",
746          DeviceRTLTy::DefaultNumTeams);
747     }
748 
749     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
750       DP("Default number of teams exceeds device limit, capping at %d\n",
751          DeviceData[DeviceId].BlocksPerGrid);
752       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
753     }
754 
755     // Set default number of threads
756     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
757     DP("Default number of threads set according to library's default %d\n",
758        DeviceRTLTy::DefaultNumThreads);
759     if (DeviceData[DeviceId].NumThreads >
760         DeviceData[DeviceId].ThreadsPerBlock) {
761       DP("Default number of threads exceeds device limit, capping at %d\n",
762          DeviceData[DeviceId].ThreadsPerBlock);
763       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
764     }
765 
766     return OFFLOAD_SUCCESS;
767   }
768 
769   __tgt_target_table *loadBinary(const int DeviceId,
770                                  const __tgt_device_image *Image) {
771     // Set the context we are using
772     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
773     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
774       return nullptr;
775 
776     // Clear the offload table as we are going to create a new one.
777     clearOffloadEntriesTable(DeviceId);
778 
779     // Create the module and extract the function pointers.
780     CUmodule Module;
781     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
782     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
783     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
784       return nullptr;
785 
786     DP("CUDA module successfully loaded!\n");
787 
788     Modules.push_back(Module);
789 
790     // Find the symbols in the module by name.
791     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
792     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
793 
794     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
795     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
796       if (!E->addr) {
797         // We return nullptr when something like this happens, the host should
798         // have always something in the address to uniquely identify the target
799         // region.
800         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
801         return nullptr;
802       }
803 
804       if (E->size) {
805         __tgt_offload_entry Entry = *E;
806         CUdeviceptr CUPtr;
807         size_t CUSize;
808         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
809         // We keep this style here because we need the name
810         if (Err != CUDA_SUCCESS) {
811           REPORT("Loading global '%s' Failed\n", E->name);
812           CUDA_ERR_STRING(Err);
813           return nullptr;
814         }
815 
816         if (CUSize != E->size) {
817           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
818              CUSize, E->size);
819           return nullptr;
820         }
821 
822         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
823            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
824 
825         Entry.addr = (void *)(CUPtr);
826 
827         // Note: In the current implementation declare target variables
828         // can either be link or to. This means that once unified
829         // memory is activated via the requires directive, the variable
830         // can be used directly from the host in both cases.
831         // TODO: when variables types other than to or link are added,
832         // the below condition should be changed to explicitly
833         // check for to and link variables types:
834         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
835         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
836         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
837           // If unified memory is present any target link or to variables
838           // can access host addresses directly. There is no longer a
839           // need for device copies.
840           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
841           DP("Copy linked variable host address (" DPxMOD
842              ") to device address (" DPxMOD ")\n",
843              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
844         }
845 
846         addOffloadEntry(DeviceId, Entry);
847 
848         continue;
849       }
850 
851       CUfunction Func;
852       Err = cuModuleGetFunction(&Func, Module, E->name);
853       // We keep this style here because we need the name
854       if (Err != CUDA_SUCCESS) {
855         REPORT("Loading '%s' Failed\n", E->name);
856         CUDA_ERR_STRING(Err);
857         return nullptr;
858       }
859 
860       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
861          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
862 
863       // default value GENERIC (in case symbol is missing from cubin file)
864       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
865       std::string ExecModeNameStr(E->name);
866       ExecModeNameStr += "_exec_mode";
867       const char *ExecModeName = ExecModeNameStr.c_str();
868 
869       CUdeviceptr ExecModePtr;
870       size_t CUSize;
871       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
872       if (Err == CUDA_SUCCESS) {
873         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
874           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
875              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
876           return nullptr;
877         }
878 
879         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
880         if (Err != CUDA_SUCCESS) {
881           REPORT("Error when copying data from device to host. Pointers: "
882                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
883                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
884           CUDA_ERR_STRING(Err);
885           return nullptr;
886         }
887       } else {
888         DP("Loading global exec_mode '%s' - symbol missing, using default "
889            "value GENERIC (1)\n",
890            ExecModeName);
891       }
892 
893       KernelsList.emplace_back(Func, ExecModeVal);
894 
895       __tgt_offload_entry Entry = *E;
896       Entry.addr = &KernelsList.back();
897       addOffloadEntry(DeviceId, Entry);
898     }
899 
900     // send device environment data to the device
901     {
902       // TODO: The device ID used here is not the real device ID used by OpenMP.
903       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
904                                     static_cast<uint32_t>(DeviceId),
905                                     static_cast<uint32_t>(DynamicMemorySize)};
906 
907       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
908         DeviceEnv.DebugKind = std::stoi(EnvStr);
909 
910       const char *DeviceEnvName = "omptarget_device_environment";
911       CUdeviceptr DeviceEnvPtr;
912       size_t CUSize;
913 
914       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
915       if (Err == CUDA_SUCCESS) {
916         if (CUSize != sizeof(DeviceEnv)) {
917           REPORT(
918               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
919               DeviceEnvName, CUSize, sizeof(int32_t));
920           CUDA_ERR_STRING(Err);
921           return nullptr;
922         }
923 
924         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
925         if (Err != CUDA_SUCCESS) {
926           REPORT("Error when copying data from host to device. Pointers: "
927                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
928                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
929           CUDA_ERR_STRING(Err);
930           return nullptr;
931         }
932 
933         DP("Sending global device environment data %zu bytes\n", CUSize);
934       } else {
935         DP("Finding global device environment '%s' - symbol missing.\n",
936            DeviceEnvName);
937         DP("Continue, considering this is a device RTL which does not accept "
938            "environment setting.\n");
939       }
940     }
941 
942     return getOffloadEntriesTable(DeviceId);
943   }
944 
945   void *dataAlloc(const int DeviceId, const int64_t Size,
946                   const TargetAllocTy Kind) {
947     switch (Kind) {
948     case TARGET_ALLOC_DEFAULT:
949     case TARGET_ALLOC_DEVICE:
950       if (UseMemoryManager)
951         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
952       else
953         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
954     case TARGET_ALLOC_HOST:
955     case TARGET_ALLOC_SHARED:
956       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
957     }
958 
959     REPORT("Invalid target data allocation kind or requested allocator not "
960            "implemented yet\n");
961 
962     return nullptr;
963   }
964 
965   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
966                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
967     assert(AsyncInfo && "AsyncInfo is nullptr");
968 
969     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
970     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
971       return OFFLOAD_FAIL;
972 
973     CUstream Stream = getStream(DeviceId, AsyncInfo);
974 
975     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
976     if (Err != CUDA_SUCCESS) {
977       DP("Error when copying data from host to device. Pointers: host "
978          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
979          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
980       CUDA_ERR_STRING(Err);
981       return OFFLOAD_FAIL;
982     }
983 
984     return OFFLOAD_SUCCESS;
985   }
986 
987   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
988                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
989     assert(AsyncInfo && "AsyncInfo is nullptr");
990 
991     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
992     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
993       return OFFLOAD_FAIL;
994 
995     CUstream Stream = getStream(DeviceId, AsyncInfo);
996 
997     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
998     if (Err != CUDA_SUCCESS) {
999       DP("Error when copying data from device to host. Pointers: host "
1000          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1001          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1002       CUDA_ERR_STRING(Err);
1003       return OFFLOAD_FAIL;
1004     }
1005 
1006     return OFFLOAD_SUCCESS;
1007   }
1008 
1009   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1010                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1011     assert(AsyncInfo && "AsyncInfo is nullptr");
1012 
1013     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1014     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1015       return OFFLOAD_FAIL;
1016 
1017     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1018 
1019     // If they are two devices, we try peer to peer copy first
1020     if (SrcDevId != DstDevId) {
1021       int CanAccessPeer = 0;
1022       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1023       if (Err != CUDA_SUCCESS) {
1024         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1025                ", dst = %" PRId32 "\n",
1026                SrcDevId, DstDevId);
1027         CUDA_ERR_STRING(Err);
1028         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1029       }
1030 
1031       if (!CanAccessPeer) {
1032         DP("P2P memcpy not supported so fall back to D2D memcpy");
1033         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1034       }
1035 
1036       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1037       if (Err != CUDA_SUCCESS) {
1038         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1039                ", dst = %" PRId32 "\n",
1040                SrcDevId, DstDevId);
1041         CUDA_ERR_STRING(Err);
1042         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1043       }
1044 
1045       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1046                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1047                               Size, Stream);
1048       if (Err == CUDA_SUCCESS)
1049         return OFFLOAD_SUCCESS;
1050 
1051       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1052          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1053          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1054       CUDA_ERR_STRING(Err);
1055     }
1056 
1057     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1058   }
1059 
1060   int dataDelete(const int DeviceId, void *TgtPtr) {
1061     if (UseMemoryManager)
1062       return MemoryManagers[DeviceId]->free(TgtPtr);
1063 
1064     return DeviceAllocators[DeviceId].free(TgtPtr);
1065   }
1066 
1067   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1068                           ptrdiff_t *TgtOffsets, const int ArgNum,
1069                           const int TeamNum, const int ThreadLimit,
1070                           const unsigned int LoopTripCount,
1071                           __tgt_async_info *AsyncInfo) const {
1072     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1073     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1074       return OFFLOAD_FAIL;
1075 
1076     // All args are references.
1077     std::vector<void *> Args(ArgNum);
1078     std::vector<void *> Ptrs(ArgNum);
1079 
1080     for (int I = 0; I < ArgNum; ++I) {
1081       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1082       Args[I] = &Ptrs[I];
1083     }
1084 
1085     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1086 
1087     const bool IsSPMDGenericMode =
1088         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1089     const bool IsSPMDMode =
1090         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1091     const bool IsGenericMode =
1092         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1093 
1094     int CudaThreadsPerBlock;
1095     if (ThreadLimit > 0) {
1096       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1097       CudaThreadsPerBlock = ThreadLimit;
1098       // Add master warp if necessary
1099       if (IsGenericMode) {
1100         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1101         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1102       }
1103     } else {
1104       DP("Setting CUDA threads per block to default %d\n",
1105          DeviceData[DeviceId].NumThreads);
1106       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1107     }
1108 
1109     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1110       DP("Threads per block capped at device limit %d\n",
1111          DeviceData[DeviceId].ThreadsPerBlock);
1112       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1113     }
1114 
1115     if (!KernelInfo->MaxThreadsPerBlock) {
1116       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1117                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1118                                KernelInfo->Func);
1119       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1120         return OFFLOAD_FAIL;
1121     }
1122 
1123     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1124       DP("Threads per block capped at kernel limit %d\n",
1125          KernelInfo->MaxThreadsPerBlock);
1126       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1127     }
1128 
1129     unsigned int CudaBlocksPerGrid;
1130     if (TeamNum <= 0) {
1131       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1132         if (IsSPMDGenericMode) {
1133           // If we reach this point, then we are executing a kernel that was
1134           // transformed from Generic-mode to SPMD-mode. This kernel has
1135           // SPMD-mode execution, but needs its blocks to be scheduled
1136           // differently because the current loop trip count only applies to the
1137           // `teams distribute` region and will create var too few blocks using
1138           // the regular SPMD-mode method.
1139           CudaBlocksPerGrid = LoopTripCount;
1140         } else if (IsSPMDMode) {
1141           // We have a combined construct, i.e. `target teams distribute
1142           // parallel for [simd]`. We launch so many teams so that each thread
1143           // will execute one iteration of the loop. round up to the nearest
1144           // integer
1145           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1146         } else if (IsGenericMode) {
1147           // If we reach this point, then we have a non-combined construct, i.e.
1148           // `teams distribute` with a nested `parallel for` and each team is
1149           // assigned one iteration of the `distribute` loop. E.g.:
1150           //
1151           // #pragma omp target teams distribute
1152           // for(...loop_tripcount...) {
1153           //   #pragma omp parallel for
1154           //   for(...) {}
1155           // }
1156           //
1157           // Threads within a team will execute the iterations of the `parallel`
1158           // loop.
1159           CudaBlocksPerGrid = LoopTripCount;
1160         } else {
1161           REPORT("Unknown execution mode: %d\n",
1162                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1163           return OFFLOAD_FAIL;
1164         }
1165         DP("Using %d teams due to loop trip count %" PRIu32
1166            " and number of threads per block %d\n",
1167            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1168       } else {
1169         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1170         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1171       }
1172     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1173       DP("Capping number of teams to team limit %d\n",
1174          DeviceData[DeviceId].BlocksPerGrid);
1175       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1176     } else {
1177       DP("Using requested number of teams %d\n", TeamNum);
1178       CudaBlocksPerGrid = TeamNum;
1179     }
1180 
1181     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1182          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1183          (getOffloadEntry(DeviceId, TgtEntryPtr))
1184              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1185              : "(null)",
1186          CudaBlocksPerGrid, CudaThreadsPerBlock,
1187          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1188 
1189     CUstream Stream = getStream(DeviceId, AsyncInfo);
1190     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1191                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1192                          /* blockDimY */ 1, /* blockDimZ */ 1,
1193                          DynamicMemorySize, Stream, &Args[0], nullptr);
1194     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1195       return OFFLOAD_FAIL;
1196 
1197     DP("Launch of entry point at " DPxMOD " successful!\n",
1198        DPxPTR(TgtEntryPtr));
1199 
1200     return OFFLOAD_SUCCESS;
1201   }
1202 
1203   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1204     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1205     CUresult Err = cuStreamSynchronize(Stream);
1206 
1207     // Once the stream is synchronized, return it to stream pool and reset
1208     // AsyncInfo. This is to make sure the synchronization only works for its
1209     // own tasks.
1210     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1211     AsyncInfo->Queue = nullptr;
1212 
1213     if (Err != CUDA_SUCCESS) {
1214       DP("Error when synchronizing stream. stream = " DPxMOD
1215          ", async info ptr = " DPxMOD "\n",
1216          DPxPTR(Stream), DPxPTR(AsyncInfo));
1217       CUDA_ERR_STRING(Err);
1218     }
1219     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1220   }
1221 
1222   void printDeviceInfo(int32_t device_id) {
1223     char TmpChar[1000];
1224     std::string TmpStr;
1225     size_t TmpSt;
1226     int TmpInt, TmpInt2, TmpInt3;
1227 
1228     CUdevice Device;
1229     checkResult(cuDeviceGet(&Device, device_id),
1230                 "Error returned from cuCtxGetDevice\n");
1231 
1232     cuDriverGetVersion(&TmpInt);
1233     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1234     printf("    CUDA Device Number: \t\t%d \n", device_id);
1235     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1236                 "Error returned from cuDeviceGetName\n");
1237     printf("    Device Name: \t\t\t%s \n", TmpChar);
1238     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1239                 "Error returned from cuDeviceTotalMem\n");
1240     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1241     checkResult(cuDeviceGetAttribute(
1242                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1243                 "Error returned from cuDeviceGetAttribute\n");
1244     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1245     checkResult(
1246         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1247         "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1249     checkResult(cuDeviceGetAttribute(
1250                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1251                 "Error returned from cuDeviceGetAttribute\n");
1252     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1253     checkResult(
1254         cuDeviceGetAttribute(
1255             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1256         "Error returned from cuDeviceGetAttribute\n");
1257     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1258     checkResult(
1259         cuDeviceGetAttribute(
1260             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1261         "Error returned from cuDeviceGetAttribute\n");
1262     printf("    Registers per Block: \t\t%d \n", TmpInt);
1263     checkResult(
1264         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1265         "Error returned from cuDeviceGetAttribute\n");
1266     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1267     checkResult(cuDeviceGetAttribute(
1268                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1269                 "Error returned from cuDeviceGetAttribute\n");
1270     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1271     checkResult(cuDeviceGetAttribute(
1272                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1273                 "Error returned from cuDeviceGetAttribute\n");
1274     checkResult(cuDeviceGetAttribute(
1275                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1276                 "Error returned from cuDeviceGetAttribute\n");
1277     checkResult(cuDeviceGetAttribute(
1278                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1279                 "Error returned from cuDeviceGetAttribute\n");
1280     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1281            TmpInt3);
1282     checkResult(cuDeviceGetAttribute(
1283                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1284                 "Error returned from cuDeviceGetAttribute\n");
1285     checkResult(cuDeviceGetAttribute(
1286                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1287                 "Error returned from cuDeviceGetAttribute\n");
1288     checkResult(cuDeviceGetAttribute(
1289                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1290                 "Error returned from cuDeviceGetAttribute\n");
1291     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1292            TmpInt3);
1293     checkResult(
1294         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1295         "Error returned from cuDeviceGetAttribute\n");
1296     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1297     checkResult(cuDeviceGetAttribute(
1298                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1299                 "Error returned from cuDeviceGetAttribute\n");
1300     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1301     checkResult(
1302         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1303         "Error returned from cuDeviceGetAttribute\n");
1304     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1305     checkResult(cuDeviceGetAttribute(
1306                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1307                 "Error returned from cuDeviceGetAttribute\n");
1308     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1309     checkResult(
1310         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1311         "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(cuDeviceGetAttribute(
1314                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1315                 "Error returned from cuDeviceGetAttribute\n");
1316     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1317     checkResult(
1318         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1319         "Error returned from cuDeviceGetAttribute\n");
1320     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1321       TmpStr = "DEFAULT";
1322     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1323       TmpStr = "PROHIBITED";
1324     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1325       TmpStr = "EXCLUSIVE PROCESS";
1326     else
1327       TmpStr = "unknown";
1328     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1329     checkResult(cuDeviceGetAttribute(
1330                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1331                 "Error returned from cuDeviceGetAttribute\n");
1332     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1333     checkResult(
1334         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1337     checkResult(cuDeviceGetAttribute(
1338                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1341     checkResult(
1342         cuDeviceGetAttribute(
1343             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1344         "Error returned from cuDeviceGetAttribute\n");
1345     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1346     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1347                                      Device),
1348                 "Error returned from cuDeviceGetAttribute\n");
1349     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1350     checkResult(cuDeviceGetAttribute(
1351                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1352                     Device),
1353                 "Error returned from cuDeviceGetAttribute\n");
1354     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1355     checkResult(cuDeviceGetAttribute(
1356                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1357                 "Error returned from cuDeviceGetAttribute\n");
1358     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1359     checkResult(cuDeviceGetAttribute(
1360                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1361                 "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1363     checkResult(cuDeviceGetAttribute(
1364                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1365                 "Error returned from cuDeviceGetAttribute\n");
1366     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1367     checkResult(
1368         cuDeviceGetAttribute(
1369             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1370         "Error returned from cuDeviceGetAttribute\n");
1371     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1372     checkResult(
1373         cuDeviceGetAttribute(
1374             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1375         "Error returned from cuDeviceGetAttribute\n");
1376     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(cuDeviceGetAttribute(
1378                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1379                 "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1381     checkResult(cuDeviceGetAttribute(
1382                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1383                 "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1385     checkResult(
1386         cuDeviceGetAttribute(
1387             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1388         "Error returned from cuDeviceGetAttribute\n");
1389     checkResult(
1390         cuDeviceGetAttribute(
1391             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1392         "Error returned from cuDeviceGetAttribute\n");
1393     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1394   }
1395 
1396   int createEvent(void **P) {
1397     CUevent Event = nullptr;
1398     if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
1399       return OFFLOAD_FAIL;
1400     *P = Event;
1401     return OFFLOAD_SUCCESS;
1402   }
1403 
1404   int destroyEvent(void *EventPtr) {
1405     EventPool.release(reinterpret_cast<CUevent>(EventPtr));
1406     return OFFLOAD_SUCCESS;
1407   }
1408 
1409   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1410                 void *EventPtr) const {
1411     CUstream Stream = getStream(DeviceId, AsyncInfo);
1412     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1413 
1414     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1415     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1416     // API document, that argument has to be 0x0.
1417     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1418     if (Err != CUDA_SUCCESS) {
1419       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1420          DPxPTR(Stream), DPxPTR(Event));
1421       CUDA_ERR_STRING(Err);
1422       return OFFLOAD_FAIL;
1423     }
1424 
1425     return OFFLOAD_SUCCESS;
1426   }
1427 };
1428 
1429 DeviceRTLTy DeviceRTL;
1430 } // namespace
1431 
1432 // Exposed library API function
1433 #ifdef __cplusplus
1434 extern "C" {
1435 #endif
1436 
1437 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1438   return elf_check_machine(image, /* EM_CUDA */ 190);
1439 }
1440 
1441 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1442 
1443 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1444   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1445   DeviceRTL.setRequiresFlag(RequiresFlags);
1446   return RequiresFlags;
1447 }
1448 
1449 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1450   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1451       DeviceRTL.isValidDeviceId(dst_dev_id))
1452     return 1;
1453 
1454   return 0;
1455 }
1456 
1457 int32_t __tgt_rtl_init_device(int32_t device_id) {
1458   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1459 
1460   return DeviceRTL.initDevice(device_id);
1461 }
1462 
1463 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1464                                           __tgt_device_image *image) {
1465   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1466 
1467   return DeviceRTL.loadBinary(device_id, image);
1468 }
1469 
1470 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1471                            int32_t kind) {
1472   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1473 
1474   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1475 }
1476 
1477 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1478                               int64_t size) {
1479   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1480 
1481   __tgt_async_info AsyncInfo;
1482   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1483                                                  size, &AsyncInfo);
1484   if (rc != OFFLOAD_SUCCESS)
1485     return OFFLOAD_FAIL;
1486 
1487   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1488 }
1489 
1490 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1491                                     void *hst_ptr, int64_t size,
1492                                     __tgt_async_info *async_info_ptr) {
1493   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1494   assert(async_info_ptr && "async_info_ptr is nullptr");
1495 
1496   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1497                               async_info_ptr);
1498 }
1499 
1500 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1501                                 int64_t size) {
1502   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1503 
1504   __tgt_async_info AsyncInfo;
1505   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1506                                                    size, &AsyncInfo);
1507   if (rc != OFFLOAD_SUCCESS)
1508     return OFFLOAD_FAIL;
1509 
1510   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1511 }
1512 
1513 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1514                                       void *tgt_ptr, int64_t size,
1515                                       __tgt_async_info *async_info_ptr) {
1516   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1517   assert(async_info_ptr && "async_info_ptr is nullptr");
1518 
1519   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1520                                 async_info_ptr);
1521 }
1522 
1523 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1524                                       int dst_dev_id, void *dst_ptr,
1525                                       int64_t size,
1526                                       __tgt_async_info *AsyncInfo) {
1527   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1528   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1529   assert(AsyncInfo && "AsyncInfo is nullptr");
1530 
1531   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1532                                 AsyncInfo);
1533 }
1534 
1535 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1536                                 int32_t dst_dev_id, void *dst_ptr,
1537                                 int64_t size) {
1538   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1539   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1540 
1541   __tgt_async_info AsyncInfo;
1542   const int32_t rc = __tgt_rtl_data_exchange_async(
1543       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1544   if (rc != OFFLOAD_SUCCESS)
1545     return OFFLOAD_FAIL;
1546 
1547   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1548 }
1549 
1550 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1551   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1552 
1553   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1554 }
1555 
1556 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1557                                          void **tgt_args,
1558                                          ptrdiff_t *tgt_offsets,
1559                                          int32_t arg_num, int32_t team_num,
1560                                          int32_t thread_limit,
1561                                          uint64_t loop_tripcount) {
1562   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1563 
1564   __tgt_async_info AsyncInfo;
1565   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1566       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1567       thread_limit, loop_tripcount, &AsyncInfo);
1568   if (rc != OFFLOAD_SUCCESS)
1569     return OFFLOAD_FAIL;
1570 
1571   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1572 }
1573 
1574 int32_t __tgt_rtl_run_target_team_region_async(
1575     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1576     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1577     int32_t thread_limit, uint64_t loop_tripcount,
1578     __tgt_async_info *async_info_ptr) {
1579   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1580 
1581   return DeviceRTL.runTargetTeamRegion(
1582       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1583       thread_limit, loop_tripcount, async_info_ptr);
1584 }
1585 
1586 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1587                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1588                                     int32_t arg_num) {
1589   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1590 
1591   __tgt_async_info AsyncInfo;
1592   const int32_t rc = __tgt_rtl_run_target_region_async(
1593       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1594   if (rc != OFFLOAD_SUCCESS)
1595     return OFFLOAD_FAIL;
1596 
1597   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1598 }
1599 
1600 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1601                                           void *tgt_entry_ptr, void **tgt_args,
1602                                           ptrdiff_t *tgt_offsets,
1603                                           int32_t arg_num,
1604                                           __tgt_async_info *async_info_ptr) {
1605   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1606 
1607   return __tgt_rtl_run_target_team_region_async(
1608       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1609       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1610       async_info_ptr);
1611 }
1612 
1613 int32_t __tgt_rtl_synchronize(int32_t device_id,
1614                               __tgt_async_info *async_info_ptr) {
1615   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1616   assert(async_info_ptr && "async_info_ptr is nullptr");
1617   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1618 
1619   return DeviceRTL.synchronize(device_id, async_info_ptr);
1620 }
1621 
1622 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1623   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1624   InfoLevel.store(NewInfoLevel);
1625 }
1626 
1627 void __tgt_rtl_print_device_info(int32_t device_id) {
1628   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1629   DeviceRTL.printDeviceInfo(device_id);
1630 }
1631 
1632 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1633   assert(event && "event is nullptr");
1634   return DeviceRTL.createEvent(event);
1635 }
1636 
1637 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1638                                __tgt_async_info *async_info_ptr) {
1639   assert(async_info_ptr && "async_info_ptr is nullptr");
1640   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1641   assert(event_ptr && "event_ptr is nullptr");
1642 
1643   return recordEvent(event_ptr, async_info_ptr);
1644 }
1645 
1646 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1647                              __tgt_async_info *async_info_ptr) {
1648   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1649   assert(async_info_ptr && "async_info_ptr is nullptr");
1650   assert(event_ptr && "event is nullptr");
1651 
1652   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1653 }
1654 
1655 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1656   assert(event_ptr && "event is nullptr");
1657 
1658   return syncEvent(event_ptr);
1659 }
1660 
1661 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1662   assert(event_ptr && "event is nullptr");
1663 
1664   return DeviceRTL.destroyEvent(event_ptr);
1665 }
1666 
1667 #ifdef __cplusplus
1668 }
1669 #endif
1670