1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <mutex>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "Debug.h"
25 #include "DeviceEnvironment.h"
26 #include "omptarget.h"
27 #include "omptargetplugin.h"
28 
29 #define TARGET_NAME CUDA
30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
31 
32 #include "MemoryManager.h"
33 
34 #include "llvm/Frontend/OpenMP/OMPConstants.h"
35 
36 // Utility for retrieving and printing CUDA error string.
37 #ifdef OMPTARGET_DEBUG
38 #define CUDA_ERR_STRING(err)                                                   \
39   do {                                                                         \
40     if (getDebugLevel() > 0) {                                                 \
41       const char *errStr = nullptr;                                            \
42       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
43       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
44         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
45       else if (errStr_status == CUDA_SUCCESS)                                  \
46         REPORT("CUDA error is: %s\n", errStr);                                 \
47       else {                                                                   \
48         REPORT("Unresolved CUDA error code: %d\n", err);                       \
49         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
50                errStr_status);                                                 \
51       }                                                                        \
52     } else {                                                                   \
53       const char *errStr = nullptr;                                            \
54       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
55       if (errStr_status == CUDA_SUCCESS)                                       \
56         REPORT("%s \n", errStr);                                               \
57     }                                                                          \
58   } while (false)
59 #else // OMPTARGET_DEBUG
60 #define CUDA_ERR_STRING(err)                                                   \
61   do {                                                                         \
62     const char *errStr = nullptr;                                              \
63     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
64     if (errStr_status == CUDA_SUCCESS)                                         \
65       REPORT("%s \n", errStr);                                                 \
66   } while (false)
67 #endif // OMPTARGET_DEBUG
68 
69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
70 
71 #include "elf_common.h"
72 
73 /// Keep entries table per device.
74 struct FuncOrGblEntryTy {
75   __tgt_target_table Table;
76   std::vector<__tgt_offload_entry> Entries;
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
85 
86   /// Maximal number of threads per block for this kernel.
87   int MaxThreadsPerBlock = 0;
88 
89   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
90       : Func(_Func), ExecutionMode(_ExecutionMode) {}
91 };
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   REPORT("%s", ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
120   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
121   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
122 
123   CUresult Err = cuEventRecord(Event, Stream);
124   if (Err != CUDA_SUCCESS) {
125     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
126        DPxPTR(Stream), DPxPTR(Event));
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 int syncEvent(void *EventPtr) {
135   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
136 
137   CUresult Err = cuEventSynchronize(Event);
138   if (Err != CUDA_SUCCESS) {
139     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 namespace {
148 
149 // Structure contains per-device data
150 struct DeviceDataTy {
151   /// List that contains all the kernels.
152   std::list<KernelTy> KernelsList;
153 
154   std::list<FuncOrGblEntryTy> FuncGblEntries;
155 
156   CUcontext Context = nullptr;
157   // Device properties
158   int ThreadsPerBlock = 0;
159   int BlocksPerGrid = 0;
160   int WarpSize = 0;
161   // OpenMP properties
162   int NumTeams = 0;
163   int NumThreads = 0;
164 };
165 
166 /// Resource allocator where \p T is the resource type.
167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
168 /// accordingly. The implementation should not raise any exception.
169 template <typename T> struct AllocatorTy {
170   using ElementTy = T;
171   virtual ~AllocatorTy() {}
172 
173   /// Create a resource and assign to R.
174   virtual int create(T &R) noexcept = 0;
175   /// Destroy the resource.
176   virtual int destroy(T) noexcept = 0;
177 };
178 
179 /// Allocator for CUstream.
180 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
181   /// See AllocatorTy<T>::create.
182   int create(CUstream &Stream) noexcept override {
183     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
184                      "Error returned from cuStreamCreate\n"))
185       return OFFLOAD_FAIL;
186 
187     return OFFLOAD_SUCCESS;
188   }
189 
190   /// See AllocatorTy<T>::destroy.
191   int destroy(CUstream Stream) noexcept override {
192     if (!checkResult(cuStreamDestroy(Stream),
193                      "Error returned from cuStreamDestroy\n"))
194       return OFFLOAD_FAIL;
195 
196     return OFFLOAD_SUCCESS;
197   }
198 };
199 
200 /// Allocator for CUevent.
201 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
202   /// See AllocatorTy<T>::create.
203   int create(CUevent &Event) noexcept override {
204     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
205                      "Error returned from cuEventCreate\n"))
206       return OFFLOAD_FAIL;
207 
208     return OFFLOAD_SUCCESS;
209   }
210 
211   /// See AllocatorTy<T>::destroy.
212   int destroy(CUevent Event) noexcept override {
213     if (!checkResult(cuEventDestroy(Event),
214                      "Error returned from cuEventDestroy\n"))
215       return OFFLOAD_FAIL;
216 
217     return OFFLOAD_SUCCESS;
218   }
219 };
220 
221 /// A generic pool of resources where \p T is the resource type.
222 /// \p T should be copyable as the object is stored in \p std::vector .
223 template <typename AllocTy> class ResourcePoolTy {
224   using ElementTy = typename AllocTy::ElementTy;
225   /// Index of the next available resource.
226   size_t Next = 0;
227   /// Mutex to guard the pool.
228   std::mutex Mutex;
229   /// Pool of resources. The difference between \p Resources and \p Pool is,
230   /// when a resource is acquired and released, it is all on \p Resources. When
231   /// a batch of new resources are needed, they are both added to \p Resources
232   /// and \p Pool. The reason for this setting is, \p Resources could contain
233   /// redundant elements because resources are not released, which can cause
234   /// double free. This setting makes sure that \p Pool always has every
235   /// resource allocated from the device.
236   std::vector<ElementTy> Resources;
237   std::vector<ElementTy> Pool;
238   /// A reference to the corresponding allocator.
239   AllocTy Allocator;
240 
241   /// If `Resources` is used up, we will fill in more resources. It assumes that
242   /// the new size `Size` should be always larger than the current size.
243   bool resize(size_t Size) {
244     assert(Resources.size() == Pool.size() && "size mismatch");
245     auto CurSize = Resources.size();
246     assert(Size > CurSize && "Unexpected smaller size");
247     Pool.reserve(Size);
248     Resources.reserve(Size);
249     for (auto I = CurSize; I < Size; ++I) {
250       ElementTy NewItem;
251       int Ret = Allocator.create(NewItem);
252       if (Ret != OFFLOAD_SUCCESS)
253         return false;
254       Pool.push_back(NewItem);
255       Resources.push_back(NewItem);
256     }
257     return true;
258   }
259 
260 public:
261   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
262       : Allocator(std::move(A)) {
263     if (Size)
264       (void)resize(Size);
265   }
266 
267   ~ResourcePoolTy() noexcept { clear(); }
268 
269   /// Get a resource from pool. `Next` always points to the next available
270   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
271   /// still available. If there is no resource left, we will ask for more. Each
272   /// time a resource is assigned, the id will increase one.
273   /// xxxxxs+++++++++
274   ///      ^
275   ///      Next
276   /// After assignment, the pool becomes the following and s is assigned.
277   /// xxxxxs+++++++++
278   ///       ^
279   ///       Next
280   int acquire(ElementTy &R) noexcept {
281     std::lock_guard<std::mutex> LG(Mutex);
282     if (Next == Resources.size()) {
283       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
284       if (!resize(NewSize))
285         return OFFLOAD_FAIL;
286     }
287 
288     assert(Next < Resources.size());
289 
290     R = Resources[Next++];
291 
292     return OFFLOAD_SUCCESS;
293   }
294 
295   /// Return the resource back to the pool. When we return a resource, we need
296   /// to first decrease `Next`, and then copy the resource back. It is worth
297   /// noting that, the order of resources return might be different from that
298   /// they're assigned, that saying, at some point, there might be two identical
299   /// resources.
300   /// xxax+a+++++
301   ///     ^
302   ///     Next
303   /// However, it doesn't matter, because they're always on the two sides of
304   /// `Next`. The left one will in the end be overwritten by another resource.
305   /// Therefore, after several execution, the order of pool might be different
306   /// from its initial state.
307   void release(ElementTy R) noexcept {
308     std::lock_guard<std::mutex> LG(Mutex);
309     Resources[--Next] = R;
310   }
311 
312   /// Released all stored resources and clear the pool.
313   /// Note: This function is not thread safe. Be sure to guard it if necessary.
314   void clear() noexcept {
315     for (auto &R : Pool)
316       (void)Allocator.destroy(R);
317     Pool.clear();
318     Resources.clear();
319   }
320 };
321 
322 } // namespace
323 
324 class DeviceRTLTy {
325   int NumberOfDevices;
326   // OpenMP environment properties
327   int EnvNumTeams;
328   int EnvTeamLimit;
329   int EnvTeamThreadLimit;
330   // OpenMP requires flags
331   int64_t RequiresFlags;
332   // Amount of dynamic shared memory to use at launch.
333   uint64_t DynamicMemorySize;
334 
335   /// Number of initial streams for each device.
336   int NumInitialStreams = 32;
337 
338   /// Number of initial events for each device.
339   int NumInitialEvents = 8;
340 
341   static constexpr const int32_t HardThreadLimit = 1024;
342   static constexpr const int32_t DefaultNumTeams = 128;
343   static constexpr const int32_t DefaultNumThreads = 128;
344 
345   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
346   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
347 
348   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
349   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
350 
351   std::vector<DeviceDataTy> DeviceData;
352   std::vector<std::vector<CUmodule>> Modules;
353 
354   /// Vector of flags indicating the initalization status of all associated
355   /// devices.
356   std::vector<bool> InitializedFlags;
357 
358   /// A class responsible for interacting with device native runtime library to
359   /// allocate and free memory.
360   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
361     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
362 
363   public:
364     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
365       if (Size == 0)
366         return nullptr;
367 
368       void *MemAlloc = nullptr;
369       CUresult Err;
370       switch (Kind) {
371       case TARGET_ALLOC_DEFAULT:
372       case TARGET_ALLOC_DEVICE:
373         CUdeviceptr DevicePtr;
374         Err = cuMemAlloc(&DevicePtr, Size);
375         MemAlloc = (void *)DevicePtr;
376         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
377           return nullptr;
378         break;
379       case TARGET_ALLOC_HOST:
380         void *HostPtr;
381         Err = cuMemAllocHost(&HostPtr, Size);
382         MemAlloc = HostPtr;
383         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
384           return nullptr;
385         HostPinnedAllocs[MemAlloc] = Kind;
386         break;
387       case TARGET_ALLOC_SHARED:
388         CUdeviceptr SharedPtr;
389         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
390         MemAlloc = (void *)SharedPtr;
391         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
392           return nullptr;
393         break;
394       }
395 
396       return MemAlloc;
397     }
398 
399     int free(void *TgtPtr) override {
400       CUresult Err;
401       // Host pinned memory must be freed differently.
402       TargetAllocTy Kind =
403           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
404               ? TARGET_ALLOC_DEFAULT
405               : TARGET_ALLOC_HOST;
406       switch (Kind) {
407       case TARGET_ALLOC_DEFAULT:
408       case TARGET_ALLOC_DEVICE:
409       case TARGET_ALLOC_SHARED:
410         Err = cuMemFree((CUdeviceptr)TgtPtr);
411         if (!checkResult(Err, "Error returned from cuMemFree\n"))
412           return OFFLOAD_FAIL;
413         break;
414       case TARGET_ALLOC_HOST:
415         Err = cuMemFreeHost(TgtPtr);
416         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
417           return OFFLOAD_FAIL;
418         break;
419       }
420 
421       return OFFLOAD_SUCCESS;
422     }
423   };
424 
425   /// A vector of device allocators
426   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
427 
428   /// A vector of memory managers. Since the memory manager is non-copyable and
429   // non-removable, we wrap them into std::unique_ptr.
430   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
431 
432   /// Whether use memory manager
433   bool UseMemoryManager = true;
434 
435   // Record entry point associated with device
436   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
437     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
438     E.Entries.push_back(entry);
439   }
440 
441   // Return a pointer to the entry associated with the pointer
442   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
443                                              const void *Addr) const {
444     for (const __tgt_offload_entry &Itr :
445          DeviceData[DeviceId].FuncGblEntries.back().Entries)
446       if (Itr.addr == Addr)
447         return &Itr;
448 
449     return nullptr;
450   }
451 
452   // Return the pointer to the target entries table
453   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
454     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
455 
456     if (E.Entries.empty())
457       return nullptr;
458 
459     // Update table info according to the entries and return the pointer
460     E.Table.EntriesBegin = E.Entries.data();
461     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
462 
463     return &E.Table;
464   }
465 
466   // Clear entries table for a device
467   void clearOffloadEntriesTable(const int DeviceId) {
468     DeviceData[DeviceId].FuncGblEntries.emplace_back();
469     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
470     E.Entries.clear();
471     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
472   }
473 
474 public:
475   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
476     assert(AsyncInfo && "AsyncInfo is nullptr");
477 
478     if (!AsyncInfo->Queue) {
479       CUstream S;
480       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
481         return nullptr;
482 
483       AsyncInfo->Queue = S;
484     }
485 
486     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
487   }
488 
489   // This class should not be copied
490   DeviceRTLTy(const DeviceRTLTy &) = delete;
491   DeviceRTLTy(DeviceRTLTy &&) = delete;
492 
493   DeviceRTLTy()
494       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
495         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
496         DynamicMemorySize(0) {
497 
498     DP("Start initializing CUDA\n");
499 
500     CUresult Err = cuInit(0);
501     if (Err == CUDA_ERROR_INVALID_HANDLE) {
502       // Can't call cuGetErrorString if dlsym failed
503       DP("Failed to load CUDA shared library\n");
504       return;
505     }
506     if (!checkResult(Err, "Error returned from cuInit\n")) {
507       return;
508     }
509 
510     Err = cuDeviceGetCount(&NumberOfDevices);
511     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
512       return;
513 
514     if (NumberOfDevices == 0) {
515       DP("There are no devices supporting CUDA.\n");
516       return;
517     }
518 
519     DeviceData.resize(NumberOfDevices);
520     Modules.resize(NumberOfDevices);
521     StreamPool.resize(NumberOfDevices);
522     EventPool.resize(NumberOfDevices);
523 
524     // Get environment variables regarding teams
525     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
526       // OMP_TEAM_LIMIT has been set
527       EnvTeamLimit = std::stoi(EnvStr);
528       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
529     }
530     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
531       // OMP_TEAMS_THREAD_LIMIT has been set
532       EnvTeamThreadLimit = std::stoi(EnvStr);
533       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
534     }
535     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
536       // OMP_NUM_TEAMS has been set
537       EnvNumTeams = std::stoi(EnvStr);
538       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
539     }
540     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
541       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
542       DynamicMemorySize = std::stoi(EnvStr);
543       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
544          DynamicMemorySize);
545     }
546     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
547       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
548       NumInitialStreams = std::stoi(EnvStr);
549       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
550     }
551 
552     for (int I = 0; I < NumberOfDevices; ++I)
553       DeviceAllocators.emplace_back();
554 
555     // Get the size threshold from environment variable
556     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
557     UseMemoryManager = Res.second;
558     size_t MemoryManagerThreshold = Res.first;
559 
560     if (UseMemoryManager)
561       for (int I = 0; I < NumberOfDevices; ++I)
562         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
563             DeviceAllocators[I], MemoryManagerThreshold));
564 
565     // We lazily initialize all devices later.
566     InitializedFlags.assign(NumberOfDevices, false);
567   }
568 
569   ~DeviceRTLTy() {
570     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
571       deinitDevice(DeviceId);
572   }
573 
574   // Check whether a given DeviceId is valid
575   bool isValidDeviceId(const int DeviceId) const {
576     return DeviceId >= 0 && DeviceId < NumberOfDevices;
577   }
578 
579   int getNumOfDevices() const { return NumberOfDevices; }
580 
581   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
582 
583   int initDevice(const int DeviceId) {
584     CUdevice Device;
585 
586     DP("Getting device %d\n", DeviceId);
587     CUresult Err = cuDeviceGet(&Device, DeviceId);
588     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
589       return OFFLOAD_FAIL;
590 
591     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
592     InitializedFlags[DeviceId] = true;
593 
594     // Query the current flags of the primary context and set its flags if
595     // it is inactive
596     unsigned int FormerPrimaryCtxFlags = 0;
597     int FormerPrimaryCtxIsActive = 0;
598     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
599                                      &FormerPrimaryCtxIsActive);
600     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
601       return OFFLOAD_FAIL;
602 
603     if (FormerPrimaryCtxIsActive) {
604       DP("The primary context is active, no change to its flags\n");
605       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
606           CU_CTX_SCHED_BLOCKING_SYNC)
607         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
608     } else {
609       DP("The primary context is inactive, set its flags to "
610          "CU_CTX_SCHED_BLOCKING_SYNC\n");
611       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
612       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
613         return OFFLOAD_FAIL;
614     }
615 
616     // Retain the per device primary context and save it to use whenever this
617     // device is selected.
618     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
619     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
620       return OFFLOAD_FAIL;
621 
622     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
623     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
624       return OFFLOAD_FAIL;
625 
626     // Initialize the stream pool.
627     if (!StreamPool[DeviceId])
628       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
629                                                             NumInitialStreams);
630 
631     // Initialize the event pool.
632     if (!EventPool[DeviceId])
633       EventPool[DeviceId] =
634           std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
635 
636     // Query attributes to determine number of threads/block and blocks/grid.
637     int MaxGridDimX;
638     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
639                                Device);
640     if (Err != CUDA_SUCCESS) {
641       DP("Error getting max grid dimension, use default value %d\n",
642          DeviceRTLTy::DefaultNumTeams);
643       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
644     } else {
645       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
646       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
647     }
648 
649     // We are only exploiting threads along the x axis.
650     int MaxBlockDimX;
651     Err = cuDeviceGetAttribute(&MaxBlockDimX,
652                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
653     if (Err != CUDA_SUCCESS) {
654       DP("Error getting max block dimension, use default value %d\n",
655          DeviceRTLTy::DefaultNumThreads);
656       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
657     } else {
658       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
659       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
660 
661       if (EnvTeamThreadLimit > 0 &&
662           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
663         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
664            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
665            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
666         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
667       }
668       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
669         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
670            "capping at the hard limit\n",
671            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
672         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
673       }
674     }
675 
676     // Get and set warp size
677     int WarpSize;
678     Err =
679         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
680     if (Err != CUDA_SUCCESS) {
681       DP("Error getting warp size, assume default value 32\n");
682       DeviceData[DeviceId].WarpSize = 32;
683     } else {
684       DP("Using warp size %d\n", WarpSize);
685       DeviceData[DeviceId].WarpSize = WarpSize;
686     }
687 
688     // Adjust teams to the env variables
689     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
690       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
691          EnvTeamLimit);
692       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
693     }
694 
695     size_t StackLimit;
696     size_t HeapLimit;
697     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
698       StackLimit = std::stol(EnvStr);
699       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
700         return OFFLOAD_FAIL;
701     } else {
702       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
703         return OFFLOAD_FAIL;
704     }
705     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
706       HeapLimit = std::stol(EnvStr);
707       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
708         return OFFLOAD_FAIL;
709     } else {
710       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
711         return OFFLOAD_FAIL;
712     }
713 
714     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
715          "Device supports up to %d CUDA blocks and %d threads with a "
716          "warp size of %d\n",
717          DeviceData[DeviceId].BlocksPerGrid,
718          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
719     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
720          "Device heap size is %d Bytes, device stack size is %d Bytes per "
721          "thread\n",
722          (int)HeapLimit, (int)StackLimit);
723 
724     // Set default number of teams
725     if (EnvNumTeams > 0) {
726       DP("Default number of teams set according to environment %d\n",
727          EnvNumTeams);
728       DeviceData[DeviceId].NumTeams = EnvNumTeams;
729     } else {
730       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
731       DP("Default number of teams set according to library's default %d\n",
732          DeviceRTLTy::DefaultNumTeams);
733     }
734 
735     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
736       DP("Default number of teams exceeds device limit, capping at %d\n",
737          DeviceData[DeviceId].BlocksPerGrid);
738       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
739     }
740 
741     // Set default number of threads
742     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
743     DP("Default number of threads set according to library's default %d\n",
744        DeviceRTLTy::DefaultNumThreads);
745     if (DeviceData[DeviceId].NumThreads >
746         DeviceData[DeviceId].ThreadsPerBlock) {
747       DP("Default number of threads exceeds device limit, capping at %d\n",
748          DeviceData[DeviceId].ThreadsPerBlock);
749       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
750     }
751 
752     return OFFLOAD_SUCCESS;
753   }
754 
755   int deinitDevice(const int DeviceId) {
756     auto IsInitialized = InitializedFlags[DeviceId];
757     if (!IsInitialized)
758       return OFFLOAD_SUCCESS;
759     InitializedFlags[DeviceId] = false;
760 
761     if (UseMemoryManager)
762       MemoryManagers[DeviceId].release();
763 
764     StreamPool[DeviceId].reset();
765     EventPool[DeviceId].reset();
766 
767     DeviceDataTy &D = DeviceData[DeviceId];
768     if (!checkResult(cuCtxSetCurrent(D.Context),
769                      "Error returned from cuCtxSetCurrent\n"))
770       return OFFLOAD_FAIL;
771 
772     // Unload all modules.
773     for (auto &M : Modules[DeviceId])
774       if (!checkResult(cuModuleUnload(M),
775                        "Error returned from cuModuleUnload\n"))
776         return OFFLOAD_FAIL;
777 
778     // Destroy context.
779     CUdevice Device;
780     if (!checkResult(cuCtxGetDevice(&Device),
781                      "Error returned from cuCtxGetDevice\n"))
782       return OFFLOAD_FAIL;
783 
784     if (!checkResult(cuDevicePrimaryCtxRelease(Device),
785                      "Error returned from cuDevicePrimaryCtxRelease\n"))
786       return OFFLOAD_FAIL;
787 
788     return OFFLOAD_SUCCESS;
789   }
790 
791   __tgt_target_table *loadBinary(const int DeviceId,
792                                  const __tgt_device_image *Image) {
793     // Clear the offload table as we are going to create a new one.
794     clearOffloadEntriesTable(DeviceId);
795 
796     // Create the module and extract the function pointers.
797     CUmodule Module;
798     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
799     CUresult Err =
800         cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
801     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
802       return nullptr;
803 
804     DP("CUDA module successfully loaded!\n");
805 
806     Modules[DeviceId].push_back(Module);
807 
808     // Find the symbols in the module by name.
809     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
810     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
811 
812     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
813     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
814       if (!E->addr) {
815         // We return nullptr when something like this happens, the host should
816         // have always something in the address to uniquely identify the target
817         // region.
818         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
819         return nullptr;
820       }
821 
822       if (E->size) {
823         __tgt_offload_entry Entry = *E;
824         CUdeviceptr CUPtr;
825         size_t CUSize;
826         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
827         // We keep this style here because we need the name
828         if (Err != CUDA_SUCCESS) {
829           REPORT("Loading global '%s' Failed\n", E->name);
830           CUDA_ERR_STRING(Err);
831           return nullptr;
832         }
833 
834         if (CUSize != E->size) {
835           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
836              CUSize, E->size);
837           return nullptr;
838         }
839 
840         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
841            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
842 
843         Entry.addr = (void *)(CUPtr);
844 
845         // Note: In the current implementation declare target variables
846         // can either be link or to. This means that once unified
847         // memory is activated via the requires directive, the variable
848         // can be used directly from the host in both cases.
849         // TODO: when variables types other than to or link are added,
850         // the below condition should be changed to explicitly
851         // check for to and link variables types:
852         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
853         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
854         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
855           // If unified memory is present any target link or to variables
856           // can access host addresses directly. There is no longer a
857           // need for device copies.
858           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
859           DP("Copy linked variable host address (" DPxMOD
860              ") to device address (" DPxMOD ")\n",
861              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
862         }
863 
864         addOffloadEntry(DeviceId, Entry);
865 
866         continue;
867       }
868 
869       CUfunction Func;
870       Err = cuModuleGetFunction(&Func, Module, E->name);
871       // We keep this style here because we need the name
872       if (Err != CUDA_SUCCESS) {
873         REPORT("Loading '%s' Failed\n", E->name);
874         CUDA_ERR_STRING(Err);
875         return nullptr;
876       }
877 
878       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
879          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
880 
881       // default value GENERIC (in case symbol is missing from cubin file)
882       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
883       std::string ExecModeNameStr(E->name);
884       ExecModeNameStr += "_exec_mode";
885       const char *ExecModeName = ExecModeNameStr.c_str();
886 
887       CUdeviceptr ExecModePtr;
888       size_t CUSize;
889       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
890       if (Err == CUDA_SUCCESS) {
891         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
892           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
893              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
894           return nullptr;
895         }
896 
897         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
898         if (Err != CUDA_SUCCESS) {
899           REPORT("Error when copying data from device to host. Pointers: "
900                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
901                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
902           CUDA_ERR_STRING(Err);
903           return nullptr;
904         }
905       } else {
906         DP("Loading global exec_mode '%s' - symbol missing, using default "
907            "value GENERIC (1)\n",
908            ExecModeName);
909       }
910 
911       KernelsList.emplace_back(Func, ExecModeVal);
912 
913       __tgt_offload_entry Entry = *E;
914       Entry.addr = &KernelsList.back();
915       addOffloadEntry(DeviceId, Entry);
916     }
917 
918     // send device environment data to the device
919     {
920       // TODO: The device ID used here is not the real device ID used by OpenMP.
921       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
922                                     static_cast<uint32_t>(DeviceId),
923                                     static_cast<uint32_t>(DynamicMemorySize)};
924 
925       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
926         DeviceEnv.DebugKind = std::stoi(EnvStr);
927 
928       const char *DeviceEnvName = "omptarget_device_environment";
929       CUdeviceptr DeviceEnvPtr;
930       size_t CUSize;
931 
932       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
933       if (Err == CUDA_SUCCESS) {
934         if (CUSize != sizeof(DeviceEnv)) {
935           REPORT(
936               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
937               DeviceEnvName, CUSize, sizeof(int32_t));
938           CUDA_ERR_STRING(Err);
939           return nullptr;
940         }
941 
942         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
943         if (Err != CUDA_SUCCESS) {
944           REPORT("Error when copying data from host to device. Pointers: "
945                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
946                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
947           CUDA_ERR_STRING(Err);
948           return nullptr;
949         }
950 
951         DP("Sending global device environment data %zu bytes\n", CUSize);
952       } else {
953         DP("Finding global device environment '%s' - symbol missing.\n",
954            DeviceEnvName);
955         DP("Continue, considering this is a device RTL which does not accept "
956            "environment setting.\n");
957       }
958     }
959 
960     return getOffloadEntriesTable(DeviceId);
961   }
962 
963   void *dataAlloc(const int DeviceId, const int64_t Size,
964                   const TargetAllocTy Kind) {
965     switch (Kind) {
966     case TARGET_ALLOC_DEFAULT:
967     case TARGET_ALLOC_DEVICE:
968       if (UseMemoryManager)
969         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
970       else
971         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
972     case TARGET_ALLOC_HOST:
973     case TARGET_ALLOC_SHARED:
974       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
975     }
976 
977     REPORT("Invalid target data allocation kind or requested allocator not "
978            "implemented yet\n");
979 
980     return nullptr;
981   }
982 
983   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
984                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
985     assert(AsyncInfo && "AsyncInfo is nullptr");
986 
987     CUstream Stream = getStream(DeviceId, AsyncInfo);
988     CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
989     if (Err != CUDA_SUCCESS) {
990       DP("Error when copying data from host to device. Pointers: host "
991          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
992          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
993       CUDA_ERR_STRING(Err);
994       return OFFLOAD_FAIL;
995     }
996 
997     return OFFLOAD_SUCCESS;
998   }
999 
1000   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1001                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
1002     assert(AsyncInfo && "AsyncInfo is nullptr");
1003 
1004     CUstream Stream = getStream(DeviceId, AsyncInfo);
1005     CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1006     if (Err != CUDA_SUCCESS) {
1007       DP("Error when copying data from device to host. Pointers: host "
1008          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1009          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1010       CUDA_ERR_STRING(Err);
1011       return OFFLOAD_FAIL;
1012     }
1013 
1014     return OFFLOAD_SUCCESS;
1015   }
1016 
1017   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1018                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1019     assert(AsyncInfo && "AsyncInfo is nullptr");
1020 
1021     CUresult Err;
1022     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1023 
1024     // If they are two devices, we try peer to peer copy first
1025     if (SrcDevId != DstDevId) {
1026       int CanAccessPeer = 0;
1027       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1028       if (Err != CUDA_SUCCESS) {
1029         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1030                ", dst = %" PRId32 "\n",
1031                SrcDevId, DstDevId);
1032         CUDA_ERR_STRING(Err);
1033         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1034       }
1035 
1036       if (!CanAccessPeer) {
1037         DP("P2P memcpy not supported so fall back to D2D memcpy");
1038         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1039       }
1040 
1041       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1042       if (Err != CUDA_SUCCESS) {
1043         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1044                ", dst = %" PRId32 "\n",
1045                SrcDevId, DstDevId);
1046         CUDA_ERR_STRING(Err);
1047         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1048       }
1049 
1050       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1051                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1052                               Size, Stream);
1053       if (Err == CUDA_SUCCESS)
1054         return OFFLOAD_SUCCESS;
1055 
1056       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1057          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1058          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1059       CUDA_ERR_STRING(Err);
1060     }
1061 
1062     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1063   }
1064 
1065   int dataDelete(const int DeviceId, void *TgtPtr) {
1066     if (UseMemoryManager)
1067       return MemoryManagers[DeviceId]->free(TgtPtr);
1068 
1069     return DeviceAllocators[DeviceId].free(TgtPtr);
1070   }
1071 
1072   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1073                           ptrdiff_t *TgtOffsets, const int ArgNum,
1074                           const int TeamNum, const int ThreadLimit,
1075                           const unsigned int LoopTripCount,
1076                           __tgt_async_info *AsyncInfo) const {
1077     // All args are references.
1078     std::vector<void *> Args(ArgNum);
1079     std::vector<void *> Ptrs(ArgNum);
1080 
1081     for (int I = 0; I < ArgNum; ++I) {
1082       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1083       Args[I] = &Ptrs[I];
1084     }
1085 
1086     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1087 
1088     const bool IsSPMDGenericMode =
1089         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1090     const bool IsSPMDMode =
1091         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1092     const bool IsGenericMode =
1093         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1094 
1095     int CudaThreadsPerBlock;
1096     if (ThreadLimit > 0) {
1097       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1098       CudaThreadsPerBlock = ThreadLimit;
1099       // Add master warp if necessary
1100       if (IsGenericMode) {
1101         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1102         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1103       }
1104     } else {
1105       DP("Setting CUDA threads per block to default %d\n",
1106          DeviceData[DeviceId].NumThreads);
1107       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1108     }
1109 
1110     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1111       DP("Threads per block capped at device limit %d\n",
1112          DeviceData[DeviceId].ThreadsPerBlock);
1113       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1114     }
1115 
1116     CUresult Err;
1117     if (!KernelInfo->MaxThreadsPerBlock) {
1118       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1119                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1120                                KernelInfo->Func);
1121       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1122         return OFFLOAD_FAIL;
1123     }
1124 
1125     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1126       DP("Threads per block capped at kernel limit %d\n",
1127          KernelInfo->MaxThreadsPerBlock);
1128       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1129     }
1130 
1131     unsigned int CudaBlocksPerGrid;
1132     if (TeamNum <= 0) {
1133       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1134         if (IsSPMDGenericMode) {
1135           // If we reach this point, then we are executing a kernel that was
1136           // transformed from Generic-mode to SPMD-mode. This kernel has
1137           // SPMD-mode execution, but needs its blocks to be scheduled
1138           // differently because the current loop trip count only applies to the
1139           // `teams distribute` region and will create var too few blocks using
1140           // the regular SPMD-mode method.
1141           CudaBlocksPerGrid = LoopTripCount;
1142         } else if (IsSPMDMode) {
1143           // We have a combined construct, i.e. `target teams distribute
1144           // parallel for [simd]`. We launch so many teams so that each thread
1145           // will execute one iteration of the loop. round up to the nearest
1146           // integer
1147           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1148         } else if (IsGenericMode) {
1149           // If we reach this point, then we have a non-combined construct, i.e.
1150           // `teams distribute` with a nested `parallel for` and each team is
1151           // assigned one iteration of the `distribute` loop. E.g.:
1152           //
1153           // #pragma omp target teams distribute
1154           // for(...loop_tripcount...) {
1155           //   #pragma omp parallel for
1156           //   for(...) {}
1157           // }
1158           //
1159           // Threads within a team will execute the iterations of the `parallel`
1160           // loop.
1161           CudaBlocksPerGrid = LoopTripCount;
1162         } else {
1163           REPORT("Unknown execution mode: %d\n",
1164                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1165           return OFFLOAD_FAIL;
1166         }
1167         DP("Using %d teams due to loop trip count %" PRIu32
1168            " and number of threads per block %d\n",
1169            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1170       } else {
1171         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1172         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1173       }
1174     } else {
1175       DP("Using requested number of teams %d\n", TeamNum);
1176       CudaBlocksPerGrid = TeamNum;
1177     }
1178 
1179     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1180       DP("Capping number of teams to team limit %d\n",
1181          DeviceData[DeviceId].BlocksPerGrid);
1182       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1183     }
1184 
1185     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1186          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1187          (getOffloadEntry(DeviceId, TgtEntryPtr))
1188              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1189              : "(null)",
1190          CudaBlocksPerGrid, CudaThreadsPerBlock,
1191          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1192 
1193     CUstream Stream = getStream(DeviceId, AsyncInfo);
1194     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1195                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1196                          /* blockDimY */ 1, /* blockDimZ */ 1,
1197                          DynamicMemorySize, Stream, &Args[0], nullptr);
1198     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1199       return OFFLOAD_FAIL;
1200 
1201     DP("Launch of entry point at " DPxMOD " successful!\n",
1202        DPxPTR(TgtEntryPtr));
1203 
1204     return OFFLOAD_SUCCESS;
1205   }
1206 
1207   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1208     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1209     CUresult Err = cuStreamSynchronize(Stream);
1210 
1211     // Once the stream is synchronized, return it to stream pool and reset
1212     // AsyncInfo. This is to make sure the synchronization only works for its
1213     // own tasks.
1214     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1215     AsyncInfo->Queue = nullptr;
1216 
1217     if (Err != CUDA_SUCCESS) {
1218       DP("Error when synchronizing stream. stream = " DPxMOD
1219          ", async info ptr = " DPxMOD "\n",
1220          DPxPTR(Stream), DPxPTR(AsyncInfo));
1221       CUDA_ERR_STRING(Err);
1222     }
1223     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1224   }
1225 
1226   void printDeviceInfo(int32_t device_id) {
1227     char TmpChar[1000];
1228     std::string TmpStr;
1229     size_t TmpSt;
1230     int TmpInt, TmpInt2, TmpInt3;
1231 
1232     CUdevice Device;
1233     checkResult(cuDeviceGet(&Device, device_id),
1234                 "Error returned from cuCtxGetDevice\n");
1235 
1236     cuDriverGetVersion(&TmpInt);
1237     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1238     printf("    CUDA Device Number: \t\t%d \n", device_id);
1239     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1240                 "Error returned from cuDeviceGetName\n");
1241     printf("    Device Name: \t\t\t%s \n", TmpChar);
1242     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1243                 "Error returned from cuDeviceTotalMem\n");
1244     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1245     checkResult(cuDeviceGetAttribute(
1246                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1247                 "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1249     checkResult(
1250         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1251         "Error returned from cuDeviceGetAttribute\n");
1252     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1253     checkResult(cuDeviceGetAttribute(
1254                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1255                 "Error returned from cuDeviceGetAttribute\n");
1256     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1257     checkResult(
1258         cuDeviceGetAttribute(
1259             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1260         "Error returned from cuDeviceGetAttribute\n");
1261     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1262     checkResult(
1263         cuDeviceGetAttribute(
1264             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1265         "Error returned from cuDeviceGetAttribute\n");
1266     printf("    Registers per Block: \t\t%d \n", TmpInt);
1267     checkResult(
1268         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1269         "Error returned from cuDeviceGetAttribute\n");
1270     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1271     checkResult(cuDeviceGetAttribute(
1272                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1273                 "Error returned from cuDeviceGetAttribute\n");
1274     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1275     checkResult(cuDeviceGetAttribute(
1276                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1277                 "Error returned from cuDeviceGetAttribute\n");
1278     checkResult(cuDeviceGetAttribute(
1279                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1280                 "Error returned from cuDeviceGetAttribute\n");
1281     checkResult(cuDeviceGetAttribute(
1282                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1283                 "Error returned from cuDeviceGetAttribute\n");
1284     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1285            TmpInt3);
1286     checkResult(cuDeviceGetAttribute(
1287                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1288                 "Error returned from cuDeviceGetAttribute\n");
1289     checkResult(cuDeviceGetAttribute(
1290                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1291                 "Error returned from cuDeviceGetAttribute\n");
1292     checkResult(cuDeviceGetAttribute(
1293                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1294                 "Error returned from cuDeviceGetAttribute\n");
1295     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1296            TmpInt3);
1297     checkResult(
1298         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1299         "Error returned from cuDeviceGetAttribute\n");
1300     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1301     checkResult(cuDeviceGetAttribute(
1302                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1303                 "Error returned from cuDeviceGetAttribute\n");
1304     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1305     checkResult(
1306         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1307         "Error returned from cuDeviceGetAttribute\n");
1308     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1309     checkResult(cuDeviceGetAttribute(
1310                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1311                 "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(
1314         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1315         "Error returned from cuDeviceGetAttribute\n");
1316     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1317     checkResult(cuDeviceGetAttribute(
1318                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1319                 "Error returned from cuDeviceGetAttribute\n");
1320     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1321     checkResult(
1322         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1323         "Error returned from cuDeviceGetAttribute\n");
1324     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1325       TmpStr = "DEFAULT";
1326     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1327       TmpStr = "PROHIBITED";
1328     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1329       TmpStr = "EXCLUSIVE PROCESS";
1330     else
1331       TmpStr = "unknown";
1332     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1333     checkResult(cuDeviceGetAttribute(
1334                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1335                 "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1337     checkResult(
1338         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1339         "Error returned from cuDeviceGetAttribute\n");
1340     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1341     checkResult(cuDeviceGetAttribute(
1342                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1343                 "Error returned from cuDeviceGetAttribute\n");
1344     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1345     checkResult(
1346         cuDeviceGetAttribute(
1347             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1348         "Error returned from cuDeviceGetAttribute\n");
1349     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1350     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1351                                      Device),
1352                 "Error returned from cuDeviceGetAttribute\n");
1353     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1354     checkResult(cuDeviceGetAttribute(
1355                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1356                     Device),
1357                 "Error returned from cuDeviceGetAttribute\n");
1358     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1359     checkResult(cuDeviceGetAttribute(
1360                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1361                 "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1363     checkResult(cuDeviceGetAttribute(
1364                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1365                 "Error returned from cuDeviceGetAttribute\n");
1366     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1367     checkResult(cuDeviceGetAttribute(
1368                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1369                 "Error returned from cuDeviceGetAttribute\n");
1370     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1371     checkResult(
1372         cuDeviceGetAttribute(
1373             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1374         "Error returned from cuDeviceGetAttribute\n");
1375     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1376     checkResult(
1377         cuDeviceGetAttribute(
1378             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1379         "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1381     checkResult(cuDeviceGetAttribute(
1382                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1383                 "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1385     checkResult(cuDeviceGetAttribute(
1386                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1387                 "Error returned from cuDeviceGetAttribute\n");
1388     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1389     checkResult(
1390         cuDeviceGetAttribute(
1391             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1392         "Error returned from cuDeviceGetAttribute\n");
1393     checkResult(
1394         cuDeviceGetAttribute(
1395             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1396         "Error returned from cuDeviceGetAttribute\n");
1397     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1398   }
1399 
1400   int createEvent(int DeviceId, void **P) {
1401     CUevent Event = nullptr;
1402     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1403       return OFFLOAD_FAIL;
1404     *P = Event;
1405     return OFFLOAD_SUCCESS;
1406   }
1407 
1408   int destroyEvent(int DeviceId, void *EventPtr) {
1409     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1410     return OFFLOAD_SUCCESS;
1411   }
1412 
1413   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1414                 void *EventPtr) const {
1415     CUstream Stream = getStream(DeviceId, AsyncInfo);
1416     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1417 
1418     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1419     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1420     // API document, that argument has to be 0x0.
1421     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1422     if (Err != CUDA_SUCCESS) {
1423       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1424          DPxPTR(Stream), DPxPTR(Event));
1425       CUDA_ERR_STRING(Err);
1426       return OFFLOAD_FAIL;
1427     }
1428 
1429     return OFFLOAD_SUCCESS;
1430   }
1431 
1432   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1433     if (AsyncInfo->Queue) {
1434       StreamPool[DeviceId]->release(
1435           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1436       AsyncInfo->Queue = nullptr;
1437     }
1438 
1439     return OFFLOAD_SUCCESS;
1440   }
1441 
1442   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1443     *AsyncInfo = new __tgt_async_info;
1444     getStream(DeviceId, *AsyncInfo);
1445     return OFFLOAD_SUCCESS;
1446   }
1447 
1448   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1449                      const char **ErrStr) const {
1450     assert(DeviceInfo && "DeviceInfo is nullptr");
1451 
1452     if (!DeviceInfo->Context)
1453       DeviceInfo->Context = DeviceData[DeviceId].Context;
1454     if (!DeviceInfo->Device) {
1455       CUdevice Dev;
1456       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1457       if (Err == CUDA_SUCCESS) {
1458         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1459       } else {
1460         cuGetErrorString(Err, ErrStr);
1461         return OFFLOAD_FAIL;
1462       }
1463     }
1464     return OFFLOAD_SUCCESS;
1465   }
1466 
1467   int setContext(int DeviceId) {
1468     assert(InitializedFlags[DeviceId] && "Device is not initialized");
1469 
1470     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1471     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1472       return OFFLOAD_FAIL;
1473 
1474     return OFFLOAD_SUCCESS;
1475   }
1476 };
1477 
1478 DeviceRTLTy DeviceRTL;
1479 } // namespace
1480 
1481 // Exposed library API function
1482 #ifdef __cplusplus
1483 extern "C" {
1484 #endif
1485 
1486 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1487   return elf_check_machine(image, /* EM_CUDA */ 190);
1488 }
1489 
1490 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1491 
1492 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1493   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1494   DeviceRTL.setRequiresFlag(RequiresFlags);
1495   return RequiresFlags;
1496 }
1497 
1498 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1499   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1500       DeviceRTL.isValidDeviceId(dst_dev_id))
1501     return 1;
1502 
1503   return 0;
1504 }
1505 
1506 int32_t __tgt_rtl_init_device(int32_t device_id) {
1507   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1508   // Context is set when init the device.
1509 
1510   return DeviceRTL.initDevice(device_id);
1511 }
1512 
1513 int32_t __tgt_rtl_deinit_device(int32_t device_id) {
1514   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1515   // Context is set when deinit the device.
1516 
1517   return DeviceRTL.deinitDevice(device_id);
1518 }
1519 
1520 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1521                                           __tgt_device_image *image) {
1522   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1523 
1524   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1525     return nullptr;
1526 
1527   return DeviceRTL.loadBinary(device_id, image);
1528 }
1529 
1530 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1531                            int32_t kind) {
1532   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1533 
1534   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1535     return nullptr;
1536 
1537   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1538 }
1539 
1540 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1541                               int64_t size) {
1542   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1543   // Context is set in __tgt_rtl_data_submit_async.
1544 
1545   __tgt_async_info AsyncInfo;
1546   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1547                                                  size, &AsyncInfo);
1548   if (rc != OFFLOAD_SUCCESS)
1549     return OFFLOAD_FAIL;
1550 
1551   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1552 }
1553 
1554 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1555                                     void *hst_ptr, int64_t size,
1556                                     __tgt_async_info *async_info_ptr) {
1557   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1558   assert(async_info_ptr && "async_info_ptr is nullptr");
1559 
1560   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1561     return OFFLOAD_FAIL;
1562 
1563   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1564                               async_info_ptr);
1565 }
1566 
1567 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1568                                 int64_t size) {
1569   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1570   // Context is set in __tgt_rtl_data_retrieve_async.
1571 
1572   __tgt_async_info AsyncInfo;
1573   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1574                                                    size, &AsyncInfo);
1575   if (rc != OFFLOAD_SUCCESS)
1576     return OFFLOAD_FAIL;
1577 
1578   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1579 }
1580 
1581 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1582                                       void *tgt_ptr, int64_t size,
1583                                       __tgt_async_info *async_info_ptr) {
1584   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1585   assert(async_info_ptr && "async_info_ptr is nullptr");
1586 
1587   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1588     return OFFLOAD_FAIL;
1589 
1590   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1591                                 async_info_ptr);
1592 }
1593 
1594 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1595                                       int dst_dev_id, void *dst_ptr,
1596                                       int64_t size,
1597                                       __tgt_async_info *AsyncInfo) {
1598   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1599   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1600   assert(AsyncInfo && "AsyncInfo is nullptr");
1601   // NOTE: We don't need to set context for data exchange as the device contexts
1602   // are passed to CUDA function directly.
1603   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1604                                 AsyncInfo);
1605 }
1606 
1607 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1608                                 int32_t dst_dev_id, void *dst_ptr,
1609                                 int64_t size) {
1610   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1611   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1612   // Context is set in __tgt_rtl_data_exchange_async.
1613 
1614   __tgt_async_info AsyncInfo;
1615   const int32_t rc = __tgt_rtl_data_exchange_async(
1616       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1617   if (rc != OFFLOAD_SUCCESS)
1618     return OFFLOAD_FAIL;
1619 
1620   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1621 }
1622 
1623 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1624   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1625 
1626   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1627     return OFFLOAD_FAIL;
1628 
1629   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1630 }
1631 
1632 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1633                                          void **tgt_args,
1634                                          ptrdiff_t *tgt_offsets,
1635                                          int32_t arg_num, int32_t team_num,
1636                                          int32_t thread_limit,
1637                                          uint64_t loop_tripcount) {
1638   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1639   // Context is set in __tgt_rtl_run_target_team_region_async.
1640 
1641   __tgt_async_info AsyncInfo;
1642   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1643       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1644       thread_limit, loop_tripcount, &AsyncInfo);
1645   if (rc != OFFLOAD_SUCCESS)
1646     return OFFLOAD_FAIL;
1647 
1648   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1649 }
1650 
1651 int32_t __tgt_rtl_run_target_team_region_async(
1652     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1653     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1654     int32_t thread_limit, uint64_t loop_tripcount,
1655     __tgt_async_info *async_info_ptr) {
1656   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1657 
1658   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1659     return OFFLOAD_FAIL;
1660 
1661   return DeviceRTL.runTargetTeamRegion(
1662       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1663       thread_limit, loop_tripcount, async_info_ptr);
1664 }
1665 
1666 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1667                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1668                                     int32_t arg_num) {
1669   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1670   // Context is set in __tgt_rtl_run_target_region_async.
1671 
1672   __tgt_async_info AsyncInfo;
1673   const int32_t rc = __tgt_rtl_run_target_region_async(
1674       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1675   if (rc != OFFLOAD_SUCCESS)
1676     return OFFLOAD_FAIL;
1677 
1678   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1679 }
1680 
1681 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1682                                           void *tgt_entry_ptr, void **tgt_args,
1683                                           ptrdiff_t *tgt_offsets,
1684                                           int32_t arg_num,
1685                                           __tgt_async_info *async_info_ptr) {
1686   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1687   // Context is set in __tgt_rtl_run_target_team_region_async.
1688   return __tgt_rtl_run_target_team_region_async(
1689       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1690       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1691       async_info_ptr);
1692 }
1693 
1694 int32_t __tgt_rtl_synchronize(int32_t device_id,
1695                               __tgt_async_info *async_info_ptr) {
1696   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1697   assert(async_info_ptr && "async_info_ptr is nullptr");
1698   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1699   // NOTE: We don't need to set context for stream sync.
1700   return DeviceRTL.synchronize(device_id, async_info_ptr);
1701 }
1702 
1703 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1704   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1705   InfoLevel.store(NewInfoLevel);
1706 }
1707 
1708 void __tgt_rtl_print_device_info(int32_t device_id) {
1709   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1710   // NOTE: We don't need to set context for print device info.
1711   DeviceRTL.printDeviceInfo(device_id);
1712 }
1713 
1714 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1715   assert(event && "event is nullptr");
1716 
1717   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1718     return OFFLOAD_FAIL;
1719 
1720   return DeviceRTL.createEvent(device_id, event);
1721 }
1722 
1723 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1724                                __tgt_async_info *async_info_ptr) {
1725   assert(async_info_ptr && "async_info_ptr is nullptr");
1726   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1727   assert(event_ptr && "event_ptr is nullptr");
1728   // NOTE: We might not need to set context for event record.
1729   return recordEvent(event_ptr, async_info_ptr);
1730 }
1731 
1732 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1733                              __tgt_async_info *async_info_ptr) {
1734   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1735   assert(async_info_ptr && "async_info_ptr is nullptr");
1736   assert(event_ptr && "event is nullptr");
1737   // If we don't have a queue we need to set the context.
1738   if (!async_info_ptr->Queue &&
1739       DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1740     return OFFLOAD_FAIL;
1741   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1742 }
1743 
1744 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1745   assert(event_ptr && "event is nullptr");
1746   // NOTE: We might not need to set context for event sync.
1747   return syncEvent(event_ptr);
1748 }
1749 
1750 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1751   assert(event_ptr && "event is nullptr");
1752 
1753   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1754     return OFFLOAD_FAIL;
1755 
1756   return DeviceRTL.destroyEvent(device_id, event_ptr);
1757 }
1758 
1759 int32_t __tgt_rtl_release_async_info(int32_t device_id,
1760                                      __tgt_async_info *async_info) {
1761   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1762   assert(async_info && "async_info is nullptr");
1763 
1764   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1765     return OFFLOAD_FAIL;
1766 
1767   return DeviceRTL.releaseAsyncInfo(device_id, async_info);
1768 }
1769 
1770 int32_t __tgt_rtl_init_async_info(int32_t device_id,
1771                                   __tgt_async_info **async_info) {
1772   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1773   assert(async_info && "async_info is nullptr");
1774 
1775   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1776     return OFFLOAD_FAIL;
1777 
1778   return DeviceRTL.initAsyncInfo(device_id, async_info);
1779 }
1780 
1781 int32_t __tgt_rtl_init_device_info(int32_t device_id,
1782                                    __tgt_device_info *device_info_ptr,
1783                                    const char **err_str) {
1784   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1785   assert(device_info_ptr && "device_info_ptr is nullptr");
1786 
1787   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1788     return OFFLOAD_FAIL;
1789 
1790   return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str);
1791 }
1792 
1793 #ifdef __cplusplus
1794 }
1795 #endif
1796