1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <mutex>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "Debug.h"
25 #include "DeviceEnvironment.h"
26 #include "omptarget.h"
27 #include "omptargetplugin.h"
28 
29 #define TARGET_NAME CUDA
30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
31 
32 #include "MemoryManager.h"
33 
34 #include "llvm/Frontend/OpenMP/OMPConstants.h"
35 
36 // Utility for retrieving and printing CUDA error string.
37 #ifdef OMPTARGET_DEBUG
38 #define CUDA_ERR_STRING(err)                                                   \
39   do {                                                                         \
40     if (getDebugLevel() > 0) {                                                 \
41       const char *errStr = nullptr;                                            \
42       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
43       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
44         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
45       else if (errStr_status == CUDA_SUCCESS)                                  \
46         REPORT("CUDA error is: %s\n", errStr);                                 \
47       else {                                                                   \
48         REPORT("Unresolved CUDA error code: %d\n", err);                       \
49         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
50                errStr_status);                                                 \
51       }                                                                        \
52     } else {                                                                   \
53       const char *errStr = nullptr;                                            \
54       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
55       if (errStr_status == CUDA_SUCCESS)                                       \
56         REPORT("%s \n", errStr);                                               \
57     }                                                                          \
58   } while (false)
59 #else // OMPTARGET_DEBUG
60 #define CUDA_ERR_STRING(err)                                                   \
61   do {                                                                         \
62     const char *errStr = nullptr;                                              \
63     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
64     if (errStr_status == CUDA_SUCCESS)                                         \
65       REPORT("%s \n", errStr);                                                 \
66   } while (false)
67 #endif // OMPTARGET_DEBUG
68 
69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
70 
71 #include "elf_common.h"
72 
73 /// Keep entries table per device.
74 struct FuncOrGblEntryTy {
75   __tgt_target_table Table;
76   std::vector<__tgt_offload_entry> Entries;
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
85 
86   /// Maximal number of threads per block for this kernel.
87   int MaxThreadsPerBlock = 0;
88 
89   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
90       : Func(_Func), ExecutionMode(_ExecutionMode) {}
91 };
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   REPORT("%s", ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
120   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
121   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
122 
123   CUresult Err = cuEventRecord(Event, Stream);
124   if (Err != CUDA_SUCCESS) {
125     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
126        DPxPTR(Stream), DPxPTR(Event));
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 int syncEvent(void *EventPtr) {
135   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
136 
137   CUresult Err = cuEventSynchronize(Event);
138   if (Err != CUDA_SUCCESS) {
139     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 namespace {
148 
149 // Structure contains per-device data
150 struct DeviceDataTy {
151   /// List that contains all the kernels.
152   std::list<KernelTy> KernelsList;
153 
154   std::list<FuncOrGblEntryTy> FuncGblEntries;
155 
156   CUcontext Context = nullptr;
157   // Device properties
158   int ThreadsPerBlock = 0;
159   int BlocksPerGrid = 0;
160   int WarpSize = 0;
161   // OpenMP properties
162   int NumTeams = 0;
163   int NumThreads = 0;
164 };
165 
166 /// Resource allocator where \p T is the resource type.
167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
168 /// accordingly. The implementation should not raise any exception.
169 template <typename T> struct AllocatorTy {
170   using ElementTy = T;
171   virtual ~AllocatorTy() {}
172 
173   /// Create a resource and assign to R.
174   virtual int create(T &R) noexcept = 0;
175   /// Destroy the resource.
176   virtual int destroy(T) noexcept = 0;
177 };
178 
179 /// Allocator for CUstream.
180 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
181   /// See AllocatorTy<T>::create.
182   int create(CUstream &Stream) noexcept override {
183     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
184                      "Error returned from cuStreamCreate\n"))
185       return OFFLOAD_FAIL;
186 
187     return OFFLOAD_SUCCESS;
188   }
189 
190   /// See AllocatorTy<T>::destroy.
191   int destroy(CUstream Stream) noexcept override {
192     if (!checkResult(cuStreamDestroy(Stream),
193                      "Error returned from cuStreamDestroy\n"))
194       return OFFLOAD_FAIL;
195 
196     return OFFLOAD_SUCCESS;
197   }
198 };
199 
200 /// Allocator for CUevent.
201 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
202   /// See AllocatorTy<T>::create.
203   int create(CUevent &Event) noexcept override {
204     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
205                      "Error returned from cuEventCreate\n"))
206       return OFFLOAD_FAIL;
207 
208     return OFFLOAD_SUCCESS;
209   }
210 
211   /// See AllocatorTy<T>::destroy.
212   int destroy(CUevent Event) noexcept override {
213     if (!checkResult(cuEventDestroy(Event),
214                      "Error returned from cuEventDestroy\n"))
215       return OFFLOAD_FAIL;
216 
217     return OFFLOAD_SUCCESS;
218   }
219 };
220 
221 /// A generic pool of resources where \p T is the resource type.
222 /// \p T should be copyable as the object is stored in \p std::vector .
223 template <typename AllocTy> class ResourcePoolTy {
224   using ElementTy = typename AllocTy::ElementTy;
225   /// Index of the next available resource.
226   size_t Next = 0;
227   /// Mutex to guard the pool.
228   std::mutex Mutex;
229   /// Pool of resources. The difference between \p Resources and \p Pool is,
230   /// when a resource is acquired and released, it is all on \p Resources. When
231   /// a batch of new resources are needed, they are both added to \p Resources
232   /// and \p Pool. The reason for this setting is, \p Resources could contain
233   /// redundant elements because resources are not released, which can cause
234   /// double free. This setting makes sure that \p Pool always has every
235   /// resource allocated from the device.
236   std::vector<ElementTy> Resources;
237   std::vector<ElementTy> Pool;
238   /// A reference to the corresponding allocator.
239   AllocTy Allocator;
240 
241   /// If `Resources` is used up, we will fill in more resources. It assumes that
242   /// the new size `Size` should be always larger than the current size.
243   bool resize(size_t Size) {
244     assert(Resources.size() == Pool.size() && "size mismatch");
245     auto CurSize = Resources.size();
246     assert(Size > CurSize && "Unexpected smaller size");
247     Pool.reserve(Size);
248     Resources.reserve(Size);
249     for (auto I = CurSize; I < Size; ++I) {
250       ElementTy NewItem;
251       int Ret = Allocator.create(NewItem);
252       if (Ret != OFFLOAD_SUCCESS)
253         return false;
254       Pool.push_back(NewItem);
255       Resources.push_back(NewItem);
256     }
257     return true;
258   }
259 
260 public:
261   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
262       : Allocator(std::move(A)) {
263     if (Size)
264       (void)resize(Size);
265   }
266 
267   ~ResourcePoolTy() noexcept { clear(); }
268 
269   /// Get a resource from pool. `Next` always points to the next available
270   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
271   /// still available. If there is no resource left, we will ask for more. Each
272   /// time a resource is assigned, the id will increase one.
273   /// xxxxxs+++++++++
274   ///      ^
275   ///      Next
276   /// After assignment, the pool becomes the following and s is assigned.
277   /// xxxxxs+++++++++
278   ///       ^
279   ///       Next
280   int acquire(ElementTy &R) noexcept {
281     std::lock_guard<std::mutex> LG(Mutex);
282     if (Next == Resources.size()) {
283       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
284       if (!resize(NewSize))
285         return OFFLOAD_FAIL;
286     }
287 
288     assert(Next < Resources.size());
289 
290     R = Resources[Next++];
291 
292     return OFFLOAD_SUCCESS;
293   }
294 
295   /// Return the resource back to the pool. When we return a resource, we need
296   /// to first decrease `Next`, and then copy the resource back. It is worth
297   /// noting that, the order of resources return might be different from that
298   /// they're assigned, that saying, at some point, there might be two identical
299   /// resources.
300   /// xxax+a+++++
301   ///     ^
302   ///     Next
303   /// However, it doesn't matter, because they're always on the two sides of
304   /// `Next`. The left one will in the end be overwritten by another resource.
305   /// Therefore, after several execution, the order of pool might be different
306   /// from its initial state.
307   void release(ElementTy R) noexcept {
308     std::lock_guard<std::mutex> LG(Mutex);
309     Resources[--Next] = R;
310   }
311 
312   /// Released all stored resources and clear the pool.
313   /// Note: This function is not thread safe. Be sure to guard it if necessary.
314   void clear() noexcept {
315     for (auto &R : Pool)
316       (void)Allocator.destroy(R);
317     Pool.clear();
318     Resources.clear();
319   }
320 };
321 
322 } // namespace
323 
324 class DeviceRTLTy {
325   int NumberOfDevices;
326   // OpenMP environment properties
327   int EnvNumTeams;
328   int EnvTeamLimit;
329   int EnvTeamThreadLimit;
330   // OpenMP requires flags
331   int64_t RequiresFlags;
332   // Amount of dynamic shared memory to use at launch.
333   uint64_t DynamicMemorySize;
334 
335   /// Number of initial streams for each device.
336   int NumInitialStreams = 32;
337 
338   /// Number of initial events for each device.
339   int NumInitialEvents = 8;
340 
341   static constexpr const int32_t HardThreadLimit = 1024;
342   static constexpr const int32_t DefaultNumTeams = 128;
343   static constexpr const int32_t DefaultNumThreads = 128;
344 
345   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
346   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
347 
348   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
349   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
350 
351   std::vector<DeviceDataTy> DeviceData;
352   std::vector<std::vector<CUmodule>> Modules;
353 
354   /// Vector of flags indicating the initalization status of all associated
355   /// devices.
356   std::vector<bool> InitializedFlags;
357 
358   enum class PeerAccessState : uint8_t { Unkown, Yes, No };
359   std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
360   std::mutex PeerAccessMatrixLock;
361 
362   /// A class responsible for interacting with device native runtime library to
363   /// allocate and free memory.
364   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
365     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
366 
367   public:
368     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
369       if (Size == 0)
370         return nullptr;
371 
372       void *MemAlloc = nullptr;
373       CUresult Err;
374       switch (Kind) {
375       case TARGET_ALLOC_DEFAULT:
376       case TARGET_ALLOC_DEVICE:
377         CUdeviceptr DevicePtr;
378         Err = cuMemAlloc(&DevicePtr, Size);
379         MemAlloc = (void *)DevicePtr;
380         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
381           return nullptr;
382         break;
383       case TARGET_ALLOC_HOST:
384         void *HostPtr;
385         Err = cuMemAllocHost(&HostPtr, Size);
386         MemAlloc = HostPtr;
387         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
388           return nullptr;
389         HostPinnedAllocs[MemAlloc] = Kind;
390         break;
391       case TARGET_ALLOC_SHARED:
392         CUdeviceptr SharedPtr;
393         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
394         MemAlloc = (void *)SharedPtr;
395         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
396           return nullptr;
397         break;
398       }
399 
400       return MemAlloc;
401     }
402 
403     int free(void *TgtPtr) override {
404       CUresult Err;
405       // Host pinned memory must be freed differently.
406       TargetAllocTy Kind =
407           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
408               ? TARGET_ALLOC_DEFAULT
409               : TARGET_ALLOC_HOST;
410       switch (Kind) {
411       case TARGET_ALLOC_DEFAULT:
412       case TARGET_ALLOC_DEVICE:
413       case TARGET_ALLOC_SHARED:
414         Err = cuMemFree((CUdeviceptr)TgtPtr);
415         if (!checkResult(Err, "Error returned from cuMemFree\n"))
416           return OFFLOAD_FAIL;
417         break;
418       case TARGET_ALLOC_HOST:
419         Err = cuMemFreeHost(TgtPtr);
420         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
421           return OFFLOAD_FAIL;
422         break;
423       }
424 
425       return OFFLOAD_SUCCESS;
426     }
427   };
428 
429   /// A vector of device allocators
430   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
431 
432   /// A vector of memory managers. Since the memory manager is non-copyable and
433   // non-removable, we wrap them into std::unique_ptr.
434   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
435 
436   /// Whether use memory manager
437   bool UseMemoryManager = true;
438 
439   // Record entry point associated with device
440   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
441     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
442     E.Entries.push_back(entry);
443   }
444 
445   // Return a pointer to the entry associated with the pointer
446   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
447                                              const void *Addr) const {
448     for (const __tgt_offload_entry &Itr :
449          DeviceData[DeviceId].FuncGblEntries.back().Entries)
450       if (Itr.addr == Addr)
451         return &Itr;
452 
453     return nullptr;
454   }
455 
456   // Return the pointer to the target entries table
457   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
458     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
459 
460     if (E.Entries.empty())
461       return nullptr;
462 
463     // Update table info according to the entries and return the pointer
464     E.Table.EntriesBegin = E.Entries.data();
465     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
466 
467     return &E.Table;
468   }
469 
470   // Clear entries table for a device
471   void clearOffloadEntriesTable(const int DeviceId) {
472     DeviceData[DeviceId].FuncGblEntries.emplace_back();
473     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
474     E.Entries.clear();
475     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
476   }
477 
478 public:
479   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
480     assert(AsyncInfo && "AsyncInfo is nullptr");
481 
482     if (!AsyncInfo->Queue) {
483       CUstream S;
484       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
485         return nullptr;
486 
487       AsyncInfo->Queue = S;
488     }
489 
490     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
491   }
492 
493   // This class should not be copied
494   DeviceRTLTy(const DeviceRTLTy &) = delete;
495   DeviceRTLTy(DeviceRTLTy &&) = delete;
496 
497   DeviceRTLTy()
498       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
499         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
500         DynamicMemorySize(0) {
501 
502     DP("Start initializing CUDA\n");
503 
504     CUresult Err = cuInit(0);
505     if (Err == CUDA_ERROR_INVALID_HANDLE) {
506       // Can't call cuGetErrorString if dlsym failed
507       DP("Failed to load CUDA shared library\n");
508       return;
509     }
510     if (!checkResult(Err, "Error returned from cuInit\n")) {
511       return;
512     }
513 
514     Err = cuDeviceGetCount(&NumberOfDevices);
515     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
516       return;
517 
518     if (NumberOfDevices == 0) {
519       DP("There are no devices supporting CUDA.\n");
520       return;
521     }
522 
523     DeviceData.resize(NumberOfDevices);
524     Modules.resize(NumberOfDevices);
525     StreamPool.resize(NumberOfDevices);
526     EventPool.resize(NumberOfDevices);
527     PeerAccessMatrix.resize(NumberOfDevices);
528     for (auto &V : PeerAccessMatrix)
529       V.resize(NumberOfDevices, PeerAccessState::Unkown);
530 
531     // Get environment variables regarding teams
532     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
533       // OMP_TEAM_LIMIT has been set
534       EnvTeamLimit = std::stoi(EnvStr);
535       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
536     }
537     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
538       // OMP_TEAMS_THREAD_LIMIT has been set
539       EnvTeamThreadLimit = std::stoi(EnvStr);
540       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
541     }
542     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
543       // OMP_NUM_TEAMS has been set
544       EnvNumTeams = std::stoi(EnvStr);
545       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
546     }
547     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
548       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
549       DynamicMemorySize = std::stoi(EnvStr);
550       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
551          DynamicMemorySize);
552     }
553     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
554       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
555       NumInitialStreams = std::stoi(EnvStr);
556       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
557     }
558 
559     for (int I = 0; I < NumberOfDevices; ++I)
560       DeviceAllocators.emplace_back();
561 
562     // Get the size threshold from environment variable
563     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
564     UseMemoryManager = Res.second;
565     size_t MemoryManagerThreshold = Res.first;
566 
567     if (UseMemoryManager)
568       for (int I = 0; I < NumberOfDevices; ++I)
569         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
570             DeviceAllocators[I], MemoryManagerThreshold));
571 
572     // We lazily initialize all devices later.
573     InitializedFlags.assign(NumberOfDevices, false);
574   }
575 
576   ~DeviceRTLTy() {
577     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
578       deinitDevice(DeviceId);
579   }
580 
581   // Check whether a given DeviceId is valid
582   bool isValidDeviceId(const int DeviceId) const {
583     return DeviceId >= 0 && DeviceId < NumberOfDevices;
584   }
585 
586   int getNumOfDevices() const { return NumberOfDevices; }
587 
588   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
589 
590   int initDevice(const int DeviceId) {
591     CUdevice Device;
592 
593     DP("Getting device %d\n", DeviceId);
594     CUresult Err = cuDeviceGet(&Device, DeviceId);
595     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
596       return OFFLOAD_FAIL;
597 
598     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
599     InitializedFlags[DeviceId] = true;
600 
601     // Query the current flags of the primary context and set its flags if
602     // it is inactive
603     unsigned int FormerPrimaryCtxFlags = 0;
604     int FormerPrimaryCtxIsActive = 0;
605     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
606                                      &FormerPrimaryCtxIsActive);
607     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
608       return OFFLOAD_FAIL;
609 
610     if (FormerPrimaryCtxIsActive) {
611       DP("The primary context is active, no change to its flags\n");
612       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
613           CU_CTX_SCHED_BLOCKING_SYNC)
614         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
615     } else {
616       DP("The primary context is inactive, set its flags to "
617          "CU_CTX_SCHED_BLOCKING_SYNC\n");
618       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
619       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
620         return OFFLOAD_FAIL;
621     }
622 
623     // Retain the per device primary context and save it to use whenever this
624     // device is selected.
625     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
626     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
627       return OFFLOAD_FAIL;
628 
629     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
630     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
631       return OFFLOAD_FAIL;
632 
633     // Initialize the stream pool.
634     if (!StreamPool[DeviceId])
635       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
636                                                             NumInitialStreams);
637 
638     // Initialize the event pool.
639     if (!EventPool[DeviceId])
640       EventPool[DeviceId] =
641           std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
642 
643     // Query attributes to determine number of threads/block and blocks/grid.
644     int MaxGridDimX;
645     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
646                                Device);
647     if (Err != CUDA_SUCCESS) {
648       DP("Error getting max grid dimension, use default value %d\n",
649          DeviceRTLTy::DefaultNumTeams);
650       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
651     } else {
652       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
653       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
654     }
655 
656     // We are only exploiting threads along the x axis.
657     int MaxBlockDimX;
658     Err = cuDeviceGetAttribute(&MaxBlockDimX,
659                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
660     if (Err != CUDA_SUCCESS) {
661       DP("Error getting max block dimension, use default value %d\n",
662          DeviceRTLTy::DefaultNumThreads);
663       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
664     } else {
665       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
666       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
667 
668       if (EnvTeamThreadLimit > 0 &&
669           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
670         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
671            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
672            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
673         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
674       }
675       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
676         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
677            "capping at the hard limit\n",
678            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
679         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
680       }
681     }
682 
683     // Get and set warp size
684     int WarpSize;
685     Err =
686         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
687     if (Err != CUDA_SUCCESS) {
688       DP("Error getting warp size, assume default value 32\n");
689       DeviceData[DeviceId].WarpSize = 32;
690     } else {
691       DP("Using warp size %d\n", WarpSize);
692       DeviceData[DeviceId].WarpSize = WarpSize;
693     }
694 
695     // Adjust teams to the env variables
696     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
697       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
698          EnvTeamLimit);
699       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
700     }
701 
702     size_t StackLimit;
703     size_t HeapLimit;
704     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
705       StackLimit = std::stol(EnvStr);
706       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
707         return OFFLOAD_FAIL;
708     } else {
709       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
710         return OFFLOAD_FAIL;
711     }
712     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
713       HeapLimit = std::stol(EnvStr);
714       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
715         return OFFLOAD_FAIL;
716     } else {
717       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
718         return OFFLOAD_FAIL;
719     }
720 
721     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
722          "Device supports up to %d CUDA blocks and %d threads with a "
723          "warp size of %d\n",
724          DeviceData[DeviceId].BlocksPerGrid,
725          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
726     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
727          "Device heap size is %d Bytes, device stack size is %d Bytes per "
728          "thread\n",
729          (int)HeapLimit, (int)StackLimit);
730 
731     // Set default number of teams
732     if (EnvNumTeams > 0) {
733       DP("Default number of teams set according to environment %d\n",
734          EnvNumTeams);
735       DeviceData[DeviceId].NumTeams = EnvNumTeams;
736     } else {
737       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
738       DP("Default number of teams set according to library's default %d\n",
739          DeviceRTLTy::DefaultNumTeams);
740     }
741 
742     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
743       DP("Default number of teams exceeds device limit, capping at %d\n",
744          DeviceData[DeviceId].BlocksPerGrid);
745       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
746     }
747 
748     // Set default number of threads
749     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
750     DP("Default number of threads set according to library's default %d\n",
751        DeviceRTLTy::DefaultNumThreads);
752     if (DeviceData[DeviceId].NumThreads >
753         DeviceData[DeviceId].ThreadsPerBlock) {
754       DP("Default number of threads exceeds device limit, capping at %d\n",
755          DeviceData[DeviceId].ThreadsPerBlock);
756       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
757     }
758 
759     return OFFLOAD_SUCCESS;
760   }
761 
762   int deinitDevice(const int DeviceId) {
763     auto IsInitialized = InitializedFlags[DeviceId];
764     if (!IsInitialized)
765       return OFFLOAD_SUCCESS;
766     InitializedFlags[DeviceId] = false;
767 
768     if (UseMemoryManager)
769       MemoryManagers[DeviceId].release();
770 
771     StreamPool[DeviceId].reset();
772     EventPool[DeviceId].reset();
773 
774     DeviceDataTy &D = DeviceData[DeviceId];
775     if (!checkResult(cuCtxSetCurrent(D.Context),
776                      "Error returned from cuCtxSetCurrent\n"))
777       return OFFLOAD_FAIL;
778 
779     // Unload all modules.
780     for (auto &M : Modules[DeviceId])
781       if (!checkResult(cuModuleUnload(M),
782                        "Error returned from cuModuleUnload\n"))
783         return OFFLOAD_FAIL;
784 
785     // Destroy context.
786     CUdevice Device;
787     if (!checkResult(cuCtxGetDevice(&Device),
788                      "Error returned from cuCtxGetDevice\n"))
789       return OFFLOAD_FAIL;
790 
791     if (!checkResult(cuDevicePrimaryCtxRelease(Device),
792                      "Error returned from cuDevicePrimaryCtxRelease\n"))
793       return OFFLOAD_FAIL;
794 
795     return OFFLOAD_SUCCESS;
796   }
797 
798   __tgt_target_table *loadBinary(const int DeviceId,
799                                  const __tgt_device_image *Image) {
800     // Clear the offload table as we are going to create a new one.
801     clearOffloadEntriesTable(DeviceId);
802 
803     // Create the module and extract the function pointers.
804     CUmodule Module;
805     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
806     CUresult Err =
807         cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
808     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
809       return nullptr;
810 
811     DP("CUDA module successfully loaded!\n");
812 
813     Modules[DeviceId].push_back(Module);
814 
815     // Find the symbols in the module by name.
816     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
817     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
818 
819     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
820     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
821       if (!E->addr) {
822         // We return nullptr when something like this happens, the host should
823         // have always something in the address to uniquely identify the target
824         // region.
825         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
826         return nullptr;
827       }
828 
829       if (E->size) {
830         __tgt_offload_entry Entry = *E;
831         CUdeviceptr CUPtr;
832         size_t CUSize;
833         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
834         // We keep this style here because we need the name
835         if (Err != CUDA_SUCCESS) {
836           REPORT("Loading global '%s' Failed\n", E->name);
837           CUDA_ERR_STRING(Err);
838           return nullptr;
839         }
840 
841         if (CUSize != E->size) {
842           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
843              CUSize, E->size);
844           return nullptr;
845         }
846 
847         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
848            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
849 
850         Entry.addr = (void *)(CUPtr);
851 
852         // Note: In the current implementation declare target variables
853         // can either be link or to. This means that once unified
854         // memory is activated via the requires directive, the variable
855         // can be used directly from the host in both cases.
856         // TODO: when variables types other than to or link are added,
857         // the below condition should be changed to explicitly
858         // check for to and link variables types:
859         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
860         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
861         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
862           // If unified memory is present any target link or to variables
863           // can access host addresses directly. There is no longer a
864           // need for device copies.
865           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
866           DP("Copy linked variable host address (" DPxMOD
867              ") to device address (" DPxMOD ")\n",
868              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
869         }
870 
871         addOffloadEntry(DeviceId, Entry);
872 
873         continue;
874       }
875 
876       CUfunction Func;
877       Err = cuModuleGetFunction(&Func, Module, E->name);
878       // We keep this style here because we need the name
879       if (Err != CUDA_SUCCESS) {
880         REPORT("Loading '%s' Failed\n", E->name);
881         CUDA_ERR_STRING(Err);
882         return nullptr;
883       }
884 
885       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
886          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
887 
888       // default value GENERIC (in case symbol is missing from cubin file)
889       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
890       std::string ExecModeNameStr(E->name);
891       ExecModeNameStr += "_exec_mode";
892       const char *ExecModeName = ExecModeNameStr.c_str();
893 
894       CUdeviceptr ExecModePtr;
895       size_t CUSize;
896       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
897       if (Err == CUDA_SUCCESS) {
898         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
899           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
900              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
901           return nullptr;
902         }
903 
904         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
905         if (Err != CUDA_SUCCESS) {
906           REPORT("Error when copying data from device to host. Pointers: "
907                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
908                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
909           CUDA_ERR_STRING(Err);
910           return nullptr;
911         }
912       } else {
913         DP("Loading global exec_mode '%s' - symbol missing, using default "
914            "value GENERIC (1)\n",
915            ExecModeName);
916       }
917 
918       KernelsList.emplace_back(Func, ExecModeVal);
919 
920       __tgt_offload_entry Entry = *E;
921       Entry.addr = &KernelsList.back();
922       addOffloadEntry(DeviceId, Entry);
923     }
924 
925     // send device environment data to the device
926     {
927       // TODO: The device ID used here is not the real device ID used by OpenMP.
928       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
929                                     static_cast<uint32_t>(DeviceId),
930                                     static_cast<uint32_t>(DynamicMemorySize)};
931 
932       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
933         DeviceEnv.DebugKind = std::stoi(EnvStr);
934 
935       const char *DeviceEnvName = "omptarget_device_environment";
936       CUdeviceptr DeviceEnvPtr;
937       size_t CUSize;
938 
939       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
940       if (Err == CUDA_SUCCESS) {
941         if (CUSize != sizeof(DeviceEnv)) {
942           REPORT(
943               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
944               DeviceEnvName, CUSize, sizeof(int32_t));
945           CUDA_ERR_STRING(Err);
946           return nullptr;
947         }
948 
949         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
950         if (Err != CUDA_SUCCESS) {
951           REPORT("Error when copying data from host to device. Pointers: "
952                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
953                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
954           CUDA_ERR_STRING(Err);
955           return nullptr;
956         }
957 
958         DP("Sending global device environment data %zu bytes\n", CUSize);
959       } else {
960         DP("Finding global device environment '%s' - symbol missing.\n",
961            DeviceEnvName);
962         DP("Continue, considering this is a device RTL which does not accept "
963            "environment setting.\n");
964       }
965     }
966 
967     return getOffloadEntriesTable(DeviceId);
968   }
969 
970   void *dataAlloc(const int DeviceId, const int64_t Size,
971                   const TargetAllocTy Kind) {
972     switch (Kind) {
973     case TARGET_ALLOC_DEFAULT:
974     case TARGET_ALLOC_DEVICE:
975       if (UseMemoryManager)
976         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
977       else
978         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
979     case TARGET_ALLOC_HOST:
980     case TARGET_ALLOC_SHARED:
981       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
982     }
983 
984     REPORT("Invalid target data allocation kind or requested allocator not "
985            "implemented yet\n");
986 
987     return nullptr;
988   }
989 
990   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
991                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
992     assert(AsyncInfo && "AsyncInfo is nullptr");
993 
994     CUstream Stream = getStream(DeviceId, AsyncInfo);
995     CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
996     if (Err != CUDA_SUCCESS) {
997       DP("Error when copying data from host to device. Pointers: host "
998          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
999          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1000       CUDA_ERR_STRING(Err);
1001       return OFFLOAD_FAIL;
1002     }
1003 
1004     return OFFLOAD_SUCCESS;
1005   }
1006 
1007   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1008                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
1009     assert(AsyncInfo && "AsyncInfo is nullptr");
1010 
1011     CUstream Stream = getStream(DeviceId, AsyncInfo);
1012     CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1013     if (Err != CUDA_SUCCESS) {
1014       DP("Error when copying data from device to host. Pointers: host "
1015          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1016          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1017       CUDA_ERR_STRING(Err);
1018       return OFFLOAD_FAIL;
1019     }
1020 
1021     return OFFLOAD_SUCCESS;
1022   }
1023 
1024   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1025                    int64_t Size, __tgt_async_info *AsyncInfo) {
1026     assert(AsyncInfo && "AsyncInfo is nullptr");
1027 
1028     CUresult Err;
1029     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1030 
1031     // If they are two devices, we try peer to peer copy first
1032     if (SrcDevId != DstDevId) {
1033       std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
1034 
1035       switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1036       case PeerAccessState::No: {
1037         REPORT("Peer access from %" PRId32 " to %" PRId32
1038                " is not supported. Fall back to D2D memcpy.\n",
1039                SrcDevId, DstDevId);
1040         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1041       }
1042       case PeerAccessState::Unkown: {
1043         int CanAccessPeer = 0;
1044         Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1045         if (Err != CUDA_SUCCESS) {
1046           REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1047                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1048                  SrcDevId, DstDevId);
1049           CUDA_ERR_STRING(Err);
1050           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1051           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1052         }
1053 
1054         if (!CanAccessPeer) {
1055           REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
1056                  "memcpy.\n",
1057                  SrcDevId, DstDevId);
1058           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1059           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1060         }
1061 
1062         Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1063         if (Err != CUDA_SUCCESS) {
1064           REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1065                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1066                  SrcDevId, DstDevId);
1067           CUDA_ERR_STRING(Err);
1068           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1069           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1070         }
1071 
1072         PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1073 
1074         LLVM_FALLTHROUGH;
1075       }
1076       case PeerAccessState::Yes: {
1077         Err = cuMemcpyPeerAsync(
1078             (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1079             (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
1080         if (Err == CUDA_SUCCESS)
1081           return OFFLOAD_SUCCESS;
1082 
1083         DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1084            ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
1085            ". Fall back to D2D memcpy.\n",
1086            DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1087         CUDA_ERR_STRING(Err);
1088 
1089         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1090       }
1091       default:
1092         REPORT("Unknown PeerAccessState %d.\n",
1093                int(PeerAccessMatrix[SrcDevId][DstDevId]));
1094         return OFFLOAD_FAIL;
1095       }
1096     }
1097 
1098     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1099   }
1100 
1101   int dataDelete(const int DeviceId, void *TgtPtr) {
1102     if (UseMemoryManager)
1103       return MemoryManagers[DeviceId]->free(TgtPtr);
1104 
1105     return DeviceAllocators[DeviceId].free(TgtPtr);
1106   }
1107 
1108   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1109                           ptrdiff_t *TgtOffsets, const int ArgNum,
1110                           const int TeamNum, const int ThreadLimit,
1111                           const unsigned int LoopTripCount,
1112                           __tgt_async_info *AsyncInfo) const {
1113     // All args are references.
1114     std::vector<void *> Args(ArgNum);
1115     std::vector<void *> Ptrs(ArgNum);
1116 
1117     for (int I = 0; I < ArgNum; ++I) {
1118       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1119       Args[I] = &Ptrs[I];
1120     }
1121 
1122     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1123 
1124     const bool IsSPMDGenericMode =
1125         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1126     const bool IsSPMDMode =
1127         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1128     const bool IsGenericMode =
1129         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1130 
1131     int CudaThreadsPerBlock;
1132     if (ThreadLimit > 0) {
1133       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1134       CudaThreadsPerBlock = ThreadLimit;
1135       // Add master warp if necessary
1136       if (IsGenericMode) {
1137         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1138         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1139       }
1140     } else {
1141       DP("Setting CUDA threads per block to default %d\n",
1142          DeviceData[DeviceId].NumThreads);
1143       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1144     }
1145 
1146     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1147       DP("Threads per block capped at device limit %d\n",
1148          DeviceData[DeviceId].ThreadsPerBlock);
1149       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1150     }
1151 
1152     CUresult Err;
1153     if (!KernelInfo->MaxThreadsPerBlock) {
1154       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1155                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1156                                KernelInfo->Func);
1157       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1158         return OFFLOAD_FAIL;
1159     }
1160 
1161     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1162       DP("Threads per block capped at kernel limit %d\n",
1163          KernelInfo->MaxThreadsPerBlock);
1164       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1165     }
1166 
1167     unsigned int CudaBlocksPerGrid;
1168     if (TeamNum <= 0) {
1169       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1170         if (IsSPMDGenericMode) {
1171           // If we reach this point, then we are executing a kernel that was
1172           // transformed from Generic-mode to SPMD-mode. This kernel has
1173           // SPMD-mode execution, but needs its blocks to be scheduled
1174           // differently because the current loop trip count only applies to the
1175           // `teams distribute` region and will create var too few blocks using
1176           // the regular SPMD-mode method.
1177           CudaBlocksPerGrid = LoopTripCount;
1178         } else if (IsSPMDMode) {
1179           // We have a combined construct, i.e. `target teams distribute
1180           // parallel for [simd]`. We launch so many teams so that each thread
1181           // will execute one iteration of the loop. round up to the nearest
1182           // integer
1183           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1184         } else if (IsGenericMode) {
1185           // If we reach this point, then we have a non-combined construct, i.e.
1186           // `teams distribute` with a nested `parallel for` and each team is
1187           // assigned one iteration of the `distribute` loop. E.g.:
1188           //
1189           // #pragma omp target teams distribute
1190           // for(...loop_tripcount...) {
1191           //   #pragma omp parallel for
1192           //   for(...) {}
1193           // }
1194           //
1195           // Threads within a team will execute the iterations of the `parallel`
1196           // loop.
1197           CudaBlocksPerGrid = LoopTripCount;
1198         } else {
1199           REPORT("Unknown execution mode: %d\n",
1200                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1201           return OFFLOAD_FAIL;
1202         }
1203         DP("Using %d teams due to loop trip count %" PRIu32
1204            " and number of threads per block %d\n",
1205            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1206       } else {
1207         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1208         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1209       }
1210     } else {
1211       DP("Using requested number of teams %d\n", TeamNum);
1212       CudaBlocksPerGrid = TeamNum;
1213     }
1214 
1215     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1216       DP("Capping number of teams to team limit %d\n",
1217          DeviceData[DeviceId].BlocksPerGrid);
1218       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1219     }
1220 
1221     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1222          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1223          (getOffloadEntry(DeviceId, TgtEntryPtr))
1224              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1225              : "(null)",
1226          CudaBlocksPerGrid, CudaThreadsPerBlock,
1227          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1228 
1229     CUstream Stream = getStream(DeviceId, AsyncInfo);
1230     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1231                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1232                          /* blockDimY */ 1, /* blockDimZ */ 1,
1233                          DynamicMemorySize, Stream, &Args[0], nullptr);
1234     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1235       return OFFLOAD_FAIL;
1236 
1237     DP("Launch of entry point at " DPxMOD " successful!\n",
1238        DPxPTR(TgtEntryPtr));
1239 
1240     return OFFLOAD_SUCCESS;
1241   }
1242 
1243   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1244     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1245     CUresult Err = cuStreamSynchronize(Stream);
1246 
1247     // Once the stream is synchronized, return it to stream pool and reset
1248     // AsyncInfo. This is to make sure the synchronization only works for its
1249     // own tasks.
1250     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1251     AsyncInfo->Queue = nullptr;
1252 
1253     if (Err != CUDA_SUCCESS) {
1254       DP("Error when synchronizing stream. stream = " DPxMOD
1255          ", async info ptr = " DPxMOD "\n",
1256          DPxPTR(Stream), DPxPTR(AsyncInfo));
1257       CUDA_ERR_STRING(Err);
1258     }
1259     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1260   }
1261 
1262   void printDeviceInfo(int32_t device_id) {
1263     char TmpChar[1000];
1264     std::string TmpStr;
1265     size_t TmpSt;
1266     int TmpInt, TmpInt2, TmpInt3;
1267 
1268     CUdevice Device;
1269     checkResult(cuDeviceGet(&Device, device_id),
1270                 "Error returned from cuCtxGetDevice\n");
1271 
1272     cuDriverGetVersion(&TmpInt);
1273     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1274     printf("    CUDA Device Number: \t\t%d \n", device_id);
1275     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1276                 "Error returned from cuDeviceGetName\n");
1277     printf("    Device Name: \t\t\t%s \n", TmpChar);
1278     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1279                 "Error returned from cuDeviceTotalMem\n");
1280     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1281     checkResult(cuDeviceGetAttribute(
1282                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1283                 "Error returned from cuDeviceGetAttribute\n");
1284     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1285     checkResult(
1286         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1287         "Error returned from cuDeviceGetAttribute\n");
1288     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1289     checkResult(cuDeviceGetAttribute(
1290                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1291                 "Error returned from cuDeviceGetAttribute\n");
1292     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1293     checkResult(
1294         cuDeviceGetAttribute(
1295             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1296         "Error returned from cuDeviceGetAttribute\n");
1297     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1298     checkResult(
1299         cuDeviceGetAttribute(
1300             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1301         "Error returned from cuDeviceGetAttribute\n");
1302     printf("    Registers per Block: \t\t%d \n", TmpInt);
1303     checkResult(
1304         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1305         "Error returned from cuDeviceGetAttribute\n");
1306     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1307     checkResult(cuDeviceGetAttribute(
1308                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1309                 "Error returned from cuDeviceGetAttribute\n");
1310     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1311     checkResult(cuDeviceGetAttribute(
1312                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1313                 "Error returned from cuDeviceGetAttribute\n");
1314     checkResult(cuDeviceGetAttribute(
1315                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1316                 "Error returned from cuDeviceGetAttribute\n");
1317     checkResult(cuDeviceGetAttribute(
1318                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1319                 "Error returned from cuDeviceGetAttribute\n");
1320     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1321            TmpInt3);
1322     checkResult(cuDeviceGetAttribute(
1323                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1324                 "Error returned from cuDeviceGetAttribute\n");
1325     checkResult(cuDeviceGetAttribute(
1326                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1327                 "Error returned from cuDeviceGetAttribute\n");
1328     checkResult(cuDeviceGetAttribute(
1329                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1330                 "Error returned from cuDeviceGetAttribute\n");
1331     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1332            TmpInt3);
1333     checkResult(
1334         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1337     checkResult(cuDeviceGetAttribute(
1338                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1341     checkResult(
1342         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1343         "Error returned from cuDeviceGetAttribute\n");
1344     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1345     checkResult(cuDeviceGetAttribute(
1346                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1347                 "Error returned from cuDeviceGetAttribute\n");
1348     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1349     checkResult(
1350         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1351         "Error returned from cuDeviceGetAttribute\n");
1352     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1353     checkResult(cuDeviceGetAttribute(
1354                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1355                 "Error returned from cuDeviceGetAttribute\n");
1356     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1357     checkResult(
1358         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1359         "Error returned from cuDeviceGetAttribute\n");
1360     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1361       TmpStr = "DEFAULT";
1362     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1363       TmpStr = "PROHIBITED";
1364     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1365       TmpStr = "EXCLUSIVE PROCESS";
1366     else
1367       TmpStr = "unknown";
1368     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1369     checkResult(cuDeviceGetAttribute(
1370                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1371                 "Error returned from cuDeviceGetAttribute\n");
1372     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1373     checkResult(
1374         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1375         "Error returned from cuDeviceGetAttribute\n");
1376     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(cuDeviceGetAttribute(
1378                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1379                 "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1381     checkResult(
1382         cuDeviceGetAttribute(
1383             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1384         "Error returned from cuDeviceGetAttribute\n");
1385     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1386     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1387                                      Device),
1388                 "Error returned from cuDeviceGetAttribute\n");
1389     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1390     checkResult(cuDeviceGetAttribute(
1391                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1392                     Device),
1393                 "Error returned from cuDeviceGetAttribute\n");
1394     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1395     checkResult(cuDeviceGetAttribute(
1396                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1397                 "Error returned from cuDeviceGetAttribute\n");
1398     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1399     checkResult(cuDeviceGetAttribute(
1400                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1401                 "Error returned from cuDeviceGetAttribute\n");
1402     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1403     checkResult(cuDeviceGetAttribute(
1404                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1405                 "Error returned from cuDeviceGetAttribute\n");
1406     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1407     checkResult(
1408         cuDeviceGetAttribute(
1409             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1410         "Error returned from cuDeviceGetAttribute\n");
1411     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1412     checkResult(
1413         cuDeviceGetAttribute(
1414             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1415         "Error returned from cuDeviceGetAttribute\n");
1416     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1417     checkResult(cuDeviceGetAttribute(
1418                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1419                 "Error returned from cuDeviceGetAttribute\n");
1420     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1421     checkResult(cuDeviceGetAttribute(
1422                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1423                 "Error returned from cuDeviceGetAttribute\n");
1424     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1425     checkResult(
1426         cuDeviceGetAttribute(
1427             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1428         "Error returned from cuDeviceGetAttribute\n");
1429     checkResult(
1430         cuDeviceGetAttribute(
1431             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1432         "Error returned from cuDeviceGetAttribute\n");
1433     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1434   }
1435 
1436   int createEvent(int DeviceId, void **P) {
1437     CUevent Event = nullptr;
1438     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1439       return OFFLOAD_FAIL;
1440     *P = Event;
1441     return OFFLOAD_SUCCESS;
1442   }
1443 
1444   int destroyEvent(int DeviceId, void *EventPtr) {
1445     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1446     return OFFLOAD_SUCCESS;
1447   }
1448 
1449   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1450                 void *EventPtr) const {
1451     CUstream Stream = getStream(DeviceId, AsyncInfo);
1452     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1453 
1454     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1455     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1456     // API document, that argument has to be 0x0.
1457     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1458     if (Err != CUDA_SUCCESS) {
1459       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1460          DPxPTR(Stream), DPxPTR(Event));
1461       CUDA_ERR_STRING(Err);
1462       return OFFLOAD_FAIL;
1463     }
1464 
1465     return OFFLOAD_SUCCESS;
1466   }
1467 
1468   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1469     if (AsyncInfo->Queue) {
1470       StreamPool[DeviceId]->release(
1471           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1472       AsyncInfo->Queue = nullptr;
1473     }
1474 
1475     return OFFLOAD_SUCCESS;
1476   }
1477 
1478   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1479     *AsyncInfo = new __tgt_async_info;
1480     getStream(DeviceId, *AsyncInfo);
1481     return OFFLOAD_SUCCESS;
1482   }
1483 
1484   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1485                      const char **ErrStr) const {
1486     assert(DeviceInfo && "DeviceInfo is nullptr");
1487 
1488     if (!DeviceInfo->Context)
1489       DeviceInfo->Context = DeviceData[DeviceId].Context;
1490     if (!DeviceInfo->Device) {
1491       CUdevice Dev;
1492       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1493       if (Err == CUDA_SUCCESS) {
1494         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1495       } else {
1496         cuGetErrorString(Err, ErrStr);
1497         return OFFLOAD_FAIL;
1498       }
1499     }
1500     return OFFLOAD_SUCCESS;
1501   }
1502 
1503   int setContext(int DeviceId) {
1504     assert(InitializedFlags[DeviceId] && "Device is not initialized");
1505 
1506     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1507     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1508       return OFFLOAD_FAIL;
1509 
1510     return OFFLOAD_SUCCESS;
1511   }
1512 };
1513 
1514 DeviceRTLTy DeviceRTL;
1515 } // namespace
1516 
1517 // Exposed library API function
1518 #ifdef __cplusplus
1519 extern "C" {
1520 #endif
1521 
1522 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1523   return elf_check_machine(image, /* EM_CUDA */ 190);
1524 }
1525 
1526 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1527 
1528 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1529   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1530   DeviceRTL.setRequiresFlag(RequiresFlags);
1531   return RequiresFlags;
1532 }
1533 
1534 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1535   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1536       DeviceRTL.isValidDeviceId(dst_dev_id))
1537     return 1;
1538 
1539   return 0;
1540 }
1541 
1542 int32_t __tgt_rtl_init_device(int32_t device_id) {
1543   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1544   // Context is set when init the device.
1545 
1546   return DeviceRTL.initDevice(device_id);
1547 }
1548 
1549 int32_t __tgt_rtl_deinit_device(int32_t device_id) {
1550   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1551   // Context is set when deinit the device.
1552 
1553   return DeviceRTL.deinitDevice(device_id);
1554 }
1555 
1556 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1557                                           __tgt_device_image *image) {
1558   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1559 
1560   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1561     return nullptr;
1562 
1563   return DeviceRTL.loadBinary(device_id, image);
1564 }
1565 
1566 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1567                            int32_t kind) {
1568   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1569 
1570   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1571     return nullptr;
1572 
1573   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1574 }
1575 
1576 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1577                               int64_t size) {
1578   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1579   // Context is set in __tgt_rtl_data_submit_async.
1580 
1581   __tgt_async_info AsyncInfo;
1582   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1583                                                  size, &AsyncInfo);
1584   if (rc != OFFLOAD_SUCCESS)
1585     return OFFLOAD_FAIL;
1586 
1587   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1588 }
1589 
1590 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1591                                     void *hst_ptr, int64_t size,
1592                                     __tgt_async_info *async_info_ptr) {
1593   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1594   assert(async_info_ptr && "async_info_ptr is nullptr");
1595 
1596   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1597     return OFFLOAD_FAIL;
1598 
1599   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1600                               async_info_ptr);
1601 }
1602 
1603 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1604                                 int64_t size) {
1605   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1606   // Context is set in __tgt_rtl_data_retrieve_async.
1607 
1608   __tgt_async_info AsyncInfo;
1609   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1610                                                    size, &AsyncInfo);
1611   if (rc != OFFLOAD_SUCCESS)
1612     return OFFLOAD_FAIL;
1613 
1614   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1615 }
1616 
1617 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1618                                       void *tgt_ptr, int64_t size,
1619                                       __tgt_async_info *async_info_ptr) {
1620   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1621   assert(async_info_ptr && "async_info_ptr is nullptr");
1622 
1623   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1624     return OFFLOAD_FAIL;
1625 
1626   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1627                                 async_info_ptr);
1628 }
1629 
1630 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1631                                       int dst_dev_id, void *dst_ptr,
1632                                       int64_t size,
1633                                       __tgt_async_info *AsyncInfo) {
1634   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1635   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1636   assert(AsyncInfo && "AsyncInfo is nullptr");
1637 
1638   if (DeviceRTL.setContext(src_dev_id) != OFFLOAD_SUCCESS)
1639     return OFFLOAD_FAIL;
1640 
1641   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1642                                 AsyncInfo);
1643 }
1644 
1645 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1646                                 int32_t dst_dev_id, void *dst_ptr,
1647                                 int64_t size) {
1648   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1649   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1650   // Context is set in __tgt_rtl_data_exchange_async.
1651 
1652   __tgt_async_info AsyncInfo;
1653   const int32_t rc = __tgt_rtl_data_exchange_async(
1654       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1655   if (rc != OFFLOAD_SUCCESS)
1656     return OFFLOAD_FAIL;
1657 
1658   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1659 }
1660 
1661 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1662   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1663 
1664   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1665     return OFFLOAD_FAIL;
1666 
1667   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1668 }
1669 
1670 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1671                                          void **tgt_args,
1672                                          ptrdiff_t *tgt_offsets,
1673                                          int32_t arg_num, int32_t team_num,
1674                                          int32_t thread_limit,
1675                                          uint64_t loop_tripcount) {
1676   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1677   // Context is set in __tgt_rtl_run_target_team_region_async.
1678 
1679   __tgt_async_info AsyncInfo;
1680   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1681       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1682       thread_limit, loop_tripcount, &AsyncInfo);
1683   if (rc != OFFLOAD_SUCCESS)
1684     return OFFLOAD_FAIL;
1685 
1686   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1687 }
1688 
1689 int32_t __tgt_rtl_run_target_team_region_async(
1690     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1691     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1692     int32_t thread_limit, uint64_t loop_tripcount,
1693     __tgt_async_info *async_info_ptr) {
1694   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1695 
1696   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1697     return OFFLOAD_FAIL;
1698 
1699   return DeviceRTL.runTargetTeamRegion(
1700       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1701       thread_limit, loop_tripcount, async_info_ptr);
1702 }
1703 
1704 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1705                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1706                                     int32_t arg_num) {
1707   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1708   // Context is set in __tgt_rtl_run_target_region_async.
1709 
1710   __tgt_async_info AsyncInfo;
1711   const int32_t rc = __tgt_rtl_run_target_region_async(
1712       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1713   if (rc != OFFLOAD_SUCCESS)
1714     return OFFLOAD_FAIL;
1715 
1716   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1717 }
1718 
1719 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1720                                           void *tgt_entry_ptr, void **tgt_args,
1721                                           ptrdiff_t *tgt_offsets,
1722                                           int32_t arg_num,
1723                                           __tgt_async_info *async_info_ptr) {
1724   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1725   // Context is set in __tgt_rtl_run_target_team_region_async.
1726   return __tgt_rtl_run_target_team_region_async(
1727       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1728       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1729       async_info_ptr);
1730 }
1731 
1732 int32_t __tgt_rtl_synchronize(int32_t device_id,
1733                               __tgt_async_info *async_info_ptr) {
1734   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1735   assert(async_info_ptr && "async_info_ptr is nullptr");
1736   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1737   // NOTE: We don't need to set context for stream sync.
1738   return DeviceRTL.synchronize(device_id, async_info_ptr);
1739 }
1740 
1741 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1742   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1743   InfoLevel.store(NewInfoLevel);
1744 }
1745 
1746 void __tgt_rtl_print_device_info(int32_t device_id) {
1747   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1748   // NOTE: We don't need to set context for print device info.
1749   DeviceRTL.printDeviceInfo(device_id);
1750 }
1751 
1752 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1753   assert(event && "event is nullptr");
1754 
1755   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1756     return OFFLOAD_FAIL;
1757 
1758   return DeviceRTL.createEvent(device_id, event);
1759 }
1760 
1761 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1762                                __tgt_async_info *async_info_ptr) {
1763   assert(async_info_ptr && "async_info_ptr is nullptr");
1764   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1765   assert(event_ptr && "event_ptr is nullptr");
1766   // NOTE: We might not need to set context for event record.
1767   return recordEvent(event_ptr, async_info_ptr);
1768 }
1769 
1770 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1771                              __tgt_async_info *async_info_ptr) {
1772   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1773   assert(async_info_ptr && "async_info_ptr is nullptr");
1774   assert(event_ptr && "event is nullptr");
1775   // If we don't have a queue we need to set the context.
1776   if (!async_info_ptr->Queue &&
1777       DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1778     return OFFLOAD_FAIL;
1779   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1780 }
1781 
1782 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1783   assert(event_ptr && "event is nullptr");
1784   // NOTE: We might not need to set context for event sync.
1785   return syncEvent(event_ptr);
1786 }
1787 
1788 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1789   assert(event_ptr && "event is nullptr");
1790 
1791   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1792     return OFFLOAD_FAIL;
1793 
1794   return DeviceRTL.destroyEvent(device_id, event_ptr);
1795 }
1796 
1797 int32_t __tgt_rtl_release_async_info(int32_t device_id,
1798                                      __tgt_async_info *async_info) {
1799   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1800   assert(async_info && "async_info is nullptr");
1801 
1802   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1803     return OFFLOAD_FAIL;
1804 
1805   return DeviceRTL.releaseAsyncInfo(device_id, async_info);
1806 }
1807 
1808 int32_t __tgt_rtl_init_async_info(int32_t device_id,
1809                                   __tgt_async_info **async_info) {
1810   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1811   assert(async_info && "async_info is nullptr");
1812 
1813   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1814     return OFFLOAD_FAIL;
1815 
1816   return DeviceRTL.initAsyncInfo(device_id, async_info);
1817 }
1818 
1819 int32_t __tgt_rtl_init_device_info(int32_t device_id,
1820                                    __tgt_device_info *device_info_ptr,
1821                                    const char **err_str) {
1822   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1823   assert(device_info_ptr && "device_info_ptr is nullptr");
1824 
1825   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1826     return OFFLOAD_FAIL;
1827 
1828   return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str);
1829 }
1830 
1831 #ifdef __cplusplus
1832 }
1833 #endif
1834