1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <mutex>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "Debug.h"
25 #include "DeviceEnvironment.h"
26 #include "omptarget.h"
27 #include "omptargetplugin.h"
28 
29 #define TARGET_NAME CUDA
30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
31 
32 #include "MemoryManager.h"
33 
34 #include "llvm/Frontend/OpenMP/OMPConstants.h"
35 
36 // Utility for retrieving and printing CUDA error string.
37 #ifdef OMPTARGET_DEBUG
38 #define CUDA_ERR_STRING(err)                                                   \
39   do {                                                                         \
40     if (getDebugLevel() > 0) {                                                 \
41       const char *errStr = nullptr;                                            \
42       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
43       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
44         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
45       else if (errStr_status == CUDA_SUCCESS)                                  \
46         REPORT("CUDA error is: %s\n", errStr);                                 \
47       else {                                                                   \
48         REPORT("Unresolved CUDA error code: %d\n", err);                       \
49         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
50                errStr_status);                                                 \
51       }                                                                        \
52     } else {                                                                   \
53       const char *errStr = nullptr;                                            \
54       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
55       if (errStr_status == CUDA_SUCCESS)                                       \
56         REPORT("%s \n", errStr);                                               \
57     }                                                                          \
58   } while (false)
59 #else // OMPTARGET_DEBUG
60 #define CUDA_ERR_STRING(err)                                                   \
61   do {                                                                         \
62     const char *errStr = nullptr;                                              \
63     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
64     if (errStr_status == CUDA_SUCCESS)                                         \
65       REPORT("%s \n", errStr);                                                 \
66   } while (false)
67 #endif // OMPTARGET_DEBUG
68 
69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
70 
71 #include "elf_common.h"
72 
73 /// Keep entries table per device.
74 struct FuncOrGblEntryTy {
75   __tgt_target_table Table;
76   std::vector<__tgt_offload_entry> Entries;
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
85 
86   /// Maximal number of threads per block for this kernel.
87   int MaxThreadsPerBlock = 0;
88 
89   KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
90       : Func(Func), ExecutionMode(ExecutionMode) {}
91 };
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   REPORT("%s", ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
120   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
121   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
122 
123   CUresult Err = cuEventRecord(Event, Stream);
124   if (Err != CUDA_SUCCESS) {
125     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
126        DPxPTR(Stream), DPxPTR(Event));
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 int syncEvent(void *EventPtr) {
135   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
136 
137   CUresult Err = cuEventSynchronize(Event);
138   if (Err != CUDA_SUCCESS) {
139     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 namespace {
148 
149 // Structure contains per-device data
150 struct DeviceDataTy {
151   /// List that contains all the kernels.
152   std::list<KernelTy> KernelsList;
153 
154   std::list<FuncOrGblEntryTy> FuncGblEntries;
155 
156   CUcontext Context = nullptr;
157   // Device properties
158   unsigned int ThreadsPerBlock = 0;
159   unsigned int BlocksPerGrid = 0;
160   unsigned int WarpSize = 0;
161   // OpenMP properties
162   int NumTeams = 0;
163   int NumThreads = 0;
164 };
165 
166 /// Resource allocator where \p T is the resource type.
167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
168 /// accordingly. The implementation should not raise any exception.
169 template <typename T> struct AllocatorTy {
170   using ElementTy = T;
171   virtual ~AllocatorTy() {}
172 
173   /// Create a resource and assign to R.
174   virtual int create(T &R) noexcept = 0;
175   /// Destroy the resource.
176   virtual int destroy(T) noexcept = 0;
177 };
178 
179 /// Allocator for CUstream.
180 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
181   /// See AllocatorTy<T>::create.
182   int create(CUstream &Stream) noexcept override {
183     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
184                      "Error returned from cuStreamCreate\n"))
185       return OFFLOAD_FAIL;
186 
187     return OFFLOAD_SUCCESS;
188   }
189 
190   /// See AllocatorTy<T>::destroy.
191   int destroy(CUstream Stream) noexcept override {
192     if (!checkResult(cuStreamDestroy(Stream),
193                      "Error returned from cuStreamDestroy\n"))
194       return OFFLOAD_FAIL;
195 
196     return OFFLOAD_SUCCESS;
197   }
198 };
199 
200 /// Allocator for CUevent.
201 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
202   /// See AllocatorTy<T>::create.
203   int create(CUevent &Event) noexcept override {
204     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
205                      "Error returned from cuEventCreate\n"))
206       return OFFLOAD_FAIL;
207 
208     return OFFLOAD_SUCCESS;
209   }
210 
211   /// See AllocatorTy<T>::destroy.
212   int destroy(CUevent Event) noexcept override {
213     if (!checkResult(cuEventDestroy(Event),
214                      "Error returned from cuEventDestroy\n"))
215       return OFFLOAD_FAIL;
216 
217     return OFFLOAD_SUCCESS;
218   }
219 };
220 
221 /// A generic pool of resources where \p T is the resource type.
222 /// \p T should be copyable as the object is stored in \p std::vector .
223 template <typename AllocTy> class ResourcePoolTy {
224   using ElementTy = typename AllocTy::ElementTy;
225   /// Index of the next available resource.
226   size_t Next = 0;
227   /// Mutex to guard the pool.
228   std::mutex Mutex;
229   /// Pool of resources. The difference between \p Resources and \p Pool is,
230   /// when a resource is acquired and released, it is all on \p Resources. When
231   /// a batch of new resources are needed, they are both added to \p Resources
232   /// and \p Pool. The reason for this setting is, \p Resources could contain
233   /// redundant elements because resources are not released, which can cause
234   /// double free. This setting makes sure that \p Pool always has every
235   /// resource allocated from the device.
236   std::vector<ElementTy> Resources;
237   std::vector<ElementTy> Pool;
238   /// A reference to the corresponding allocator.
239   AllocTy Allocator;
240 
241   /// If `Resources` is used up, we will fill in more resources. It assumes that
242   /// the new size `Size` should be always larger than the current size.
243   bool resize(size_t Size) {
244     assert(Resources.size() == Pool.size() && "size mismatch");
245     auto CurSize = Resources.size();
246     assert(Size > CurSize && "Unexpected smaller size");
247     Pool.reserve(Size);
248     Resources.reserve(Size);
249     for (auto I = CurSize; I < Size; ++I) {
250       ElementTy NewItem;
251       int Ret = Allocator.create(NewItem);
252       if (Ret != OFFLOAD_SUCCESS)
253         return false;
254       Pool.push_back(NewItem);
255       Resources.push_back(NewItem);
256     }
257     return true;
258   }
259 
260 public:
261   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
262       : Allocator(std::move(A)) {
263     if (Size)
264       (void)resize(Size);
265   }
266 
267   ~ResourcePoolTy() noexcept { clear(); }
268 
269   /// Get a resource from pool. `Next` always points to the next available
270   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
271   /// still available. If there is no resource left, we will ask for more. Each
272   /// time a resource is assigned, the id will increase one.
273   /// xxxxxs+++++++++
274   ///      ^
275   ///      Next
276   /// After assignment, the pool becomes the following and s is assigned.
277   /// xxxxxs+++++++++
278   ///       ^
279   ///       Next
280   int acquire(ElementTy &R) noexcept {
281     std::lock_guard<std::mutex> LG(Mutex);
282     if (Next == Resources.size()) {
283       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
284       if (!resize(NewSize))
285         return OFFLOAD_FAIL;
286     }
287 
288     assert(Next < Resources.size());
289 
290     R = Resources[Next++];
291 
292     return OFFLOAD_SUCCESS;
293   }
294 
295   /// Return the resource back to the pool. When we return a resource, we need
296   /// to first decrease `Next`, and then copy the resource back. It is worth
297   /// noting that, the order of resources return might be different from that
298   /// they're assigned, that saying, at some point, there might be two identical
299   /// resources.
300   /// xxax+a+++++
301   ///     ^
302   ///     Next
303   /// However, it doesn't matter, because they're always on the two sides of
304   /// `Next`. The left one will in the end be overwritten by another resource.
305   /// Therefore, after several execution, the order of pool might be different
306   /// from its initial state.
307   void release(ElementTy R) noexcept {
308     std::lock_guard<std::mutex> LG(Mutex);
309     Resources[--Next] = R;
310   }
311 
312   /// Released all stored resources and clear the pool.
313   /// Note: This function is not thread safe. Be sure to guard it if necessary.
314   void clear() noexcept {
315     for (auto &R : Pool)
316       (void)Allocator.destroy(R);
317     Pool.clear();
318     Resources.clear();
319   }
320 };
321 
322 } // namespace
323 
324 class DeviceRTLTy {
325   int NumberOfDevices;
326   // OpenMP environment properties
327   int EnvNumTeams;
328   int EnvTeamLimit;
329   int EnvTeamThreadLimit;
330   // OpenMP requires flags
331   int64_t RequiresFlags;
332   // Amount of dynamic shared memory to use at launch.
333   uint64_t DynamicMemorySize;
334 
335   /// Number of initial streams for each device.
336   int NumInitialStreams = 32;
337 
338   /// Number of initial events for each device.
339   int NumInitialEvents = 8;
340 
341   static constexpr const int32_t HardThreadLimit = 1024;
342   static constexpr const int32_t DefaultNumTeams = 128;
343   static constexpr const int32_t DefaultNumThreads = 128;
344 
345   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
346   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
347 
348   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
349   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
350 
351   std::vector<DeviceDataTy> DeviceData;
352   std::vector<std::vector<CUmodule>> Modules;
353 
354   /// Vector of flags indicating the initalization status of all associated
355   /// devices.
356   std::vector<bool> InitializedFlags;
357 
358   enum class PeerAccessState : uint8_t { Unkown, Yes, No };
359   std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
360   std::mutex PeerAccessMatrixLock;
361 
362   /// A class responsible for interacting with device native runtime library to
363   /// allocate and free memory.
364   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
365     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
366 
367   public:
368     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
369       if (Size == 0)
370         return nullptr;
371 
372       void *MemAlloc = nullptr;
373       CUresult Err;
374       switch (Kind) {
375       case TARGET_ALLOC_DEFAULT:
376       case TARGET_ALLOC_DEVICE:
377         CUdeviceptr DevicePtr;
378         Err = cuMemAlloc(&DevicePtr, Size);
379         MemAlloc = (void *)DevicePtr;
380         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
381           return nullptr;
382         break;
383       case TARGET_ALLOC_HOST:
384         void *HostPtr;
385         Err = cuMemAllocHost(&HostPtr, Size);
386         MemAlloc = HostPtr;
387         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
388           return nullptr;
389         HostPinnedAllocs[MemAlloc] = Kind;
390         break;
391       case TARGET_ALLOC_SHARED:
392         CUdeviceptr SharedPtr;
393         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
394         MemAlloc = (void *)SharedPtr;
395         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
396           return nullptr;
397         break;
398       }
399 
400       return MemAlloc;
401     }
402 
403     int free(void *TgtPtr) override {
404       CUresult Err;
405       // Host pinned memory must be freed differently.
406       TargetAllocTy Kind =
407           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
408               ? TARGET_ALLOC_DEFAULT
409               : TARGET_ALLOC_HOST;
410       switch (Kind) {
411       case TARGET_ALLOC_DEFAULT:
412       case TARGET_ALLOC_DEVICE:
413       case TARGET_ALLOC_SHARED:
414         Err = cuMemFree((CUdeviceptr)TgtPtr);
415         if (!checkResult(Err, "Error returned from cuMemFree\n"))
416           return OFFLOAD_FAIL;
417         break;
418       case TARGET_ALLOC_HOST:
419         Err = cuMemFreeHost(TgtPtr);
420         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
421           return OFFLOAD_FAIL;
422         break;
423       }
424 
425       return OFFLOAD_SUCCESS;
426     }
427   };
428 
429   /// A vector of device allocators
430   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
431 
432   /// A vector of memory managers. Since the memory manager is non-copyable and
433   // non-removable, we wrap them into std::unique_ptr.
434   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
435 
436   /// Whether use memory manager
437   bool UseMemoryManager = true;
438 
439   // Record entry point associated with device
440   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
441     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
442     E.Entries.push_back(Entry);
443   }
444 
445   // Return a pointer to the entry associated with the pointer
446   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
447                                              const void *Addr) const {
448     for (const __tgt_offload_entry &Itr :
449          DeviceData[DeviceId].FuncGblEntries.back().Entries)
450       if (Itr.addr == Addr)
451         return &Itr;
452 
453     return nullptr;
454   }
455 
456   // Return the pointer to the target entries table
457   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
458     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
459 
460     if (E.Entries.empty())
461       return nullptr;
462 
463     // Update table info according to the entries and return the pointer
464     E.Table.EntriesBegin = E.Entries.data();
465     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
466 
467     return &E.Table;
468   }
469 
470   // Clear entries table for a device
471   void clearOffloadEntriesTable(const int DeviceId) {
472     DeviceData[DeviceId].FuncGblEntries.emplace_back();
473     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
474     E.Entries.clear();
475     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
476   }
477 
478 public:
479   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
480     assert(AsyncInfo && "AsyncInfo is nullptr");
481 
482     if (!AsyncInfo->Queue) {
483       CUstream S;
484       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
485         return nullptr;
486 
487       AsyncInfo->Queue = S;
488     }
489 
490     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
491   }
492 
493   // This class should not be copied
494   DeviceRTLTy(const DeviceRTLTy &) = delete;
495   DeviceRTLTy(DeviceRTLTy &&) = delete;
496 
497   DeviceRTLTy()
498       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
499         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
500         DynamicMemorySize(0) {
501 
502     DP("Start initializing CUDA\n");
503 
504     CUresult Err = cuInit(0);
505     if (Err == CUDA_ERROR_INVALID_HANDLE) {
506       // Can't call cuGetErrorString if dlsym failed
507       DP("Failed to load CUDA shared library\n");
508       return;
509     }
510     if (!checkResult(Err, "Error returned from cuInit\n")) {
511       return;
512     }
513 
514     Err = cuDeviceGetCount(&NumberOfDevices);
515     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
516       return;
517 
518     if (NumberOfDevices == 0) {
519       DP("There are no devices supporting CUDA.\n");
520       return;
521     }
522 
523     DeviceData.resize(NumberOfDevices);
524     Modules.resize(NumberOfDevices);
525     StreamPool.resize(NumberOfDevices);
526     EventPool.resize(NumberOfDevices);
527     PeerAccessMatrix.resize(NumberOfDevices);
528     for (auto &V : PeerAccessMatrix)
529       V.resize(NumberOfDevices, PeerAccessState::Unkown);
530 
531     // Get environment variables regarding teams
532     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
533       // OMP_TEAM_LIMIT has been set
534       EnvTeamLimit = std::stoi(EnvStr);
535       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
536     }
537     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
538       // OMP_TEAMS_THREAD_LIMIT has been set
539       EnvTeamThreadLimit = std::stoi(EnvStr);
540       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
541     }
542     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
543       // OMP_NUM_TEAMS has been set
544       EnvNumTeams = std::stoi(EnvStr);
545       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
546     }
547     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
548       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
549       DynamicMemorySize = std::stoi(EnvStr);
550       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
551          DynamicMemorySize);
552     }
553     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
554       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
555       NumInitialStreams = std::stoi(EnvStr);
556       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
557     }
558 
559     for (int I = 0; I < NumberOfDevices; ++I)
560       DeviceAllocators.emplace_back();
561 
562     // Get the size threshold from environment variable
563     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
564     UseMemoryManager = Res.second;
565     size_t MemoryManagerThreshold = Res.first;
566 
567     if (UseMemoryManager)
568       for (int I = 0; I < NumberOfDevices; ++I)
569         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
570             DeviceAllocators[I], MemoryManagerThreshold));
571 
572     // We lazily initialize all devices later.
573     InitializedFlags.assign(NumberOfDevices, false);
574   }
575 
576   ~DeviceRTLTy() {
577     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
578       deinitDevice(DeviceId);
579   }
580 
581   // Check whether a given DeviceId is valid
582   bool isValidDeviceId(const int DeviceId) const {
583     return DeviceId >= 0 && DeviceId < NumberOfDevices;
584   }
585 
586   int getNumOfDevices() const { return NumberOfDevices; }
587 
588   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
589 
590   int initDevice(const int DeviceId) {
591     CUdevice Device;
592 
593     DP("Getting device %d\n", DeviceId);
594     CUresult Err = cuDeviceGet(&Device, DeviceId);
595     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
596       return OFFLOAD_FAIL;
597 
598     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
599     InitializedFlags[DeviceId] = true;
600 
601     // Query the current flags of the primary context and set its flags if
602     // it is inactive
603     unsigned int FormerPrimaryCtxFlags = 0;
604     int FormerPrimaryCtxIsActive = 0;
605     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
606                                      &FormerPrimaryCtxIsActive);
607     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
608       return OFFLOAD_FAIL;
609 
610     if (FormerPrimaryCtxIsActive) {
611       DP("The primary context is active, no change to its flags\n");
612       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
613           CU_CTX_SCHED_BLOCKING_SYNC)
614         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
615     } else {
616       DP("The primary context is inactive, set its flags to "
617          "CU_CTX_SCHED_BLOCKING_SYNC\n");
618       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
619       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
620         return OFFLOAD_FAIL;
621     }
622 
623     // Retain the per device primary context and save it to use whenever this
624     // device is selected.
625     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
626     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
627       return OFFLOAD_FAIL;
628 
629     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
630     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
631       return OFFLOAD_FAIL;
632 
633     // Initialize the stream pool.
634     if (!StreamPool[DeviceId])
635       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
636                                                             NumInitialStreams);
637 
638     // Initialize the event pool.
639     if (!EventPool[DeviceId])
640       EventPool[DeviceId] =
641           std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
642 
643     // Query attributes to determine number of threads/block and blocks/grid.
644     int MaxGridDimX;
645     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
646                                Device);
647     if (Err != CUDA_SUCCESS) {
648       DP("Error getting max grid dimension, use default value %d\n",
649          DeviceRTLTy::DefaultNumTeams);
650       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
651     } else {
652       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
653       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
654     }
655 
656     // We are only exploiting threads along the x axis.
657     int MaxBlockDimX;
658     Err = cuDeviceGetAttribute(&MaxBlockDimX,
659                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
660     if (Err != CUDA_SUCCESS) {
661       DP("Error getting max block dimension, use default value %d\n",
662          DeviceRTLTy::DefaultNumThreads);
663       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
664     } else {
665       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
666       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
667 
668       if (EnvTeamThreadLimit > 0 &&
669           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
670         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
671            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
672            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
673         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
674       }
675       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
676         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
677            "capping at the hard limit\n",
678            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
679         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
680       }
681     }
682 
683     // Get and set warp size
684     int WarpSize;
685     Err =
686         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
687     if (Err != CUDA_SUCCESS) {
688       DP("Error getting warp size, assume default value 32\n");
689       DeviceData[DeviceId].WarpSize = 32;
690     } else {
691       DP("Using warp size %d\n", WarpSize);
692       DeviceData[DeviceId].WarpSize = WarpSize;
693     }
694 
695     // Adjust teams to the env variables
696     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
697       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
698          EnvTeamLimit);
699       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
700     }
701 
702     size_t StackLimit;
703     size_t HeapLimit;
704     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
705       StackLimit = std::stol(EnvStr);
706       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
707         return OFFLOAD_FAIL;
708     } else {
709       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
710         return OFFLOAD_FAIL;
711     }
712     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
713       HeapLimit = std::stol(EnvStr);
714       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
715         return OFFLOAD_FAIL;
716     } else {
717       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
718         return OFFLOAD_FAIL;
719     }
720 
721     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
722          "Device supports up to %d CUDA blocks and %d threads with a "
723          "warp size of %d\n",
724          DeviceData[DeviceId].BlocksPerGrid,
725          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
726     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
727          "Device heap size is %d Bytes, device stack size is %d Bytes per "
728          "thread\n",
729          (int)HeapLimit, (int)StackLimit);
730 
731     // Set default number of teams
732     if (EnvNumTeams > 0) {
733       DP("Default number of teams set according to environment %d\n",
734          EnvNumTeams);
735       DeviceData[DeviceId].NumTeams = EnvNumTeams;
736     } else {
737       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
738       DP("Default number of teams set according to library's default %d\n",
739          DeviceRTLTy::DefaultNumTeams);
740     }
741 
742     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
743       DP("Default number of teams exceeds device limit, capping at %d\n",
744          DeviceData[DeviceId].BlocksPerGrid);
745       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
746     }
747 
748     // Set default number of threads
749     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
750     DP("Default number of threads set according to library's default %d\n",
751        DeviceRTLTy::DefaultNumThreads);
752     if (DeviceData[DeviceId].NumThreads >
753         DeviceData[DeviceId].ThreadsPerBlock) {
754       DP("Default number of threads exceeds device limit, capping at %d\n",
755          DeviceData[DeviceId].ThreadsPerBlock);
756       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
757     }
758 
759     return OFFLOAD_SUCCESS;
760   }
761 
762   int deinitDevice(const int DeviceId) {
763     auto IsInitialized = InitializedFlags[DeviceId];
764     if (!IsInitialized)
765       return OFFLOAD_SUCCESS;
766     InitializedFlags[DeviceId] = false;
767 
768     if (UseMemoryManager)
769       MemoryManagers[DeviceId].release();
770 
771     StreamPool[DeviceId].reset();
772     EventPool[DeviceId].reset();
773 
774     DeviceDataTy &D = DeviceData[DeviceId];
775     if (!checkResult(cuCtxSetCurrent(D.Context),
776                      "Error returned from cuCtxSetCurrent\n"))
777       return OFFLOAD_FAIL;
778 
779     // Unload all modules.
780     for (auto &M : Modules[DeviceId])
781       if (!checkResult(cuModuleUnload(M),
782                        "Error returned from cuModuleUnload\n"))
783         return OFFLOAD_FAIL;
784 
785     // Destroy context.
786     CUdevice Device;
787     if (!checkResult(cuCtxGetDevice(&Device),
788                      "Error returned from cuCtxGetDevice\n"))
789       return OFFLOAD_FAIL;
790 
791     if (!checkResult(cuDevicePrimaryCtxRelease(Device),
792                      "Error returned from cuDevicePrimaryCtxRelease\n"))
793       return OFFLOAD_FAIL;
794 
795     return OFFLOAD_SUCCESS;
796   }
797 
798   __tgt_target_table *loadBinary(const int DeviceId,
799                                  const __tgt_device_image *Image) {
800     // Clear the offload table as we are going to create a new one.
801     clearOffloadEntriesTable(DeviceId);
802 
803     // Create the module and extract the function pointers.
804     CUmodule Module;
805     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
806     CUresult Err =
807         cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
808     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
809       return nullptr;
810 
811     DP("CUDA module successfully loaded!\n");
812 
813     Modules[DeviceId].push_back(Module);
814 
815     // Find the symbols in the module by name.
816     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
817     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
818 
819     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
820     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
821       if (!E->addr) {
822         // We return nullptr when something like this happens, the host should
823         // have always something in the address to uniquely identify the target
824         // region.
825         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
826         return nullptr;
827       }
828 
829       if (E->size) {
830         __tgt_offload_entry Entry = *E;
831         CUdeviceptr CUPtr;
832         size_t CUSize;
833         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
834         // We keep this style here because we need the name
835         if (Err != CUDA_SUCCESS) {
836           REPORT("Loading global '%s' Failed\n", E->name);
837           CUDA_ERR_STRING(Err);
838           return nullptr;
839         }
840 
841         if (CUSize != E->size) {
842           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
843              CUSize, E->size);
844           return nullptr;
845         }
846 
847         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
848            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
849 
850         Entry.addr = (void *)(CUPtr);
851 
852         // Note: In the current implementation declare target variables
853         // can either be link or to. This means that once unified
854         // memory is activated via the requires directive, the variable
855         // can be used directly from the host in both cases.
856         // TODO: when variables types other than to or link are added,
857         // the below condition should be changed to explicitly
858         // check for to and link variables types:
859         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
860         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
861         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
862           // If unified memory is present any target link or to variables
863           // can access host addresses directly. There is no longer a
864           // need for device copies.
865           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
866           DP("Copy linked variable host address (" DPxMOD
867              ") to device address (" DPxMOD ")\n",
868              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
869         }
870 
871         addOffloadEntry(DeviceId, Entry);
872 
873         continue;
874       }
875 
876       CUfunction Func;
877       Err = cuModuleGetFunction(&Func, Module, E->name);
878       // We keep this style here because we need the name
879       if (Err != CUDA_SUCCESS) {
880         REPORT("Loading '%s' Failed\n", E->name);
881         CUDA_ERR_STRING(Err);
882         return nullptr;
883       }
884 
885       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
886          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
887 
888       // default value GENERIC (in case symbol is missing from cubin file)
889       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
890       std::string ExecModeNameStr(E->name);
891       ExecModeNameStr += "_exec_mode";
892       const char *ExecModeName = ExecModeNameStr.c_str();
893 
894       CUdeviceptr ExecModePtr;
895       size_t CUSize;
896       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
897       if (Err == CUDA_SUCCESS) {
898         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
899           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
900              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
901           return nullptr;
902         }
903 
904         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
905         if (Err != CUDA_SUCCESS) {
906           REPORT("Error when copying data from device to host. Pointers: "
907                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
908                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
909           CUDA_ERR_STRING(Err);
910           return nullptr;
911         }
912       } else {
913         DP("Loading global exec_mode '%s' - symbol missing, using default "
914            "value GENERIC (1)\n",
915            ExecModeName);
916       }
917 
918       KernelsList.emplace_back(Func, ExecModeVal);
919 
920       __tgt_offload_entry Entry = *E;
921       Entry.addr = &KernelsList.back();
922       addOffloadEntry(DeviceId, Entry);
923     }
924 
925     // send device environment data to the device
926     {
927       // TODO: The device ID used here is not the real device ID used by OpenMP.
928       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
929                                     static_cast<uint32_t>(DeviceId),
930                                     static_cast<uint32_t>(DynamicMemorySize)};
931 
932       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
933         DeviceEnv.DebugKind = std::stoi(EnvStr);
934 
935       const char *DeviceEnvName = "omptarget_device_environment";
936       CUdeviceptr DeviceEnvPtr;
937       size_t CUSize;
938 
939       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
940       if (Err == CUDA_SUCCESS) {
941         if (CUSize != sizeof(DeviceEnv)) {
942           REPORT(
943               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
944               DeviceEnvName, CUSize, sizeof(int32_t));
945           CUDA_ERR_STRING(Err);
946           return nullptr;
947         }
948 
949         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
950         if (Err != CUDA_SUCCESS) {
951           REPORT("Error when copying data from host to device. Pointers: "
952                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
953                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
954           CUDA_ERR_STRING(Err);
955           return nullptr;
956         }
957 
958         DP("Sending global device environment data %zu bytes\n", CUSize);
959       } else {
960         DP("Finding global device environment '%s' - symbol missing.\n",
961            DeviceEnvName);
962         DP("Continue, considering this is a device RTL which does not accept "
963            "environment setting.\n");
964       }
965     }
966 
967     return getOffloadEntriesTable(DeviceId);
968   }
969 
970   void *dataAlloc(const int DeviceId, const int64_t Size,
971                   const TargetAllocTy Kind) {
972     switch (Kind) {
973     case TARGET_ALLOC_DEFAULT:
974     case TARGET_ALLOC_DEVICE:
975       if (UseMemoryManager)
976         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
977       else
978         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
979     case TARGET_ALLOC_HOST:
980     case TARGET_ALLOC_SHARED:
981       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
982     }
983 
984     REPORT("Invalid target data allocation kind or requested allocator not "
985            "implemented yet\n");
986 
987     return nullptr;
988   }
989 
990   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
991                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
992     assert(AsyncInfo && "AsyncInfo is nullptr");
993 
994     CUstream Stream = getStream(DeviceId, AsyncInfo);
995     CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
996     if (Err != CUDA_SUCCESS) {
997       DP("Error when copying data from host to device. Pointers: host "
998          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
999          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1000       CUDA_ERR_STRING(Err);
1001       return OFFLOAD_FAIL;
1002     }
1003 
1004     return OFFLOAD_SUCCESS;
1005   }
1006 
1007   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1008                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
1009     assert(AsyncInfo && "AsyncInfo is nullptr");
1010 
1011     CUstream Stream = getStream(DeviceId, AsyncInfo);
1012     CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1013     if (Err != CUDA_SUCCESS) {
1014       DP("Error when copying data from device to host. Pointers: host "
1015          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1016          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1017       CUDA_ERR_STRING(Err);
1018       return OFFLOAD_FAIL;
1019     }
1020 
1021     return OFFLOAD_SUCCESS;
1022   }
1023 
1024   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1025                    int64_t Size, __tgt_async_info *AsyncInfo) {
1026     assert(AsyncInfo && "AsyncInfo is nullptr");
1027 
1028     CUresult Err;
1029     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1030 
1031     // If they are two devices, we try peer to peer copy first
1032     if (SrcDevId != DstDevId) {
1033       std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
1034 
1035       switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1036       case PeerAccessState::No: {
1037         REPORT("Peer access from %" PRId32 " to %" PRId32
1038                " is not supported. Fall back to D2D memcpy.\n",
1039                SrcDevId, DstDevId);
1040         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1041       }
1042       case PeerAccessState::Unkown: {
1043         int CanAccessPeer = 0;
1044         Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1045         if (Err != CUDA_SUCCESS) {
1046           REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1047                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1048                  SrcDevId, DstDevId);
1049           CUDA_ERR_STRING(Err);
1050           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1051           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1052         }
1053 
1054         if (!CanAccessPeer) {
1055           REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
1056                  "memcpy.\n",
1057                  SrcDevId, DstDevId);
1058           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1059           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1060         }
1061 
1062         Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1063         if (Err != CUDA_SUCCESS) {
1064           REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1065                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1066                  SrcDevId, DstDevId);
1067           CUDA_ERR_STRING(Err);
1068           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1069           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1070         }
1071 
1072         PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1073 
1074         LLVM_FALLTHROUGH;
1075       }
1076       case PeerAccessState::Yes: {
1077         Err = cuMemcpyPeerAsync(
1078             (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1079             (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
1080         if (Err == CUDA_SUCCESS)
1081           return OFFLOAD_SUCCESS;
1082 
1083         DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1084            ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
1085            ". Fall back to D2D memcpy.\n",
1086            DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1087         CUDA_ERR_STRING(Err);
1088 
1089         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1090       }
1091       }
1092     }
1093 
1094     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1095   }
1096 
1097   int dataDelete(const int DeviceId, void *TgtPtr) {
1098     if (UseMemoryManager)
1099       return MemoryManagers[DeviceId]->free(TgtPtr);
1100 
1101     return DeviceAllocators[DeviceId].free(TgtPtr);
1102   }
1103 
1104   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1105                           ptrdiff_t *TgtOffsets, const int ArgNum,
1106                           const int TeamNum, const int ThreadLimit,
1107                           const unsigned int LoopTripCount,
1108                           __tgt_async_info *AsyncInfo) const {
1109     // All args are references.
1110     std::vector<void *> Args(ArgNum);
1111     std::vector<void *> Ptrs(ArgNum);
1112 
1113     for (int I = 0; I < ArgNum; ++I) {
1114       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1115       Args[I] = &Ptrs[I];
1116     }
1117 
1118     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1119 
1120     const bool IsSPMDGenericMode =
1121         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1122     const bool IsSPMDMode =
1123         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1124     const bool IsGenericMode =
1125         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1126 
1127     int CudaThreadsPerBlock;
1128     if (ThreadLimit > 0) {
1129       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1130       CudaThreadsPerBlock = ThreadLimit;
1131       // Add master warp if necessary
1132       if (IsGenericMode) {
1133         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1134         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1135       }
1136     } else {
1137       DP("Setting CUDA threads per block to default %d\n",
1138          DeviceData[DeviceId].NumThreads);
1139       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1140     }
1141 
1142     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1143       DP("Threads per block capped at device limit %d\n",
1144          DeviceData[DeviceId].ThreadsPerBlock);
1145       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1146     }
1147 
1148     CUresult Err;
1149     if (!KernelInfo->MaxThreadsPerBlock) {
1150       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1151                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1152                                KernelInfo->Func);
1153       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1154         return OFFLOAD_FAIL;
1155     }
1156 
1157     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1158       DP("Threads per block capped at kernel limit %d\n",
1159          KernelInfo->MaxThreadsPerBlock);
1160       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1161     }
1162 
1163     unsigned int CudaBlocksPerGrid;
1164     if (TeamNum <= 0) {
1165       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1166         if (IsSPMDGenericMode) {
1167           // If we reach this point, then we are executing a kernel that was
1168           // transformed from Generic-mode to SPMD-mode. This kernel has
1169           // SPMD-mode execution, but needs its blocks to be scheduled
1170           // differently because the current loop trip count only applies to the
1171           // `teams distribute` region and will create var too few blocks using
1172           // the regular SPMD-mode method.
1173           CudaBlocksPerGrid = LoopTripCount;
1174         } else if (IsSPMDMode) {
1175           // We have a combined construct, i.e. `target teams distribute
1176           // parallel for [simd]`. We launch so many teams so that each thread
1177           // will execute one iteration of the loop. round up to the nearest
1178           // integer
1179           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1180         } else if (IsGenericMode) {
1181           // If we reach this point, then we have a non-combined construct, i.e.
1182           // `teams distribute` with a nested `parallel for` and each team is
1183           // assigned one iteration of the `distribute` loop. E.g.:
1184           //
1185           // #pragma omp target teams distribute
1186           // for(...loop_tripcount...) {
1187           //   #pragma omp parallel for
1188           //   for(...) {}
1189           // }
1190           //
1191           // Threads within a team will execute the iterations of the `parallel`
1192           // loop.
1193           CudaBlocksPerGrid = LoopTripCount;
1194         } else {
1195           REPORT("Unknown execution mode: %d\n",
1196                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1197           return OFFLOAD_FAIL;
1198         }
1199         DP("Using %d teams due to loop trip count %" PRIu32
1200            " and number of threads per block %d\n",
1201            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1202       } else {
1203         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1204         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1205       }
1206     } else {
1207       DP("Using requested number of teams %d\n", TeamNum);
1208       CudaBlocksPerGrid = TeamNum;
1209     }
1210 
1211     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1212       DP("Capping number of teams to team limit %d\n",
1213          DeviceData[DeviceId].BlocksPerGrid);
1214       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1215     }
1216 
1217     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1218          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1219          (getOffloadEntry(DeviceId, TgtEntryPtr))
1220              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1221              : "(null)",
1222          CudaBlocksPerGrid, CudaThreadsPerBlock,
1223          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1224 
1225     CUstream Stream = getStream(DeviceId, AsyncInfo);
1226     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1227                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1228                          /* blockDimY */ 1, /* blockDimZ */ 1,
1229                          DynamicMemorySize, Stream, &Args[0], nullptr);
1230     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1231       return OFFLOAD_FAIL;
1232 
1233     DP("Launch of entry point at " DPxMOD " successful!\n",
1234        DPxPTR(TgtEntryPtr));
1235 
1236     return OFFLOAD_SUCCESS;
1237   }
1238 
1239   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1240     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1241     CUresult Err = cuStreamSynchronize(Stream);
1242 
1243     // Once the stream is synchronized, return it to stream pool and reset
1244     // AsyncInfo. This is to make sure the synchronization only works for its
1245     // own tasks.
1246     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1247     AsyncInfo->Queue = nullptr;
1248 
1249     if (Err != CUDA_SUCCESS) {
1250       DP("Error when synchronizing stream. stream = " DPxMOD
1251          ", async info ptr = " DPxMOD "\n",
1252          DPxPTR(Stream), DPxPTR(AsyncInfo));
1253       CUDA_ERR_STRING(Err);
1254     }
1255     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1256   }
1257 
1258   void printDeviceInfo(int32_t DeviceId) {
1259     char TmpChar[1000];
1260     std::string TmpStr;
1261     size_t TmpSt;
1262     int TmpInt, TmpInt2, TmpInt3;
1263 
1264     CUdevice Device;
1265     checkResult(cuDeviceGet(&Device, DeviceId),
1266                 "Error returned from cuCtxGetDevice\n");
1267 
1268     cuDriverGetVersion(&TmpInt);
1269     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1270     printf("    CUDA Device Number: \t\t%d \n", DeviceId);
1271     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1272                 "Error returned from cuDeviceGetName\n");
1273     printf("    Device Name: \t\t\t%s \n", TmpChar);
1274     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1275                 "Error returned from cuDeviceTotalMem\n");
1276     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1277     checkResult(cuDeviceGetAttribute(
1278                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1279                 "Error returned from cuDeviceGetAttribute\n");
1280     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1281     checkResult(
1282         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1283         "Error returned from cuDeviceGetAttribute\n");
1284     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1285     checkResult(cuDeviceGetAttribute(
1286                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1287                 "Error returned from cuDeviceGetAttribute\n");
1288     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1289     checkResult(
1290         cuDeviceGetAttribute(
1291             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1292         "Error returned from cuDeviceGetAttribute\n");
1293     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1294     checkResult(
1295         cuDeviceGetAttribute(
1296             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1297         "Error returned from cuDeviceGetAttribute\n");
1298     printf("    Registers per Block: \t\t%d \n", TmpInt);
1299     checkResult(
1300         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1301         "Error returned from cuDeviceGetAttribute\n");
1302     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1303     checkResult(cuDeviceGetAttribute(
1304                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1305                 "Error returned from cuDeviceGetAttribute\n");
1306     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1307     checkResult(cuDeviceGetAttribute(
1308                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1309                 "Error returned from cuDeviceGetAttribute\n");
1310     checkResult(cuDeviceGetAttribute(
1311                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1312                 "Error returned from cuDeviceGetAttribute\n");
1313     checkResult(cuDeviceGetAttribute(
1314                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1315                 "Error returned from cuDeviceGetAttribute\n");
1316     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1317            TmpInt3);
1318     checkResult(cuDeviceGetAttribute(
1319                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1320                 "Error returned from cuDeviceGetAttribute\n");
1321     checkResult(cuDeviceGetAttribute(
1322                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1323                 "Error returned from cuDeviceGetAttribute\n");
1324     checkResult(cuDeviceGetAttribute(
1325                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1326                 "Error returned from cuDeviceGetAttribute\n");
1327     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1328            TmpInt3);
1329     checkResult(
1330         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1331         "Error returned from cuDeviceGetAttribute\n");
1332     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1333     checkResult(cuDeviceGetAttribute(
1334                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1335                 "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1337     checkResult(
1338         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1339         "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1341     checkResult(cuDeviceGetAttribute(
1342                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1343                 "Error returned from cuDeviceGetAttribute\n");
1344     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1345     checkResult(
1346         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1347         "Error returned from cuDeviceGetAttribute\n");
1348     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1349     checkResult(cuDeviceGetAttribute(
1350                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1351                 "Error returned from cuDeviceGetAttribute\n");
1352     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1353     checkResult(
1354         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1355         "Error returned from cuDeviceGetAttribute\n");
1356     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1357       TmpStr = "DEFAULT";
1358     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1359       TmpStr = "PROHIBITED";
1360     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1361       TmpStr = "EXCLUSIVE PROCESS";
1362     else
1363       TmpStr = "unknown";
1364     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1365     checkResult(cuDeviceGetAttribute(
1366                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1367                 "Error returned from cuDeviceGetAttribute\n");
1368     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1369     checkResult(
1370         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1371         "Error returned from cuDeviceGetAttribute\n");
1372     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1373     checkResult(cuDeviceGetAttribute(
1374                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1375                 "Error returned from cuDeviceGetAttribute\n");
1376     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1377     checkResult(
1378         cuDeviceGetAttribute(
1379             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1380         "Error returned from cuDeviceGetAttribute\n");
1381     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1382     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1383                                      Device),
1384                 "Error returned from cuDeviceGetAttribute\n");
1385     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1386     checkResult(cuDeviceGetAttribute(
1387                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1388                     Device),
1389                 "Error returned from cuDeviceGetAttribute\n");
1390     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1391     checkResult(cuDeviceGetAttribute(
1392                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1393                 "Error returned from cuDeviceGetAttribute\n");
1394     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1395     checkResult(cuDeviceGetAttribute(
1396                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1397                 "Error returned from cuDeviceGetAttribute\n");
1398     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1399     checkResult(cuDeviceGetAttribute(
1400                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1401                 "Error returned from cuDeviceGetAttribute\n");
1402     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1403     checkResult(
1404         cuDeviceGetAttribute(
1405             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1406         "Error returned from cuDeviceGetAttribute\n");
1407     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1408     checkResult(
1409         cuDeviceGetAttribute(
1410             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1411         "Error returned from cuDeviceGetAttribute\n");
1412     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1413     checkResult(cuDeviceGetAttribute(
1414                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1415                 "Error returned from cuDeviceGetAttribute\n");
1416     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1417     checkResult(cuDeviceGetAttribute(
1418                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1419                 "Error returned from cuDeviceGetAttribute\n");
1420     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1421     checkResult(
1422         cuDeviceGetAttribute(
1423             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1424         "Error returned from cuDeviceGetAttribute\n");
1425     checkResult(
1426         cuDeviceGetAttribute(
1427             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1428         "Error returned from cuDeviceGetAttribute\n");
1429     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1430   }
1431 
1432   int createEvent(int DeviceId, void **P) {
1433     CUevent Event = nullptr;
1434     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1435       return OFFLOAD_FAIL;
1436     *P = Event;
1437     return OFFLOAD_SUCCESS;
1438   }
1439 
1440   int destroyEvent(int DeviceId, void *EventPtr) {
1441     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1442     return OFFLOAD_SUCCESS;
1443   }
1444 
1445   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1446                 void *EventPtr) const {
1447     CUstream Stream = getStream(DeviceId, AsyncInfo);
1448     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1449 
1450     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1451     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1452     // API document, that argument has to be 0x0.
1453     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1454     if (Err != CUDA_SUCCESS) {
1455       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1456          DPxPTR(Stream), DPxPTR(Event));
1457       CUDA_ERR_STRING(Err);
1458       return OFFLOAD_FAIL;
1459     }
1460 
1461     return OFFLOAD_SUCCESS;
1462   }
1463 
1464   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1465     if (AsyncInfo->Queue) {
1466       StreamPool[DeviceId]->release(
1467           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1468       AsyncInfo->Queue = nullptr;
1469     }
1470 
1471     return OFFLOAD_SUCCESS;
1472   }
1473 
1474   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1475     *AsyncInfo = new __tgt_async_info;
1476     getStream(DeviceId, *AsyncInfo);
1477     return OFFLOAD_SUCCESS;
1478   }
1479 
1480   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1481                      const char **ErrStr) const {
1482     assert(DeviceInfo && "DeviceInfo is nullptr");
1483 
1484     if (!DeviceInfo->Context)
1485       DeviceInfo->Context = DeviceData[DeviceId].Context;
1486     if (!DeviceInfo->Device) {
1487       CUdevice Dev;
1488       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1489       if (Err == CUDA_SUCCESS) {
1490         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1491       } else {
1492         cuGetErrorString(Err, ErrStr);
1493         return OFFLOAD_FAIL;
1494       }
1495     }
1496     return OFFLOAD_SUCCESS;
1497   }
1498 
1499   int setContext(int DeviceId) {
1500     assert(InitializedFlags[DeviceId] && "Device is not initialized");
1501 
1502     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1503     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1504       return OFFLOAD_FAIL;
1505 
1506     return OFFLOAD_SUCCESS;
1507   }
1508 };
1509 
1510 DeviceRTLTy DeviceRTL;
1511 } // namespace
1512 
1513 // Exposed library API function
1514 #ifdef __cplusplus
1515 extern "C" {
1516 #endif
1517 
1518 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1519   return elf_check_machine(Image, /* EM_CUDA */ 190);
1520 }
1521 
1522 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1523 
1524 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1525   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1526   DeviceRTL.setRequiresFlag(RequiresFlags);
1527   return RequiresFlags;
1528 }
1529 
1530 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
1531   if (DeviceRTL.isValidDeviceId(SrcDevId) &&
1532       DeviceRTL.isValidDeviceId(DstDevId))
1533     return 1;
1534 
1535   return 0;
1536 }
1537 
1538 int32_t __tgt_rtl_init_device(int32_t DeviceId) {
1539   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1540   // Context is set when init the device.
1541 
1542   return DeviceRTL.initDevice(DeviceId);
1543 }
1544 
1545 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
1546   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1547   // Context is set when deinit the device.
1548 
1549   return DeviceRTL.deinitDevice(DeviceId);
1550 }
1551 
1552 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
1553                                           __tgt_device_image *Image) {
1554   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1555 
1556   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1557     return nullptr;
1558 
1559   return DeviceRTL.loadBinary(DeviceId, Image);
1560 }
1561 
1562 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
1563                            int32_t Kind) {
1564   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1565 
1566   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1567     return nullptr;
1568 
1569   return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
1570 }
1571 
1572 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1573                               int64_t Size) {
1574   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1575   // Context is set in __tgt_rtl_data_submit_async.
1576 
1577   __tgt_async_info AsyncInfo;
1578   const int32_t Rc =
1579       __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
1580   if (Rc != OFFLOAD_SUCCESS)
1581     return OFFLOAD_FAIL;
1582 
1583   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1584 }
1585 
1586 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
1587                                     void *HstPtr, int64_t Size,
1588                                     __tgt_async_info *AsyncInfoPtr) {
1589   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1590   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1591 
1592   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1593     return OFFLOAD_FAIL;
1594 
1595   return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
1596 }
1597 
1598 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1599                                 int64_t Size) {
1600   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1601   // Context is set in __tgt_rtl_data_retrieve_async.
1602 
1603   __tgt_async_info AsyncInfo;
1604   const int32_t Rc =
1605       __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
1606   if (Rc != OFFLOAD_SUCCESS)
1607     return OFFLOAD_FAIL;
1608 
1609   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1610 }
1611 
1612 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
1613                                       void *TgtPtr, int64_t Size,
1614                                       __tgt_async_info *AsyncInfoPtr) {
1615   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1616   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1617 
1618   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1619     return OFFLOAD_FAIL;
1620 
1621   return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
1622 }
1623 
1624 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
1625                                       int DstDevId, void *DstPtr, int64_t Size,
1626                                       __tgt_async_info *AsyncInfo) {
1627   assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1628   assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1629   assert(AsyncInfo && "AsyncInfo is nullptr");
1630 
1631   if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
1632     return OFFLOAD_FAIL;
1633 
1634   return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
1635                                 AsyncInfo);
1636 }
1637 
1638 int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
1639                                 int32_t DstDevId, void *DstPtr, int64_t Size) {
1640   assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1641   assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1642   // Context is set in __tgt_rtl_data_exchange_async.
1643 
1644   __tgt_async_info AsyncInfo;
1645   const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
1646                                                    DstPtr, Size, &AsyncInfo);
1647   if (Rc != OFFLOAD_SUCCESS)
1648     return OFFLOAD_FAIL;
1649 
1650   return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
1651 }
1652 
1653 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
1654   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1655 
1656   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1657     return OFFLOAD_FAIL;
1658 
1659   return DeviceRTL.dataDelete(DeviceId, TgtPtr);
1660 }
1661 
1662 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
1663                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
1664                                          int32_t ArgNum, int32_t TeamNum,
1665                                          int32_t ThreadLimit,
1666                                          uint64_t LoopTripcount) {
1667   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1668   // Context is set in __tgt_rtl_run_target_team_region_async.
1669 
1670   __tgt_async_info AsyncInfo;
1671   const int32_t Rc = __tgt_rtl_run_target_team_region_async(
1672       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1673       LoopTripcount, &AsyncInfo);
1674   if (Rc != OFFLOAD_SUCCESS)
1675     return OFFLOAD_FAIL;
1676 
1677   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1678 }
1679 
1680 int32_t __tgt_rtl_run_target_team_region_async(
1681     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
1682     int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
1683     uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
1684   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1685 
1686   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1687     return OFFLOAD_FAIL;
1688 
1689   return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
1690                                        TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1691                                        LoopTripcount, AsyncInfoPtr);
1692 }
1693 
1694 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
1695                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
1696                                     int32_t ArgNum) {
1697   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1698   // Context is set in __tgt_rtl_run_target_region_async.
1699 
1700   __tgt_async_info AsyncInfo;
1701   const int32_t Rc = __tgt_rtl_run_target_region_async(
1702       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
1703   if (Rc != OFFLOAD_SUCCESS)
1704     return OFFLOAD_FAIL;
1705 
1706   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1707 }
1708 
1709 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
1710                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
1711                                           int32_t ArgNum,
1712                                           __tgt_async_info *AsyncInfoPtr) {
1713   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1714   // Context is set in __tgt_rtl_run_target_team_region_async.
1715   return __tgt_rtl_run_target_team_region_async(
1716       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
1717       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1718       AsyncInfoPtr);
1719 }
1720 
1721 int32_t __tgt_rtl_synchronize(int32_t DeviceId,
1722                               __tgt_async_info *AsyncInfoPtr) {
1723   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1724   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1725   assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1726   // NOTE: We don't need to set context for stream sync.
1727   return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
1728 }
1729 
1730 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1731   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1732   InfoLevel.store(NewInfoLevel);
1733 }
1734 
1735 void __tgt_rtl_print_device_info(int32_t DeviceId) {
1736   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1737   // NOTE: We don't need to set context for print device info.
1738   DeviceRTL.printDeviceInfo(DeviceId);
1739 }
1740 
1741 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
1742   assert(Event && "event is nullptr");
1743 
1744   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1745     return OFFLOAD_FAIL;
1746 
1747   return DeviceRTL.createEvent(DeviceId, Event);
1748 }
1749 
1750 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
1751                                __tgt_async_info *AsyncInfoPtr) {
1752   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1753   assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1754   assert(EventPtr && "event_ptr is nullptr");
1755   // NOTE: We might not need to set context for event record.
1756   return recordEvent(EventPtr, AsyncInfoPtr);
1757 }
1758 
1759 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
1760                              __tgt_async_info *AsyncInfoPtr) {
1761   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1762   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1763   assert(EventPtr && "event is nullptr");
1764   // If we don't have a queue we need to set the context.
1765   if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1766     return OFFLOAD_FAIL;
1767   return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
1768 }
1769 
1770 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
1771   assert(EventPtr && "event is nullptr");
1772   // NOTE: We might not need to set context for event sync.
1773   return syncEvent(EventPtr);
1774 }
1775 
1776 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
1777   assert(EventPtr && "event is nullptr");
1778 
1779   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1780     return OFFLOAD_FAIL;
1781 
1782   return DeviceRTL.destroyEvent(DeviceId, EventPtr);
1783 }
1784 
1785 int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
1786                                      __tgt_async_info *AsyncInfo) {
1787   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1788   assert(AsyncInfo && "async_info is nullptr");
1789 
1790   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1791     return OFFLOAD_FAIL;
1792 
1793   return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
1794 }
1795 
1796 int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
1797                                   __tgt_async_info **AsyncInfo) {
1798   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1799   assert(AsyncInfo && "async_info is nullptr");
1800 
1801   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1802     return OFFLOAD_FAIL;
1803 
1804   return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
1805 }
1806 
1807 int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
1808                                    __tgt_device_info *DeviceInfoPtr,
1809                                    const char **ErrStr) {
1810   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1811   assert(DeviceInfoPtr && "device_info_ptr is nullptr");
1812 
1813   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1814     return OFFLOAD_FAIL;
1815 
1816   return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
1817 }
1818 
1819 #ifdef __cplusplus
1820 }
1821 #endif
1822