1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <mutex>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "Debug.h"
25 #include "DeviceEnvironment.h"
26 #include "omptarget.h"
27 #include "omptargetplugin.h"
28 
29 #define TARGET_NAME CUDA
30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
31 
32 #include "MemoryManager.h"
33 
34 #include "llvm/Frontend/OpenMP/OMPConstants.h"
35 
36 // Utility for retrieving and printing CUDA error string.
37 #ifdef OMPTARGET_DEBUG
38 #define CUDA_ERR_STRING(err)                                                   \
39   do {                                                                         \
40     if (getDebugLevel() > 0) {                                                 \
41       const char *errStr = nullptr;                                            \
42       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
43       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
44         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
45       else if (errStr_status == CUDA_SUCCESS)                                  \
46         REPORT("CUDA error is: %s\n", errStr);                                 \
47       else {                                                                   \
48         REPORT("Unresolved CUDA error code: %d\n", err);                       \
49         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
50                errStr_status);                                                 \
51       }                                                                        \
52     } else {                                                                   \
53       const char *errStr = nullptr;                                            \
54       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
55       if (errStr_status == CUDA_SUCCESS)                                       \
56         REPORT("%s \n", errStr);                                               \
57     }                                                                          \
58   } while (false)
59 #else // OMPTARGET_DEBUG
60 #define CUDA_ERR_STRING(err)                                                   \
61   do {                                                                         \
62     const char *errStr = nullptr;                                              \
63     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
64     if (errStr_status == CUDA_SUCCESS)                                         \
65       REPORT("%s \n", errStr);                                                 \
66   } while (false)
67 #endif // OMPTARGET_DEBUG
68 
69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
70 
71 #include "elf_common.h"
72 
73 /// Keep entries table per device.
74 struct FuncOrGblEntryTy {
75   __tgt_target_table Table;
76   std::vector<__tgt_offload_entry> Entries;
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
85 
86   /// Maximal number of threads per block for this kernel.
87   int MaxThreadsPerBlock = 0;
88 
89   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
90       : Func(_Func), ExecutionMode(_ExecutionMode) {}
91 };
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   REPORT("%s", ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
120   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
121   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
122 
123   CUresult Err = cuEventRecord(Event, Stream);
124   if (Err != CUDA_SUCCESS) {
125     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
126        DPxPTR(Stream), DPxPTR(Event));
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 int syncEvent(void *EventPtr) {
135   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
136 
137   CUresult Err = cuEventSynchronize(Event);
138   if (Err != CUDA_SUCCESS) {
139     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 namespace {
148 
149 // Structure contains per-device data
150 struct DeviceDataTy {
151   /// List that contains all the kernels.
152   std::list<KernelTy> KernelsList;
153 
154   std::list<FuncOrGblEntryTy> FuncGblEntries;
155 
156   CUcontext Context = nullptr;
157   // Device properties
158   int ThreadsPerBlock = 0;
159   int BlocksPerGrid = 0;
160   int WarpSize = 0;
161   // OpenMP properties
162   int NumTeams = 0;
163   int NumThreads = 0;
164 };
165 
166 /// Resource allocator where \p T is the resource type.
167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
168 /// accordingly. The implementation should not raise any exception.
169 template <typename T> struct AllocatorTy {
170   using ElementTy = T;
171   /// Create a resource and assign to R.
172   virtual int create(T &R) noexcept = 0;
173   /// Destroy the resource.
174   virtual int destroy(T) noexcept = 0;
175 };
176 
177 /// Allocator for CUstream.
178 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
179   /// See AllocatorTy<T>::create.
180   int create(CUstream &Stream) noexcept override {
181     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
182                      "Error returned from cuStreamCreate\n"))
183       return OFFLOAD_FAIL;
184 
185     return OFFLOAD_SUCCESS;
186   }
187 
188   /// See AllocatorTy<T>::destroy.
189   int destroy(CUstream Stream) noexcept override {
190     if (!checkResult(cuStreamDestroy(Stream),
191                      "Error returned from cuStreamDestroy\n"))
192       return OFFLOAD_FAIL;
193 
194     return OFFLOAD_SUCCESS;
195   }
196 };
197 
198 /// Allocator for CUevent.
199 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
200   /// See AllocatorTy<T>::create.
201   int create(CUevent &Event) noexcept override {
202     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
203                      "Error returned from cuEventCreate\n"))
204       return OFFLOAD_FAIL;
205 
206     return OFFLOAD_SUCCESS;
207   }
208 
209   /// See AllocatorTy<T>::destroy.
210   int destroy(CUevent Event) noexcept override {
211     if (!checkResult(cuEventDestroy(Event),
212                      "Error returned from cuEventDestroy\n"))
213       return OFFLOAD_FAIL;
214 
215     return OFFLOAD_SUCCESS;
216   }
217 };
218 
219 /// A generic pool of resources where \p T is the resource type.
220 /// \p T should be copyable as the object is stored in \p std::vector .
221 template <typename AllocTy> class ResourcePoolTy {
222   using ElementTy = typename AllocTy::ElementTy;
223   /// Index of the next available resource.
224   size_t Next = 0;
225   /// Mutex to guard the pool.
226   std::mutex Mutex;
227   /// Pool of resources.
228   std::vector<ElementTy> Resources;
229   /// A reference to the corresponding allocator.
230   AllocTy Allocator;
231 
232   /// If `Resources` is used up, we will fill in more resources. It assumes that
233   /// the new size `Size` should be always larger than the current size.
234   bool resize(size_t Size) {
235     auto CurSize = Resources.size();
236     assert(Size > CurSize && "Unexpected smaller size");
237     Resources.reserve(Size);
238     for (auto I = CurSize; I < Size; ++I) {
239       ElementTy NewItem;
240       int Ret = Allocator.create(NewItem);
241       if (Ret != OFFLOAD_SUCCESS)
242         return false;
243       Resources.push_back(NewItem);
244     }
245     return true;
246   }
247 
248 public:
249   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
250       : Allocator(std::move(A)) {
251     if (Size)
252       (void)resize(Size);
253   }
254 
255   ~ResourcePoolTy() noexcept { clear(); }
256 
257   /// Get a resource from pool. `Next` always points to the next available
258   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
259   /// still available. If there is no resource left, we will ask for more. Each
260   /// time a resource is assigned, the id will increase one.
261   /// xxxxxs+++++++++
262   ///      ^
263   ///      Next
264   /// After assignment, the pool becomes the following and s is assigned.
265   /// xxxxxs+++++++++
266   ///       ^
267   ///       Next
268   int acquire(ElementTy &R) noexcept {
269     std::lock_guard<std::mutex> LG(Mutex);
270     if (Next == Resources.size()) {
271       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
272       if (!resize(NewSize))
273         return OFFLOAD_FAIL;
274     }
275 
276     assert(Next < Resources.size());
277 
278     R = Resources[Next++];
279 
280     return OFFLOAD_SUCCESS;
281   }
282 
283   /// Return the resource back to the pool. When we return a resource, we need
284   /// to first decrease `Next`, and then copy the resource back. It is worth
285   /// noting that, the order of resources return might be different from that
286   /// they're assigned, that saying, at some point, there might be two identical
287   /// resources.
288   /// xxax+a+++++
289   ///     ^
290   ///     Next
291   /// However, it doesn't matter, because they're always on the two sides of
292   /// `Next`. The left one will in the end be overwritten by another resource.
293   /// Therefore, after several execution, the order of pool might be different
294   /// from its initial state.
295   void release(ElementTy R) noexcept {
296     std::lock_guard<std::mutex> LG(Mutex);
297     Resources[--Next] = R;
298   }
299 
300   /// Released all stored resources and clear the pool.
301   /// Note: This function is not thread safe. Be sure to guard it if necessary.
302   void clear() noexcept {
303     for (auto &R : Resources)
304       (void)Allocator.destroy(R);
305     Resources.clear();
306   }
307 };
308 
309 } // namespace
310 
311 class DeviceRTLTy {
312   int NumberOfDevices;
313   // OpenMP environment properties
314   int EnvNumTeams;
315   int EnvTeamLimit;
316   int EnvTeamThreadLimit;
317   // OpenMP requires flags
318   int64_t RequiresFlags;
319   // Amount of dynamic shared memory to use at launch.
320   uint64_t DynamicMemorySize;
321 
322   /// Number of initial streams for each device.
323   int NumInitialStreams = 32;
324 
325   /// Number of initial events for each device.
326   int NumInitialEvents = 8;
327 
328   static constexpr const int32_t HardThreadLimit = 1024;
329   static constexpr const int32_t DefaultNumTeams = 128;
330   static constexpr const int32_t DefaultNumThreads = 128;
331 
332   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
333   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
334 
335   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
336   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
337 
338   std::vector<DeviceDataTy> DeviceData;
339   std::vector<std::vector<CUmodule>> Modules;
340 
341   /// Vector of flags indicating the initalization status of all associated
342   /// devices.
343   std::vector<bool> InitializedFlags;
344 
345   /// A class responsible for interacting with device native runtime library to
346   /// allocate and free memory.
347   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
348     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
349 
350   public:
351     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
352       if (Size == 0)
353         return nullptr;
354 
355       void *MemAlloc = nullptr;
356       CUresult Err;
357       switch (Kind) {
358       case TARGET_ALLOC_DEFAULT:
359       case TARGET_ALLOC_DEVICE:
360         CUdeviceptr DevicePtr;
361         Err = cuMemAlloc(&DevicePtr, Size);
362         MemAlloc = (void *)DevicePtr;
363         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
364           return nullptr;
365         break;
366       case TARGET_ALLOC_HOST:
367         void *HostPtr;
368         Err = cuMemAllocHost(&HostPtr, Size);
369         MemAlloc = HostPtr;
370         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
371           return nullptr;
372         HostPinnedAllocs[MemAlloc] = Kind;
373         break;
374       case TARGET_ALLOC_SHARED:
375         CUdeviceptr SharedPtr;
376         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
377         MemAlloc = (void *)SharedPtr;
378         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
379           return nullptr;
380         break;
381       }
382 
383       return MemAlloc;
384     }
385 
386     int free(void *TgtPtr) override {
387       CUresult Err;
388       // Host pinned memory must be freed differently.
389       TargetAllocTy Kind =
390           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
391               ? TARGET_ALLOC_DEFAULT
392               : TARGET_ALLOC_HOST;
393       switch (Kind) {
394       case TARGET_ALLOC_DEFAULT:
395       case TARGET_ALLOC_DEVICE:
396       case TARGET_ALLOC_SHARED:
397         Err = cuMemFree((CUdeviceptr)TgtPtr);
398         if (!checkResult(Err, "Error returned from cuMemFree\n"))
399           return OFFLOAD_FAIL;
400         break;
401       case TARGET_ALLOC_HOST:
402         Err = cuMemFreeHost(TgtPtr);
403         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
404           return OFFLOAD_FAIL;
405         break;
406       }
407 
408       return OFFLOAD_SUCCESS;
409     }
410   };
411 
412   /// A vector of device allocators
413   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
414 
415   /// A vector of memory managers. Since the memory manager is non-copyable and
416   // non-removable, we wrap them into std::unique_ptr.
417   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
418 
419   /// Whether use memory manager
420   bool UseMemoryManager = true;
421 
422   // Record entry point associated with device
423   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
424     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
425     E.Entries.push_back(entry);
426   }
427 
428   // Return a pointer to the entry associated with the pointer
429   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
430                                              const void *Addr) const {
431     for (const __tgt_offload_entry &Itr :
432          DeviceData[DeviceId].FuncGblEntries.back().Entries)
433       if (Itr.addr == Addr)
434         return &Itr;
435 
436     return nullptr;
437   }
438 
439   // Return the pointer to the target entries table
440   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
441     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
442 
443     if (E.Entries.empty())
444       return nullptr;
445 
446     // Update table info according to the entries and return the pointer
447     E.Table.EntriesBegin = E.Entries.data();
448     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
449 
450     return &E.Table;
451   }
452 
453   // Clear entries table for a device
454   void clearOffloadEntriesTable(const int DeviceId) {
455     DeviceData[DeviceId].FuncGblEntries.emplace_back();
456     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
457     E.Entries.clear();
458     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
459   }
460 
461 public:
462   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
463     assert(AsyncInfo && "AsyncInfo is nullptr");
464 
465     if (!AsyncInfo->Queue) {
466       CUstream S;
467       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
468         return nullptr;
469 
470       AsyncInfo->Queue = S;
471     }
472 
473     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
474   }
475 
476   // This class should not be copied
477   DeviceRTLTy(const DeviceRTLTy &) = delete;
478   DeviceRTLTy(DeviceRTLTy &&) = delete;
479 
480   DeviceRTLTy()
481       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
482         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
483         DynamicMemorySize(0) {
484 
485     DP("Start initializing CUDA\n");
486 
487     CUresult Err = cuInit(0);
488     if (Err == CUDA_ERROR_INVALID_HANDLE) {
489       // Can't call cuGetErrorString if dlsym failed
490       DP("Failed to load CUDA shared library\n");
491       return;
492     }
493     if (!checkResult(Err, "Error returned from cuInit\n")) {
494       return;
495     }
496 
497     Err = cuDeviceGetCount(&NumberOfDevices);
498     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
499       return;
500 
501     if (NumberOfDevices == 0) {
502       DP("There are no devices supporting CUDA.\n");
503       return;
504     }
505 
506     DeviceData.resize(NumberOfDevices);
507     Modules.resize(NumberOfDevices);
508     StreamPool.resize(NumberOfDevices);
509     EventPool.resize(NumberOfDevices);
510 
511     // Get environment variables regarding teams
512     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
513       // OMP_TEAM_LIMIT has been set
514       EnvTeamLimit = std::stoi(EnvStr);
515       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
516     }
517     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
518       // OMP_TEAMS_THREAD_LIMIT has been set
519       EnvTeamThreadLimit = std::stoi(EnvStr);
520       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
521     }
522     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
523       // OMP_NUM_TEAMS has been set
524       EnvNumTeams = std::stoi(EnvStr);
525       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
526     }
527     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
528       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
529       DynamicMemorySize = std::stoi(EnvStr);
530       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
531          DynamicMemorySize);
532     }
533     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
534       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
535       NumInitialStreams = std::stoi(EnvStr);
536       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
537     }
538 
539     for (int I = 0; I < NumberOfDevices; ++I)
540       DeviceAllocators.emplace_back();
541 
542     // Get the size threshold from environment variable
543     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
544     UseMemoryManager = Res.second;
545     size_t MemoryManagerThreshold = Res.first;
546 
547     if (UseMemoryManager)
548       for (int I = 0; I < NumberOfDevices; ++I)
549         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
550             DeviceAllocators[I], MemoryManagerThreshold));
551 
552     // We lazily initialize all devices later.
553     InitializedFlags.assign(NumberOfDevices, false);
554   }
555 
556   ~DeviceRTLTy() {
557     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
558       deinitDevice(DeviceId);
559   }
560 
561   // Check whether a given DeviceId is valid
562   bool isValidDeviceId(const int DeviceId) const {
563     return DeviceId >= 0 && DeviceId < NumberOfDevices;
564   }
565 
566   int getNumOfDevices() const { return NumberOfDevices; }
567 
568   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
569 
570   int initDevice(const int DeviceId) {
571     CUdevice Device;
572 
573     DP("Getting device %d\n", DeviceId);
574     CUresult Err = cuDeviceGet(&Device, DeviceId);
575     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
576       return OFFLOAD_FAIL;
577 
578     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
579     InitializedFlags[DeviceId] = true;
580 
581     // Query the current flags of the primary context and set its flags if
582     // it is inactive
583     unsigned int FormerPrimaryCtxFlags = 0;
584     int FormerPrimaryCtxIsActive = 0;
585     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
586                                      &FormerPrimaryCtxIsActive);
587     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
588       return OFFLOAD_FAIL;
589 
590     if (FormerPrimaryCtxIsActive) {
591       DP("The primary context is active, no change to its flags\n");
592       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
593           CU_CTX_SCHED_BLOCKING_SYNC)
594         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
595     } else {
596       DP("The primary context is inactive, set its flags to "
597          "CU_CTX_SCHED_BLOCKING_SYNC\n");
598       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
599       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
600         return OFFLOAD_FAIL;
601     }
602 
603     // Retain the per device primary context and save it to use whenever this
604     // device is selected.
605     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
606     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
607       return OFFLOAD_FAIL;
608 
609     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
610     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
611       return OFFLOAD_FAIL;
612 
613     // Initialize the stream pool.
614     if (!StreamPool[DeviceId])
615       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
616                                                             NumInitialStreams);
617 
618     // Initialize the event pool.
619     if (!EventPool[DeviceId])
620       EventPool[DeviceId] =
621           std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
622 
623     // Query attributes to determine number of threads/block and blocks/grid.
624     int MaxGridDimX;
625     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
626                                Device);
627     if (Err != CUDA_SUCCESS) {
628       DP("Error getting max grid dimension, use default value %d\n",
629          DeviceRTLTy::DefaultNumTeams);
630       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
631     } else {
632       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
633       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
634     }
635 
636     // We are only exploiting threads along the x axis.
637     int MaxBlockDimX;
638     Err = cuDeviceGetAttribute(&MaxBlockDimX,
639                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
640     if (Err != CUDA_SUCCESS) {
641       DP("Error getting max block dimension, use default value %d\n",
642          DeviceRTLTy::DefaultNumThreads);
643       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
644     } else {
645       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
646       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
647 
648       if (EnvTeamThreadLimit > 0 &&
649           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
650         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
651            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
652            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
653         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
654       }
655       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
656         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
657            "capping at the hard limit\n",
658            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
659         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
660       }
661     }
662 
663     // Get and set warp size
664     int WarpSize;
665     Err =
666         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
667     if (Err != CUDA_SUCCESS) {
668       DP("Error getting warp size, assume default value 32\n");
669       DeviceData[DeviceId].WarpSize = 32;
670     } else {
671       DP("Using warp size %d\n", WarpSize);
672       DeviceData[DeviceId].WarpSize = WarpSize;
673     }
674 
675     // Adjust teams to the env variables
676     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
677       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
678          EnvTeamLimit);
679       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
680     }
681 
682     size_t StackLimit;
683     size_t HeapLimit;
684     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
685       StackLimit = std::stol(EnvStr);
686       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
687         return OFFLOAD_FAIL;
688     } else {
689       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
690         return OFFLOAD_FAIL;
691     }
692     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
693       HeapLimit = std::stol(EnvStr);
694       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
695         return OFFLOAD_FAIL;
696     } else {
697       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
698         return OFFLOAD_FAIL;
699     }
700 
701     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
702          "Device supports up to %d CUDA blocks and %d threads with a "
703          "warp size of %d\n",
704          DeviceData[DeviceId].BlocksPerGrid,
705          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
706     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
707          "Device heap size is %d Bytes, device stack size is %d Bytes per "
708          "thread\n",
709          (int)HeapLimit, (int)StackLimit);
710 
711     // Set default number of teams
712     if (EnvNumTeams > 0) {
713       DP("Default number of teams set according to environment %d\n",
714          EnvNumTeams);
715       DeviceData[DeviceId].NumTeams = EnvNumTeams;
716     } else {
717       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
718       DP("Default number of teams set according to library's default %d\n",
719          DeviceRTLTy::DefaultNumTeams);
720     }
721 
722     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
723       DP("Default number of teams exceeds device limit, capping at %d\n",
724          DeviceData[DeviceId].BlocksPerGrid);
725       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
726     }
727 
728     // Set default number of threads
729     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
730     DP("Default number of threads set according to library's default %d\n",
731        DeviceRTLTy::DefaultNumThreads);
732     if (DeviceData[DeviceId].NumThreads >
733         DeviceData[DeviceId].ThreadsPerBlock) {
734       DP("Default number of threads exceeds device limit, capping at %d\n",
735          DeviceData[DeviceId].ThreadsPerBlock);
736       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
737     }
738 
739     return OFFLOAD_SUCCESS;
740   }
741 
742   int deinitDevice(const int DeviceId) {
743     auto IsInitialized = InitializedFlags[DeviceId];
744     if (!IsInitialized)
745       return OFFLOAD_SUCCESS;
746     InitializedFlags[DeviceId] = false;
747 
748     if (UseMemoryManager)
749       MemoryManagers[DeviceId].release();
750 
751     StreamPool[DeviceId].reset();
752     EventPool[DeviceId].reset();
753 
754     DeviceDataTy &D = DeviceData[DeviceId];
755     if (!checkResult(cuCtxSetCurrent(D.Context),
756                      "Error returned from cuCtxSetCurrent\n"))
757       return OFFLOAD_FAIL;
758 
759     // Unload all modules.
760     for (auto &M : Modules[DeviceId])
761       if (!checkResult(cuModuleUnload(M),
762                        "Error returned from cuModuleUnload\n"))
763         return OFFLOAD_FAIL;
764 
765     // Destroy context.
766     CUdevice Device;
767     if (!checkResult(cuCtxGetDevice(&Device),
768                      "Error returned from cuCtxGetDevice\n"))
769       return OFFLOAD_FAIL;
770 
771     if (!checkResult(cuDevicePrimaryCtxRelease(Device),
772                      "Error returned from cuDevicePrimaryCtxRelease\n"))
773       return OFFLOAD_FAIL;
774 
775     return OFFLOAD_SUCCESS;
776   }
777 
778   __tgt_target_table *loadBinary(const int DeviceId,
779                                  const __tgt_device_image *Image) {
780     // Clear the offload table as we are going to create a new one.
781     clearOffloadEntriesTable(DeviceId);
782 
783     // Create the module and extract the function pointers.
784     CUmodule Module;
785     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
786     CUresult Err =
787         cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
788     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
789       return nullptr;
790 
791     DP("CUDA module successfully loaded!\n");
792 
793     Modules[DeviceId].push_back(Module);
794 
795     // Find the symbols in the module by name.
796     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
797     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
798 
799     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
800     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
801       if (!E->addr) {
802         // We return nullptr when something like this happens, the host should
803         // have always something in the address to uniquely identify the target
804         // region.
805         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
806         return nullptr;
807       }
808 
809       if (E->size) {
810         __tgt_offload_entry Entry = *E;
811         CUdeviceptr CUPtr;
812         size_t CUSize;
813         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
814         // We keep this style here because we need the name
815         if (Err != CUDA_SUCCESS) {
816           REPORT("Loading global '%s' Failed\n", E->name);
817           CUDA_ERR_STRING(Err);
818           return nullptr;
819         }
820 
821         if (CUSize != E->size) {
822           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
823              CUSize, E->size);
824           return nullptr;
825         }
826 
827         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
828            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
829 
830         Entry.addr = (void *)(CUPtr);
831 
832         // Note: In the current implementation declare target variables
833         // can either be link or to. This means that once unified
834         // memory is activated via the requires directive, the variable
835         // can be used directly from the host in both cases.
836         // TODO: when variables types other than to or link are added,
837         // the below condition should be changed to explicitly
838         // check for to and link variables types:
839         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
840         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
841         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
842           // If unified memory is present any target link or to variables
843           // can access host addresses directly. There is no longer a
844           // need for device copies.
845           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
846           DP("Copy linked variable host address (" DPxMOD
847              ") to device address (" DPxMOD ")\n",
848              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
849         }
850 
851         addOffloadEntry(DeviceId, Entry);
852 
853         continue;
854       }
855 
856       CUfunction Func;
857       Err = cuModuleGetFunction(&Func, Module, E->name);
858       // We keep this style here because we need the name
859       if (Err != CUDA_SUCCESS) {
860         REPORT("Loading '%s' Failed\n", E->name);
861         CUDA_ERR_STRING(Err);
862         return nullptr;
863       }
864 
865       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
866          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
867 
868       // default value GENERIC (in case symbol is missing from cubin file)
869       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
870       std::string ExecModeNameStr(E->name);
871       ExecModeNameStr += "_exec_mode";
872       const char *ExecModeName = ExecModeNameStr.c_str();
873 
874       CUdeviceptr ExecModePtr;
875       size_t CUSize;
876       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
877       if (Err == CUDA_SUCCESS) {
878         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
879           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
880              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
881           return nullptr;
882         }
883 
884         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
885         if (Err != CUDA_SUCCESS) {
886           REPORT("Error when copying data from device to host. Pointers: "
887                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
888                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
889           CUDA_ERR_STRING(Err);
890           return nullptr;
891         }
892       } else {
893         DP("Loading global exec_mode '%s' - symbol missing, using default "
894            "value GENERIC (1)\n",
895            ExecModeName);
896       }
897 
898       KernelsList.emplace_back(Func, ExecModeVal);
899 
900       __tgt_offload_entry Entry = *E;
901       Entry.addr = &KernelsList.back();
902       addOffloadEntry(DeviceId, Entry);
903     }
904 
905     // send device environment data to the device
906     {
907       // TODO: The device ID used here is not the real device ID used by OpenMP.
908       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
909                                     static_cast<uint32_t>(DeviceId),
910                                     static_cast<uint32_t>(DynamicMemorySize)};
911 
912       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
913         DeviceEnv.DebugKind = std::stoi(EnvStr);
914 
915       const char *DeviceEnvName = "omptarget_device_environment";
916       CUdeviceptr DeviceEnvPtr;
917       size_t CUSize;
918 
919       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
920       if (Err == CUDA_SUCCESS) {
921         if (CUSize != sizeof(DeviceEnv)) {
922           REPORT(
923               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
924               DeviceEnvName, CUSize, sizeof(int32_t));
925           CUDA_ERR_STRING(Err);
926           return nullptr;
927         }
928 
929         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
930         if (Err != CUDA_SUCCESS) {
931           REPORT("Error when copying data from host to device. Pointers: "
932                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
933                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
934           CUDA_ERR_STRING(Err);
935           return nullptr;
936         }
937 
938         DP("Sending global device environment data %zu bytes\n", CUSize);
939       } else {
940         DP("Finding global device environment '%s' - symbol missing.\n",
941            DeviceEnvName);
942         DP("Continue, considering this is a device RTL which does not accept "
943            "environment setting.\n");
944       }
945     }
946 
947     return getOffloadEntriesTable(DeviceId);
948   }
949 
950   void *dataAlloc(const int DeviceId, const int64_t Size,
951                   const TargetAllocTy Kind) {
952     switch (Kind) {
953     case TARGET_ALLOC_DEFAULT:
954     case TARGET_ALLOC_DEVICE:
955       if (UseMemoryManager)
956         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
957       else
958         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
959     case TARGET_ALLOC_HOST:
960     case TARGET_ALLOC_SHARED:
961       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
962     }
963 
964     REPORT("Invalid target data allocation kind or requested allocator not "
965            "implemented yet\n");
966 
967     return nullptr;
968   }
969 
970   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
971                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
972     assert(AsyncInfo && "AsyncInfo is nullptr");
973 
974     CUstream Stream = getStream(DeviceId, AsyncInfo);
975     CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
976     if (Err != CUDA_SUCCESS) {
977       DP("Error when copying data from host to device. Pointers: host "
978          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
979          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
980       CUDA_ERR_STRING(Err);
981       return OFFLOAD_FAIL;
982     }
983 
984     return OFFLOAD_SUCCESS;
985   }
986 
987   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
988                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
989     assert(AsyncInfo && "AsyncInfo is nullptr");
990 
991     CUstream Stream = getStream(DeviceId, AsyncInfo);
992     CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
993     if (Err != CUDA_SUCCESS) {
994       DP("Error when copying data from device to host. Pointers: host "
995          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
996          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
997       CUDA_ERR_STRING(Err);
998       return OFFLOAD_FAIL;
999     }
1000 
1001     return OFFLOAD_SUCCESS;
1002   }
1003 
1004   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1005                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1006     assert(AsyncInfo && "AsyncInfo is nullptr");
1007 
1008     CUresult Err;
1009     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1010 
1011     // If they are two devices, we try peer to peer copy first
1012     if (SrcDevId != DstDevId) {
1013       int CanAccessPeer = 0;
1014       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1015       if (Err != CUDA_SUCCESS) {
1016         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1017                ", dst = %" PRId32 "\n",
1018                SrcDevId, DstDevId);
1019         CUDA_ERR_STRING(Err);
1020         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1021       }
1022 
1023       if (!CanAccessPeer) {
1024         DP("P2P memcpy not supported so fall back to D2D memcpy");
1025         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1026       }
1027 
1028       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1029       if (Err != CUDA_SUCCESS) {
1030         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1031                ", dst = %" PRId32 "\n",
1032                SrcDevId, DstDevId);
1033         CUDA_ERR_STRING(Err);
1034         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1035       }
1036 
1037       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1038                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1039                               Size, Stream);
1040       if (Err == CUDA_SUCCESS)
1041         return OFFLOAD_SUCCESS;
1042 
1043       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1044          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1045          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1046       CUDA_ERR_STRING(Err);
1047     }
1048 
1049     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1050   }
1051 
1052   int dataDelete(const int DeviceId, void *TgtPtr) {
1053     if (UseMemoryManager)
1054       return MemoryManagers[DeviceId]->free(TgtPtr);
1055 
1056     return DeviceAllocators[DeviceId].free(TgtPtr);
1057   }
1058 
1059   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1060                           ptrdiff_t *TgtOffsets, const int ArgNum,
1061                           const int TeamNum, const int ThreadLimit,
1062                           const unsigned int LoopTripCount,
1063                           __tgt_async_info *AsyncInfo) const {
1064     // All args are references.
1065     std::vector<void *> Args(ArgNum);
1066     std::vector<void *> Ptrs(ArgNum);
1067 
1068     for (int I = 0; I < ArgNum; ++I) {
1069       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1070       Args[I] = &Ptrs[I];
1071     }
1072 
1073     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1074 
1075     const bool IsSPMDGenericMode =
1076         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1077     const bool IsSPMDMode =
1078         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1079     const bool IsGenericMode =
1080         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1081 
1082     int CudaThreadsPerBlock;
1083     if (ThreadLimit > 0) {
1084       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1085       CudaThreadsPerBlock = ThreadLimit;
1086       // Add master warp if necessary
1087       if (IsGenericMode) {
1088         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1089         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1090       }
1091     } else {
1092       DP("Setting CUDA threads per block to default %d\n",
1093          DeviceData[DeviceId].NumThreads);
1094       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1095     }
1096 
1097     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1098       DP("Threads per block capped at device limit %d\n",
1099          DeviceData[DeviceId].ThreadsPerBlock);
1100       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1101     }
1102 
1103     CUresult Err;
1104     if (!KernelInfo->MaxThreadsPerBlock) {
1105       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1106                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1107                                KernelInfo->Func);
1108       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1109         return OFFLOAD_FAIL;
1110     }
1111 
1112     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1113       DP("Threads per block capped at kernel limit %d\n",
1114          KernelInfo->MaxThreadsPerBlock);
1115       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1116     }
1117 
1118     unsigned int CudaBlocksPerGrid;
1119     if (TeamNum <= 0) {
1120       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1121         if (IsSPMDGenericMode) {
1122           // If we reach this point, then we are executing a kernel that was
1123           // transformed from Generic-mode to SPMD-mode. This kernel has
1124           // SPMD-mode execution, but needs its blocks to be scheduled
1125           // differently because the current loop trip count only applies to the
1126           // `teams distribute` region and will create var too few blocks using
1127           // the regular SPMD-mode method.
1128           CudaBlocksPerGrid = LoopTripCount;
1129         } else if (IsSPMDMode) {
1130           // We have a combined construct, i.e. `target teams distribute
1131           // parallel for [simd]`. We launch so many teams so that each thread
1132           // will execute one iteration of the loop. round up to the nearest
1133           // integer
1134           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1135         } else if (IsGenericMode) {
1136           // If we reach this point, then we have a non-combined construct, i.e.
1137           // `teams distribute` with a nested `parallel for` and each team is
1138           // assigned one iteration of the `distribute` loop. E.g.:
1139           //
1140           // #pragma omp target teams distribute
1141           // for(...loop_tripcount...) {
1142           //   #pragma omp parallel for
1143           //   for(...) {}
1144           // }
1145           //
1146           // Threads within a team will execute the iterations of the `parallel`
1147           // loop.
1148           CudaBlocksPerGrid = LoopTripCount;
1149         } else {
1150           REPORT("Unknown execution mode: %d\n",
1151                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1152           return OFFLOAD_FAIL;
1153         }
1154         DP("Using %d teams due to loop trip count %" PRIu32
1155            " and number of threads per block %d\n",
1156            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1157       } else {
1158         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1159         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1160       }
1161     } else {
1162       DP("Using requested number of teams %d\n", TeamNum);
1163       CudaBlocksPerGrid = TeamNum;
1164     }
1165 
1166     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1167       DP("Capping number of teams to team limit %d\n",
1168          DeviceData[DeviceId].BlocksPerGrid);
1169       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1170     }
1171 
1172     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1173          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1174          (getOffloadEntry(DeviceId, TgtEntryPtr))
1175              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1176              : "(null)",
1177          CudaBlocksPerGrid, CudaThreadsPerBlock,
1178          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1179 
1180     CUstream Stream = getStream(DeviceId, AsyncInfo);
1181     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1182                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1183                          /* blockDimY */ 1, /* blockDimZ */ 1,
1184                          DynamicMemorySize, Stream, &Args[0], nullptr);
1185     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1186       return OFFLOAD_FAIL;
1187 
1188     DP("Launch of entry point at " DPxMOD " successful!\n",
1189        DPxPTR(TgtEntryPtr));
1190 
1191     return OFFLOAD_SUCCESS;
1192   }
1193 
1194   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1195     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1196     CUresult Err = cuStreamSynchronize(Stream);
1197 
1198     // Once the stream is synchronized, return it to stream pool and reset
1199     // AsyncInfo. This is to make sure the synchronization only works for its
1200     // own tasks.
1201     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1202     AsyncInfo->Queue = nullptr;
1203 
1204     if (Err != CUDA_SUCCESS) {
1205       DP("Error when synchronizing stream. stream = " DPxMOD
1206          ", async info ptr = " DPxMOD "\n",
1207          DPxPTR(Stream), DPxPTR(AsyncInfo));
1208       CUDA_ERR_STRING(Err);
1209     }
1210     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1211   }
1212 
1213   void printDeviceInfo(int32_t device_id) {
1214     char TmpChar[1000];
1215     std::string TmpStr;
1216     size_t TmpSt;
1217     int TmpInt, TmpInt2, TmpInt3;
1218 
1219     CUdevice Device;
1220     checkResult(cuDeviceGet(&Device, device_id),
1221                 "Error returned from cuCtxGetDevice\n");
1222 
1223     cuDriverGetVersion(&TmpInt);
1224     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1225     printf("    CUDA Device Number: \t\t%d \n", device_id);
1226     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1227                 "Error returned from cuDeviceGetName\n");
1228     printf("    Device Name: \t\t\t%s \n", TmpChar);
1229     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1230                 "Error returned from cuDeviceTotalMem\n");
1231     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1232     checkResult(cuDeviceGetAttribute(
1233                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1234                 "Error returned from cuDeviceGetAttribute\n");
1235     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1236     checkResult(
1237         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1238         "Error returned from cuDeviceGetAttribute\n");
1239     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1240     checkResult(cuDeviceGetAttribute(
1241                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1242                 "Error returned from cuDeviceGetAttribute\n");
1243     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1244     checkResult(
1245         cuDeviceGetAttribute(
1246             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1247         "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1249     checkResult(
1250         cuDeviceGetAttribute(
1251             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1252         "Error returned from cuDeviceGetAttribute\n");
1253     printf("    Registers per Block: \t\t%d \n", TmpInt);
1254     checkResult(
1255         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1256         "Error returned from cuDeviceGetAttribute\n");
1257     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1258     checkResult(cuDeviceGetAttribute(
1259                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1260                 "Error returned from cuDeviceGetAttribute\n");
1261     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1262     checkResult(cuDeviceGetAttribute(
1263                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1264                 "Error returned from cuDeviceGetAttribute\n");
1265     checkResult(cuDeviceGetAttribute(
1266                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1267                 "Error returned from cuDeviceGetAttribute\n");
1268     checkResult(cuDeviceGetAttribute(
1269                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1270                 "Error returned from cuDeviceGetAttribute\n");
1271     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1272            TmpInt3);
1273     checkResult(cuDeviceGetAttribute(
1274                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1275                 "Error returned from cuDeviceGetAttribute\n");
1276     checkResult(cuDeviceGetAttribute(
1277                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1278                 "Error returned from cuDeviceGetAttribute\n");
1279     checkResult(cuDeviceGetAttribute(
1280                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1281                 "Error returned from cuDeviceGetAttribute\n");
1282     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1283            TmpInt3);
1284     checkResult(
1285         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1286         "Error returned from cuDeviceGetAttribute\n");
1287     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1288     checkResult(cuDeviceGetAttribute(
1289                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1290                 "Error returned from cuDeviceGetAttribute\n");
1291     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1292     checkResult(
1293         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1294         "Error returned from cuDeviceGetAttribute\n");
1295     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1296     checkResult(cuDeviceGetAttribute(
1297                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1298                 "Error returned from cuDeviceGetAttribute\n");
1299     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1300     checkResult(
1301         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1302         "Error returned from cuDeviceGetAttribute\n");
1303     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1304     checkResult(cuDeviceGetAttribute(
1305                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1306                 "Error returned from cuDeviceGetAttribute\n");
1307     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1308     checkResult(
1309         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1310         "Error returned from cuDeviceGetAttribute\n");
1311     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1312       TmpStr = "DEFAULT";
1313     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1314       TmpStr = "PROHIBITED";
1315     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1316       TmpStr = "EXCLUSIVE PROCESS";
1317     else
1318       TmpStr = "unknown";
1319     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1320     checkResult(cuDeviceGetAttribute(
1321                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1322                 "Error returned from cuDeviceGetAttribute\n");
1323     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1324     checkResult(
1325         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1326         "Error returned from cuDeviceGetAttribute\n");
1327     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1328     checkResult(cuDeviceGetAttribute(
1329                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1330                 "Error returned from cuDeviceGetAttribute\n");
1331     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1332     checkResult(
1333         cuDeviceGetAttribute(
1334             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1337     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1338                                      Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1341     checkResult(cuDeviceGetAttribute(
1342                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1343                     Device),
1344                 "Error returned from cuDeviceGetAttribute\n");
1345     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1346     checkResult(cuDeviceGetAttribute(
1347                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1348                 "Error returned from cuDeviceGetAttribute\n");
1349     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1350     checkResult(cuDeviceGetAttribute(
1351                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1352                 "Error returned from cuDeviceGetAttribute\n");
1353     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1354     checkResult(cuDeviceGetAttribute(
1355                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1356                 "Error returned from cuDeviceGetAttribute\n");
1357     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1358     checkResult(
1359         cuDeviceGetAttribute(
1360             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1361         "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1363     checkResult(
1364         cuDeviceGetAttribute(
1365             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1366         "Error returned from cuDeviceGetAttribute\n");
1367     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1368     checkResult(cuDeviceGetAttribute(
1369                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1370                 "Error returned from cuDeviceGetAttribute\n");
1371     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1372     checkResult(cuDeviceGetAttribute(
1373                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1374                 "Error returned from cuDeviceGetAttribute\n");
1375     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1376     checkResult(
1377         cuDeviceGetAttribute(
1378             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1379         "Error returned from cuDeviceGetAttribute\n");
1380     checkResult(
1381         cuDeviceGetAttribute(
1382             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1383         "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1385   }
1386 
1387   int createEvent(int DeviceId, void **P) {
1388     CUevent Event = nullptr;
1389     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1390       return OFFLOAD_FAIL;
1391     *P = Event;
1392     return OFFLOAD_SUCCESS;
1393   }
1394 
1395   int destroyEvent(int DeviceId, void *EventPtr) {
1396     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1397     return OFFLOAD_SUCCESS;
1398   }
1399 
1400   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1401                 void *EventPtr) const {
1402     CUstream Stream = getStream(DeviceId, AsyncInfo);
1403     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1404 
1405     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1406     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1407     // API document, that argument has to be 0x0.
1408     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1409     if (Err != CUDA_SUCCESS) {
1410       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1411          DPxPTR(Stream), DPxPTR(Event));
1412       CUDA_ERR_STRING(Err);
1413       return OFFLOAD_FAIL;
1414     }
1415 
1416     return OFFLOAD_SUCCESS;
1417   }
1418 
1419   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1420     if (AsyncInfo->Queue) {
1421       StreamPool[DeviceId]->release(
1422           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1423       AsyncInfo->Queue = nullptr;
1424     }
1425 
1426     return OFFLOAD_SUCCESS;
1427   }
1428 
1429   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1430     *AsyncInfo = new __tgt_async_info;
1431     getStream(DeviceId, *AsyncInfo);
1432     return OFFLOAD_SUCCESS;
1433   }
1434 
1435   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1436                      const char **ErrStr) const {
1437     assert(DeviceInfo && "DeviceInfo is nullptr");
1438 
1439     if (!DeviceInfo->Context)
1440       DeviceInfo->Context = DeviceData[DeviceId].Context;
1441     if (!DeviceInfo->Device) {
1442       CUdevice Dev;
1443       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1444       if (Err == CUDA_SUCCESS) {
1445         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1446       } else {
1447         cuGetErrorString(Err, ErrStr);
1448         return OFFLOAD_FAIL;
1449       }
1450     }
1451     return OFFLOAD_SUCCESS;
1452   }
1453 
1454   int setContext(int DeviceId) {
1455     assert(InitializedFlags[DeviceId] && "Device is not initialized");
1456 
1457     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1458     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1459       return OFFLOAD_FAIL;
1460 
1461     return OFFLOAD_SUCCESS;
1462   }
1463 };
1464 
1465 DeviceRTLTy DeviceRTL;
1466 } // namespace
1467 
1468 // Exposed library API function
1469 #ifdef __cplusplus
1470 extern "C" {
1471 #endif
1472 
1473 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1474   return elf_check_machine(image, /* EM_CUDA */ 190);
1475 }
1476 
1477 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1478 
1479 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1480   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1481   DeviceRTL.setRequiresFlag(RequiresFlags);
1482   return RequiresFlags;
1483 }
1484 
1485 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1486   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1487       DeviceRTL.isValidDeviceId(dst_dev_id))
1488     return 1;
1489 
1490   return 0;
1491 }
1492 
1493 int32_t __tgt_rtl_init_device(int32_t device_id) {
1494   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1495   // Context is set when init the device.
1496 
1497   return DeviceRTL.initDevice(device_id);
1498 }
1499 
1500 int32_t __tgt_rtl_deinit_device(int32_t device_id) {
1501   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1502   // Context is set when deinit the device.
1503 
1504   return DeviceRTL.deinitDevice(device_id);
1505 }
1506 
1507 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1508                                           __tgt_device_image *image) {
1509   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1510 
1511   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1512     return nullptr;
1513 
1514   return DeviceRTL.loadBinary(device_id, image);
1515 }
1516 
1517 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1518                            int32_t kind) {
1519   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1520 
1521   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1522     return nullptr;
1523 
1524   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1525 }
1526 
1527 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1528                               int64_t size) {
1529   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1530   // Context is set in __tgt_rtl_data_submit_async.
1531 
1532   __tgt_async_info AsyncInfo;
1533   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1534                                                  size, &AsyncInfo);
1535   if (rc != OFFLOAD_SUCCESS)
1536     return OFFLOAD_FAIL;
1537 
1538   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1539 }
1540 
1541 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1542                                     void *hst_ptr, int64_t size,
1543                                     __tgt_async_info *async_info_ptr) {
1544   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1545   assert(async_info_ptr && "async_info_ptr is nullptr");
1546 
1547   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1548     return OFFLOAD_FAIL;
1549 
1550   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1551                               async_info_ptr);
1552 }
1553 
1554 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1555                                 int64_t size) {
1556   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1557   // Context is set in __tgt_rtl_data_retrieve_async.
1558 
1559   __tgt_async_info AsyncInfo;
1560   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1561                                                    size, &AsyncInfo);
1562   if (rc != OFFLOAD_SUCCESS)
1563     return OFFLOAD_FAIL;
1564 
1565   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1566 }
1567 
1568 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1569                                       void *tgt_ptr, int64_t size,
1570                                       __tgt_async_info *async_info_ptr) {
1571   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1572   assert(async_info_ptr && "async_info_ptr is nullptr");
1573 
1574   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1575     return OFFLOAD_FAIL;
1576 
1577   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1578                                 async_info_ptr);
1579 }
1580 
1581 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1582                                       int dst_dev_id, void *dst_ptr,
1583                                       int64_t size,
1584                                       __tgt_async_info *AsyncInfo) {
1585   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1586   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1587   assert(AsyncInfo && "AsyncInfo is nullptr");
1588   // NOTE: We don't need to set context for data exchange as the device contexts
1589   // are passed to CUDA function directly.
1590   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1591                                 AsyncInfo);
1592 }
1593 
1594 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1595                                 int32_t dst_dev_id, void *dst_ptr,
1596                                 int64_t size) {
1597   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1598   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1599   // Context is set in __tgt_rtl_data_exchange_async.
1600 
1601   __tgt_async_info AsyncInfo;
1602   const int32_t rc = __tgt_rtl_data_exchange_async(
1603       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1604   if (rc != OFFLOAD_SUCCESS)
1605     return OFFLOAD_FAIL;
1606 
1607   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1608 }
1609 
1610 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1611   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1612 
1613   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1614     return OFFLOAD_FAIL;
1615 
1616   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1617 }
1618 
1619 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1620                                          void **tgt_args,
1621                                          ptrdiff_t *tgt_offsets,
1622                                          int32_t arg_num, int32_t team_num,
1623                                          int32_t thread_limit,
1624                                          uint64_t loop_tripcount) {
1625   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1626   // Context is set in __tgt_rtl_run_target_team_region_async.
1627 
1628   __tgt_async_info AsyncInfo;
1629   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1630       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1631       thread_limit, loop_tripcount, &AsyncInfo);
1632   if (rc != OFFLOAD_SUCCESS)
1633     return OFFLOAD_FAIL;
1634 
1635   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1636 }
1637 
1638 int32_t __tgt_rtl_run_target_team_region_async(
1639     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1640     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1641     int32_t thread_limit, uint64_t loop_tripcount,
1642     __tgt_async_info *async_info_ptr) {
1643   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1644 
1645   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1646     return OFFLOAD_FAIL;
1647 
1648   return DeviceRTL.runTargetTeamRegion(
1649       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1650       thread_limit, loop_tripcount, async_info_ptr);
1651 }
1652 
1653 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1654                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1655                                     int32_t arg_num) {
1656   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1657   // Context is set in __tgt_rtl_run_target_region_async.
1658 
1659   __tgt_async_info AsyncInfo;
1660   const int32_t rc = __tgt_rtl_run_target_region_async(
1661       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1662   if (rc != OFFLOAD_SUCCESS)
1663     return OFFLOAD_FAIL;
1664 
1665   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1666 }
1667 
1668 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1669                                           void *tgt_entry_ptr, void **tgt_args,
1670                                           ptrdiff_t *tgt_offsets,
1671                                           int32_t arg_num,
1672                                           __tgt_async_info *async_info_ptr) {
1673   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1674   // Context is set in __tgt_rtl_run_target_team_region_async.
1675   return __tgt_rtl_run_target_team_region_async(
1676       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1677       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1678       async_info_ptr);
1679 }
1680 
1681 int32_t __tgt_rtl_synchronize(int32_t device_id,
1682                               __tgt_async_info *async_info_ptr) {
1683   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1684   assert(async_info_ptr && "async_info_ptr is nullptr");
1685   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1686   // NOTE: We don't need to set context for stream sync.
1687   return DeviceRTL.synchronize(device_id, async_info_ptr);
1688 }
1689 
1690 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1691   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1692   InfoLevel.store(NewInfoLevel);
1693 }
1694 
1695 void __tgt_rtl_print_device_info(int32_t device_id) {
1696   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1697   // NOTE: We don't need to set context for print device info.
1698   DeviceRTL.printDeviceInfo(device_id);
1699 }
1700 
1701 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1702   assert(event && "event is nullptr");
1703 
1704   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1705     return OFFLOAD_FAIL;
1706 
1707   return DeviceRTL.createEvent(device_id, event);
1708 }
1709 
1710 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1711                                __tgt_async_info *async_info_ptr) {
1712   assert(async_info_ptr && "async_info_ptr is nullptr");
1713   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1714   assert(event_ptr && "event_ptr is nullptr");
1715   // NOTE: We might not need to set context for event record.
1716   return recordEvent(event_ptr, async_info_ptr);
1717 }
1718 
1719 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1720                              __tgt_async_info *async_info_ptr) {
1721   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1722   assert(async_info_ptr && "async_info_ptr is nullptr");
1723   assert(event_ptr && "event is nullptr");
1724   // NOTE: We might not need to set context for event sync.
1725   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1726 }
1727 
1728 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1729   assert(event_ptr && "event is nullptr");
1730   // NOTE: We might not need to set context for event sync.
1731   return syncEvent(event_ptr);
1732 }
1733 
1734 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1735   assert(event_ptr && "event is nullptr");
1736 
1737   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1738     return OFFLOAD_FAIL;
1739 
1740   return DeviceRTL.destroyEvent(device_id, event_ptr);
1741 }
1742 
1743 int32_t __tgt_rtl_release_async_info(int32_t device_id,
1744                                      __tgt_async_info *async_info) {
1745   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1746   assert(async_info && "async_info is nullptr");
1747 
1748   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1749     return OFFLOAD_FAIL;
1750 
1751   return DeviceRTL.releaseAsyncInfo(device_id, async_info);
1752 }
1753 
1754 int32_t __tgt_rtl_init_async_info(int32_t device_id,
1755                                   __tgt_async_info **async_info) {
1756   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1757   assert(async_info && "async_info is nullptr");
1758 
1759   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1760     return OFFLOAD_FAIL;
1761 
1762   return DeviceRTL.initAsyncInfo(device_id, async_info);
1763 }
1764 
1765 int32_t __tgt_rtl_init_device_info(int32_t device_id,
1766                                    __tgt_device_info *device_info_ptr,
1767                                    const char **err_str) {
1768   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1769   assert(device_info_ptr && "device_info_ptr is nullptr");
1770 
1771   if (DeviceRTL.setContext(device_id) != OFFLOAD_SUCCESS)
1772     return OFFLOAD_FAIL;
1773 
1774   return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str);
1775 }
1776 
1777 #ifdef __cplusplus
1778 }
1779 #endif
1780