1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "DeviceEnvironment.h"
25 #include "omptargetplugin.h"
26 
27 #define TARGET_NAME CUDA
28 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
29 
30 #include "MemoryManager.h"
31 
32 #include "llvm/Frontend/OpenMP/OMPConstants.h"
33 
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef OMPTARGET_DEBUG
36 #define CUDA_ERR_STRING(err)                                                   \
37   do {                                                                         \
38     if (getDebugLevel() > 0) {                                                 \
39       const char *errStr = nullptr;                                            \
40       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
41       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
42         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
43       else if (errStr_status == CUDA_SUCCESS)                                  \
44         REPORT("CUDA error is: %s\n", errStr);                                 \
45       else {                                                                   \
46         REPORT("Unresolved CUDA error code: %d\n", err);                       \
47         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
48                errStr_status);                                                 \
49       }                                                                        \
50     } else {                                                                   \
51       const char *errStr = nullptr;                                            \
52       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
53       if (errStr_status == CUDA_SUCCESS)                                       \
54         REPORT("%s \n", errStr);                                               \
55     }                                                                          \
56   } while (false)
57 #else // OMPTARGET_DEBUG
58 #define CUDA_ERR_STRING(err)                                                   \
59   do {                                                                         \
60     const char *errStr = nullptr;                                              \
61     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
62     if (errStr_status == CUDA_SUCCESS)                                         \
63       REPORT("%s \n", errStr);                                                 \
64   } while (false)
65 #endif // OMPTARGET_DEBUG
66 
67 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
68 
69 #include "elf_common.h"
70 
71 /// Keep entries table per device.
72 struct FuncOrGblEntryTy {
73   __tgt_target_table Table;
74   std::vector<__tgt_offload_entry> Entries;
75 };
76 
77 /// Use a single entity to encode a kernel and a set of flags.
78 struct KernelTy {
79   CUfunction Func;
80 
81   // execution mode of kernel
82   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
83 
84   /// Maximal number of threads per block for this kernel.
85   int MaxThreadsPerBlock = 0;
86 
87   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
88       : Func(_Func), ExecutionMode(_ExecutionMode) {}
89 };
90 
91 namespace {
92 bool checkResult(CUresult Err, const char *ErrMsg) {
93   if (Err == CUDA_SUCCESS)
94     return true;
95 
96   REPORT("%s", ErrMsg);
97   CUDA_ERR_STRING(Err);
98   return false;
99 }
100 
101 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
102                CUstream Stream) {
103   CUresult Err =
104       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
105 
106   if (Err != CUDA_SUCCESS) {
107     DP("Error when copying data from device to device. Pointers: src "
108        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
109        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
110     CUDA_ERR_STRING(Err);
111     return OFFLOAD_FAIL;
112   }
113 
114   return OFFLOAD_SUCCESS;
115 }
116 
117 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
118   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
119   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
120 
121   CUresult Err = cuEventRecord(Event, Stream);
122   if (Err != CUDA_SUCCESS) {
123     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
124        DPxPTR(Stream), DPxPTR(Event));
125     CUDA_ERR_STRING(Err);
126     return OFFLOAD_FAIL;
127   }
128 
129   return OFFLOAD_SUCCESS;
130 }
131 
132 int syncEvent(void *EventPtr) {
133   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
134 
135   CUresult Err = cuEventSynchronize(Event);
136   if (Err != CUDA_SUCCESS) {
137     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
138     CUDA_ERR_STRING(Err);
139     return OFFLOAD_FAIL;
140   }
141 
142   return OFFLOAD_SUCCESS;
143 }
144 
145 // Structure contains per-device data
146 struct DeviceDataTy {
147   /// List that contains all the kernels.
148   std::list<KernelTy> KernelsList;
149 
150   std::list<FuncOrGblEntryTy> FuncGblEntries;
151 
152   CUcontext Context = nullptr;
153   // Device properties
154   int ThreadsPerBlock = 0;
155   int BlocksPerGrid = 0;
156   int WarpSize = 0;
157   // OpenMP properties
158   int NumTeams = 0;
159   int NumThreads = 0;
160 };
161 
162 /// Resource allocator where \p T is the resource type.
163 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
164 /// accordingly. The implementation should not raise any exception.
165 template <typename T> class AllocatorTy {
166 public:
167   /// Create a resource and assign to R.
168   int create(T &R) noexcept;
169   /// Destroy the resource.
170   int destroy(T) noexcept;
171 };
172 
173 /// Allocator for CUstream.
174 template <> class AllocatorTy<CUstream> {
175   CUcontext Context;
176 
177 public:
178   AllocatorTy(CUcontext C) noexcept : Context(C) {}
179 
180   /// See AllocatorTy<T>::create.
181   int create(CUstream &Stream) noexcept {
182     if (!checkResult(cuCtxSetCurrent(Context),
183                      "Error returned from cuCtxSetCurrent\n"))
184       return OFFLOAD_FAIL;
185 
186     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
187                      "Error returned from cuStreamCreate\n"))
188       return OFFLOAD_FAIL;
189 
190     return OFFLOAD_SUCCESS;
191   }
192 
193   /// See AllocatorTy<T>::destroy.
194   int destroy(CUstream Stream) noexcept {
195     if (!checkResult(cuCtxSetCurrent(Context),
196                      "Error returned from cuCtxSetCurrent\n"))
197       return OFFLOAD_FAIL;
198     if (!checkResult(cuStreamDestroy(Stream),
199                      "Error returned from cuStreamDestroy\n"))
200       return OFFLOAD_FAIL;
201 
202     return OFFLOAD_SUCCESS;
203   }
204 };
205 
206 /// Allocator for CUevent.
207 template <> class AllocatorTy<CUevent> {
208 public:
209   /// See AllocatorTy<T>::create.
210   int create(CUevent &Event) noexcept {
211     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
212                      "Error returned from cuEventCreate\n"))
213       return OFFLOAD_FAIL;
214 
215     return OFFLOAD_SUCCESS;
216   }
217 
218   /// See AllocatorTy<T>::destroy.
219   int destroy(CUevent Event) noexcept {
220     if (!checkResult(cuEventDestroy(Event),
221                      "Error returned from cuEventDestroy\n"))
222       return OFFLOAD_FAIL;
223 
224     return OFFLOAD_SUCCESS;
225   }
226 };
227 
228 /// A generic pool of resources where \p T is the resource type.
229 /// \p T should be copyable as the object is stored in \p std::vector .
230 template <typename T> class ResourcePoolTy {
231   /// Index of the next available resource.
232   size_t Next = 0;
233   /// Mutex to guard the pool.
234   std::mutex Mutex;
235   /// Pool of resources.
236   std::vector<T> Resources;
237   /// A reference to the corresponding allocator.
238   AllocatorTy<T> Allocator;
239 
240   /// If `Resources` is used up, we will fill in more resources. It assumes that
241   /// the new size `Size` should be always larger than the current size.
242   bool resize(size_t Size) {
243     auto CurSize = Resources.size();
244     assert(Size > CurSize && "Unexpected smaller size");
245     Resources.reserve(Size);
246     for (auto I = CurSize; I < Size; ++I) {
247       T NewItem;
248       int Ret = Allocator.create(NewItem);
249       if (Ret != OFFLOAD_SUCCESS)
250         return false;
251       Resources.push_back(NewItem);
252     }
253     return true;
254   }
255 
256 public:
257   ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
258       : Allocator(std::move(A)) {
259     if (Size)
260       (void)resize(Size);
261   }
262 
263   ~ResourcePoolTy() noexcept { clear(); }
264 
265   /// Get a resource from pool. `Next` always points to the next available
266   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
267   /// still available. If there is no resource left, we will ask for more. Each
268   /// time a resource is assigned, the id will increase one.
269   /// xxxxxs+++++++++
270   ///      ^
271   ///      Next
272   /// After assignment, the pool becomes the following and s is assigned.
273   /// xxxxxs+++++++++
274   ///       ^
275   ///       Next
276   int acquire(T &R) noexcept {
277     std::lock_guard<std::mutex> LG(Mutex);
278     if (Next == Resources.size()) {
279       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
280       if (!resize(NewSize))
281         return OFFLOAD_FAIL;
282     }
283 
284     assert(Next < Resources.size());
285 
286     R = Resources[Next++];
287 
288     return OFFLOAD_SUCCESS;
289   }
290 
291   /// Return the resource back to the pool. When we return a resource, we need
292   /// to first decrease `Next`, and then copy the resource back. It is worth
293   /// noting that, the order of resources return might be different from that
294   /// they're assigned, that saying, at some point, there might be two identical
295   /// resources.
296   /// xxax+a+++++
297   ///     ^
298   ///     Next
299   /// However, it doesn't matter, because they're always on the two sides of
300   /// `Next`. The left one will in the end be overwritten by another resource.
301   /// Therefore, after several execution, the order of pool might be different
302   /// from its initial state.
303   void release(T R) noexcept {
304     std::lock_guard<std::mutex> LG(Mutex);
305     Resources[--Next] = R;
306   }
307 
308   /// Released all stored resources and clear the pool.
309   /// Note: This function is not thread safe. Be sure to guard it if necessary.
310   void clear() noexcept {
311     for (auto &R : Resources)
312       (void)Allocator.destroy(R);
313     Resources.clear();
314   }
315 };
316 
317 class DeviceRTLTy {
318   int NumberOfDevices;
319   // OpenMP environment properties
320   int EnvNumTeams;
321   int EnvTeamLimit;
322   int EnvTeamThreadLimit;
323   // OpenMP requires flags
324   int64_t RequiresFlags;
325   // Amount of dynamic shared memory to use at launch.
326   uint64_t DynamicMemorySize;
327   // Number of initial streams for each device.
328   int NumInitialStreams = 32;
329 
330   static constexpr const int32_t HardThreadLimit = 1024;
331   static constexpr const int32_t DefaultNumTeams = 128;
332   static constexpr const int32_t DefaultNumThreads = 128;
333 
334   using StreamPoolTy = ResourcePoolTy<CUstream>;
335   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
336 
337   ResourcePoolTy<CUevent> EventPool;
338 
339   std::vector<DeviceDataTy> DeviceData;
340   std::vector<CUmodule> Modules;
341 
342   /// A class responsible for interacting with device native runtime library to
343   /// allocate and free memory.
344   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
345     const int DeviceId;
346     const std::vector<DeviceDataTy> &DeviceData;
347     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
348 
349   public:
350     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
351         : DeviceId(DeviceId), DeviceData(DeviceData) {}
352 
353     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
354       if (Size == 0)
355         return nullptr;
356 
357       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
358       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
359         return nullptr;
360 
361       void *MemAlloc = nullptr;
362       switch (Kind) {
363       case TARGET_ALLOC_DEFAULT:
364       case TARGET_ALLOC_DEVICE:
365         CUdeviceptr DevicePtr;
366         Err = cuMemAlloc(&DevicePtr, Size);
367         MemAlloc = (void *)DevicePtr;
368         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
369           return nullptr;
370         break;
371       case TARGET_ALLOC_HOST:
372         void *HostPtr;
373         Err = cuMemAllocHost(&HostPtr, Size);
374         MemAlloc = HostPtr;
375         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
376           return nullptr;
377         HostPinnedAllocs[MemAlloc] = Kind;
378         break;
379       case TARGET_ALLOC_SHARED:
380         CUdeviceptr SharedPtr;
381         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
382         MemAlloc = (void *)SharedPtr;
383         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
384           return nullptr;
385         break;
386       }
387 
388       return MemAlloc;
389     }
390 
391     int free(void *TgtPtr) override {
392       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
393       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
394         return OFFLOAD_FAIL;
395 
396       // Host pinned memory must be freed differently.
397       TargetAllocTy Kind =
398           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
399               ? TARGET_ALLOC_DEFAULT
400               : TARGET_ALLOC_HOST;
401       switch (Kind) {
402       case TARGET_ALLOC_DEFAULT:
403       case TARGET_ALLOC_DEVICE:
404       case TARGET_ALLOC_SHARED:
405         Err = cuMemFree((CUdeviceptr)TgtPtr);
406         if (!checkResult(Err, "Error returned from cuMemFree\n"))
407           return OFFLOAD_FAIL;
408         break;
409       case TARGET_ALLOC_HOST:
410         Err = cuMemFreeHost(TgtPtr);
411         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
412           return OFFLOAD_FAIL;
413         break;
414       }
415 
416       return OFFLOAD_SUCCESS;
417     }
418   };
419 
420   /// A vector of device allocators
421   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
422 
423   /// A vector of memory managers. Since the memory manager is non-copyable and
424   // non-removable, we wrap them into std::unique_ptr.
425   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
426 
427   /// Whether use memory manager
428   bool UseMemoryManager = true;
429 
430   // Record entry point associated with device
431   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
432     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
433     E.Entries.push_back(entry);
434   }
435 
436   // Return a pointer to the entry associated with the pointer
437   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
438                                              const void *Addr) const {
439     for (const __tgt_offload_entry &Itr :
440          DeviceData[DeviceId].FuncGblEntries.back().Entries)
441       if (Itr.addr == Addr)
442         return &Itr;
443 
444     return nullptr;
445   }
446 
447   // Return the pointer to the target entries table
448   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
449     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
450 
451     if (E.Entries.empty())
452       return nullptr;
453 
454     // Update table info according to the entries and return the pointer
455     E.Table.EntriesBegin = E.Entries.data();
456     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
457 
458     return &E.Table;
459   }
460 
461   // Clear entries table for a device
462   void clearOffloadEntriesTable(const int DeviceId) {
463     DeviceData[DeviceId].FuncGblEntries.emplace_back();
464     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
465     E.Entries.clear();
466     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
467   }
468 
469 public:
470 
471   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
472     assert(AsyncInfo && "AsyncInfo is nullptr");
473 
474     if (!AsyncInfo->Queue) {
475       CUstream S;
476       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
477         return nullptr;
478 
479       AsyncInfo->Queue = S;
480     }
481 
482     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
483   }
484 
485   // This class should not be copied
486   DeviceRTLTy(const DeviceRTLTy &) = delete;
487   DeviceRTLTy(DeviceRTLTy &&) = delete;
488 
489   DeviceRTLTy()
490       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
491         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
492         DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
493 
494     DP("Start initializing CUDA\n");
495 
496     CUresult Err = cuInit(0);
497     if (Err == CUDA_ERROR_INVALID_HANDLE) {
498       // Can't call cuGetErrorString if dlsym failed
499       DP("Failed to load CUDA shared library\n");
500       return;
501     }
502     if (!checkResult(Err, "Error returned from cuInit\n")) {
503       return;
504     }
505 
506     Err = cuDeviceGetCount(&NumberOfDevices);
507     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
508       return;
509 
510     if (NumberOfDevices == 0) {
511       DP("There are no devices supporting CUDA.\n");
512       return;
513     }
514 
515     DeviceData.resize(NumberOfDevices);
516     StreamPool.resize(NumberOfDevices);
517 
518     // Get environment variables regarding teams
519     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
520       // OMP_TEAM_LIMIT has been set
521       EnvTeamLimit = std::stoi(EnvStr);
522       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
523     }
524     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
525       // OMP_TEAMS_THREAD_LIMIT has been set
526       EnvTeamThreadLimit = std::stoi(EnvStr);
527       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
528     }
529     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
530       // OMP_NUM_TEAMS has been set
531       EnvNumTeams = std::stoi(EnvStr);
532       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
533     }
534     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
535       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
536       DynamicMemorySize = std::stoi(EnvStr);
537       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
538          DynamicMemorySize);
539     }
540     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
541       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
542       NumInitialStreams = std::stoi(EnvStr);
543       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
544     }
545 
546     for (int I = 0; I < NumberOfDevices; ++I)
547       DeviceAllocators.emplace_back(I, DeviceData);
548 
549     // Get the size threshold from environment variable
550     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
551     UseMemoryManager = Res.second;
552     size_t MemoryManagerThreshold = Res.first;
553 
554     if (UseMemoryManager)
555       for (int I = 0; I < NumberOfDevices; ++I)
556         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
557             DeviceAllocators[I], MemoryManagerThreshold));
558   }
559 
560   ~DeviceRTLTy() {
561     // We first destruct memory managers in case that its dependent data are
562     // destroyed before it.
563     for (auto &M : MemoryManagers)
564       M.release();
565 
566     for (CUmodule &M : Modules)
567       // Close module
568       if (M)
569         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
570 
571     for (auto &S : StreamPool)
572       S.reset();
573 
574     EventPool.clear();
575 
576     for (DeviceDataTy &D : DeviceData) {
577       // Destroy context
578       if (D.Context) {
579         checkResult(cuCtxSetCurrent(D.Context),
580                     "Error returned from cuCtxSetCurrent\n");
581         CUdevice Device;
582         checkResult(cuCtxGetDevice(&Device),
583                     "Error returned from cuCtxGetDevice\n");
584         checkResult(cuDevicePrimaryCtxRelease(Device),
585                     "Error returned from cuDevicePrimaryCtxRelease\n");
586       }
587     }
588   }
589 
590   // Check whether a given DeviceId is valid
591   bool isValidDeviceId(const int DeviceId) const {
592     return DeviceId >= 0 && DeviceId < NumberOfDevices;
593   }
594 
595   int getNumOfDevices() const { return NumberOfDevices; }
596 
597   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
598 
599   int initDevice(const int DeviceId) {
600     CUdevice Device;
601 
602     DP("Getting device %d\n", DeviceId);
603     CUresult Err = cuDeviceGet(&Device, DeviceId);
604     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
605       return OFFLOAD_FAIL;
606 
607     // Query the current flags of the primary context and set its flags if
608     // it is inactive
609     unsigned int FormerPrimaryCtxFlags = 0;
610     int FormerPrimaryCtxIsActive = 0;
611     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
612                                      &FormerPrimaryCtxIsActive);
613     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
614       return OFFLOAD_FAIL;
615 
616     if (FormerPrimaryCtxIsActive) {
617       DP("The primary context is active, no change to its flags\n");
618       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
619           CU_CTX_SCHED_BLOCKING_SYNC)
620         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
621     } else {
622       DP("The primary context is inactive, set its flags to "
623          "CU_CTX_SCHED_BLOCKING_SYNC\n");
624       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
625       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
626         return OFFLOAD_FAIL;
627     }
628 
629     // Retain the per device primary context and save it to use whenever this
630     // device is selected.
631     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
632     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
633       return OFFLOAD_FAIL;
634 
635     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
636     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
637       return OFFLOAD_FAIL;
638 
639     // Initialize stream pool
640     if (!StreamPool[DeviceId])
641       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
642           AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
643           NumInitialStreams);
644 
645     // Query attributes to determine number of threads/block and blocks/grid.
646     int MaxGridDimX;
647     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
648                                Device);
649     if (Err != CUDA_SUCCESS) {
650       DP("Error getting max grid dimension, use default value %d\n",
651          DeviceRTLTy::DefaultNumTeams);
652       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
653     } else {
654       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
655       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
656     }
657 
658     // We are only exploiting threads along the x axis.
659     int MaxBlockDimX;
660     Err = cuDeviceGetAttribute(&MaxBlockDimX,
661                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
662     if (Err != CUDA_SUCCESS) {
663       DP("Error getting max block dimension, use default value %d\n",
664          DeviceRTLTy::DefaultNumThreads);
665       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
666     } else {
667       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
668       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
669 
670       if (EnvTeamThreadLimit > 0 &&
671           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
672         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
673            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
674            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
675         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
676       }
677       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
678         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
679            "capping at the hard limit\n",
680            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
681         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
682       }
683     }
684 
685     // Get and set warp size
686     int WarpSize;
687     Err =
688         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
689     if (Err != CUDA_SUCCESS) {
690       DP("Error getting warp size, assume default value 32\n");
691       DeviceData[DeviceId].WarpSize = 32;
692     } else {
693       DP("Using warp size %d\n", WarpSize);
694       DeviceData[DeviceId].WarpSize = WarpSize;
695     }
696 
697     // Adjust teams to the env variables
698     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
699       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
700          EnvTeamLimit);
701       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
702     }
703 
704     size_t StackLimit;
705     size_t HeapLimit;
706     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
707       StackLimit = std::stol(EnvStr);
708       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
709         return OFFLOAD_FAIL;
710     } else {
711       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
712         return OFFLOAD_FAIL;
713     }
714     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
715       HeapLimit = std::stol(EnvStr);
716       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
717         return OFFLOAD_FAIL;
718     } else {
719       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
720         return OFFLOAD_FAIL;
721     }
722 
723     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
724          "Device supports up to %d CUDA blocks and %d threads with a "
725          "warp size of %d\n",
726          DeviceData[DeviceId].BlocksPerGrid,
727          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
728     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
729          "Device heap size is %d Bytes, device stack size is %d Bytes per "
730          "thread\n",
731          (int)HeapLimit, (int)StackLimit);
732 
733     // Set default number of teams
734     if (EnvNumTeams > 0) {
735       DP("Default number of teams set according to environment %d\n",
736          EnvNumTeams);
737       DeviceData[DeviceId].NumTeams = EnvNumTeams;
738     } else {
739       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
740       DP("Default number of teams set according to library's default %d\n",
741          DeviceRTLTy::DefaultNumTeams);
742     }
743 
744     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
745       DP("Default number of teams exceeds device limit, capping at %d\n",
746          DeviceData[DeviceId].BlocksPerGrid);
747       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
748     }
749 
750     // Set default number of threads
751     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
752     DP("Default number of threads set according to library's default %d\n",
753        DeviceRTLTy::DefaultNumThreads);
754     if (DeviceData[DeviceId].NumThreads >
755         DeviceData[DeviceId].ThreadsPerBlock) {
756       DP("Default number of threads exceeds device limit, capping at %d\n",
757          DeviceData[DeviceId].ThreadsPerBlock);
758       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
759     }
760 
761     return OFFLOAD_SUCCESS;
762   }
763 
764   __tgt_target_table *loadBinary(const int DeviceId,
765                                  const __tgt_device_image *Image) {
766     // Set the context we are using
767     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
768     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
769       return nullptr;
770 
771     // Clear the offload table as we are going to create a new one.
772     clearOffloadEntriesTable(DeviceId);
773 
774     // Create the module and extract the function pointers.
775     CUmodule Module;
776     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
777     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
778     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
779       return nullptr;
780 
781     DP("CUDA module successfully loaded!\n");
782 
783     Modules.push_back(Module);
784 
785     // Find the symbols in the module by name.
786     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
787     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
788 
789     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
790     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
791       if (!E->addr) {
792         // We return nullptr when something like this happens, the host should
793         // have always something in the address to uniquely identify the target
794         // region.
795         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
796         return nullptr;
797       }
798 
799       if (E->size) {
800         __tgt_offload_entry Entry = *E;
801         CUdeviceptr CUPtr;
802         size_t CUSize;
803         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
804         // We keep this style here because we need the name
805         if (Err != CUDA_SUCCESS) {
806           REPORT("Loading global '%s' Failed\n", E->name);
807           CUDA_ERR_STRING(Err);
808           return nullptr;
809         }
810 
811         if (CUSize != E->size) {
812           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
813              CUSize, E->size);
814           return nullptr;
815         }
816 
817         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
818            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
819 
820         Entry.addr = (void *)(CUPtr);
821 
822         // Note: In the current implementation declare target variables
823         // can either be link or to. This means that once unified
824         // memory is activated via the requires directive, the variable
825         // can be used directly from the host in both cases.
826         // TODO: when variables types other than to or link are added,
827         // the below condition should be changed to explicitly
828         // check for to and link variables types:
829         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
830         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
831         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
832           // If unified memory is present any target link or to variables
833           // can access host addresses directly. There is no longer a
834           // need for device copies.
835           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
836           DP("Copy linked variable host address (" DPxMOD
837              ") to device address (" DPxMOD ")\n",
838              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
839         }
840 
841         addOffloadEntry(DeviceId, Entry);
842 
843         continue;
844       }
845 
846       CUfunction Func;
847       Err = cuModuleGetFunction(&Func, Module, E->name);
848       // We keep this style here because we need the name
849       if (Err != CUDA_SUCCESS) {
850         REPORT("Loading '%s' Failed\n", E->name);
851         CUDA_ERR_STRING(Err);
852         return nullptr;
853       }
854 
855       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
856          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
857 
858       // default value GENERIC (in case symbol is missing from cubin file)
859       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
860       std::string ExecModeNameStr(E->name);
861       ExecModeNameStr += "_exec_mode";
862       const char *ExecModeName = ExecModeNameStr.c_str();
863 
864       CUdeviceptr ExecModePtr;
865       size_t CUSize;
866       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
867       if (Err == CUDA_SUCCESS) {
868         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
869           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
870              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
871           return nullptr;
872         }
873 
874         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
875         if (Err != CUDA_SUCCESS) {
876           REPORT("Error when copying data from device to host. Pointers: "
877                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
878                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
879           CUDA_ERR_STRING(Err);
880           return nullptr;
881         }
882       } else {
883         DP("Loading global exec_mode '%s' - symbol missing, using default "
884            "value GENERIC (1)\n",
885            ExecModeName);
886       }
887 
888       KernelsList.emplace_back(Func, ExecModeVal);
889 
890       __tgt_offload_entry Entry = *E;
891       Entry.addr = &KernelsList.back();
892       addOffloadEntry(DeviceId, Entry);
893     }
894 
895     // send device environment data to the device
896     {
897       // TODO: The device ID used here is not the real device ID used by OpenMP.
898       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
899                                     static_cast<uint32_t>(DeviceId),
900                                     static_cast<uint32_t>(DynamicMemorySize)};
901 
902       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
903         DeviceEnv.DebugKind = std::stoi(EnvStr);
904 
905       const char *DeviceEnvName = "omptarget_device_environment";
906       CUdeviceptr DeviceEnvPtr;
907       size_t CUSize;
908 
909       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
910       if (Err == CUDA_SUCCESS) {
911         if (CUSize != sizeof(DeviceEnv)) {
912           REPORT(
913               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
914               DeviceEnvName, CUSize, sizeof(int32_t));
915           CUDA_ERR_STRING(Err);
916           return nullptr;
917         }
918 
919         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
920         if (Err != CUDA_SUCCESS) {
921           REPORT("Error when copying data from host to device. Pointers: "
922                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
923                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
924           CUDA_ERR_STRING(Err);
925           return nullptr;
926         }
927 
928         DP("Sending global device environment data %zu bytes\n", CUSize);
929       } else {
930         DP("Finding global device environment '%s' - symbol missing.\n",
931            DeviceEnvName);
932         DP("Continue, considering this is a device RTL which does not accept "
933            "environment setting.\n");
934       }
935     }
936 
937     return getOffloadEntriesTable(DeviceId);
938   }
939 
940   void *dataAlloc(const int DeviceId, const int64_t Size,
941                   const TargetAllocTy Kind) {
942     switch (Kind) {
943     case TARGET_ALLOC_DEFAULT:
944     case TARGET_ALLOC_DEVICE:
945       if (UseMemoryManager)
946         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
947       else
948         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
949     case TARGET_ALLOC_HOST:
950     case TARGET_ALLOC_SHARED:
951       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
952     }
953 
954     REPORT("Invalid target data allocation kind or requested allocator not "
955            "implemented yet\n");
956 
957     return nullptr;
958   }
959 
960   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
961                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
962     assert(AsyncInfo && "AsyncInfo is nullptr");
963 
964     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
965     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
966       return OFFLOAD_FAIL;
967 
968     CUstream Stream = getStream(DeviceId, AsyncInfo);
969 
970     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
971     if (Err != CUDA_SUCCESS) {
972       DP("Error when copying data from host to device. Pointers: host "
973          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
974          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
975       CUDA_ERR_STRING(Err);
976       return OFFLOAD_FAIL;
977     }
978 
979     return OFFLOAD_SUCCESS;
980   }
981 
982   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
983                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
984     assert(AsyncInfo && "AsyncInfo is nullptr");
985 
986     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
987     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
988       return OFFLOAD_FAIL;
989 
990     CUstream Stream = getStream(DeviceId, AsyncInfo);
991 
992     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
993     if (Err != CUDA_SUCCESS) {
994       DP("Error when copying data from device to host. Pointers: host "
995          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
996          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
997       CUDA_ERR_STRING(Err);
998       return OFFLOAD_FAIL;
999     }
1000 
1001     return OFFLOAD_SUCCESS;
1002   }
1003 
1004   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1005                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1006     assert(AsyncInfo && "AsyncInfo is nullptr");
1007 
1008     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1009     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1010       return OFFLOAD_FAIL;
1011 
1012     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1013 
1014     // If they are two devices, we try peer to peer copy first
1015     if (SrcDevId != DstDevId) {
1016       int CanAccessPeer = 0;
1017       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1018       if (Err != CUDA_SUCCESS) {
1019         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1020                ", dst = %" PRId32 "\n",
1021                SrcDevId, DstDevId);
1022         CUDA_ERR_STRING(Err);
1023         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1024       }
1025 
1026       if (!CanAccessPeer) {
1027         DP("P2P memcpy not supported so fall back to D2D memcpy");
1028         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1029       }
1030 
1031       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1032       if (Err != CUDA_SUCCESS) {
1033         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1034                ", dst = %" PRId32 "\n",
1035                SrcDevId, DstDevId);
1036         CUDA_ERR_STRING(Err);
1037         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1038       }
1039 
1040       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1041                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1042                               Size, Stream);
1043       if (Err == CUDA_SUCCESS)
1044         return OFFLOAD_SUCCESS;
1045 
1046       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1047          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1048          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1049       CUDA_ERR_STRING(Err);
1050     }
1051 
1052     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1053   }
1054 
1055   int dataDelete(const int DeviceId, void *TgtPtr) {
1056     if (UseMemoryManager)
1057       return MemoryManagers[DeviceId]->free(TgtPtr);
1058 
1059     return DeviceAllocators[DeviceId].free(TgtPtr);
1060   }
1061 
1062   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1063                           ptrdiff_t *TgtOffsets, const int ArgNum,
1064                           const int TeamNum, const int ThreadLimit,
1065                           const unsigned int LoopTripCount,
1066                           __tgt_async_info *AsyncInfo) const {
1067     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1068     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1069       return OFFLOAD_FAIL;
1070 
1071     // All args are references.
1072     std::vector<void *> Args(ArgNum);
1073     std::vector<void *> Ptrs(ArgNum);
1074 
1075     for (int I = 0; I < ArgNum; ++I) {
1076       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1077       Args[I] = &Ptrs[I];
1078     }
1079 
1080     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1081 
1082     const bool IsSPMDGenericMode =
1083         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1084     const bool IsSPMDMode =
1085         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1086     const bool IsGenericMode =
1087         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1088 
1089     int CudaThreadsPerBlock;
1090     if (ThreadLimit > 0) {
1091       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1092       CudaThreadsPerBlock = ThreadLimit;
1093       // Add master warp if necessary
1094       if (IsGenericMode) {
1095         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1096         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1097       }
1098     } else {
1099       DP("Setting CUDA threads per block to default %d\n",
1100          DeviceData[DeviceId].NumThreads);
1101       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1102     }
1103 
1104     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1105       DP("Threads per block capped at device limit %d\n",
1106          DeviceData[DeviceId].ThreadsPerBlock);
1107       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1108     }
1109 
1110     if (!KernelInfo->MaxThreadsPerBlock) {
1111       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1112                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1113                                KernelInfo->Func);
1114       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1115         return OFFLOAD_FAIL;
1116     }
1117 
1118     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1119       DP("Threads per block capped at kernel limit %d\n",
1120          KernelInfo->MaxThreadsPerBlock);
1121       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1122     }
1123 
1124     unsigned int CudaBlocksPerGrid;
1125     if (TeamNum <= 0) {
1126       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1127         if (IsSPMDGenericMode) {
1128           // If we reach this point, then we are executing a kernel that was
1129           // transformed from Generic-mode to SPMD-mode. This kernel has
1130           // SPMD-mode execution, but needs its blocks to be scheduled
1131           // differently because the current loop trip count only applies to the
1132           // `teams distribute` region and will create var too few blocks using
1133           // the regular SPMD-mode method.
1134           CudaBlocksPerGrid = LoopTripCount;
1135         } else if (IsSPMDMode) {
1136           // We have a combined construct, i.e. `target teams distribute
1137           // parallel for [simd]`. We launch so many teams so that each thread
1138           // will execute one iteration of the loop. round up to the nearest
1139           // integer
1140           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1141         } else if (IsGenericMode) {
1142           // If we reach this point, then we have a non-combined construct, i.e.
1143           // `teams distribute` with a nested `parallel for` and each team is
1144           // assigned one iteration of the `distribute` loop. E.g.:
1145           //
1146           // #pragma omp target teams distribute
1147           // for(...loop_tripcount...) {
1148           //   #pragma omp parallel for
1149           //   for(...) {}
1150           // }
1151           //
1152           // Threads within a team will execute the iterations of the `parallel`
1153           // loop.
1154           CudaBlocksPerGrid = LoopTripCount;
1155         } else {
1156           REPORT("Unknown execution mode: %d\n",
1157                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1158           return OFFLOAD_FAIL;
1159         }
1160         DP("Using %d teams due to loop trip count %" PRIu32
1161            " and number of threads per block %d\n",
1162            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1163       } else {
1164         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1165         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1166       }
1167     } else {
1168       DP("Using requested number of teams %d\n", TeamNum);
1169       CudaBlocksPerGrid = TeamNum;
1170     }
1171 
1172     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1173       DP("Capping number of teams to team limit %d\n",
1174          DeviceData[DeviceId].BlocksPerGrid);
1175       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1176     }
1177 
1178     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1179          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1180          (getOffloadEntry(DeviceId, TgtEntryPtr))
1181              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1182              : "(null)",
1183          CudaBlocksPerGrid, CudaThreadsPerBlock,
1184          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1185 
1186     CUstream Stream = getStream(DeviceId, AsyncInfo);
1187     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1188                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1189                          /* blockDimY */ 1, /* blockDimZ */ 1,
1190                          DynamicMemorySize, Stream, &Args[0], nullptr);
1191     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1192       return OFFLOAD_FAIL;
1193 
1194     DP("Launch of entry point at " DPxMOD " successful!\n",
1195        DPxPTR(TgtEntryPtr));
1196 
1197     return OFFLOAD_SUCCESS;
1198   }
1199 
1200   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1201     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1202     CUresult Err = cuStreamSynchronize(Stream);
1203 
1204     // Once the stream is synchronized, return it to stream pool and reset
1205     // AsyncInfo. This is to make sure the synchronization only works for its
1206     // own tasks.
1207     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1208     AsyncInfo->Queue = nullptr;
1209 
1210     if (Err != CUDA_SUCCESS) {
1211       DP("Error when synchronizing stream. stream = " DPxMOD
1212          ", async info ptr = " DPxMOD "\n",
1213          DPxPTR(Stream), DPxPTR(AsyncInfo));
1214       CUDA_ERR_STRING(Err);
1215     }
1216     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1217   }
1218 
1219   void printDeviceInfo(int32_t device_id) {
1220     char TmpChar[1000];
1221     std::string TmpStr;
1222     size_t TmpSt;
1223     int TmpInt, TmpInt2, TmpInt3;
1224 
1225     CUdevice Device;
1226     checkResult(cuDeviceGet(&Device, device_id),
1227                 "Error returned from cuCtxGetDevice\n");
1228 
1229     cuDriverGetVersion(&TmpInt);
1230     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1231     printf("    CUDA Device Number: \t\t%d \n", device_id);
1232     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1233                 "Error returned from cuDeviceGetName\n");
1234     printf("    Device Name: \t\t\t%s \n", TmpChar);
1235     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1236                 "Error returned from cuDeviceTotalMem\n");
1237     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1238     checkResult(cuDeviceGetAttribute(
1239                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1240                 "Error returned from cuDeviceGetAttribute\n");
1241     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1242     checkResult(
1243         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1244         "Error returned from cuDeviceGetAttribute\n");
1245     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1246     checkResult(cuDeviceGetAttribute(
1247                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1248                 "Error returned from cuDeviceGetAttribute\n");
1249     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1250     checkResult(
1251         cuDeviceGetAttribute(
1252             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1253         "Error returned from cuDeviceGetAttribute\n");
1254     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1255     checkResult(
1256         cuDeviceGetAttribute(
1257             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1258         "Error returned from cuDeviceGetAttribute\n");
1259     printf("    Registers per Block: \t\t%d \n", TmpInt);
1260     checkResult(
1261         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1262         "Error returned from cuDeviceGetAttribute\n");
1263     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1264     checkResult(cuDeviceGetAttribute(
1265                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1266                 "Error returned from cuDeviceGetAttribute\n");
1267     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1268     checkResult(cuDeviceGetAttribute(
1269                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1270                 "Error returned from cuDeviceGetAttribute\n");
1271     checkResult(cuDeviceGetAttribute(
1272                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1273                 "Error returned from cuDeviceGetAttribute\n");
1274     checkResult(cuDeviceGetAttribute(
1275                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1276                 "Error returned from cuDeviceGetAttribute\n");
1277     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1278            TmpInt3);
1279     checkResult(cuDeviceGetAttribute(
1280                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1281                 "Error returned from cuDeviceGetAttribute\n");
1282     checkResult(cuDeviceGetAttribute(
1283                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1284                 "Error returned from cuDeviceGetAttribute\n");
1285     checkResult(cuDeviceGetAttribute(
1286                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1287                 "Error returned from cuDeviceGetAttribute\n");
1288     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1289            TmpInt3);
1290     checkResult(
1291         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1292         "Error returned from cuDeviceGetAttribute\n");
1293     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1294     checkResult(cuDeviceGetAttribute(
1295                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1296                 "Error returned from cuDeviceGetAttribute\n");
1297     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1298     checkResult(
1299         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1300         "Error returned from cuDeviceGetAttribute\n");
1301     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1302     checkResult(cuDeviceGetAttribute(
1303                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1304                 "Error returned from cuDeviceGetAttribute\n");
1305     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1306     checkResult(
1307         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1308         "Error returned from cuDeviceGetAttribute\n");
1309     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1310     checkResult(cuDeviceGetAttribute(
1311                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1312                 "Error returned from cuDeviceGetAttribute\n");
1313     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1314     checkResult(
1315         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1316         "Error returned from cuDeviceGetAttribute\n");
1317     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1318       TmpStr = "DEFAULT";
1319     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1320       TmpStr = "PROHIBITED";
1321     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1322       TmpStr = "EXCLUSIVE PROCESS";
1323     else
1324       TmpStr = "unknown";
1325     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1326     checkResult(cuDeviceGetAttribute(
1327                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1328                 "Error returned from cuDeviceGetAttribute\n");
1329     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1330     checkResult(
1331         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1332         "Error returned from cuDeviceGetAttribute\n");
1333     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1334     checkResult(cuDeviceGetAttribute(
1335                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1336                 "Error returned from cuDeviceGetAttribute\n");
1337     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1338     checkResult(
1339         cuDeviceGetAttribute(
1340             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1341         "Error returned from cuDeviceGetAttribute\n");
1342     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1343     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1344                                      Device),
1345                 "Error returned from cuDeviceGetAttribute\n");
1346     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1347     checkResult(cuDeviceGetAttribute(
1348                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1349                     Device),
1350                 "Error returned from cuDeviceGetAttribute\n");
1351     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1352     checkResult(cuDeviceGetAttribute(
1353                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1354                 "Error returned from cuDeviceGetAttribute\n");
1355     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1356     checkResult(cuDeviceGetAttribute(
1357                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1358                 "Error returned from cuDeviceGetAttribute\n");
1359     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1360     checkResult(cuDeviceGetAttribute(
1361                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1362                 "Error returned from cuDeviceGetAttribute\n");
1363     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1364     checkResult(
1365         cuDeviceGetAttribute(
1366             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1367         "Error returned from cuDeviceGetAttribute\n");
1368     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1369     checkResult(
1370         cuDeviceGetAttribute(
1371             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1372         "Error returned from cuDeviceGetAttribute\n");
1373     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1374     checkResult(cuDeviceGetAttribute(
1375                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1376                 "Error returned from cuDeviceGetAttribute\n");
1377     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1378     checkResult(cuDeviceGetAttribute(
1379                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1380                 "Error returned from cuDeviceGetAttribute\n");
1381     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1382     checkResult(
1383         cuDeviceGetAttribute(
1384             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1385         "Error returned from cuDeviceGetAttribute\n");
1386     checkResult(
1387         cuDeviceGetAttribute(
1388             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1389         "Error returned from cuDeviceGetAttribute\n");
1390     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1391   }
1392 
1393   int createEvent(void **P) {
1394     CUevent Event = nullptr;
1395     if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
1396       return OFFLOAD_FAIL;
1397     *P = Event;
1398     return OFFLOAD_SUCCESS;
1399   }
1400 
1401   int destroyEvent(void *EventPtr) {
1402     EventPool.release(reinterpret_cast<CUevent>(EventPtr));
1403     return OFFLOAD_SUCCESS;
1404   }
1405 
1406   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1407                 void *EventPtr) const {
1408     CUstream Stream = getStream(DeviceId, AsyncInfo);
1409     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1410 
1411     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1412     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1413     // API document, that argument has to be 0x0.
1414     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1415     if (Err != CUDA_SUCCESS) {
1416       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1417          DPxPTR(Stream), DPxPTR(Event));
1418       CUDA_ERR_STRING(Err);
1419       return OFFLOAD_FAIL;
1420     }
1421 
1422     return OFFLOAD_SUCCESS;
1423   }
1424 
1425   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1426     if (AsyncInfo->Queue) {
1427       StreamPool[DeviceId]->release(
1428           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1429       AsyncInfo->Queue = nullptr;
1430     }
1431 
1432     return OFFLOAD_SUCCESS;
1433   }
1434 
1435   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1436     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1437     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1438       return OFFLOAD_FAIL;
1439 
1440     *AsyncInfo = new __tgt_async_info;
1441     getStream(DeviceId, *AsyncInfo);
1442     return OFFLOAD_SUCCESS;
1443   }
1444 
1445   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1446                      const char **ErrStr) const {
1447     assert(DeviceInfo && "DeviceInfo is nullptr");
1448 
1449     if (!DeviceInfo->Context)
1450       DeviceInfo->Context = DeviceData[DeviceId].Context;
1451     if (!DeviceInfo->Device) {
1452       CUdevice Dev;
1453       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1454       if (Err == CUDA_SUCCESS) {
1455         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1456       } else {
1457         cuGetErrorString(Err, ErrStr);
1458         return OFFLOAD_FAIL;
1459       }
1460     }
1461     return OFFLOAD_SUCCESS;
1462   }
1463 };
1464 
1465 DeviceRTLTy DeviceRTL;
1466 } // namespace
1467 
1468 // Exposed library API function
1469 #ifdef __cplusplus
1470 extern "C" {
1471 #endif
1472 
1473 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1474   return elf_check_machine(image, /* EM_CUDA */ 190);
1475 }
1476 
1477 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1478 
1479 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1480   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1481   DeviceRTL.setRequiresFlag(RequiresFlags);
1482   return RequiresFlags;
1483 }
1484 
1485 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1486   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1487       DeviceRTL.isValidDeviceId(dst_dev_id))
1488     return 1;
1489 
1490   return 0;
1491 }
1492 
1493 int32_t __tgt_rtl_init_device(int32_t device_id) {
1494   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1495 
1496   return DeviceRTL.initDevice(device_id);
1497 }
1498 
1499 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1500                                           __tgt_device_image *image) {
1501   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1502 
1503   return DeviceRTL.loadBinary(device_id, image);
1504 }
1505 
1506 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1507                            int32_t kind) {
1508   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1509 
1510   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1511 }
1512 
1513 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1514                               int64_t size) {
1515   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1516 
1517   __tgt_async_info AsyncInfo;
1518   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1519                                                  size, &AsyncInfo);
1520   if (rc != OFFLOAD_SUCCESS)
1521     return OFFLOAD_FAIL;
1522 
1523   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1524 }
1525 
1526 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1527                                     void *hst_ptr, int64_t size,
1528                                     __tgt_async_info *async_info_ptr) {
1529   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1530   assert(async_info_ptr && "async_info_ptr is nullptr");
1531 
1532   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1533                               async_info_ptr);
1534 }
1535 
1536 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1537                                 int64_t size) {
1538   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1539 
1540   __tgt_async_info AsyncInfo;
1541   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1542                                                    size, &AsyncInfo);
1543   if (rc != OFFLOAD_SUCCESS)
1544     return OFFLOAD_FAIL;
1545 
1546   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1547 }
1548 
1549 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1550                                       void *tgt_ptr, int64_t size,
1551                                       __tgt_async_info *async_info_ptr) {
1552   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1553   assert(async_info_ptr && "async_info_ptr is nullptr");
1554 
1555   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1556                                 async_info_ptr);
1557 }
1558 
1559 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1560                                       int dst_dev_id, void *dst_ptr,
1561                                       int64_t size,
1562                                       __tgt_async_info *AsyncInfo) {
1563   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1564   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1565   assert(AsyncInfo && "AsyncInfo is nullptr");
1566 
1567   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1568                                 AsyncInfo);
1569 }
1570 
1571 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1572                                 int32_t dst_dev_id, void *dst_ptr,
1573                                 int64_t size) {
1574   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1575   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1576 
1577   __tgt_async_info AsyncInfo;
1578   const int32_t rc = __tgt_rtl_data_exchange_async(
1579       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1580   if (rc != OFFLOAD_SUCCESS)
1581     return OFFLOAD_FAIL;
1582 
1583   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1584 }
1585 
1586 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1587   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1588 
1589   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1590 }
1591 
1592 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1593                                          void **tgt_args,
1594                                          ptrdiff_t *tgt_offsets,
1595                                          int32_t arg_num, int32_t team_num,
1596                                          int32_t thread_limit,
1597                                          uint64_t loop_tripcount) {
1598   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1599 
1600   __tgt_async_info AsyncInfo;
1601   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1602       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1603       thread_limit, loop_tripcount, &AsyncInfo);
1604   if (rc != OFFLOAD_SUCCESS)
1605     return OFFLOAD_FAIL;
1606 
1607   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1608 }
1609 
1610 int32_t __tgt_rtl_run_target_team_region_async(
1611     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1612     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1613     int32_t thread_limit, uint64_t loop_tripcount,
1614     __tgt_async_info *async_info_ptr) {
1615   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1616 
1617   return DeviceRTL.runTargetTeamRegion(
1618       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1619       thread_limit, loop_tripcount, async_info_ptr);
1620 }
1621 
1622 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1623                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1624                                     int32_t arg_num) {
1625   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1626 
1627   __tgt_async_info AsyncInfo;
1628   const int32_t rc = __tgt_rtl_run_target_region_async(
1629       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1630   if (rc != OFFLOAD_SUCCESS)
1631     return OFFLOAD_FAIL;
1632 
1633   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1634 }
1635 
1636 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1637                                           void *tgt_entry_ptr, void **tgt_args,
1638                                           ptrdiff_t *tgt_offsets,
1639                                           int32_t arg_num,
1640                                           __tgt_async_info *async_info_ptr) {
1641   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1642 
1643   return __tgt_rtl_run_target_team_region_async(
1644       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1645       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1646       async_info_ptr);
1647 }
1648 
1649 int32_t __tgt_rtl_synchronize(int32_t device_id,
1650                               __tgt_async_info *async_info_ptr) {
1651   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1652   assert(async_info_ptr && "async_info_ptr is nullptr");
1653   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1654 
1655   return DeviceRTL.synchronize(device_id, async_info_ptr);
1656 }
1657 
1658 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1659   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1660   InfoLevel.store(NewInfoLevel);
1661 }
1662 
1663 void __tgt_rtl_print_device_info(int32_t device_id) {
1664   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1665   DeviceRTL.printDeviceInfo(device_id);
1666 }
1667 
1668 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1669   assert(event && "event is nullptr");
1670   return DeviceRTL.createEvent(event);
1671 }
1672 
1673 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1674                                __tgt_async_info *async_info_ptr) {
1675   assert(async_info_ptr && "async_info_ptr is nullptr");
1676   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1677   assert(event_ptr && "event_ptr is nullptr");
1678 
1679   return recordEvent(event_ptr, async_info_ptr);
1680 }
1681 
1682 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1683                              __tgt_async_info *async_info_ptr) {
1684   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1685   assert(async_info_ptr && "async_info_ptr is nullptr");
1686   assert(event_ptr && "event is nullptr");
1687 
1688   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1689 }
1690 
1691 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1692   assert(event_ptr && "event is nullptr");
1693 
1694   return syncEvent(event_ptr);
1695 }
1696 
1697 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1698   assert(event_ptr && "event is nullptr");
1699 
1700   return DeviceRTL.destroyEvent(event_ptr);
1701 }
1702 
1703 int32_t __tgt_rtl_release_async_info(int32_t device_id,
1704                                      __tgt_async_info *async_info) {
1705   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1706   assert(async_info && "async_info is nullptr");
1707 
1708   return DeviceRTL.releaseAsyncInfo(device_id, async_info);
1709 }
1710 
1711 int32_t __tgt_rtl_init_async_info(int32_t device_id,
1712                                   __tgt_async_info **async_info) {
1713   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1714   assert(async_info && "async_info is nullptr");
1715 
1716   return DeviceRTL.initAsyncInfo(device_id, async_info);
1717 }
1718 
1719 int32_t __tgt_rtl_init_device_info(int32_t device_id,
1720                                    __tgt_device_info *device_info_ptr,
1721                                    const char **err_str) {
1722   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1723   assert(device_info_ptr && "device_info_ptr is nullptr");
1724 
1725   return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str);
1726 }
1727 
1728 #ifdef __cplusplus
1729 }
1730 #endif
1731