1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <mutex>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "Debug.h"
25 #include "DeviceEnvironment.h"
26 #include "omptarget.h"
27 #include "omptargetplugin.h"
28 
29 #define TARGET_NAME CUDA
30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
31 
32 #include "MemoryManager.h"
33 
34 #include "llvm/Frontend/OpenMP/OMPConstants.h"
35 
36 // Utility for retrieving and printing CUDA error string.
37 #ifdef OMPTARGET_DEBUG
38 #define CUDA_ERR_STRING(err)                                                   \
39   do {                                                                         \
40     if (getDebugLevel() > 0) {                                                 \
41       const char *errStr = nullptr;                                            \
42       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
43       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
44         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
45       else if (errStr_status == CUDA_SUCCESS)                                  \
46         REPORT("CUDA error is: %s\n", errStr);                                 \
47       else {                                                                   \
48         REPORT("Unresolved CUDA error code: %d\n", err);                       \
49         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
50                errStr_status);                                                 \
51       }                                                                        \
52     } else {                                                                   \
53       const char *errStr = nullptr;                                            \
54       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
55       if (errStr_status == CUDA_SUCCESS)                                       \
56         REPORT("%s \n", errStr);                                               \
57     }                                                                          \
58   } while (false)
59 #else // OMPTARGET_DEBUG
60 #define CUDA_ERR_STRING(err)                                                   \
61   do {                                                                         \
62     const char *errStr = nullptr;                                              \
63     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
64     if (errStr_status == CUDA_SUCCESS)                                         \
65       REPORT("%s \n", errStr);                                                 \
66   } while (false)
67 #endif // OMPTARGET_DEBUG
68 
69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
70 
71 #include "elf_common.h"
72 
73 /// Keep entries table per device.
74 struct FuncOrGblEntryTy {
75   __tgt_target_table Table;
76   std::vector<__tgt_offload_entry> Entries;
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
85 
86   /// Maximal number of threads per block for this kernel.
87   int MaxThreadsPerBlock = 0;
88 
89   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
90       : Func(_Func), ExecutionMode(_ExecutionMode) {}
91 };
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   REPORT("%s", ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
120   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
121   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
122 
123   CUresult Err = cuEventRecord(Event, Stream);
124   if (Err != CUDA_SUCCESS) {
125     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
126        DPxPTR(Stream), DPxPTR(Event));
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 int syncEvent(void *EventPtr) {
135   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
136 
137   CUresult Err = cuEventSynchronize(Event);
138   if (Err != CUDA_SUCCESS) {
139     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 namespace {
148 
149 // Structure contains per-device data
150 struct DeviceDataTy {
151   /// List that contains all the kernels.
152   std::list<KernelTy> KernelsList;
153 
154   std::list<FuncOrGblEntryTy> FuncGblEntries;
155 
156   CUcontext Context = nullptr;
157   // Device properties
158   int ThreadsPerBlock = 0;
159   int BlocksPerGrid = 0;
160   int WarpSize = 0;
161   // OpenMP properties
162   int NumTeams = 0;
163   int NumThreads = 0;
164 };
165 
166 /// Resource allocator where \p T is the resource type.
167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
168 /// accordingly. The implementation should not raise any exception.
169 template <typename T> struct AllocatorTy {
170   AllocatorTy(CUcontext C) noexcept : Context(C) {}
171   using ElementTy = T;
172 
173   virtual ~AllocatorTy() {}
174 
175   /// Create a resource and assign to R.
176   virtual int create(T &R) noexcept = 0;
177   /// Destroy the resource.
178   virtual int destroy(T) noexcept = 0;
179 
180 protected:
181   CUcontext Context;
182 };
183 
184 /// Allocator for CUstream.
185 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
186   StreamAllocatorTy(CUcontext C) noexcept : AllocatorTy<CUstream>(C) {}
187 
188   /// See AllocatorTy<T>::create.
189   int create(CUstream &Stream) noexcept override {
190     if (!checkResult(cuCtxSetCurrent(Context),
191                      "Error returned from cuCtxSetCurrent\n"))
192       return OFFLOAD_FAIL;
193 
194     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
195                      "Error returned from cuStreamCreate\n"))
196       return OFFLOAD_FAIL;
197 
198     return OFFLOAD_SUCCESS;
199   }
200 
201   /// See AllocatorTy<T>::destroy.
202   int destroy(CUstream Stream) noexcept override {
203     if (!checkResult(cuCtxSetCurrent(Context),
204                      "Error returned from cuCtxSetCurrent\n"))
205       return OFFLOAD_FAIL;
206     if (!checkResult(cuStreamDestroy(Stream),
207                      "Error returned from cuStreamDestroy\n"))
208       return OFFLOAD_FAIL;
209 
210     return OFFLOAD_SUCCESS;
211   }
212 };
213 
214 /// Allocator for CUevent.
215 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
216   EventAllocatorTy(CUcontext C) noexcept : AllocatorTy<CUevent>(C) {}
217 
218   /// See AllocatorTy<T>::create.
219   int create(CUevent &Event) noexcept override {
220     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
221                      "Error returned from cuEventCreate\n"))
222       return OFFLOAD_FAIL;
223 
224     return OFFLOAD_SUCCESS;
225   }
226 
227   /// See AllocatorTy<T>::destroy.
228   int destroy(CUevent Event) noexcept override {
229     if (!checkResult(cuEventDestroy(Event),
230                      "Error returned from cuEventDestroy\n"))
231       return OFFLOAD_FAIL;
232 
233     return OFFLOAD_SUCCESS;
234   }
235 };
236 
237 /// A generic pool of resources where \p T is the resource type.
238 /// \p T should be copyable as the object is stored in \p std::vector .
239 template <typename AllocTy> class ResourcePoolTy {
240   using ElementTy = typename AllocTy::ElementTy;
241   /// Index of the next available resource.
242   size_t Next = 0;
243   /// Mutex to guard the pool.
244   std::mutex Mutex;
245   /// Pool of resources.
246   std::vector<ElementTy> Resources;
247   /// A reference to the corresponding allocator.
248   AllocTy Allocator;
249 
250   /// If `Resources` is used up, we will fill in more resources. It assumes that
251   /// the new size `Size` should be always larger than the current size.
252   bool resize(size_t Size) {
253     auto CurSize = Resources.size();
254     assert(Size > CurSize && "Unexpected smaller size");
255     Resources.reserve(Size);
256     for (auto I = CurSize; I < Size; ++I) {
257       ElementTy NewItem;
258       int Ret = Allocator.create(NewItem);
259       if (Ret != OFFLOAD_SUCCESS)
260         return false;
261       Resources.push_back(NewItem);
262     }
263     return true;
264   }
265 
266 public:
267   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
268       : Allocator(std::move(A)) {
269     if (Size)
270       (void)resize(Size);
271   }
272 
273   ~ResourcePoolTy() noexcept { clear(); }
274 
275   /// Get a resource from pool. `Next` always points to the next available
276   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
277   /// still available. If there is no resource left, we will ask for more. Each
278   /// time a resource is assigned, the id will increase one.
279   /// xxxxxs+++++++++
280   ///      ^
281   ///      Next
282   /// After assignment, the pool becomes the following and s is assigned.
283   /// xxxxxs+++++++++
284   ///       ^
285   ///       Next
286   int acquire(ElementTy &R) noexcept {
287     std::lock_guard<std::mutex> LG(Mutex);
288     if (Next == Resources.size()) {
289       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
290       if (!resize(NewSize))
291         return OFFLOAD_FAIL;
292     }
293 
294     assert(Next < Resources.size());
295 
296     R = Resources[Next++];
297 
298     return OFFLOAD_SUCCESS;
299   }
300 
301   /// Return the resource back to the pool. When we return a resource, we need
302   /// to first decrease `Next`, and then copy the resource back. It is worth
303   /// noting that, the order of resources return might be different from that
304   /// they're assigned, that saying, at some point, there might be two identical
305   /// resources.
306   /// xxax+a+++++
307   ///     ^
308   ///     Next
309   /// However, it doesn't matter, because they're always on the two sides of
310   /// `Next`. The left one will in the end be overwritten by another resource.
311   /// Therefore, after several execution, the order of pool might be different
312   /// from its initial state.
313   void release(ElementTy R) noexcept {
314     std::lock_guard<std::mutex> LG(Mutex);
315     Resources[--Next] = R;
316   }
317 
318   /// Released all stored resources and clear the pool.
319   /// Note: This function is not thread safe. Be sure to guard it if necessary.
320   void clear() noexcept {
321     for (auto &R : Resources)
322       (void)Allocator.destroy(R);
323     Resources.clear();
324   }
325 };
326 
327 } // namespace
328 
329 class DeviceRTLTy {
330   int NumberOfDevices;
331   // OpenMP environment properties
332   int EnvNumTeams;
333   int EnvTeamLimit;
334   int EnvTeamThreadLimit;
335   // OpenMP requires flags
336   int64_t RequiresFlags;
337   // Amount of dynamic shared memory to use at launch.
338   uint64_t DynamicMemorySize;
339 
340   /// Number of initial streams for each device.
341   int NumInitialStreams = 32;
342 
343   /// Number of initial events for each device.
344   int NumInitialEvents = 8;
345 
346   static constexpr const int32_t HardThreadLimit = 1024;
347   static constexpr const int32_t DefaultNumTeams = 128;
348   static constexpr const int32_t DefaultNumThreads = 128;
349 
350   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
351   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
352 
353   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
354   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
355 
356   std::vector<DeviceDataTy> DeviceData;
357   std::vector<CUmodule> Modules;
358 
359   /// Vector of flags indicating the initalization status of all associated
360   /// devices.
361   std::vector<bool> InitializedFlags;
362 
363   /// A class responsible for interacting with device native runtime library to
364   /// allocate and free memory.
365   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
366     const int DeviceId;
367     const std::vector<DeviceDataTy> &DeviceData;
368     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
369 
370   public:
371     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
372         : DeviceId(DeviceId), DeviceData(DeviceData) {}
373 
374     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
375       if (Size == 0)
376         return nullptr;
377 
378       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
379       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
380         return nullptr;
381 
382       void *MemAlloc = nullptr;
383       switch (Kind) {
384       case TARGET_ALLOC_DEFAULT:
385       case TARGET_ALLOC_DEVICE:
386         CUdeviceptr DevicePtr;
387         Err = cuMemAlloc(&DevicePtr, Size);
388         MemAlloc = (void *)DevicePtr;
389         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
390           return nullptr;
391         break;
392       case TARGET_ALLOC_HOST:
393         void *HostPtr;
394         Err = cuMemAllocHost(&HostPtr, Size);
395         MemAlloc = HostPtr;
396         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
397           return nullptr;
398         HostPinnedAllocs[MemAlloc] = Kind;
399         break;
400       case TARGET_ALLOC_SHARED:
401         CUdeviceptr SharedPtr;
402         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
403         MemAlloc = (void *)SharedPtr;
404         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
405           return nullptr;
406         break;
407       }
408 
409       return MemAlloc;
410     }
411 
412     int free(void *TgtPtr) override {
413       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
414       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
415         return OFFLOAD_FAIL;
416 
417       // Host pinned memory must be freed differently.
418       TargetAllocTy Kind =
419           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
420               ? TARGET_ALLOC_DEFAULT
421               : TARGET_ALLOC_HOST;
422       switch (Kind) {
423       case TARGET_ALLOC_DEFAULT:
424       case TARGET_ALLOC_DEVICE:
425       case TARGET_ALLOC_SHARED:
426         Err = cuMemFree((CUdeviceptr)TgtPtr);
427         if (!checkResult(Err, "Error returned from cuMemFree\n"))
428           return OFFLOAD_FAIL;
429         break;
430       case TARGET_ALLOC_HOST:
431         Err = cuMemFreeHost(TgtPtr);
432         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
433           return OFFLOAD_FAIL;
434         break;
435       }
436 
437       return OFFLOAD_SUCCESS;
438     }
439   };
440 
441   /// A vector of device allocators
442   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
443 
444   /// A vector of memory managers. Since the memory manager is non-copyable and
445   // non-removable, we wrap them into std::unique_ptr.
446   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
447 
448   /// Whether use memory manager
449   bool UseMemoryManager = true;
450 
451   // Record entry point associated with device
452   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
453     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
454     E.Entries.push_back(entry);
455   }
456 
457   // Return a pointer to the entry associated with the pointer
458   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
459                                              const void *Addr) const {
460     for (const __tgt_offload_entry &Itr :
461          DeviceData[DeviceId].FuncGblEntries.back().Entries)
462       if (Itr.addr == Addr)
463         return &Itr;
464 
465     return nullptr;
466   }
467 
468   // Return the pointer to the target entries table
469   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
470     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
471 
472     if (E.Entries.empty())
473       return nullptr;
474 
475     // Update table info according to the entries and return the pointer
476     E.Table.EntriesBegin = E.Entries.data();
477     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
478 
479     return &E.Table;
480   }
481 
482   // Clear entries table for a device
483   void clearOffloadEntriesTable(const int DeviceId) {
484     DeviceData[DeviceId].FuncGblEntries.emplace_back();
485     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
486     E.Entries.clear();
487     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
488   }
489 
490 public:
491   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
492     assert(AsyncInfo && "AsyncInfo is nullptr");
493 
494     if (!AsyncInfo->Queue) {
495       CUstream S;
496       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
497         return nullptr;
498 
499       AsyncInfo->Queue = S;
500     }
501 
502     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
503   }
504 
505   // This class should not be copied
506   DeviceRTLTy(const DeviceRTLTy &) = delete;
507   DeviceRTLTy(DeviceRTLTy &&) = delete;
508 
509   DeviceRTLTy()
510       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
511         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
512         DynamicMemorySize(0) {
513 
514     DP("Start initializing CUDA\n");
515 
516     CUresult Err = cuInit(0);
517     if (Err == CUDA_ERROR_INVALID_HANDLE) {
518       // Can't call cuGetErrorString if dlsym failed
519       DP("Failed to load CUDA shared library\n");
520       return;
521     }
522     if (!checkResult(Err, "Error returned from cuInit\n")) {
523       return;
524     }
525 
526     Err = cuDeviceGetCount(&NumberOfDevices);
527     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
528       return;
529 
530     if (NumberOfDevices == 0) {
531       DP("There are no devices supporting CUDA.\n");
532       return;
533     }
534 
535     DeviceData.resize(NumberOfDevices);
536     StreamPool.resize(NumberOfDevices);
537     EventPool.resize(NumberOfDevices);
538 
539     // Get environment variables regarding teams
540     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
541       // OMP_TEAM_LIMIT has been set
542       EnvTeamLimit = std::stoi(EnvStr);
543       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
544     }
545     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
546       // OMP_TEAMS_THREAD_LIMIT has been set
547       EnvTeamThreadLimit = std::stoi(EnvStr);
548       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
549     }
550     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
551       // OMP_NUM_TEAMS has been set
552       EnvNumTeams = std::stoi(EnvStr);
553       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
554     }
555     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
556       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
557       DynamicMemorySize = std::stoi(EnvStr);
558       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
559          DynamicMemorySize);
560     }
561     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
562       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
563       NumInitialStreams = std::stoi(EnvStr);
564       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
565     }
566 
567     for (int I = 0; I < NumberOfDevices; ++I)
568       DeviceAllocators.emplace_back(I, DeviceData);
569 
570     // Get the size threshold from environment variable
571     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
572     UseMemoryManager = Res.second;
573     size_t MemoryManagerThreshold = Res.first;
574 
575     if (UseMemoryManager)
576       for (int I = 0; I < NumberOfDevices; ++I)
577         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
578             DeviceAllocators[I], MemoryManagerThreshold));
579 
580     // We lazily initialize all devices later.
581     InitializedFlags.assign(NumberOfDevices, false);
582   }
583 
584   ~DeviceRTLTy() {
585     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
586       deinitDevice(DeviceId);
587   }
588 
589   // Check whether a given DeviceId is valid
590   bool isValidDeviceId(const int DeviceId) const {
591     return DeviceId >= 0 && DeviceId < NumberOfDevices;
592   }
593 
594   int getNumOfDevices() const { return NumberOfDevices; }
595 
596   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
597 
598   int initDevice(const int DeviceId) {
599     CUdevice Device;
600 
601     DP("Getting device %d\n", DeviceId);
602     CUresult Err = cuDeviceGet(&Device, DeviceId);
603     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
604       return OFFLOAD_FAIL;
605 
606     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
607     InitializedFlags[DeviceId] = true;
608 
609     // Query the current flags of the primary context and set its flags if
610     // it is inactive
611     unsigned int FormerPrimaryCtxFlags = 0;
612     int FormerPrimaryCtxIsActive = 0;
613     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
614                                      &FormerPrimaryCtxIsActive);
615     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
616       return OFFLOAD_FAIL;
617 
618     if (FormerPrimaryCtxIsActive) {
619       DP("The primary context is active, no change to its flags\n");
620       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
621           CU_CTX_SCHED_BLOCKING_SYNC)
622         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
623     } else {
624       DP("The primary context is inactive, set its flags to "
625          "CU_CTX_SCHED_BLOCKING_SYNC\n");
626       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
627       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
628         return OFFLOAD_FAIL;
629     }
630 
631     // Retain the per device primary context and save it to use whenever this
632     // device is selected.
633     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
634     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
635       return OFFLOAD_FAIL;
636 
637     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
638     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
639       return OFFLOAD_FAIL;
640 
641     // Initialize the stream pool.
642     if (!StreamPool[DeviceId])
643       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
644           StreamAllocatorTy(DeviceData[DeviceId].Context), NumInitialStreams);
645 
646     // Initialize the event pool.
647     if (!EventPool[DeviceId])
648       EventPool[DeviceId] = std::make_unique<EventPoolTy>(
649           EventAllocatorTy(DeviceData[DeviceId].Context), NumInitialEvents);
650 
651     // Query attributes to determine number of threads/block and blocks/grid.
652     int MaxGridDimX;
653     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
654                                Device);
655     if (Err != CUDA_SUCCESS) {
656       DP("Error getting max grid dimension, use default value %d\n",
657          DeviceRTLTy::DefaultNumTeams);
658       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
659     } else {
660       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
661       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
662     }
663 
664     // We are only exploiting threads along the x axis.
665     int MaxBlockDimX;
666     Err = cuDeviceGetAttribute(&MaxBlockDimX,
667                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
668     if (Err != CUDA_SUCCESS) {
669       DP("Error getting max block dimension, use default value %d\n",
670          DeviceRTLTy::DefaultNumThreads);
671       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
672     } else {
673       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
674       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
675 
676       if (EnvTeamThreadLimit > 0 &&
677           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
678         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
679            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
680            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
681         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
682       }
683       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
684         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
685            "capping at the hard limit\n",
686            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
687         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
688       }
689     }
690 
691     // Get and set warp size
692     int WarpSize;
693     Err =
694         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
695     if (Err != CUDA_SUCCESS) {
696       DP("Error getting warp size, assume default value 32\n");
697       DeviceData[DeviceId].WarpSize = 32;
698     } else {
699       DP("Using warp size %d\n", WarpSize);
700       DeviceData[DeviceId].WarpSize = WarpSize;
701     }
702 
703     // Adjust teams to the env variables
704     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
705       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
706          EnvTeamLimit);
707       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
708     }
709 
710     size_t StackLimit;
711     size_t HeapLimit;
712     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
713       StackLimit = std::stol(EnvStr);
714       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
715         return OFFLOAD_FAIL;
716     } else {
717       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
718         return OFFLOAD_FAIL;
719     }
720     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
721       HeapLimit = std::stol(EnvStr);
722       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
723         return OFFLOAD_FAIL;
724     } else {
725       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
726         return OFFLOAD_FAIL;
727     }
728 
729     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
730          "Device supports up to %d CUDA blocks and %d threads with a "
731          "warp size of %d\n",
732          DeviceData[DeviceId].BlocksPerGrid,
733          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
734     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
735          "Device heap size is %d Bytes, device stack size is %d Bytes per "
736          "thread\n",
737          (int)HeapLimit, (int)StackLimit);
738 
739     // Set default number of teams
740     if (EnvNumTeams > 0) {
741       DP("Default number of teams set according to environment %d\n",
742          EnvNumTeams);
743       DeviceData[DeviceId].NumTeams = EnvNumTeams;
744     } else {
745       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
746       DP("Default number of teams set according to library's default %d\n",
747          DeviceRTLTy::DefaultNumTeams);
748     }
749 
750     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
751       DP("Default number of teams exceeds device limit, capping at %d\n",
752          DeviceData[DeviceId].BlocksPerGrid);
753       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
754     }
755 
756     // Set default number of threads
757     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
758     DP("Default number of threads set according to library's default %d\n",
759        DeviceRTLTy::DefaultNumThreads);
760     if (DeviceData[DeviceId].NumThreads >
761         DeviceData[DeviceId].ThreadsPerBlock) {
762       DP("Default number of threads exceeds device limit, capping at %d\n",
763          DeviceData[DeviceId].ThreadsPerBlock);
764       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
765     }
766 
767     return OFFLOAD_SUCCESS;
768   }
769 
770   int deinitDevice(const int DeviceId) {
771     auto IsInitialized = InitializedFlags[DeviceId];
772     if (!IsInitialized)
773       return OFFLOAD_SUCCESS;
774     InitializedFlags[DeviceId] = false;
775 
776     if (UseMemoryManager)
777       MemoryManagers[DeviceId].release();
778 
779     // Close module
780     if (CUmodule &M = Modules[DeviceId])
781       checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
782 
783     StreamPool[DeviceId].reset();
784     EventPool[DeviceId].reset();
785 
786     // Destroy context
787     DeviceDataTy &D = DeviceData[DeviceId];
788     if (D.Context) {
789       if (checkResult(cuCtxSetCurrent(D.Context),
790                       "Error returned from cuCtxSetCurrent\n")) {
791         CUdevice Device;
792         if (checkResult(cuCtxGetDevice(&Device),
793                         "Error returned from cuCtxGetDevice\n"))
794           checkResult(cuDevicePrimaryCtxRelease(Device),
795                       "Error returned from cuDevicePrimaryCtxRelease\n");
796       }
797     }
798     return OFFLOAD_SUCCESS;
799   }
800 
801   __tgt_target_table *loadBinary(const int DeviceId,
802                                  const __tgt_device_image *Image) {
803     // Set the context we are using
804     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
805     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
806       return nullptr;
807 
808     // Clear the offload table as we are going to create a new one.
809     clearOffloadEntriesTable(DeviceId);
810 
811     // Create the module and extract the function pointers.
812     CUmodule Module;
813     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
814     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
815     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
816       return nullptr;
817 
818     DP("CUDA module successfully loaded!\n");
819 
820     Modules.push_back(Module);
821 
822     // Find the symbols in the module by name.
823     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
824     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
825 
826     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
827     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
828       if (!E->addr) {
829         // We return nullptr when something like this happens, the host should
830         // have always something in the address to uniquely identify the target
831         // region.
832         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
833         return nullptr;
834       }
835 
836       if (E->size) {
837         __tgt_offload_entry Entry = *E;
838         CUdeviceptr CUPtr;
839         size_t CUSize;
840         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
841         // We keep this style here because we need the name
842         if (Err != CUDA_SUCCESS) {
843           REPORT("Loading global '%s' Failed\n", E->name);
844           CUDA_ERR_STRING(Err);
845           return nullptr;
846         }
847 
848         if (CUSize != E->size) {
849           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
850              CUSize, E->size);
851           return nullptr;
852         }
853 
854         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
855            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
856 
857         Entry.addr = (void *)(CUPtr);
858 
859         // Note: In the current implementation declare target variables
860         // can either be link or to. This means that once unified
861         // memory is activated via the requires directive, the variable
862         // can be used directly from the host in both cases.
863         // TODO: when variables types other than to or link are added,
864         // the below condition should be changed to explicitly
865         // check for to and link variables types:
866         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
867         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
868         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
869           // If unified memory is present any target link or to variables
870           // can access host addresses directly. There is no longer a
871           // need for device copies.
872           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
873           DP("Copy linked variable host address (" DPxMOD
874              ") to device address (" DPxMOD ")\n",
875              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
876         }
877 
878         addOffloadEntry(DeviceId, Entry);
879 
880         continue;
881       }
882 
883       CUfunction Func;
884       Err = cuModuleGetFunction(&Func, Module, E->name);
885       // We keep this style here because we need the name
886       if (Err != CUDA_SUCCESS) {
887         REPORT("Loading '%s' Failed\n", E->name);
888         CUDA_ERR_STRING(Err);
889         return nullptr;
890       }
891 
892       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
893          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
894 
895       // default value GENERIC (in case symbol is missing from cubin file)
896       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
897       std::string ExecModeNameStr(E->name);
898       ExecModeNameStr += "_exec_mode";
899       const char *ExecModeName = ExecModeNameStr.c_str();
900 
901       CUdeviceptr ExecModePtr;
902       size_t CUSize;
903       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
904       if (Err == CUDA_SUCCESS) {
905         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
906           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
907              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
908           return nullptr;
909         }
910 
911         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
912         if (Err != CUDA_SUCCESS) {
913           REPORT("Error when copying data from device to host. Pointers: "
914                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
915                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
916           CUDA_ERR_STRING(Err);
917           return nullptr;
918         }
919       } else {
920         DP("Loading global exec_mode '%s' - symbol missing, using default "
921            "value GENERIC (1)\n",
922            ExecModeName);
923       }
924 
925       KernelsList.emplace_back(Func, ExecModeVal);
926 
927       __tgt_offload_entry Entry = *E;
928       Entry.addr = &KernelsList.back();
929       addOffloadEntry(DeviceId, Entry);
930     }
931 
932     // send device environment data to the device
933     {
934       // TODO: The device ID used here is not the real device ID used by OpenMP.
935       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
936                                     static_cast<uint32_t>(DeviceId),
937                                     static_cast<uint32_t>(DynamicMemorySize)};
938 
939       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
940         DeviceEnv.DebugKind = std::stoi(EnvStr);
941 
942       const char *DeviceEnvName = "omptarget_device_environment";
943       CUdeviceptr DeviceEnvPtr;
944       size_t CUSize;
945 
946       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
947       if (Err == CUDA_SUCCESS) {
948         if (CUSize != sizeof(DeviceEnv)) {
949           REPORT(
950               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
951               DeviceEnvName, CUSize, sizeof(int32_t));
952           CUDA_ERR_STRING(Err);
953           return nullptr;
954         }
955 
956         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
957         if (Err != CUDA_SUCCESS) {
958           REPORT("Error when copying data from host to device. Pointers: "
959                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
960                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
961           CUDA_ERR_STRING(Err);
962           return nullptr;
963         }
964 
965         DP("Sending global device environment data %zu bytes\n", CUSize);
966       } else {
967         DP("Finding global device environment '%s' - symbol missing.\n",
968            DeviceEnvName);
969         DP("Continue, considering this is a device RTL which does not accept "
970            "environment setting.\n");
971       }
972     }
973 
974     return getOffloadEntriesTable(DeviceId);
975   }
976 
977   void *dataAlloc(const int DeviceId, const int64_t Size,
978                   const TargetAllocTy Kind) {
979     switch (Kind) {
980     case TARGET_ALLOC_DEFAULT:
981     case TARGET_ALLOC_DEVICE:
982       if (UseMemoryManager)
983         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
984       else
985         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
986     case TARGET_ALLOC_HOST:
987     case TARGET_ALLOC_SHARED:
988       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
989     }
990 
991     REPORT("Invalid target data allocation kind or requested allocator not "
992            "implemented yet\n");
993 
994     return nullptr;
995   }
996 
997   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
998                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
999     assert(AsyncInfo && "AsyncInfo is nullptr");
1000 
1001     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1002     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1003       return OFFLOAD_FAIL;
1004 
1005     CUstream Stream = getStream(DeviceId, AsyncInfo);
1006 
1007     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
1008     if (Err != CUDA_SUCCESS) {
1009       DP("Error when copying data from host to device. Pointers: host "
1010          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1011          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1012       CUDA_ERR_STRING(Err);
1013       return OFFLOAD_FAIL;
1014     }
1015 
1016     return OFFLOAD_SUCCESS;
1017   }
1018 
1019   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1020                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
1021     assert(AsyncInfo && "AsyncInfo is nullptr");
1022 
1023     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1024     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1025       return OFFLOAD_FAIL;
1026 
1027     CUstream Stream = getStream(DeviceId, AsyncInfo);
1028 
1029     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1030     if (Err != CUDA_SUCCESS) {
1031       DP("Error when copying data from device to host. Pointers: host "
1032          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1033          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1034       CUDA_ERR_STRING(Err);
1035       return OFFLOAD_FAIL;
1036     }
1037 
1038     return OFFLOAD_SUCCESS;
1039   }
1040 
1041   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1042                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1043     assert(AsyncInfo && "AsyncInfo is nullptr");
1044 
1045     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1046     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1047       return OFFLOAD_FAIL;
1048 
1049     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1050 
1051     // If they are two devices, we try peer to peer copy first
1052     if (SrcDevId != DstDevId) {
1053       int CanAccessPeer = 0;
1054       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1055       if (Err != CUDA_SUCCESS) {
1056         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1057                ", dst = %" PRId32 "\n",
1058                SrcDevId, DstDevId);
1059         CUDA_ERR_STRING(Err);
1060         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1061       }
1062 
1063       if (!CanAccessPeer) {
1064         DP("P2P memcpy not supported so fall back to D2D memcpy");
1065         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1066       }
1067 
1068       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1069       if (Err != CUDA_SUCCESS) {
1070         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1071                ", dst = %" PRId32 "\n",
1072                SrcDevId, DstDevId);
1073         CUDA_ERR_STRING(Err);
1074         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1075       }
1076 
1077       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1078                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1079                               Size, Stream);
1080       if (Err == CUDA_SUCCESS)
1081         return OFFLOAD_SUCCESS;
1082 
1083       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1084          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1085          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1086       CUDA_ERR_STRING(Err);
1087     }
1088 
1089     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1090   }
1091 
1092   int dataDelete(const int DeviceId, void *TgtPtr) {
1093     if (UseMemoryManager)
1094       return MemoryManagers[DeviceId]->free(TgtPtr);
1095 
1096     return DeviceAllocators[DeviceId].free(TgtPtr);
1097   }
1098 
1099   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1100                           ptrdiff_t *TgtOffsets, const int ArgNum,
1101                           const int TeamNum, const int ThreadLimit,
1102                           const unsigned int LoopTripCount,
1103                           __tgt_async_info *AsyncInfo) const {
1104     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1105     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1106       return OFFLOAD_FAIL;
1107 
1108     // All args are references.
1109     std::vector<void *> Args(ArgNum);
1110     std::vector<void *> Ptrs(ArgNum);
1111 
1112     for (int I = 0; I < ArgNum; ++I) {
1113       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1114       Args[I] = &Ptrs[I];
1115     }
1116 
1117     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1118 
1119     const bool IsSPMDGenericMode =
1120         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1121     const bool IsSPMDMode =
1122         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1123     const bool IsGenericMode =
1124         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1125 
1126     int CudaThreadsPerBlock;
1127     if (ThreadLimit > 0) {
1128       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1129       CudaThreadsPerBlock = ThreadLimit;
1130       // Add master warp if necessary
1131       if (IsGenericMode) {
1132         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1133         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1134       }
1135     } else {
1136       DP("Setting CUDA threads per block to default %d\n",
1137          DeviceData[DeviceId].NumThreads);
1138       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1139     }
1140 
1141     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1142       DP("Threads per block capped at device limit %d\n",
1143          DeviceData[DeviceId].ThreadsPerBlock);
1144       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1145     }
1146 
1147     if (!KernelInfo->MaxThreadsPerBlock) {
1148       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1149                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1150                                KernelInfo->Func);
1151       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1152         return OFFLOAD_FAIL;
1153     }
1154 
1155     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1156       DP("Threads per block capped at kernel limit %d\n",
1157          KernelInfo->MaxThreadsPerBlock);
1158       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1159     }
1160 
1161     unsigned int CudaBlocksPerGrid;
1162     if (TeamNum <= 0) {
1163       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1164         if (IsSPMDGenericMode) {
1165           // If we reach this point, then we are executing a kernel that was
1166           // transformed from Generic-mode to SPMD-mode. This kernel has
1167           // SPMD-mode execution, but needs its blocks to be scheduled
1168           // differently because the current loop trip count only applies to the
1169           // `teams distribute` region and will create var too few blocks using
1170           // the regular SPMD-mode method.
1171           CudaBlocksPerGrid = LoopTripCount;
1172         } else if (IsSPMDMode) {
1173           // We have a combined construct, i.e. `target teams distribute
1174           // parallel for [simd]`. We launch so many teams so that each thread
1175           // will execute one iteration of the loop. round up to the nearest
1176           // integer
1177           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1178         } else if (IsGenericMode) {
1179           // If we reach this point, then we have a non-combined construct, i.e.
1180           // `teams distribute` with a nested `parallel for` and each team is
1181           // assigned one iteration of the `distribute` loop. E.g.:
1182           //
1183           // #pragma omp target teams distribute
1184           // for(...loop_tripcount...) {
1185           //   #pragma omp parallel for
1186           //   for(...) {}
1187           // }
1188           //
1189           // Threads within a team will execute the iterations of the `parallel`
1190           // loop.
1191           CudaBlocksPerGrid = LoopTripCount;
1192         } else {
1193           REPORT("Unknown execution mode: %d\n",
1194                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1195           return OFFLOAD_FAIL;
1196         }
1197         DP("Using %d teams due to loop trip count %" PRIu32
1198            " and number of threads per block %d\n",
1199            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1200       } else {
1201         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1202         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1203       }
1204     } else {
1205       DP("Using requested number of teams %d\n", TeamNum);
1206       CudaBlocksPerGrid = TeamNum;
1207     }
1208 
1209     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1210       DP("Capping number of teams to team limit %d\n",
1211          DeviceData[DeviceId].BlocksPerGrid);
1212       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1213     }
1214 
1215     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1216          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1217          (getOffloadEntry(DeviceId, TgtEntryPtr))
1218              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1219              : "(null)",
1220          CudaBlocksPerGrid, CudaThreadsPerBlock,
1221          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1222 
1223     CUstream Stream = getStream(DeviceId, AsyncInfo);
1224     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1225                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1226                          /* blockDimY */ 1, /* blockDimZ */ 1,
1227                          DynamicMemorySize, Stream, &Args[0], nullptr);
1228     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1229       return OFFLOAD_FAIL;
1230 
1231     DP("Launch of entry point at " DPxMOD " successful!\n",
1232        DPxPTR(TgtEntryPtr));
1233 
1234     return OFFLOAD_SUCCESS;
1235   }
1236 
1237   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1238     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1239     CUresult Err = cuStreamSynchronize(Stream);
1240 
1241     // Once the stream is synchronized, return it to stream pool and reset
1242     // AsyncInfo. This is to make sure the synchronization only works for its
1243     // own tasks.
1244     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1245     AsyncInfo->Queue = nullptr;
1246 
1247     if (Err != CUDA_SUCCESS) {
1248       DP("Error when synchronizing stream. stream = " DPxMOD
1249          ", async info ptr = " DPxMOD "\n",
1250          DPxPTR(Stream), DPxPTR(AsyncInfo));
1251       CUDA_ERR_STRING(Err);
1252     }
1253     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1254   }
1255 
1256   void printDeviceInfo(int32_t device_id) {
1257     char TmpChar[1000];
1258     std::string TmpStr;
1259     size_t TmpSt;
1260     int TmpInt, TmpInt2, TmpInt3;
1261 
1262     CUdevice Device;
1263     checkResult(cuDeviceGet(&Device, device_id),
1264                 "Error returned from cuCtxGetDevice\n");
1265 
1266     cuDriverGetVersion(&TmpInt);
1267     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1268     printf("    CUDA Device Number: \t\t%d \n", device_id);
1269     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1270                 "Error returned from cuDeviceGetName\n");
1271     printf("    Device Name: \t\t\t%s \n", TmpChar);
1272     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1273                 "Error returned from cuDeviceTotalMem\n");
1274     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1275     checkResult(cuDeviceGetAttribute(
1276                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1277                 "Error returned from cuDeviceGetAttribute\n");
1278     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1279     checkResult(
1280         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1281         "Error returned from cuDeviceGetAttribute\n");
1282     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1283     checkResult(cuDeviceGetAttribute(
1284                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1285                 "Error returned from cuDeviceGetAttribute\n");
1286     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1287     checkResult(
1288         cuDeviceGetAttribute(
1289             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1290         "Error returned from cuDeviceGetAttribute\n");
1291     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1292     checkResult(
1293         cuDeviceGetAttribute(
1294             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1295         "Error returned from cuDeviceGetAttribute\n");
1296     printf("    Registers per Block: \t\t%d \n", TmpInt);
1297     checkResult(
1298         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1299         "Error returned from cuDeviceGetAttribute\n");
1300     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1301     checkResult(cuDeviceGetAttribute(
1302                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1303                 "Error returned from cuDeviceGetAttribute\n");
1304     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1305     checkResult(cuDeviceGetAttribute(
1306                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1307                 "Error returned from cuDeviceGetAttribute\n");
1308     checkResult(cuDeviceGetAttribute(
1309                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1310                 "Error returned from cuDeviceGetAttribute\n");
1311     checkResult(cuDeviceGetAttribute(
1312                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1313                 "Error returned from cuDeviceGetAttribute\n");
1314     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1315            TmpInt3);
1316     checkResult(cuDeviceGetAttribute(
1317                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1318                 "Error returned from cuDeviceGetAttribute\n");
1319     checkResult(cuDeviceGetAttribute(
1320                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1321                 "Error returned from cuDeviceGetAttribute\n");
1322     checkResult(cuDeviceGetAttribute(
1323                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1324                 "Error returned from cuDeviceGetAttribute\n");
1325     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1326            TmpInt3);
1327     checkResult(
1328         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1329         "Error returned from cuDeviceGetAttribute\n");
1330     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1331     checkResult(cuDeviceGetAttribute(
1332                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1333                 "Error returned from cuDeviceGetAttribute\n");
1334     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1335     checkResult(
1336         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1337         "Error returned from cuDeviceGetAttribute\n");
1338     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1339     checkResult(cuDeviceGetAttribute(
1340                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1341                 "Error returned from cuDeviceGetAttribute\n");
1342     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1343     checkResult(
1344         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1345         "Error returned from cuDeviceGetAttribute\n");
1346     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1347     checkResult(cuDeviceGetAttribute(
1348                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1349                 "Error returned from cuDeviceGetAttribute\n");
1350     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1351     checkResult(
1352         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1353         "Error returned from cuDeviceGetAttribute\n");
1354     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1355       TmpStr = "DEFAULT";
1356     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1357       TmpStr = "PROHIBITED";
1358     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1359       TmpStr = "EXCLUSIVE PROCESS";
1360     else
1361       TmpStr = "unknown";
1362     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1363     checkResult(cuDeviceGetAttribute(
1364                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1365                 "Error returned from cuDeviceGetAttribute\n");
1366     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1367     checkResult(
1368         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1369         "Error returned from cuDeviceGetAttribute\n");
1370     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1371     checkResult(cuDeviceGetAttribute(
1372                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1373                 "Error returned from cuDeviceGetAttribute\n");
1374     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1375     checkResult(
1376         cuDeviceGetAttribute(
1377             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1378         "Error returned from cuDeviceGetAttribute\n");
1379     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1380     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1381                                      Device),
1382                 "Error returned from cuDeviceGetAttribute\n");
1383     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1384     checkResult(cuDeviceGetAttribute(
1385                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1386                     Device),
1387                 "Error returned from cuDeviceGetAttribute\n");
1388     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1389     checkResult(cuDeviceGetAttribute(
1390                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1391                 "Error returned from cuDeviceGetAttribute\n");
1392     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1393     checkResult(cuDeviceGetAttribute(
1394                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1395                 "Error returned from cuDeviceGetAttribute\n");
1396     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1397     checkResult(cuDeviceGetAttribute(
1398                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1399                 "Error returned from cuDeviceGetAttribute\n");
1400     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1401     checkResult(
1402         cuDeviceGetAttribute(
1403             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1404         "Error returned from cuDeviceGetAttribute\n");
1405     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1406     checkResult(
1407         cuDeviceGetAttribute(
1408             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1409         "Error returned from cuDeviceGetAttribute\n");
1410     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1411     checkResult(cuDeviceGetAttribute(
1412                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1413                 "Error returned from cuDeviceGetAttribute\n");
1414     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1415     checkResult(cuDeviceGetAttribute(
1416                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1417                 "Error returned from cuDeviceGetAttribute\n");
1418     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1419     checkResult(
1420         cuDeviceGetAttribute(
1421             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1422         "Error returned from cuDeviceGetAttribute\n");
1423     checkResult(
1424         cuDeviceGetAttribute(
1425             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1426         "Error returned from cuDeviceGetAttribute\n");
1427     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1428   }
1429 
1430   int createEvent(int DeviceId, void **P) {
1431     CUevent Event = nullptr;
1432     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1433       return OFFLOAD_FAIL;
1434     *P = Event;
1435     return OFFLOAD_SUCCESS;
1436   }
1437 
1438   int destroyEvent(int DeviceId, void *EventPtr) {
1439     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1440     return OFFLOAD_SUCCESS;
1441   }
1442 
1443   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1444                 void *EventPtr) const {
1445     CUstream Stream = getStream(DeviceId, AsyncInfo);
1446     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1447 
1448     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1449     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1450     // API document, that argument has to be 0x0.
1451     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1452     if (Err != CUDA_SUCCESS) {
1453       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1454          DPxPTR(Stream), DPxPTR(Event));
1455       CUDA_ERR_STRING(Err);
1456       return OFFLOAD_FAIL;
1457     }
1458 
1459     return OFFLOAD_SUCCESS;
1460   }
1461 
1462   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1463     if (AsyncInfo->Queue) {
1464       StreamPool[DeviceId]->release(
1465           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1466       AsyncInfo->Queue = nullptr;
1467     }
1468 
1469     return OFFLOAD_SUCCESS;
1470   }
1471 
1472   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1473     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1474     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1475       return OFFLOAD_FAIL;
1476 
1477     *AsyncInfo = new __tgt_async_info;
1478     getStream(DeviceId, *AsyncInfo);
1479     return OFFLOAD_SUCCESS;
1480   }
1481 
1482   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1483                      const char **ErrStr) const {
1484     assert(DeviceInfo && "DeviceInfo is nullptr");
1485 
1486     if (!DeviceInfo->Context)
1487       DeviceInfo->Context = DeviceData[DeviceId].Context;
1488     if (!DeviceInfo->Device) {
1489       CUdevice Dev;
1490       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1491       if (Err == CUDA_SUCCESS) {
1492         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1493       } else {
1494         cuGetErrorString(Err, ErrStr);
1495         return OFFLOAD_FAIL;
1496       }
1497     }
1498     return OFFLOAD_SUCCESS;
1499   }
1500 };
1501 
1502 DeviceRTLTy DeviceRTL;
1503 } // namespace
1504 
1505 // Exposed library API function
1506 #ifdef __cplusplus
1507 extern "C" {
1508 #endif
1509 
1510 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1511   return elf_check_machine(image, /* EM_CUDA */ 190);
1512 }
1513 
1514 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1515 
1516 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1517   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1518   DeviceRTL.setRequiresFlag(RequiresFlags);
1519   return RequiresFlags;
1520 }
1521 
1522 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1523   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1524       DeviceRTL.isValidDeviceId(dst_dev_id))
1525     return 1;
1526 
1527   return 0;
1528 }
1529 
1530 int32_t __tgt_rtl_init_device(int32_t device_id) {
1531   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1532 
1533   return DeviceRTL.initDevice(device_id);
1534 }
1535 
1536 int32_t __tgt_rtl_deinit_device(int32_t device_id) {
1537   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1538 
1539   return DeviceRTL.deinitDevice(device_id);
1540 }
1541 
1542 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1543                                           __tgt_device_image *image) {
1544   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1545 
1546   return DeviceRTL.loadBinary(device_id, image);
1547 }
1548 
1549 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1550                            int32_t kind) {
1551   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1552 
1553   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1554 }
1555 
1556 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1557                               int64_t size) {
1558   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1559 
1560   __tgt_async_info AsyncInfo;
1561   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1562                                                  size, &AsyncInfo);
1563   if (rc != OFFLOAD_SUCCESS)
1564     return OFFLOAD_FAIL;
1565 
1566   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1567 }
1568 
1569 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1570                                     void *hst_ptr, int64_t size,
1571                                     __tgt_async_info *async_info_ptr) {
1572   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1573   assert(async_info_ptr && "async_info_ptr is nullptr");
1574 
1575   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1576                               async_info_ptr);
1577 }
1578 
1579 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1580                                 int64_t size) {
1581   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1582 
1583   __tgt_async_info AsyncInfo;
1584   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1585                                                    size, &AsyncInfo);
1586   if (rc != OFFLOAD_SUCCESS)
1587     return OFFLOAD_FAIL;
1588 
1589   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1590 }
1591 
1592 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1593                                       void *tgt_ptr, int64_t size,
1594                                       __tgt_async_info *async_info_ptr) {
1595   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1596   assert(async_info_ptr && "async_info_ptr is nullptr");
1597 
1598   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1599                                 async_info_ptr);
1600 }
1601 
1602 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1603                                       int dst_dev_id, void *dst_ptr,
1604                                       int64_t size,
1605                                       __tgt_async_info *AsyncInfo) {
1606   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1607   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1608   assert(AsyncInfo && "AsyncInfo is nullptr");
1609 
1610   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1611                                 AsyncInfo);
1612 }
1613 
1614 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1615                                 int32_t dst_dev_id, void *dst_ptr,
1616                                 int64_t size) {
1617   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1618   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1619 
1620   __tgt_async_info AsyncInfo;
1621   const int32_t rc = __tgt_rtl_data_exchange_async(
1622       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1623   if (rc != OFFLOAD_SUCCESS)
1624     return OFFLOAD_FAIL;
1625 
1626   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1627 }
1628 
1629 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1630   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1631 
1632   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1633 }
1634 
1635 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1636                                          void **tgt_args,
1637                                          ptrdiff_t *tgt_offsets,
1638                                          int32_t arg_num, int32_t team_num,
1639                                          int32_t thread_limit,
1640                                          uint64_t loop_tripcount) {
1641   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1642 
1643   __tgt_async_info AsyncInfo;
1644   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1645       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1646       thread_limit, loop_tripcount, &AsyncInfo);
1647   if (rc != OFFLOAD_SUCCESS)
1648     return OFFLOAD_FAIL;
1649 
1650   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1651 }
1652 
1653 int32_t __tgt_rtl_run_target_team_region_async(
1654     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1655     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1656     int32_t thread_limit, uint64_t loop_tripcount,
1657     __tgt_async_info *async_info_ptr) {
1658   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1659 
1660   return DeviceRTL.runTargetTeamRegion(
1661       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1662       thread_limit, loop_tripcount, async_info_ptr);
1663 }
1664 
1665 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1666                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1667                                     int32_t arg_num) {
1668   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1669 
1670   __tgt_async_info AsyncInfo;
1671   const int32_t rc = __tgt_rtl_run_target_region_async(
1672       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1673   if (rc != OFFLOAD_SUCCESS)
1674     return OFFLOAD_FAIL;
1675 
1676   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1677 }
1678 
1679 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1680                                           void *tgt_entry_ptr, void **tgt_args,
1681                                           ptrdiff_t *tgt_offsets,
1682                                           int32_t arg_num,
1683                                           __tgt_async_info *async_info_ptr) {
1684   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1685 
1686   return __tgt_rtl_run_target_team_region_async(
1687       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1688       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1689       async_info_ptr);
1690 }
1691 
1692 int32_t __tgt_rtl_synchronize(int32_t device_id,
1693                               __tgt_async_info *async_info_ptr) {
1694   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1695   assert(async_info_ptr && "async_info_ptr is nullptr");
1696   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1697 
1698   return DeviceRTL.synchronize(device_id, async_info_ptr);
1699 }
1700 
1701 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1702   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1703   InfoLevel.store(NewInfoLevel);
1704 }
1705 
1706 void __tgt_rtl_print_device_info(int32_t device_id) {
1707   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1708   DeviceRTL.printDeviceInfo(device_id);
1709 }
1710 
1711 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1712   assert(event && "event is nullptr");
1713   return DeviceRTL.createEvent(device_id, event);
1714 }
1715 
1716 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1717                                __tgt_async_info *async_info_ptr) {
1718   assert(async_info_ptr && "async_info_ptr is nullptr");
1719   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1720   assert(event_ptr && "event_ptr is nullptr");
1721 
1722   return recordEvent(event_ptr, async_info_ptr);
1723 }
1724 
1725 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1726                              __tgt_async_info *async_info_ptr) {
1727   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1728   assert(async_info_ptr && "async_info_ptr is nullptr");
1729   assert(event_ptr && "event is nullptr");
1730 
1731   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1732 }
1733 
1734 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1735   assert(event_ptr && "event is nullptr");
1736 
1737   return syncEvent(event_ptr);
1738 }
1739 
1740 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1741   assert(event_ptr && "event is nullptr");
1742 
1743   return DeviceRTL.destroyEvent(device_id, event_ptr);
1744 }
1745 
1746 int32_t __tgt_rtl_release_async_info(int32_t device_id,
1747                                      __tgt_async_info *async_info) {
1748   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1749   assert(async_info && "async_info is nullptr");
1750 
1751   return DeviceRTL.releaseAsyncInfo(device_id, async_info);
1752 }
1753 
1754 int32_t __tgt_rtl_init_async_info(int32_t device_id,
1755                                   __tgt_async_info **async_info) {
1756   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1757   assert(async_info && "async_info is nullptr");
1758 
1759   return DeviceRTL.initAsyncInfo(device_id, async_info);
1760 }
1761 
1762 int32_t __tgt_rtl_init_device_info(int32_t device_id,
1763                                    __tgt_device_info *device_info_ptr,
1764                                    const char **err_str) {
1765   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1766   assert(device_info_ptr && "device_info_ptr is nullptr");
1767 
1768   return DeviceRTL.initDeviceInfo(device_id, device_info_ptr, err_str);
1769 }
1770 
1771 #ifdef __cplusplus
1772 }
1773 #endif
1774