1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <mutex>
20 #include <string>
21 #include <unordered_map>
22 #include <vector>
23 
24 #include "Debug.h"
25 #include "DeviceEnvironment.h"
26 #include "omptarget.h"
27 #include "omptargetplugin.h"
28 
29 #define TARGET_NAME CUDA
30 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
31 
32 #include "MemoryManager.h"
33 
34 #include "llvm/Frontend/OpenMP/OMPConstants.h"
35 
36 // Utility for retrieving and printing CUDA error string.
37 #ifdef OMPTARGET_DEBUG
38 #define CUDA_ERR_STRING(err)                                                   \
39   do {                                                                         \
40     if (getDebugLevel() > 0) {                                                 \
41       const char *errStr = nullptr;                                            \
42       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
43       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
44         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
45       else if (errStr_status == CUDA_SUCCESS)                                  \
46         REPORT("CUDA error is: %s\n", errStr);                                 \
47       else {                                                                   \
48         REPORT("Unresolved CUDA error code: %d\n", err);                       \
49         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
50                errStr_status);                                                 \
51       }                                                                        \
52     } else {                                                                   \
53       const char *errStr = nullptr;                                            \
54       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
55       if (errStr_status == CUDA_SUCCESS)                                       \
56         REPORT("%s \n", errStr);                                               \
57     }                                                                          \
58   } while (false)
59 #else // OMPTARGET_DEBUG
60 #define CUDA_ERR_STRING(err)                                                   \
61   do {                                                                         \
62     const char *errStr = nullptr;                                              \
63     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
64     if (errStr_status == CUDA_SUCCESS)                                         \
65       REPORT("%s \n", errStr);                                                 \
66   } while (false)
67 #endif // OMPTARGET_DEBUG
68 
69 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
70 
71 #include "elf_common.h"
72 
73 /// Keep entries table per device.
74 struct FuncOrGblEntryTy {
75   __tgt_target_table Table;
76   std::vector<__tgt_offload_entry> Entries;
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
85 
86   /// Maximal number of threads per block for this kernel.
87   int MaxThreadsPerBlock = 0;
88 
89   KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
90       : Func(Func), ExecutionMode(ExecutionMode) {}
91 };
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   REPORT("%s", ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
120   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
121   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
122 
123   CUresult Err = cuEventRecord(Event, Stream);
124   if (Err != CUDA_SUCCESS) {
125     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
126        DPxPTR(Stream), DPxPTR(Event));
127     CUDA_ERR_STRING(Err);
128     return OFFLOAD_FAIL;
129   }
130 
131   return OFFLOAD_SUCCESS;
132 }
133 
134 int syncEvent(void *EventPtr) {
135   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
136 
137   CUresult Err = cuEventSynchronize(Event);
138   if (Err != CUDA_SUCCESS) {
139     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
140     CUDA_ERR_STRING(Err);
141     return OFFLOAD_FAIL;
142   }
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 namespace {
148 
149 // Structure contains per-device data
150 struct DeviceDataTy {
151   /// List that contains all the kernels.
152   std::list<KernelTy> KernelsList;
153 
154   std::list<FuncOrGblEntryTy> FuncGblEntries;
155 
156   CUcontext Context = nullptr;
157   // Device properties
158   unsigned int ThreadsPerBlock = 0;
159   unsigned int BlocksPerGrid = 0;
160   unsigned int WarpSize = 0;
161   // OpenMP properties
162   int NumTeams = 0;
163   int NumThreads = 0;
164 };
165 
166 /// Resource allocator where \p T is the resource type.
167 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
168 /// accordingly. The implementation should not raise any exception.
169 template <typename T> struct AllocatorTy {
170   using ElementTy = T;
171   virtual ~AllocatorTy() {}
172 
173   /// Create a resource and assign to R.
174   virtual int create(T &R) noexcept = 0;
175   /// Destroy the resource.
176   virtual int destroy(T) noexcept = 0;
177 };
178 
179 /// Allocator for CUstream.
180 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
181   /// See AllocatorTy<T>::create.
182   int create(CUstream &Stream) noexcept override {
183     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
184                      "Error returned from cuStreamCreate\n"))
185       return OFFLOAD_FAIL;
186 
187     return OFFLOAD_SUCCESS;
188   }
189 
190   /// See AllocatorTy<T>::destroy.
191   int destroy(CUstream Stream) noexcept override {
192     if (!checkResult(cuStreamDestroy(Stream),
193                      "Error returned from cuStreamDestroy\n"))
194       return OFFLOAD_FAIL;
195 
196     return OFFLOAD_SUCCESS;
197   }
198 };
199 
200 /// Allocator for CUevent.
201 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
202   /// See AllocatorTy<T>::create.
203   int create(CUevent &Event) noexcept override {
204     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
205                      "Error returned from cuEventCreate\n"))
206       return OFFLOAD_FAIL;
207 
208     return OFFLOAD_SUCCESS;
209   }
210 
211   /// See AllocatorTy<T>::destroy.
212   int destroy(CUevent Event) noexcept override {
213     if (!checkResult(cuEventDestroy(Event),
214                      "Error returned from cuEventDestroy\n"))
215       return OFFLOAD_FAIL;
216 
217     return OFFLOAD_SUCCESS;
218   }
219 };
220 
221 /// A generic pool of resources where \p T is the resource type.
222 /// \p T should be copyable as the object is stored in \p std::vector .
223 template <typename AllocTy> class ResourcePoolTy {
224   using ElementTy = typename AllocTy::ElementTy;
225   /// Index of the next available resource.
226   size_t Next = 0;
227   /// Mutex to guard the pool.
228   std::mutex Mutex;
229   /// Pool of resources. The difference between \p Resources and \p Pool is,
230   /// when a resource is acquired and released, it is all on \p Resources. When
231   /// a batch of new resources are needed, they are both added to \p Resources
232   /// and \p Pool. The reason for this setting is, \p Resources could contain
233   /// redundant elements because resources are not released, which can cause
234   /// double free. This setting makes sure that \p Pool always has every
235   /// resource allocated from the device.
236   std::vector<ElementTy> Resources;
237   std::vector<ElementTy> Pool;
238   /// A reference to the corresponding allocator.
239   AllocTy Allocator;
240 
241   /// If `Resources` is used up, we will fill in more resources. It assumes that
242   /// the new size `Size` should be always larger than the current size.
243   bool resize(size_t Size) {
244     assert(Resources.size() == Pool.size() && "size mismatch");
245     auto CurSize = Resources.size();
246     assert(Size > CurSize && "Unexpected smaller size");
247     Pool.reserve(Size);
248     Resources.reserve(Size);
249     for (auto I = CurSize; I < Size; ++I) {
250       ElementTy NewItem;
251       int Ret = Allocator.create(NewItem);
252       if (Ret != OFFLOAD_SUCCESS)
253         return false;
254       Pool.push_back(NewItem);
255       Resources.push_back(NewItem);
256     }
257     return true;
258   }
259 
260 public:
261   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
262       : Allocator(std::move(A)) {
263     if (Size)
264       (void)resize(Size);
265   }
266 
267   ~ResourcePoolTy() noexcept { clear(); }
268 
269   /// Get a resource from pool. `Next` always points to the next available
270   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
271   /// still available. If there is no resource left, we will ask for more. Each
272   /// time a resource is assigned, the id will increase one.
273   /// xxxxxs+++++++++
274   ///      ^
275   ///      Next
276   /// After assignment, the pool becomes the following and s is assigned.
277   /// xxxxxs+++++++++
278   ///       ^
279   ///       Next
280   int acquire(ElementTy &R) noexcept {
281     std::lock_guard<std::mutex> LG(Mutex);
282     if (Next == Resources.size()) {
283       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
284       if (!resize(NewSize))
285         return OFFLOAD_FAIL;
286     }
287 
288     assert(Next < Resources.size());
289 
290     R = Resources[Next++];
291 
292     return OFFLOAD_SUCCESS;
293   }
294 
295   /// Return the resource back to the pool. When we return a resource, we need
296   /// to first decrease `Next`, and then copy the resource back. It is worth
297   /// noting that, the order of resources return might be different from that
298   /// they're assigned, that saying, at some point, there might be two identical
299   /// resources.
300   /// xxax+a+++++
301   ///     ^
302   ///     Next
303   /// However, it doesn't matter, because they're always on the two sides of
304   /// `Next`. The left one will in the end be overwritten by another resource.
305   /// Therefore, after several execution, the order of pool might be different
306   /// from its initial state.
307   void release(ElementTy R) noexcept {
308     std::lock_guard<std::mutex> LG(Mutex);
309     Resources[--Next] = R;
310   }
311 
312   /// Released all stored resources and clear the pool.
313   /// Note: This function is not thread safe. Be sure to guard it if necessary.
314   void clear() noexcept {
315     for (auto &R : Pool)
316       (void)Allocator.destroy(R);
317     Pool.clear();
318     Resources.clear();
319   }
320 };
321 
322 } // namespace
323 
324 class DeviceRTLTy {
325   int NumberOfDevices;
326   // OpenMP environment properties
327   int EnvNumTeams;
328   int EnvTeamLimit;
329   int EnvTeamThreadLimit;
330   // OpenMP requires flags
331   int64_t RequiresFlags;
332   // Amount of dynamic shared memory to use at launch.
333   uint64_t DynamicMemorySize;
334 
335   /// Number of initial streams for each device.
336   int NumInitialStreams = 32;
337 
338   /// Number of initial events for each device.
339   int NumInitialEvents = 8;
340 
341   static constexpr const int32_t HardThreadLimit = 1024;
342   static constexpr const int32_t DefaultNumTeams = 128;
343   static constexpr const int32_t DefaultNumThreads = 128;
344 
345   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
346   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
347 
348   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
349   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
350 
351   std::vector<DeviceDataTy> DeviceData;
352   std::vector<std::vector<CUmodule>> Modules;
353 
354   /// Vector of flags indicating the initalization status of all associated
355   /// devices.
356   std::vector<bool> InitializedFlags;
357 
358   enum class PeerAccessState : uint8_t { Unkown, Yes, No };
359   std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
360   std::mutex PeerAccessMatrixLock;
361 
362   /// A class responsible for interacting with device native runtime library to
363   /// allocate and free memory.
364   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
365     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
366 
367   public:
368     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
369       if (Size == 0)
370         return nullptr;
371 
372       void *MemAlloc = nullptr;
373       CUresult Err;
374       switch (Kind) {
375       case TARGET_ALLOC_DEFAULT:
376       case TARGET_ALLOC_DEVICE:
377         CUdeviceptr DevicePtr;
378         Err = cuMemAlloc(&DevicePtr, Size);
379         MemAlloc = (void *)DevicePtr;
380         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
381           return nullptr;
382         break;
383       case TARGET_ALLOC_HOST:
384         void *HostPtr;
385         Err = cuMemAllocHost(&HostPtr, Size);
386         MemAlloc = HostPtr;
387         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
388           return nullptr;
389         HostPinnedAllocs[MemAlloc] = Kind;
390         break;
391       case TARGET_ALLOC_SHARED:
392         CUdeviceptr SharedPtr;
393         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
394         MemAlloc = (void *)SharedPtr;
395         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
396           return nullptr;
397         break;
398       }
399 
400       return MemAlloc;
401     }
402 
403     int free(void *TgtPtr) override {
404       CUresult Err;
405       // Host pinned memory must be freed differently.
406       TargetAllocTy Kind =
407           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
408               ? TARGET_ALLOC_DEFAULT
409               : TARGET_ALLOC_HOST;
410       switch (Kind) {
411       case TARGET_ALLOC_DEFAULT:
412       case TARGET_ALLOC_DEVICE:
413       case TARGET_ALLOC_SHARED:
414         Err = cuMemFree((CUdeviceptr)TgtPtr);
415         if (!checkResult(Err, "Error returned from cuMemFree\n"))
416           return OFFLOAD_FAIL;
417         break;
418       case TARGET_ALLOC_HOST:
419         Err = cuMemFreeHost(TgtPtr);
420         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
421           return OFFLOAD_FAIL;
422         break;
423       }
424 
425       return OFFLOAD_SUCCESS;
426     }
427   };
428 
429   /// A vector of device allocators
430   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
431 
432   /// A vector of memory managers. Since the memory manager is non-copyable and
433   // non-removable, we wrap them into std::unique_ptr.
434   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
435 
436   /// Whether use memory manager
437   bool UseMemoryManager = true;
438 
439   // Record entry point associated with device
440   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
441     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
442     E.Entries.push_back(Entry);
443   }
444 
445   // Return a pointer to the entry associated with the pointer
446   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
447                                              const void *Addr) const {
448     for (const __tgt_offload_entry &Itr :
449          DeviceData[DeviceId].FuncGblEntries.back().Entries)
450       if (Itr.addr == Addr)
451         return &Itr;
452 
453     return nullptr;
454   }
455 
456   // Return the pointer to the target entries table
457   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
458     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
459 
460     if (E.Entries.empty())
461       return nullptr;
462 
463     // Update table info according to the entries and return the pointer
464     E.Table.EntriesBegin = E.Entries.data();
465     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
466 
467     return &E.Table;
468   }
469 
470   // Clear entries table for a device
471   void clearOffloadEntriesTable(const int DeviceId) {
472     DeviceData[DeviceId].FuncGblEntries.emplace_back();
473     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
474     E.Entries.clear();
475     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
476   }
477 
478 public:
479   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
480     assert(AsyncInfo && "AsyncInfo is nullptr");
481 
482     if (!AsyncInfo->Queue) {
483       CUstream S;
484       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
485         return nullptr;
486 
487       AsyncInfo->Queue = S;
488     }
489 
490     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
491   }
492 
493   // This class should not be copied
494   DeviceRTLTy(const DeviceRTLTy &) = delete;
495   DeviceRTLTy(DeviceRTLTy &&) = delete;
496 
497   DeviceRTLTy()
498       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
499         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
500         DynamicMemorySize(0) {
501 
502     DP("Start initializing CUDA\n");
503 
504     CUresult Err = cuInit(0);
505     if (Err == CUDA_ERROR_INVALID_HANDLE) {
506       // Can't call cuGetErrorString if dlsym failed
507       DP("Failed to load CUDA shared library\n");
508       return;
509     }
510     if (Err == CUDA_ERROR_NO_DEVICE) {
511       DP("There are no devices supporting CUDA.\n");
512       return;
513     }
514     if (!checkResult(Err, "Error returned from cuInit\n")) {
515       return;
516     }
517 
518     Err = cuDeviceGetCount(&NumberOfDevices);
519     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
520       return;
521 
522     if (NumberOfDevices == 0) {
523       DP("There are no devices supporting CUDA.\n");
524       return;
525     }
526 
527     DeviceData.resize(NumberOfDevices);
528     Modules.resize(NumberOfDevices);
529     StreamPool.resize(NumberOfDevices);
530     EventPool.resize(NumberOfDevices);
531     PeerAccessMatrix.resize(NumberOfDevices);
532     for (auto &V : PeerAccessMatrix)
533       V.resize(NumberOfDevices, PeerAccessState::Unkown);
534 
535     // Get environment variables regarding teams
536     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
537       // OMP_TEAM_LIMIT has been set
538       EnvTeamLimit = std::stoi(EnvStr);
539       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
540     }
541     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
542       // OMP_TEAMS_THREAD_LIMIT has been set
543       EnvTeamThreadLimit = std::stoi(EnvStr);
544       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
545     }
546     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
547       // OMP_NUM_TEAMS has been set
548       EnvNumTeams = std::stoi(EnvStr);
549       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
550     }
551     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
552       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
553       DynamicMemorySize = std::stoi(EnvStr);
554       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
555          DynamicMemorySize);
556     }
557     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
558       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
559       NumInitialStreams = std::stoi(EnvStr);
560       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
561     }
562 
563     for (int I = 0; I < NumberOfDevices; ++I)
564       DeviceAllocators.emplace_back();
565 
566     // Get the size threshold from environment variable
567     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
568     UseMemoryManager = Res.second;
569     size_t MemoryManagerThreshold = Res.first;
570 
571     if (UseMemoryManager)
572       for (int I = 0; I < NumberOfDevices; ++I)
573         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
574             DeviceAllocators[I], MemoryManagerThreshold));
575 
576     // We lazily initialize all devices later.
577     InitializedFlags.assign(NumberOfDevices, false);
578   }
579 
580   ~DeviceRTLTy() {
581     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
582       deinitDevice(DeviceId);
583   }
584 
585   // Check whether a given DeviceId is valid
586   bool isValidDeviceId(const int DeviceId) const {
587     return DeviceId >= 0 && DeviceId < NumberOfDevices;
588   }
589 
590   int getNumOfDevices() const { return NumberOfDevices; }
591 
592   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
593 
594   int initDevice(const int DeviceId) {
595     CUdevice Device;
596 
597     DP("Getting device %d\n", DeviceId);
598     CUresult Err = cuDeviceGet(&Device, DeviceId);
599     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
600       return OFFLOAD_FAIL;
601 
602     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
603     InitializedFlags[DeviceId] = true;
604 
605     // Query the current flags of the primary context and set its flags if
606     // it is inactive
607     unsigned int FormerPrimaryCtxFlags = 0;
608     int FormerPrimaryCtxIsActive = 0;
609     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
610                                      &FormerPrimaryCtxIsActive);
611     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
612       return OFFLOAD_FAIL;
613 
614     if (FormerPrimaryCtxIsActive) {
615       DP("The primary context is active, no change to its flags\n");
616       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
617           CU_CTX_SCHED_BLOCKING_SYNC)
618         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
619     } else {
620       DP("The primary context is inactive, set its flags to "
621          "CU_CTX_SCHED_BLOCKING_SYNC\n");
622       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
623       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
624         return OFFLOAD_FAIL;
625     }
626 
627     // Retain the per device primary context and save it to use whenever this
628     // device is selected.
629     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
630     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
631       return OFFLOAD_FAIL;
632 
633     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
634     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
635       return OFFLOAD_FAIL;
636 
637     // Initialize the stream pool.
638     if (!StreamPool[DeviceId])
639       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
640                                                             NumInitialStreams);
641 
642     // Initialize the event pool.
643     if (!EventPool[DeviceId])
644       EventPool[DeviceId] =
645           std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
646 
647     // Query attributes to determine number of threads/block and blocks/grid.
648     int MaxGridDimX;
649     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
650                                Device);
651     if (Err != CUDA_SUCCESS) {
652       DP("Error getting max grid dimension, use default value %d\n",
653          DeviceRTLTy::DefaultNumTeams);
654       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
655     } else {
656       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
657       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
658     }
659 
660     // We are only exploiting threads along the x axis.
661     int MaxBlockDimX;
662     Err = cuDeviceGetAttribute(&MaxBlockDimX,
663                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
664     if (Err != CUDA_SUCCESS) {
665       DP("Error getting max block dimension, use default value %d\n",
666          DeviceRTLTy::DefaultNumThreads);
667       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
668     } else {
669       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
670       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
671 
672       if (EnvTeamThreadLimit > 0 &&
673           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
674         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
675            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
676            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
677         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
678       }
679       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
680         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
681            "capping at the hard limit\n",
682            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
683         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
684       }
685     }
686 
687     // Get and set warp size
688     int WarpSize;
689     Err =
690         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
691     if (Err != CUDA_SUCCESS) {
692       DP("Error getting warp size, assume default value 32\n");
693       DeviceData[DeviceId].WarpSize = 32;
694     } else {
695       DP("Using warp size %d\n", WarpSize);
696       DeviceData[DeviceId].WarpSize = WarpSize;
697     }
698 
699     // Adjust teams to the env variables
700     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
701       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
702          EnvTeamLimit);
703       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
704     }
705 
706     size_t StackLimit;
707     size_t HeapLimit;
708     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
709       StackLimit = std::stol(EnvStr);
710       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
711         return OFFLOAD_FAIL;
712     } else {
713       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
714         return OFFLOAD_FAIL;
715     }
716     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
717       HeapLimit = std::stol(EnvStr);
718       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
719         return OFFLOAD_FAIL;
720     } else {
721       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
722         return OFFLOAD_FAIL;
723     }
724 
725     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
726          "Device supports up to %d CUDA blocks and %d threads with a "
727          "warp size of %d\n",
728          DeviceData[DeviceId].BlocksPerGrid,
729          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
730     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
731          "Device heap size is %d Bytes, device stack size is %d Bytes per "
732          "thread\n",
733          (int)HeapLimit, (int)StackLimit);
734 
735     // Set default number of teams
736     if (EnvNumTeams > 0) {
737       DP("Default number of teams set according to environment %d\n",
738          EnvNumTeams);
739       DeviceData[DeviceId].NumTeams = EnvNumTeams;
740     } else {
741       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
742       DP("Default number of teams set according to library's default %d\n",
743          DeviceRTLTy::DefaultNumTeams);
744     }
745 
746     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
747       DP("Default number of teams exceeds device limit, capping at %d\n",
748          DeviceData[DeviceId].BlocksPerGrid);
749       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
750     }
751 
752     // Set default number of threads
753     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
754     DP("Default number of threads set according to library's default %d\n",
755        DeviceRTLTy::DefaultNumThreads);
756     if (DeviceData[DeviceId].NumThreads >
757         DeviceData[DeviceId].ThreadsPerBlock) {
758       DP("Default number of threads exceeds device limit, capping at %d\n",
759          DeviceData[DeviceId].ThreadsPerBlock);
760       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
761     }
762 
763     return OFFLOAD_SUCCESS;
764   }
765 
766   int deinitDevice(const int DeviceId) {
767     auto IsInitialized = InitializedFlags[DeviceId];
768     if (!IsInitialized)
769       return OFFLOAD_SUCCESS;
770     InitializedFlags[DeviceId] = false;
771 
772     if (UseMemoryManager)
773       MemoryManagers[DeviceId].release();
774 
775     StreamPool[DeviceId].reset();
776     EventPool[DeviceId].reset();
777 
778     DeviceDataTy &D = DeviceData[DeviceId];
779     if (!checkResult(cuCtxSetCurrent(D.Context),
780                      "Error returned from cuCtxSetCurrent\n"))
781       return OFFLOAD_FAIL;
782 
783     // Unload all modules.
784     for (auto &M : Modules[DeviceId])
785       if (!checkResult(cuModuleUnload(M),
786                        "Error returned from cuModuleUnload\n"))
787         return OFFLOAD_FAIL;
788 
789     // Destroy context.
790     CUdevice Device;
791     if (!checkResult(cuCtxGetDevice(&Device),
792                      "Error returned from cuCtxGetDevice\n"))
793       return OFFLOAD_FAIL;
794 
795     if (!checkResult(cuDevicePrimaryCtxRelease(Device),
796                      "Error returned from cuDevicePrimaryCtxRelease\n"))
797       return OFFLOAD_FAIL;
798 
799     return OFFLOAD_SUCCESS;
800   }
801 
802   __tgt_target_table *loadBinary(const int DeviceId,
803                                  const __tgt_device_image *Image) {
804     // Clear the offload table as we are going to create a new one.
805     clearOffloadEntriesTable(DeviceId);
806 
807     // Create the module and extract the function pointers.
808     CUmodule Module;
809     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
810     CUresult Err =
811         cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
812     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
813       return nullptr;
814 
815     DP("CUDA module successfully loaded!\n");
816 
817     Modules[DeviceId].push_back(Module);
818 
819     // Find the symbols in the module by name.
820     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
821     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
822 
823     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
824     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
825       if (!E->addr) {
826         // We return nullptr when something like this happens, the host should
827         // have always something in the address to uniquely identify the target
828         // region.
829         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
830         return nullptr;
831       }
832 
833       if (E->size) {
834         __tgt_offload_entry Entry = *E;
835         CUdeviceptr CUPtr;
836         size_t CUSize;
837         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
838         // We keep this style here because we need the name
839         if (Err != CUDA_SUCCESS) {
840           REPORT("Loading global '%s' Failed\n", E->name);
841           CUDA_ERR_STRING(Err);
842           return nullptr;
843         }
844 
845         if (CUSize != E->size) {
846           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
847              CUSize, E->size);
848           return nullptr;
849         }
850 
851         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
852            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
853 
854         Entry.addr = (void *)(CUPtr);
855 
856         // Note: In the current implementation declare target variables
857         // can either be link or to. This means that once unified
858         // memory is activated via the requires directive, the variable
859         // can be used directly from the host in both cases.
860         // TODO: when variables types other than to or link are added,
861         // the below condition should be changed to explicitly
862         // check for to and link variables types:
863         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
864         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
865         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
866           // If unified memory is present any target link or to variables
867           // can access host addresses directly. There is no longer a
868           // need for device copies.
869           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
870           DP("Copy linked variable host address (" DPxMOD
871              ") to device address (" DPxMOD ")\n",
872              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
873         }
874 
875         addOffloadEntry(DeviceId, Entry);
876 
877         continue;
878       }
879 
880       CUfunction Func;
881       Err = cuModuleGetFunction(&Func, Module, E->name);
882       // We keep this style here because we need the name
883       if (Err != CUDA_SUCCESS) {
884         REPORT("Loading '%s' Failed\n", E->name);
885         CUDA_ERR_STRING(Err);
886         return nullptr;
887       }
888 
889       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
890          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
891 
892       // default value GENERIC (in case symbol is missing from cubin file)
893       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
894       std::string ExecModeNameStr(E->name);
895       ExecModeNameStr += "_exec_mode";
896       const char *ExecModeName = ExecModeNameStr.c_str();
897 
898       CUdeviceptr ExecModePtr;
899       size_t CUSize;
900       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
901       if (Err == CUDA_SUCCESS) {
902         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
903           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
904              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
905           return nullptr;
906         }
907 
908         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
909         if (Err != CUDA_SUCCESS) {
910           REPORT("Error when copying data from device to host. Pointers: "
911                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
912                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
913           CUDA_ERR_STRING(Err);
914           return nullptr;
915         }
916       } else {
917         DP("Loading global exec_mode '%s' - symbol missing, using default "
918            "value GENERIC (1)\n",
919            ExecModeName);
920       }
921 
922       KernelsList.emplace_back(Func, ExecModeVal);
923 
924       __tgt_offload_entry Entry = *E;
925       Entry.addr = &KernelsList.back();
926       addOffloadEntry(DeviceId, Entry);
927     }
928 
929     // send device environment data to the device
930     {
931       // TODO: The device ID used here is not the real device ID used by OpenMP.
932       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
933                                     static_cast<uint32_t>(DeviceId),
934                                     static_cast<uint32_t>(DynamicMemorySize)};
935 
936       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
937         DeviceEnv.DebugKind = std::stoi(EnvStr);
938 
939       const char *DeviceEnvName = "omptarget_device_environment";
940       CUdeviceptr DeviceEnvPtr;
941       size_t CUSize;
942 
943       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
944       if (Err == CUDA_SUCCESS) {
945         if (CUSize != sizeof(DeviceEnv)) {
946           REPORT(
947               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
948               DeviceEnvName, CUSize, sizeof(int32_t));
949           CUDA_ERR_STRING(Err);
950           return nullptr;
951         }
952 
953         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
954         if (Err != CUDA_SUCCESS) {
955           REPORT("Error when copying data from host to device. Pointers: "
956                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
957                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
958           CUDA_ERR_STRING(Err);
959           return nullptr;
960         }
961 
962         DP("Sending global device environment data %zu bytes\n", CUSize);
963       } else {
964         DP("Finding global device environment '%s' - symbol missing.\n",
965            DeviceEnvName);
966         DP("Continue, considering this is a device RTL which does not accept "
967            "environment setting.\n");
968       }
969     }
970 
971     return getOffloadEntriesTable(DeviceId);
972   }
973 
974   void *dataAlloc(const int DeviceId, const int64_t Size,
975                   const TargetAllocTy Kind) {
976     switch (Kind) {
977     case TARGET_ALLOC_DEFAULT:
978     case TARGET_ALLOC_DEVICE:
979       if (UseMemoryManager)
980         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
981       else
982         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
983     case TARGET_ALLOC_HOST:
984     case TARGET_ALLOC_SHARED:
985       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
986     }
987 
988     REPORT("Invalid target data allocation kind or requested allocator not "
989            "implemented yet\n");
990 
991     return nullptr;
992   }
993 
994   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
995                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
996     assert(AsyncInfo && "AsyncInfo is nullptr");
997 
998     CUstream Stream = getStream(DeviceId, AsyncInfo);
999     CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
1000     if (Err != CUDA_SUCCESS) {
1001       DP("Error when copying data from host to device. Pointers: host "
1002          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1003          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1004       CUDA_ERR_STRING(Err);
1005       return OFFLOAD_FAIL;
1006     }
1007 
1008     return OFFLOAD_SUCCESS;
1009   }
1010 
1011   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1012                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
1013     assert(AsyncInfo && "AsyncInfo is nullptr");
1014 
1015     CUstream Stream = getStream(DeviceId, AsyncInfo);
1016     CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1017     if (Err != CUDA_SUCCESS) {
1018       DP("Error when copying data from device to host. Pointers: host "
1019          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1020          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1021       CUDA_ERR_STRING(Err);
1022       return OFFLOAD_FAIL;
1023     }
1024 
1025     return OFFLOAD_SUCCESS;
1026   }
1027 
1028   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1029                    int64_t Size, __tgt_async_info *AsyncInfo) {
1030     assert(AsyncInfo && "AsyncInfo is nullptr");
1031 
1032     CUresult Err;
1033     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1034 
1035     // If they are two devices, we try peer to peer copy first
1036     if (SrcDevId != DstDevId) {
1037       std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
1038 
1039       switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1040       case PeerAccessState::No: {
1041         REPORT("Peer access from %" PRId32 " to %" PRId32
1042                " is not supported. Fall back to D2D memcpy.\n",
1043                SrcDevId, DstDevId);
1044         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1045       }
1046       case PeerAccessState::Unkown: {
1047         int CanAccessPeer = 0;
1048         Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1049         if (Err != CUDA_SUCCESS) {
1050           REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1051                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1052                  SrcDevId, DstDevId);
1053           CUDA_ERR_STRING(Err);
1054           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1055           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1056         }
1057 
1058         if (!CanAccessPeer) {
1059           REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
1060                  "memcpy.\n",
1061                  SrcDevId, DstDevId);
1062           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1063           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1064         }
1065 
1066         Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1067         if (Err != CUDA_SUCCESS) {
1068           REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1069                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1070                  SrcDevId, DstDevId);
1071           CUDA_ERR_STRING(Err);
1072           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1073           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1074         }
1075 
1076         PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1077 
1078         LLVM_FALLTHROUGH;
1079       }
1080       case PeerAccessState::Yes: {
1081         Err = cuMemcpyPeerAsync(
1082             (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1083             (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
1084         if (Err == CUDA_SUCCESS)
1085           return OFFLOAD_SUCCESS;
1086 
1087         DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1088            ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
1089            ". Fall back to D2D memcpy.\n",
1090            DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1091         CUDA_ERR_STRING(Err);
1092 
1093         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1094       }
1095       }
1096     }
1097 
1098     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1099   }
1100 
1101   int dataDelete(const int DeviceId, void *TgtPtr) {
1102     if (UseMemoryManager)
1103       return MemoryManagers[DeviceId]->free(TgtPtr);
1104 
1105     return DeviceAllocators[DeviceId].free(TgtPtr);
1106   }
1107 
1108   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1109                           ptrdiff_t *TgtOffsets, const int ArgNum,
1110                           const int TeamNum, const int ThreadLimit,
1111                           const unsigned int LoopTripCount,
1112                           __tgt_async_info *AsyncInfo) const {
1113     // All args are references.
1114     std::vector<void *> Args(ArgNum);
1115     std::vector<void *> Ptrs(ArgNum);
1116 
1117     for (int I = 0; I < ArgNum; ++I) {
1118       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1119       Args[I] = &Ptrs[I];
1120     }
1121 
1122     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1123 
1124     const bool IsSPMDGenericMode =
1125         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1126     const bool IsSPMDMode =
1127         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1128     const bool IsGenericMode =
1129         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1130 
1131     int CudaThreadsPerBlock;
1132     if (ThreadLimit > 0) {
1133       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1134       CudaThreadsPerBlock = ThreadLimit;
1135       // Add master warp if necessary
1136       if (IsGenericMode) {
1137         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1138         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1139       }
1140     } else {
1141       DP("Setting CUDA threads per block to default %d\n",
1142          DeviceData[DeviceId].NumThreads);
1143       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1144     }
1145 
1146     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1147       DP("Threads per block capped at device limit %d\n",
1148          DeviceData[DeviceId].ThreadsPerBlock);
1149       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1150     }
1151 
1152     CUresult Err;
1153     if (!KernelInfo->MaxThreadsPerBlock) {
1154       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1155                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1156                                KernelInfo->Func);
1157       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1158         return OFFLOAD_FAIL;
1159     }
1160 
1161     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1162       DP("Threads per block capped at kernel limit %d\n",
1163          KernelInfo->MaxThreadsPerBlock);
1164       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1165     }
1166 
1167     unsigned int CudaBlocksPerGrid;
1168     if (TeamNum <= 0) {
1169       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1170         if (IsSPMDGenericMode) {
1171           // If we reach this point, then we are executing a kernel that was
1172           // transformed from Generic-mode to SPMD-mode. This kernel has
1173           // SPMD-mode execution, but needs its blocks to be scheduled
1174           // differently because the current loop trip count only applies to the
1175           // `teams distribute` region and will create var too few blocks using
1176           // the regular SPMD-mode method.
1177           CudaBlocksPerGrid = LoopTripCount;
1178         } else if (IsSPMDMode) {
1179           // We have a combined construct, i.e. `target teams distribute
1180           // parallel for [simd]`. We launch so many teams so that each thread
1181           // will execute one iteration of the loop. round up to the nearest
1182           // integer
1183           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1184         } else if (IsGenericMode) {
1185           // If we reach this point, then we have a non-combined construct, i.e.
1186           // `teams distribute` with a nested `parallel for` and each team is
1187           // assigned one iteration of the `distribute` loop. E.g.:
1188           //
1189           // #pragma omp target teams distribute
1190           // for(...loop_tripcount...) {
1191           //   #pragma omp parallel for
1192           //   for(...) {}
1193           // }
1194           //
1195           // Threads within a team will execute the iterations of the `parallel`
1196           // loop.
1197           CudaBlocksPerGrid = LoopTripCount;
1198         } else {
1199           REPORT("Unknown execution mode: %d\n",
1200                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1201           return OFFLOAD_FAIL;
1202         }
1203         DP("Using %d teams due to loop trip count %" PRIu32
1204            " and number of threads per block %d\n",
1205            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1206       } else {
1207         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1208         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1209       }
1210     } else {
1211       DP("Using requested number of teams %d\n", TeamNum);
1212       CudaBlocksPerGrid = TeamNum;
1213     }
1214 
1215     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1216       DP("Capping number of teams to team limit %d\n",
1217          DeviceData[DeviceId].BlocksPerGrid);
1218       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1219     }
1220 
1221     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1222          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1223          (getOffloadEntry(DeviceId, TgtEntryPtr))
1224              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1225              : "(null)",
1226          CudaBlocksPerGrid, CudaThreadsPerBlock,
1227          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1228 
1229     CUstream Stream = getStream(DeviceId, AsyncInfo);
1230     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1231                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1232                          /* blockDimY */ 1, /* blockDimZ */ 1,
1233                          DynamicMemorySize, Stream, &Args[0], nullptr);
1234     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1235       return OFFLOAD_FAIL;
1236 
1237     DP("Launch of entry point at " DPxMOD " successful!\n",
1238        DPxPTR(TgtEntryPtr));
1239 
1240     return OFFLOAD_SUCCESS;
1241   }
1242 
1243   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1244     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1245     CUresult Err = cuStreamSynchronize(Stream);
1246 
1247     // Once the stream is synchronized, return it to stream pool and reset
1248     // AsyncInfo. This is to make sure the synchronization only works for its
1249     // own tasks.
1250     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1251     AsyncInfo->Queue = nullptr;
1252 
1253     if (Err != CUDA_SUCCESS) {
1254       DP("Error when synchronizing stream. stream = " DPxMOD
1255          ", async info ptr = " DPxMOD "\n",
1256          DPxPTR(Stream), DPxPTR(AsyncInfo));
1257       CUDA_ERR_STRING(Err);
1258     }
1259     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1260   }
1261 
1262   void printDeviceInfo(int32_t DeviceId) {
1263     char TmpChar[1000];
1264     std::string TmpStr;
1265     size_t TmpSt;
1266     int TmpInt, TmpInt2, TmpInt3;
1267 
1268     CUdevice Device;
1269     checkResult(cuDeviceGet(&Device, DeviceId),
1270                 "Error returned from cuCtxGetDevice\n");
1271 
1272     cuDriverGetVersion(&TmpInt);
1273     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1274     printf("    CUDA Device Number: \t\t%d \n", DeviceId);
1275     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1276                 "Error returned from cuDeviceGetName\n");
1277     printf("    Device Name: \t\t\t%s \n", TmpChar);
1278     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1279                 "Error returned from cuDeviceTotalMem\n");
1280     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1281     checkResult(cuDeviceGetAttribute(
1282                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1283                 "Error returned from cuDeviceGetAttribute\n");
1284     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1285     checkResult(
1286         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1287         "Error returned from cuDeviceGetAttribute\n");
1288     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1289     checkResult(cuDeviceGetAttribute(
1290                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1291                 "Error returned from cuDeviceGetAttribute\n");
1292     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1293     checkResult(
1294         cuDeviceGetAttribute(
1295             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1296         "Error returned from cuDeviceGetAttribute\n");
1297     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1298     checkResult(
1299         cuDeviceGetAttribute(
1300             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1301         "Error returned from cuDeviceGetAttribute\n");
1302     printf("    Registers per Block: \t\t%d \n", TmpInt);
1303     checkResult(
1304         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1305         "Error returned from cuDeviceGetAttribute\n");
1306     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1307     checkResult(cuDeviceGetAttribute(
1308                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1309                 "Error returned from cuDeviceGetAttribute\n");
1310     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1311     checkResult(cuDeviceGetAttribute(
1312                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1313                 "Error returned from cuDeviceGetAttribute\n");
1314     checkResult(cuDeviceGetAttribute(
1315                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1316                 "Error returned from cuDeviceGetAttribute\n");
1317     checkResult(cuDeviceGetAttribute(
1318                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1319                 "Error returned from cuDeviceGetAttribute\n");
1320     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1321            TmpInt3);
1322     checkResult(cuDeviceGetAttribute(
1323                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1324                 "Error returned from cuDeviceGetAttribute\n");
1325     checkResult(cuDeviceGetAttribute(
1326                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1327                 "Error returned from cuDeviceGetAttribute\n");
1328     checkResult(cuDeviceGetAttribute(
1329                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1330                 "Error returned from cuDeviceGetAttribute\n");
1331     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1332            TmpInt3);
1333     checkResult(
1334         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1337     checkResult(cuDeviceGetAttribute(
1338                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1341     checkResult(
1342         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1343         "Error returned from cuDeviceGetAttribute\n");
1344     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1345     checkResult(cuDeviceGetAttribute(
1346                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1347                 "Error returned from cuDeviceGetAttribute\n");
1348     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1349     checkResult(
1350         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1351         "Error returned from cuDeviceGetAttribute\n");
1352     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1353     checkResult(cuDeviceGetAttribute(
1354                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1355                 "Error returned from cuDeviceGetAttribute\n");
1356     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1357     checkResult(
1358         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1359         "Error returned from cuDeviceGetAttribute\n");
1360     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1361       TmpStr = "DEFAULT";
1362     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1363       TmpStr = "PROHIBITED";
1364     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1365       TmpStr = "EXCLUSIVE PROCESS";
1366     else
1367       TmpStr = "unknown";
1368     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1369     checkResult(cuDeviceGetAttribute(
1370                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1371                 "Error returned from cuDeviceGetAttribute\n");
1372     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1373     checkResult(
1374         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1375         "Error returned from cuDeviceGetAttribute\n");
1376     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(cuDeviceGetAttribute(
1378                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1379                 "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1381     checkResult(
1382         cuDeviceGetAttribute(
1383             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1384         "Error returned from cuDeviceGetAttribute\n");
1385     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1386     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1387                                      Device),
1388                 "Error returned from cuDeviceGetAttribute\n");
1389     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1390     checkResult(cuDeviceGetAttribute(
1391                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1392                     Device),
1393                 "Error returned from cuDeviceGetAttribute\n");
1394     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1395     checkResult(cuDeviceGetAttribute(
1396                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1397                 "Error returned from cuDeviceGetAttribute\n");
1398     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1399     checkResult(cuDeviceGetAttribute(
1400                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1401                 "Error returned from cuDeviceGetAttribute\n");
1402     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1403     checkResult(cuDeviceGetAttribute(
1404                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1405                 "Error returned from cuDeviceGetAttribute\n");
1406     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1407     checkResult(
1408         cuDeviceGetAttribute(
1409             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1410         "Error returned from cuDeviceGetAttribute\n");
1411     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1412     checkResult(
1413         cuDeviceGetAttribute(
1414             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1415         "Error returned from cuDeviceGetAttribute\n");
1416     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1417     checkResult(cuDeviceGetAttribute(
1418                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1419                 "Error returned from cuDeviceGetAttribute\n");
1420     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1421     checkResult(cuDeviceGetAttribute(
1422                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1423                 "Error returned from cuDeviceGetAttribute\n");
1424     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1425     checkResult(
1426         cuDeviceGetAttribute(
1427             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1428         "Error returned from cuDeviceGetAttribute\n");
1429     checkResult(
1430         cuDeviceGetAttribute(
1431             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1432         "Error returned from cuDeviceGetAttribute\n");
1433     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1434   }
1435 
1436   int createEvent(int DeviceId, void **P) {
1437     CUevent Event = nullptr;
1438     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1439       return OFFLOAD_FAIL;
1440     *P = Event;
1441     return OFFLOAD_SUCCESS;
1442   }
1443 
1444   int destroyEvent(int DeviceId, void *EventPtr) {
1445     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1446     return OFFLOAD_SUCCESS;
1447   }
1448 
1449   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1450                 void *EventPtr) const {
1451     CUstream Stream = getStream(DeviceId, AsyncInfo);
1452     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1453 
1454     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1455     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1456     // API document, that argument has to be 0x0.
1457     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1458     if (Err != CUDA_SUCCESS) {
1459       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1460          DPxPTR(Stream), DPxPTR(Event));
1461       CUDA_ERR_STRING(Err);
1462       return OFFLOAD_FAIL;
1463     }
1464 
1465     return OFFLOAD_SUCCESS;
1466   }
1467 
1468   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1469     if (AsyncInfo->Queue) {
1470       StreamPool[DeviceId]->release(
1471           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1472       AsyncInfo->Queue = nullptr;
1473     }
1474 
1475     return OFFLOAD_SUCCESS;
1476   }
1477 
1478   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1479     *AsyncInfo = new __tgt_async_info;
1480     getStream(DeviceId, *AsyncInfo);
1481     return OFFLOAD_SUCCESS;
1482   }
1483 
1484   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1485                      const char **ErrStr) const {
1486     assert(DeviceInfo && "DeviceInfo is nullptr");
1487 
1488     if (!DeviceInfo->Context)
1489       DeviceInfo->Context = DeviceData[DeviceId].Context;
1490     if (!DeviceInfo->Device) {
1491       CUdevice Dev;
1492       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1493       if (Err == CUDA_SUCCESS) {
1494         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1495       } else {
1496         cuGetErrorString(Err, ErrStr);
1497         return OFFLOAD_FAIL;
1498       }
1499     }
1500     return OFFLOAD_SUCCESS;
1501   }
1502 
1503   int setContext(int DeviceId) {
1504     assert(InitializedFlags[DeviceId] && "Device is not initialized");
1505 
1506     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1507     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1508       return OFFLOAD_FAIL;
1509 
1510     return OFFLOAD_SUCCESS;
1511   }
1512 };
1513 
1514 DeviceRTLTy DeviceRTL;
1515 } // namespace
1516 
1517 // Exposed library API function
1518 #ifdef __cplusplus
1519 extern "C" {
1520 #endif
1521 
1522 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1523   return elf_check_machine(Image, /* EM_CUDA */ 190);
1524 }
1525 
1526 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1527                                        __tgt_image_info *info) {
1528   if (!__tgt_rtl_is_valid_binary(image))
1529     return false;
1530 
1531   // A subarchitecture was not specified. Assume it is compatible.
1532   if (!info->Arch)
1533     return true;
1534 
1535   int32_t NumberOfDevices = 0;
1536   if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
1537     return false;
1538 
1539   for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1540     CUdevice Device;
1541     if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
1542       return false;
1543 
1544     int32_t Major, Minor;
1545     if (cuDeviceGetAttribute(&Major,
1546                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1547                              Device) != CUDA_SUCCESS)
1548       return false;
1549     if (cuDeviceGetAttribute(&Minor,
1550                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1551                              Device) != CUDA_SUCCESS)
1552       return false;
1553 
1554     std::string ArchStr = "sm_" + std::to_string(Major) + std::to_string(Minor);
1555     if (ArchStr != info->Arch)
1556       return false;
1557   }
1558 
1559   DP("Image has compatible compute capability: %s\n", info->Arch);
1560   return true;
1561 }
1562 
1563 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1564 
1565 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1566   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1567   DeviceRTL.setRequiresFlag(RequiresFlags);
1568   return RequiresFlags;
1569 }
1570 
1571 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
1572   if (DeviceRTL.isValidDeviceId(SrcDevId) &&
1573       DeviceRTL.isValidDeviceId(DstDevId))
1574     return 1;
1575 
1576   return 0;
1577 }
1578 
1579 int32_t __tgt_rtl_init_device(int32_t DeviceId) {
1580   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1581   // Context is set when init the device.
1582 
1583   return DeviceRTL.initDevice(DeviceId);
1584 }
1585 
1586 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
1587   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1588   // Context is set when deinit the device.
1589 
1590   return DeviceRTL.deinitDevice(DeviceId);
1591 }
1592 
1593 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
1594                                           __tgt_device_image *Image) {
1595   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1596 
1597   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1598     return nullptr;
1599 
1600   return DeviceRTL.loadBinary(DeviceId, Image);
1601 }
1602 
1603 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
1604                            int32_t Kind) {
1605   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1606 
1607   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1608     return nullptr;
1609 
1610   return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
1611 }
1612 
1613 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1614                               int64_t Size) {
1615   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1616   // Context is set in __tgt_rtl_data_submit_async.
1617 
1618   __tgt_async_info AsyncInfo;
1619   const int32_t Rc =
1620       __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
1621   if (Rc != OFFLOAD_SUCCESS)
1622     return OFFLOAD_FAIL;
1623 
1624   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1625 }
1626 
1627 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
1628                                     void *HstPtr, int64_t Size,
1629                                     __tgt_async_info *AsyncInfoPtr) {
1630   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1631   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1632 
1633   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1634     return OFFLOAD_FAIL;
1635 
1636   return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
1637 }
1638 
1639 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1640                                 int64_t Size) {
1641   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1642   // Context is set in __tgt_rtl_data_retrieve_async.
1643 
1644   __tgt_async_info AsyncInfo;
1645   const int32_t Rc =
1646       __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
1647   if (Rc != OFFLOAD_SUCCESS)
1648     return OFFLOAD_FAIL;
1649 
1650   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1651 }
1652 
1653 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
1654                                       void *TgtPtr, int64_t Size,
1655                                       __tgt_async_info *AsyncInfoPtr) {
1656   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1657   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1658 
1659   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1660     return OFFLOAD_FAIL;
1661 
1662   return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
1663 }
1664 
1665 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
1666                                       int DstDevId, void *DstPtr, int64_t Size,
1667                                       __tgt_async_info *AsyncInfo) {
1668   assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1669   assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1670   assert(AsyncInfo && "AsyncInfo is nullptr");
1671 
1672   if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
1673     return OFFLOAD_FAIL;
1674 
1675   return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
1676                                 AsyncInfo);
1677 }
1678 
1679 int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
1680                                 int32_t DstDevId, void *DstPtr, int64_t Size) {
1681   assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1682   assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1683   // Context is set in __tgt_rtl_data_exchange_async.
1684 
1685   __tgt_async_info AsyncInfo;
1686   const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
1687                                                    DstPtr, Size, &AsyncInfo);
1688   if (Rc != OFFLOAD_SUCCESS)
1689     return OFFLOAD_FAIL;
1690 
1691   return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
1692 }
1693 
1694 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
1695   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1696 
1697   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1698     return OFFLOAD_FAIL;
1699 
1700   return DeviceRTL.dataDelete(DeviceId, TgtPtr);
1701 }
1702 
1703 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
1704                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
1705                                          int32_t ArgNum, int32_t TeamNum,
1706                                          int32_t ThreadLimit,
1707                                          uint64_t LoopTripcount) {
1708   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1709   // Context is set in __tgt_rtl_run_target_team_region_async.
1710 
1711   __tgt_async_info AsyncInfo;
1712   const int32_t Rc = __tgt_rtl_run_target_team_region_async(
1713       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1714       LoopTripcount, &AsyncInfo);
1715   if (Rc != OFFLOAD_SUCCESS)
1716     return OFFLOAD_FAIL;
1717 
1718   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1719 }
1720 
1721 int32_t __tgt_rtl_run_target_team_region_async(
1722     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
1723     int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
1724     uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
1725   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1726 
1727   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1728     return OFFLOAD_FAIL;
1729 
1730   return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
1731                                        TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1732                                        LoopTripcount, AsyncInfoPtr);
1733 }
1734 
1735 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
1736                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
1737                                     int32_t ArgNum) {
1738   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1739   // Context is set in __tgt_rtl_run_target_region_async.
1740 
1741   __tgt_async_info AsyncInfo;
1742   const int32_t Rc = __tgt_rtl_run_target_region_async(
1743       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
1744   if (Rc != OFFLOAD_SUCCESS)
1745     return OFFLOAD_FAIL;
1746 
1747   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1748 }
1749 
1750 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
1751                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
1752                                           int32_t ArgNum,
1753                                           __tgt_async_info *AsyncInfoPtr) {
1754   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1755   // Context is set in __tgt_rtl_run_target_team_region_async.
1756   return __tgt_rtl_run_target_team_region_async(
1757       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
1758       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1759       AsyncInfoPtr);
1760 }
1761 
1762 int32_t __tgt_rtl_synchronize(int32_t DeviceId,
1763                               __tgt_async_info *AsyncInfoPtr) {
1764   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1765   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1766   assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1767   // NOTE: We don't need to set context for stream sync.
1768   return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
1769 }
1770 
1771 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1772   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1773   InfoLevel.store(NewInfoLevel);
1774 }
1775 
1776 void __tgt_rtl_print_device_info(int32_t DeviceId) {
1777   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1778   // NOTE: We don't need to set context for print device info.
1779   DeviceRTL.printDeviceInfo(DeviceId);
1780 }
1781 
1782 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
1783   assert(Event && "event is nullptr");
1784 
1785   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1786     return OFFLOAD_FAIL;
1787 
1788   return DeviceRTL.createEvent(DeviceId, Event);
1789 }
1790 
1791 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
1792                                __tgt_async_info *AsyncInfoPtr) {
1793   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1794   assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1795   assert(EventPtr && "event_ptr is nullptr");
1796   // NOTE: We might not need to set context for event record.
1797   return recordEvent(EventPtr, AsyncInfoPtr);
1798 }
1799 
1800 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
1801                              __tgt_async_info *AsyncInfoPtr) {
1802   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1803   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1804   assert(EventPtr && "event is nullptr");
1805   // If we don't have a queue we need to set the context.
1806   if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1807     return OFFLOAD_FAIL;
1808   return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
1809 }
1810 
1811 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
1812   assert(EventPtr && "event is nullptr");
1813   // NOTE: We might not need to set context for event sync.
1814   return syncEvent(EventPtr);
1815 }
1816 
1817 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
1818   assert(EventPtr && "event is nullptr");
1819 
1820   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1821     return OFFLOAD_FAIL;
1822 
1823   return DeviceRTL.destroyEvent(DeviceId, EventPtr);
1824 }
1825 
1826 int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
1827                                      __tgt_async_info *AsyncInfo) {
1828   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1829   assert(AsyncInfo && "async_info is nullptr");
1830 
1831   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1832     return OFFLOAD_FAIL;
1833 
1834   return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
1835 }
1836 
1837 int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
1838                                   __tgt_async_info **AsyncInfo) {
1839   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1840   assert(AsyncInfo && "async_info is nullptr");
1841 
1842   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1843     return OFFLOAD_FAIL;
1844 
1845   return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
1846 }
1847 
1848 int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
1849                                    __tgt_device_info *DeviceInfoPtr,
1850                                    const char **ErrStr) {
1851   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1852   assert(DeviceInfoPtr && "device_info_ptr is nullptr");
1853 
1854   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1855     return OFFLOAD_FAIL;
1856 
1857   return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
1858 }
1859 
1860 #ifdef __cplusplus
1861 }
1862 #endif
1863