1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "omptargetplugin.h"
25 
26 #define TARGET_NAME CUDA
27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
28 
29 #include "MemoryManager.h"
30 
31 #include "llvm/Frontend/OpenMP/OMPConstants.h"
32 
33 // Utility for retrieving and printing CUDA error string.
34 #ifdef OMPTARGET_DEBUG
35 #define CUDA_ERR_STRING(err)                                                   \
36   do {                                                                         \
37     if (getDebugLevel() > 0) {                                                 \
38       const char *errStr = nullptr;                                            \
39       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
40       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
41         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
42       else if (errStr_status == CUDA_SUCCESS)                                  \
43         REPORT("CUDA error is: %s\n", errStr);                                 \
44       else {                                                                   \
45         REPORT("Unresolved CUDA error code: %d\n", err);                       \
46         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
47                errStr_status);                                                 \
48       }                                                                        \
49     } else {                                                                   \
50       const char *errStr = nullptr;                                            \
51       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
52       if (errStr_status == CUDA_SUCCESS)                                       \
53         REPORT("%s \n", errStr);                                               \
54     }                                                                          \
55   } while (false)
56 #else // OMPTARGET_DEBUG
57 #define CUDA_ERR_STRING(err)                                                   \
58   do {                                                                         \
59     const char *errStr = nullptr;                                              \
60     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
61     if (errStr_status == CUDA_SUCCESS)                                         \
62       REPORT("%s \n", errStr);                                                 \
63   } while (false)
64 #endif // OMPTARGET_DEBUG
65 
66 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
67 
68 #include "elf_common.h"
69 
70 /// Keep entries table per device.
71 struct FuncOrGblEntryTy {
72   __tgt_target_table Table;
73   std::vector<__tgt_offload_entry> Entries;
74 };
75 
76 /// Use a single entity to encode a kernel and a set of flags.
77 struct KernelTy {
78   CUfunction Func;
79 
80   // execution mode of kernel
81   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
82 
83   /// Maximal number of threads per block for this kernel.
84   int MaxThreadsPerBlock = 0;
85 
86   KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
87       : Func(_Func), ExecutionMode(_ExecutionMode) {}
88 };
89 
90 /// Device environment data
91 /// Manually sync with the deviceRTL side for now, move to a dedicated header
92 /// file later.
93 struct omptarget_device_environmentTy {
94   int32_t debug_level;
95   uint32_t num_devices;
96   uint32_t device_num;
97   uint64_t dynamic_shared_size;
98 };
99 
100 namespace {
101 bool checkResult(CUresult Err, const char *ErrMsg) {
102   if (Err == CUDA_SUCCESS)
103     return true;
104 
105   REPORT("%s", ErrMsg);
106   CUDA_ERR_STRING(Err);
107   return false;
108 }
109 
110 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
111                CUstream Stream) {
112   CUresult Err =
113       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
114 
115   if (Err != CUDA_SUCCESS) {
116     DP("Error when copying data from device to device. Pointers: src "
117        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
118        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
119     CUDA_ERR_STRING(Err);
120     return OFFLOAD_FAIL;
121   }
122 
123   return OFFLOAD_SUCCESS;
124 }
125 
126 int createEvent(void **P) {
127   CUevent Event = nullptr;
128 
129   CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT);
130   if (Err != CUDA_SUCCESS) {
131     DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event));
132     CUDA_ERR_STRING(Err);
133     return OFFLOAD_FAIL;
134   }
135 
136   *P = Event;
137 
138   return OFFLOAD_SUCCESS;
139 }
140 
141 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
142   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
143   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
144 
145   CUresult Err = cuEventRecord(Event, Stream);
146   if (Err != CUDA_SUCCESS) {
147     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
148        DPxPTR(Stream), DPxPTR(Event));
149     CUDA_ERR_STRING(Err);
150     return OFFLOAD_FAIL;
151   }
152 
153   return OFFLOAD_SUCCESS;
154 }
155 
156 int syncEvent(void *EventPtr) {
157   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
158 
159   CUresult Err = cuEventSynchronize(Event);
160   if (Err != CUDA_SUCCESS) {
161     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
162     CUDA_ERR_STRING(Err);
163     return OFFLOAD_FAIL;
164   }
165 
166   return OFFLOAD_SUCCESS;
167 }
168 
169 int destroyEvent(void *EventPtr) {
170   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
171 
172   CUresult Err = cuEventDestroy(Event);
173   if (Err != CUDA_SUCCESS) {
174     DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event));
175     CUDA_ERR_STRING(Err);
176     return OFFLOAD_FAIL;
177   }
178 
179   return OFFLOAD_SUCCESS;
180 }
181 
182 // Structure contains per-device data
183 struct DeviceDataTy {
184   /// List that contains all the kernels.
185   std::list<KernelTy> KernelsList;
186 
187   std::list<FuncOrGblEntryTy> FuncGblEntries;
188 
189   CUcontext Context = nullptr;
190   // Device properties
191   int ThreadsPerBlock = 0;
192   int BlocksPerGrid = 0;
193   int WarpSize = 0;
194   // OpenMP properties
195   int NumTeams = 0;
196   int NumThreads = 0;
197 };
198 
199 class StreamManagerTy {
200   int NumberOfDevices;
201   // The initial size of stream pool
202   int EnvNumInitialStreams;
203   // Per-device stream mutex
204   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
205   // Per-device stream Id indicates the next available stream in the pool
206   std::vector<int> NextStreamId;
207   // Per-device stream pool
208   std::vector<std::vector<CUstream>> StreamPool;
209   // Reference to per-device data
210   std::vector<DeviceDataTy> &DeviceData;
211 
212   // If there is no CUstream left in the pool, we will resize the pool to
213   // allocate more CUstream. This function should be called with device mutex,
214   // and we do not resize to smaller one.
215   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
216     std::vector<CUstream> &Pool = StreamPool[DeviceId];
217     const size_t CurrentSize = Pool.size();
218     assert(NewSize > CurrentSize && "new size is not larger than current size");
219 
220     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
221     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
222       // We will return if cannot switch to the right context in case of
223       // creating bunch of streams that are not corresponding to the right
224       // device. The offloading will fail later because selected CUstream is
225       // nullptr.
226       return;
227     }
228 
229     Pool.resize(NewSize, nullptr);
230 
231     for (size_t I = CurrentSize; I < NewSize; ++I) {
232       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
233                   "Error returned from cuStreamCreate\n");
234     }
235   }
236 
237 public:
238   StreamManagerTy(const int NumberOfDevices,
239                   std::vector<DeviceDataTy> &DeviceData)
240       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
241         DeviceData(DeviceData) {
242     StreamPool.resize(NumberOfDevices);
243     NextStreamId.resize(NumberOfDevices);
244     StreamMtx.resize(NumberOfDevices);
245 
246     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
247       EnvNumInitialStreams = std::stoi(EnvStr);
248 
249     // Initialize the next stream id
250     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
251 
252     // Initialize stream mutex
253     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
254       Ptr = std::make_unique<std::mutex>();
255   }
256 
257   ~StreamManagerTy() {
258     // Destroy streams
259     for (int I = 0; I < NumberOfDevices; ++I) {
260       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
261                   "Error returned from cuCtxSetCurrent\n");
262 
263       for (CUstream &S : StreamPool[I]) {
264         if (S)
265           checkResult(cuStreamDestroy(S),
266                       "Error returned from cuStreamDestroy\n");
267       }
268     }
269   }
270 
271   // Get a CUstream from pool. Per-device next stream id always points to the
272   // next available CUstream. That means, CUstreams [0, id-1] have been
273   // assigned, and [id,] are still available. If there is no CUstream left, we
274   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
275   // the id will increase one.
276   // xxxxxs+++++++++
277   //      ^
278   //      id
279   // After assignment, the pool becomes the following and s is assigned.
280   // xxxxxs+++++++++
281   //       ^
282   //       id
283   CUstream getStream(const int DeviceId) {
284     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
285     int &Id = NextStreamId[DeviceId];
286     // No CUstream left in the pool, we need to request from CUDA RT
287     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
288       // By default we double the stream pool every time
289       resizeStreamPool(DeviceId, Id * 2);
290     }
291     return StreamPool[DeviceId][Id++];
292   }
293 
294   // Return a CUstream back to pool. As mentioned above, per-device next
295   // stream is always points to the next available CUstream, so when we return
296   // a CUstream, we need to first decrease the id, and then copy the CUstream
297   // back.
298   // It is worth noting that, the order of streams return might be different
299   // from that they're assigned, that saying, at some point, there might be
300   // two identical CUstreams.
301   // xxax+a+++++
302   //     ^
303   //     id
304   // However, it doesn't matter, because they're always on the two sides of
305   // id. The left one will in the end be overwritten by another CUstream.
306   // Therefore, after several execution, the order of pool might be different
307   // from its initial state.
308   void returnStream(const int DeviceId, CUstream Stream) {
309     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
310     int &Id = NextStreamId[DeviceId];
311     assert(Id > 0 && "Wrong stream ID");
312     StreamPool[DeviceId][--Id] = Stream;
313   }
314 
315   bool initializeDeviceStreamPool(const int DeviceId) {
316     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
317 
318     resizeStreamPool(DeviceId, EnvNumInitialStreams);
319 
320     // Check the size of stream pool
321     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
322       return false;
323 
324     // Check whether each stream is valid
325     for (CUstream &S : StreamPool[DeviceId])
326       if (!S)
327         return false;
328 
329     return true;
330   }
331 };
332 
333 class DeviceRTLTy {
334   int NumberOfDevices;
335   // OpenMP environment properties
336   int EnvNumTeams;
337   int EnvTeamLimit;
338   int EnvTeamThreadLimit;
339   // OpenMP requires flags
340   int64_t RequiresFlags;
341   // Amount of dynamic shared memory to use at launch.
342   uint64_t DynamicMemorySize;
343 
344   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
345   static constexpr const int HardThreadLimit = 1024;
346   static constexpr const int DefaultNumTeams = 128;
347   static constexpr const int DefaultNumThreads = 128;
348 
349   std::unique_ptr<StreamManagerTy> StreamManager;
350   std::vector<DeviceDataTy> DeviceData;
351   std::vector<CUmodule> Modules;
352 
353   /// A class responsible for interacting with device native runtime library to
354   /// allocate and free memory.
355   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
356     const int DeviceId;
357     const std::vector<DeviceDataTy> &DeviceData;
358     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
359 
360   public:
361     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
362         : DeviceId(DeviceId), DeviceData(DeviceData) {}
363 
364     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
365       if (Size == 0)
366         return nullptr;
367 
368       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
369       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
370         return nullptr;
371 
372       void *MemAlloc = nullptr;
373       switch (Kind) {
374       case TARGET_ALLOC_DEFAULT:
375       case TARGET_ALLOC_DEVICE:
376         CUdeviceptr DevicePtr;
377         Err = cuMemAlloc(&DevicePtr, Size);
378         MemAlloc = (void *)DevicePtr;
379         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
380           return nullptr;
381         break;
382       case TARGET_ALLOC_HOST:
383         void *HostPtr;
384         Err = cuMemAllocHost(&HostPtr, Size);
385         MemAlloc = HostPtr;
386         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
387           return nullptr;
388         HostPinnedAllocs[MemAlloc] = Kind;
389         break;
390       case TARGET_ALLOC_SHARED:
391         CUdeviceptr SharedPtr;
392         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
393         MemAlloc = (void *)SharedPtr;
394         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
395           return nullptr;
396         break;
397       }
398 
399       return MemAlloc;
400     }
401 
402     int free(void *TgtPtr) override {
403       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
404       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
405         return OFFLOAD_FAIL;
406 
407       // Host pinned memory must be freed differently.
408       TargetAllocTy Kind =
409           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
410               ? TARGET_ALLOC_DEFAULT
411               : TARGET_ALLOC_HOST;
412       switch (Kind) {
413       case TARGET_ALLOC_DEFAULT:
414       case TARGET_ALLOC_DEVICE:
415       case TARGET_ALLOC_SHARED:
416         Err = cuMemFree((CUdeviceptr)TgtPtr);
417         if (!checkResult(Err, "Error returned from cuMemFree\n"))
418           return OFFLOAD_FAIL;
419         break;
420       case TARGET_ALLOC_HOST:
421         Err = cuMemFreeHost(TgtPtr);
422         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
423           return OFFLOAD_FAIL;
424         break;
425       }
426 
427       return OFFLOAD_SUCCESS;
428     }
429   };
430 
431   /// A vector of device allocators
432   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
433 
434   /// A vector of memory managers. Since the memory manager is non-copyable and
435   // non-removable, we wrap them into std::unique_ptr.
436   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
437 
438   /// Whether use memory manager
439   bool UseMemoryManager = true;
440 
441   // Record entry point associated with device
442   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
443     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
444     E.Entries.push_back(entry);
445   }
446 
447   // Return a pointer to the entry associated with the pointer
448   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
449                                              const void *Addr) const {
450     for (const __tgt_offload_entry &Itr :
451          DeviceData[DeviceId].FuncGblEntries.back().Entries)
452       if (Itr.addr == Addr)
453         return &Itr;
454 
455     return nullptr;
456   }
457 
458   // Return the pointer to the target entries table
459   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
460     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
461 
462     if (E.Entries.empty())
463       return nullptr;
464 
465     // Update table info according to the entries and return the pointer
466     E.Table.EntriesBegin = E.Entries.data();
467     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
468 
469     return &E.Table;
470   }
471 
472   // Clear entries table for a device
473   void clearOffloadEntriesTable(const int DeviceId) {
474     DeviceData[DeviceId].FuncGblEntries.emplace_back();
475     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
476     E.Entries.clear();
477     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
478   }
479 
480   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
481     assert(AsyncInfo && "AsyncInfo is nullptr");
482 
483     if (!AsyncInfo->Queue)
484       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
485 
486     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
487   }
488 
489 public:
490   // This class should not be copied
491   DeviceRTLTy(const DeviceRTLTy &) = delete;
492   DeviceRTLTy(DeviceRTLTy &&) = delete;
493 
494   DeviceRTLTy()
495       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
496         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
497         DynamicMemorySize(0) {
498 
499     DP("Start initializing CUDA\n");
500 
501     CUresult Err = cuInit(0);
502     if (Err == CUDA_ERROR_INVALID_HANDLE) {
503       // Can't call cuGetErrorString if dlsym failed
504       DP("Failed to load CUDA shared library\n");
505       return;
506     }
507     if (!checkResult(Err, "Error returned from cuInit\n")) {
508       return;
509     }
510 
511     Err = cuDeviceGetCount(&NumberOfDevices);
512     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
513       return;
514 
515     if (NumberOfDevices == 0) {
516       DP("There are no devices supporting CUDA.\n");
517       return;
518     }
519 
520     DeviceData.resize(NumberOfDevices);
521 
522     // Get environment variables regarding teams
523     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
524       // OMP_TEAM_LIMIT has been set
525       EnvTeamLimit = std::stoi(EnvStr);
526       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
527     }
528     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
529       // OMP_TEAMS_THREAD_LIMIT has been set
530       EnvTeamThreadLimit = std::stoi(EnvStr);
531       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
532     }
533     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
534       // OMP_NUM_TEAMS has been set
535       EnvNumTeams = std::stoi(EnvStr);
536       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
537     }
538     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
539       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
540       DynamicMemorySize = std::stoi(EnvStr);
541       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
542          DynamicMemorySize);
543     }
544 
545     StreamManager =
546         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
547 
548     for (int I = 0; I < NumberOfDevices; ++I)
549       DeviceAllocators.emplace_back(I, DeviceData);
550 
551     // Get the size threshold from environment variable
552     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
553     UseMemoryManager = Res.second;
554     size_t MemoryManagerThreshold = Res.first;
555 
556     if (UseMemoryManager)
557       for (int I = 0; I < NumberOfDevices; ++I)
558         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
559             DeviceAllocators[I], MemoryManagerThreshold));
560   }
561 
562   ~DeviceRTLTy() {
563     // We first destruct memory managers in case that its dependent data are
564     // destroyed before it.
565     for (auto &M : MemoryManagers)
566       M.release();
567 
568     StreamManager = nullptr;
569 
570     for (CUmodule &M : Modules)
571       // Close module
572       if (M)
573         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
574 
575     for (DeviceDataTy &D : DeviceData) {
576       // Destroy context
577       if (D.Context) {
578         checkResult(cuCtxSetCurrent(D.Context),
579                     "Error returned from cuCtxSetCurrent\n");
580         CUdevice Device;
581         checkResult(cuCtxGetDevice(&Device),
582                     "Error returned from cuCtxGetDevice\n");
583         checkResult(cuDevicePrimaryCtxRelease(Device),
584                     "Error returned from cuDevicePrimaryCtxRelease\n");
585       }
586     }
587   }
588 
589   // Check whether a given DeviceId is valid
590   bool isValidDeviceId(const int DeviceId) const {
591     return DeviceId >= 0 && DeviceId < NumberOfDevices;
592   }
593 
594   int getNumOfDevices() const { return NumberOfDevices; }
595 
596   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
597 
598   int initDevice(const int DeviceId) {
599     CUdevice Device;
600 
601     DP("Getting device %d\n", DeviceId);
602     CUresult Err = cuDeviceGet(&Device, DeviceId);
603     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
604       return OFFLOAD_FAIL;
605 
606     // Query the current flags of the primary context and set its flags if
607     // it is inactive
608     unsigned int FormerPrimaryCtxFlags = 0;
609     int FormerPrimaryCtxIsActive = 0;
610     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
611                                      &FormerPrimaryCtxIsActive);
612     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
613       return OFFLOAD_FAIL;
614 
615     if (FormerPrimaryCtxIsActive) {
616       DP("The primary context is active, no change to its flags\n");
617       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
618           CU_CTX_SCHED_BLOCKING_SYNC)
619         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
620     } else {
621       DP("The primary context is inactive, set its flags to "
622          "CU_CTX_SCHED_BLOCKING_SYNC\n");
623       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
624       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
625         return OFFLOAD_FAIL;
626     }
627 
628     // Retain the per device primary context and save it to use whenever this
629     // device is selected.
630     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
631     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
632       return OFFLOAD_FAIL;
633 
634     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
635     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
636       return OFFLOAD_FAIL;
637 
638     // Initialize stream pool
639     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
640       return OFFLOAD_FAIL;
641 
642     // Query attributes to determine number of threads/block and blocks/grid.
643     int MaxGridDimX;
644     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
645                                Device);
646     if (Err != CUDA_SUCCESS) {
647       DP("Error getting max grid dimension, use default value %d\n",
648          DeviceRTLTy::DefaultNumTeams);
649       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
650     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
651       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
652       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
653     } else {
654       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
655          "at the hard limit\n",
656          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
657       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
658     }
659 
660     // We are only exploiting threads along the x axis.
661     int MaxBlockDimX;
662     Err = cuDeviceGetAttribute(&MaxBlockDimX,
663                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
664     if (Err != CUDA_SUCCESS) {
665       DP("Error getting max block dimension, use default value %d\n",
666          DeviceRTLTy::DefaultNumThreads);
667       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
668     } else {
669       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
670       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
671 
672       if (EnvTeamThreadLimit > 0 &&
673           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
674         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
675            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
676            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
677         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
678       }
679       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
680         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
681            "capping at the hard limit\n",
682            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
683         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
684       }
685     }
686 
687     // Get and set warp size
688     int WarpSize;
689     Err =
690         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
691     if (Err != CUDA_SUCCESS) {
692       DP("Error getting warp size, assume default value 32\n");
693       DeviceData[DeviceId].WarpSize = 32;
694     } else {
695       DP("Using warp size %d\n", WarpSize);
696       DeviceData[DeviceId].WarpSize = WarpSize;
697     }
698 
699     // Adjust teams to the env variables
700     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
701       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
702          EnvTeamLimit);
703       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
704     }
705 
706     size_t StackLimit;
707     size_t HeapLimit;
708     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
709       StackLimit = std::stol(EnvStr);
710       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
711         return OFFLOAD_FAIL;
712     } else {
713       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
714         return OFFLOAD_FAIL;
715     }
716     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
717       HeapLimit = std::stol(EnvStr);
718       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
719         return OFFLOAD_FAIL;
720     } else {
721       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
722         return OFFLOAD_FAIL;
723     }
724 
725     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
726          "Device supports up to %d CUDA blocks and %d threads with a "
727          "warp size of %d\n",
728          DeviceData[DeviceId].BlocksPerGrid,
729          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
730     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
731          "Device heap size is %d Bytes, device stack size is %d Bytes per "
732          "thread\n",
733          (int)HeapLimit, (int)StackLimit);
734 
735     // Set default number of teams
736     if (EnvNumTeams > 0) {
737       DP("Default number of teams set according to environment %d\n",
738          EnvNumTeams);
739       DeviceData[DeviceId].NumTeams = EnvNumTeams;
740     } else {
741       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
742       DP("Default number of teams set according to library's default %d\n",
743          DeviceRTLTy::DefaultNumTeams);
744     }
745 
746     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
747       DP("Default number of teams exceeds device limit, capping at %d\n",
748          DeviceData[DeviceId].BlocksPerGrid);
749       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
750     }
751 
752     // Set default number of threads
753     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
754     DP("Default number of threads set according to library's default %d\n",
755        DeviceRTLTy::DefaultNumThreads);
756     if (DeviceData[DeviceId].NumThreads >
757         DeviceData[DeviceId].ThreadsPerBlock) {
758       DP("Default number of threads exceeds device limit, capping at %d\n",
759          DeviceData[DeviceId].ThreadsPerBlock);
760       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
761     }
762 
763     return OFFLOAD_SUCCESS;
764   }
765 
766   __tgt_target_table *loadBinary(const int DeviceId,
767                                  const __tgt_device_image *Image) {
768     // Set the context we are using
769     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
770     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
771       return nullptr;
772 
773     // Clear the offload table as we are going to create a new one.
774     clearOffloadEntriesTable(DeviceId);
775 
776     // Create the module and extract the function pointers.
777     CUmodule Module;
778     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
779     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
780     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
781       return nullptr;
782 
783     DP("CUDA module successfully loaded!\n");
784 
785     Modules.push_back(Module);
786 
787     // Find the symbols in the module by name.
788     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
789     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
790 
791     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
792     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
793       if (!E->addr) {
794         // We return nullptr when something like this happens, the host should
795         // have always something in the address to uniquely identify the target
796         // region.
797         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
798         return nullptr;
799       }
800 
801       if (E->size) {
802         __tgt_offload_entry Entry = *E;
803         CUdeviceptr CUPtr;
804         size_t CUSize;
805         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
806         // We keep this style here because we need the name
807         if (Err != CUDA_SUCCESS) {
808           REPORT("Loading global '%s' Failed\n", E->name);
809           CUDA_ERR_STRING(Err);
810           return nullptr;
811         }
812 
813         if (CUSize != E->size) {
814           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
815              CUSize, E->size);
816           return nullptr;
817         }
818 
819         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
820            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
821 
822         Entry.addr = (void *)(CUPtr);
823 
824         // Note: In the current implementation declare target variables
825         // can either be link or to. This means that once unified
826         // memory is activated via the requires directive, the variable
827         // can be used directly from the host in both cases.
828         // TODO: when variables types other than to or link are added,
829         // the below condition should be changed to explicitly
830         // check for to and link variables types:
831         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
832         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
833         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
834           // If unified memory is present any target link or to variables
835           // can access host addresses directly. There is no longer a
836           // need for device copies.
837           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
838           DP("Copy linked variable host address (" DPxMOD
839              ") to device address (" DPxMOD ")\n",
840              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
841         }
842 
843         addOffloadEntry(DeviceId, Entry);
844 
845         continue;
846       }
847 
848       CUfunction Func;
849       Err = cuModuleGetFunction(&Func, Module, E->name);
850       // We keep this style here because we need the name
851       if (Err != CUDA_SUCCESS) {
852         REPORT("Loading '%s' Failed\n", E->name);
853         CUDA_ERR_STRING(Err);
854         return nullptr;
855       }
856 
857       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
858          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
859 
860       // default value GENERIC (in case symbol is missing from cubin file)
861       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
862       std::string ExecModeNameStr(E->name);
863       ExecModeNameStr += "_exec_mode";
864       const char *ExecModeName = ExecModeNameStr.c_str();
865 
866       CUdeviceptr ExecModePtr;
867       size_t CUSize;
868       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
869       if (Err == CUDA_SUCCESS) {
870         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
871           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
872              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
873           return nullptr;
874         }
875 
876         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
877         if (Err != CUDA_SUCCESS) {
878           REPORT("Error when copying data from device to host. Pointers: "
879                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
880                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
881           CUDA_ERR_STRING(Err);
882           return nullptr;
883         }
884       } else {
885         DP("Loading global exec_mode '%s' - symbol missing, using default "
886            "value GENERIC (1)\n",
887            ExecModeName);
888       }
889 
890       KernelsList.emplace_back(Func, ExecModeVal);
891 
892       __tgt_offload_entry Entry = *E;
893       Entry.addr = &KernelsList.back();
894       addOffloadEntry(DeviceId, Entry);
895     }
896 
897     // send device environment data to the device
898     {
899       // TODO: The device ID used here is not the real device ID used by OpenMP.
900       omptarget_device_environmentTy DeviceEnv{
901           0, static_cast<uint32_t>(NumberOfDevices),
902           static_cast<uint32_t>(DeviceId), DynamicMemorySize};
903 
904 #ifdef OMPTARGET_DEBUG
905       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
906         DeviceEnv.debug_level = std::stoi(EnvStr);
907 #endif
908 
909       const char *DeviceEnvName = "omptarget_device_environment";
910       CUdeviceptr DeviceEnvPtr;
911       size_t CUSize;
912 
913       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
914       if (Err == CUDA_SUCCESS) {
915         if (CUSize != sizeof(DeviceEnv)) {
916           REPORT(
917               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
918               DeviceEnvName, CUSize, sizeof(int32_t));
919           CUDA_ERR_STRING(Err);
920           return nullptr;
921         }
922 
923         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
924         if (Err != CUDA_SUCCESS) {
925           REPORT("Error when copying data from host to device. Pointers: "
926                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
927                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
928           CUDA_ERR_STRING(Err);
929           return nullptr;
930         }
931 
932         DP("Sending global device environment data %zu bytes\n", CUSize);
933       } else {
934         DP("Finding global device environment '%s' - symbol missing.\n",
935            DeviceEnvName);
936         DP("Continue, considering this is a device RTL which does not accept "
937            "environment setting.\n");
938       }
939     }
940 
941     return getOffloadEntriesTable(DeviceId);
942   }
943 
944   void *dataAlloc(const int DeviceId, const int64_t Size,
945                   const TargetAllocTy Kind) {
946     switch (Kind) {
947     case TARGET_ALLOC_DEFAULT:
948     case TARGET_ALLOC_DEVICE:
949       if (UseMemoryManager)
950         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
951       else
952         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
953     case TARGET_ALLOC_HOST:
954     case TARGET_ALLOC_SHARED:
955       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
956     }
957 
958     REPORT("Invalid target data allocation kind or requested allocator not "
959            "implemented yet\n");
960 
961     return nullptr;
962   }
963 
964   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
965                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
966     assert(AsyncInfo && "AsyncInfo is nullptr");
967 
968     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
969     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
970       return OFFLOAD_FAIL;
971 
972     CUstream Stream = getStream(DeviceId, AsyncInfo);
973 
974     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
975     if (Err != CUDA_SUCCESS) {
976       DP("Error when copying data from host to device. Pointers: host "
977          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
978          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
979       CUDA_ERR_STRING(Err);
980       return OFFLOAD_FAIL;
981     }
982 
983     return OFFLOAD_SUCCESS;
984   }
985 
986   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
987                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
988     assert(AsyncInfo && "AsyncInfo is nullptr");
989 
990     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
991     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
992       return OFFLOAD_FAIL;
993 
994     CUstream Stream = getStream(DeviceId, AsyncInfo);
995 
996     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
997     if (Err != CUDA_SUCCESS) {
998       DP("Error when copying data from device to host. Pointers: host "
999          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1000          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1001       CUDA_ERR_STRING(Err);
1002       return OFFLOAD_FAIL;
1003     }
1004 
1005     return OFFLOAD_SUCCESS;
1006   }
1007 
1008   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1009                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1010     assert(AsyncInfo && "AsyncInfo is nullptr");
1011 
1012     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1013     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1014       return OFFLOAD_FAIL;
1015 
1016     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1017 
1018     // If they are two devices, we try peer to peer copy first
1019     if (SrcDevId != DstDevId) {
1020       int CanAccessPeer = 0;
1021       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1022       if (Err != CUDA_SUCCESS) {
1023         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1024                ", dst = %" PRId32 "\n",
1025                SrcDevId, DstDevId);
1026         CUDA_ERR_STRING(Err);
1027         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1028       }
1029 
1030       if (!CanAccessPeer) {
1031         DP("P2P memcpy not supported so fall back to D2D memcpy");
1032         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1033       }
1034 
1035       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1036       if (Err != CUDA_SUCCESS) {
1037         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1038                ", dst = %" PRId32 "\n",
1039                SrcDevId, DstDevId);
1040         CUDA_ERR_STRING(Err);
1041         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1042       }
1043 
1044       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1045                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1046                               Size, Stream);
1047       if (Err == CUDA_SUCCESS)
1048         return OFFLOAD_SUCCESS;
1049 
1050       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1051          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1052          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1053       CUDA_ERR_STRING(Err);
1054     }
1055 
1056     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1057   }
1058 
1059   int dataDelete(const int DeviceId, void *TgtPtr) {
1060     if (UseMemoryManager)
1061       return MemoryManagers[DeviceId]->free(TgtPtr);
1062 
1063     return DeviceAllocators[DeviceId].free(TgtPtr);
1064   }
1065 
1066   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1067                           ptrdiff_t *TgtOffsets, const int ArgNum,
1068                           const int TeamNum, const int ThreadLimit,
1069                           const unsigned int LoopTripCount,
1070                           __tgt_async_info *AsyncInfo) const {
1071     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1072     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1073       return OFFLOAD_FAIL;
1074 
1075     // All args are references.
1076     std::vector<void *> Args(ArgNum);
1077     std::vector<void *> Ptrs(ArgNum);
1078 
1079     for (int I = 0; I < ArgNum; ++I) {
1080       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1081       Args[I] = &Ptrs[I];
1082     }
1083 
1084     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1085 
1086     const bool IsSPMDGenericMode =
1087         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1088     const bool IsSPMDMode =
1089         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1090     const bool IsGenericMode =
1091         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1092 
1093     int CudaThreadsPerBlock;
1094     if (ThreadLimit > 0) {
1095       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1096       CudaThreadsPerBlock = ThreadLimit;
1097       // Add master warp if necessary
1098       if (IsGenericMode) {
1099         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1100         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1101       }
1102     } else {
1103       DP("Setting CUDA threads per block to default %d\n",
1104          DeviceData[DeviceId].NumThreads);
1105       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1106     }
1107 
1108     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1109       DP("Threads per block capped at device limit %d\n",
1110          DeviceData[DeviceId].ThreadsPerBlock);
1111       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1112     }
1113 
1114     if (!KernelInfo->MaxThreadsPerBlock) {
1115       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1116                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1117                                KernelInfo->Func);
1118       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1119         return OFFLOAD_FAIL;
1120     }
1121 
1122     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1123       DP("Threads per block capped at kernel limit %d\n",
1124          KernelInfo->MaxThreadsPerBlock);
1125       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1126     }
1127 
1128     unsigned int CudaBlocksPerGrid;
1129     if (TeamNum <= 0) {
1130       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1131         if (IsSPMDGenericMode) {
1132           // If we reach this point, then we are executing a kernel that was
1133           // transformed from Generic-mode to SPMD-mode. This kernel has
1134           // SPMD-mode execution, but needs its blocks to be scheduled
1135           // differently because the current loop trip count only applies to the
1136           // `teams distribute` region and will create var too few blocks using
1137           // the regular SPMD-mode method.
1138           CudaBlocksPerGrid = LoopTripCount;
1139         } else if (IsSPMDMode) {
1140           // We have a combined construct, i.e. `target teams distribute
1141           // parallel for [simd]`. We launch so many teams so that each thread
1142           // will execute one iteration of the loop. round up to the nearest
1143           // integer
1144           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1145         } else if (IsGenericMode) {
1146           // If we reach this point, then we have a non-combined construct, i.e.
1147           // `teams distribute` with a nested `parallel for` and each team is
1148           // assigned one iteration of the `distribute` loop. E.g.:
1149           //
1150           // #pragma omp target teams distribute
1151           // for(...loop_tripcount...) {
1152           //   #pragma omp parallel for
1153           //   for(...) {}
1154           // }
1155           //
1156           // Threads within a team will execute the iterations of the `parallel`
1157           // loop.
1158           CudaBlocksPerGrid = LoopTripCount;
1159         } else {
1160           REPORT("Unknown execution mode: %d\n",
1161                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1162           return OFFLOAD_FAIL;
1163         }
1164         DP("Using %d teams due to loop trip count %" PRIu32
1165            " and number of threads per block %d\n",
1166            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1167       } else {
1168         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1169         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1170       }
1171     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1172       DP("Capping number of teams to team limit %d\n",
1173          DeviceData[DeviceId].BlocksPerGrid);
1174       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1175     } else {
1176       DP("Using requested number of teams %d\n", TeamNum);
1177       CudaBlocksPerGrid = TeamNum;
1178     }
1179 
1180     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1181          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1182          (getOffloadEntry(DeviceId, TgtEntryPtr))
1183              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1184              : "(null)",
1185          CudaBlocksPerGrid, CudaThreadsPerBlock,
1186          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1187 
1188     CUstream Stream = getStream(DeviceId, AsyncInfo);
1189     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1190                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1191                          /* blockDimY */ 1, /* blockDimZ */ 1,
1192                          DynamicMemorySize, Stream, &Args[0], nullptr);
1193     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1194       return OFFLOAD_FAIL;
1195 
1196     DP("Launch of entry point at " DPxMOD " successful!\n",
1197        DPxPTR(TgtEntryPtr));
1198 
1199     return OFFLOAD_SUCCESS;
1200   }
1201 
1202   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1203     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1204     CUresult Err = cuStreamSynchronize(Stream);
1205 
1206     // Once the stream is synchronized, return it to stream pool and reset
1207     // AsyncInfo. This is to make sure the synchronization only works for its
1208     // own tasks.
1209     StreamManager->returnStream(DeviceId,
1210                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1211     AsyncInfo->Queue = nullptr;
1212 
1213     if (Err != CUDA_SUCCESS) {
1214       DP("Error when synchronizing stream. stream = " DPxMOD
1215          ", async info ptr = " DPxMOD "\n",
1216          DPxPTR(Stream), DPxPTR(AsyncInfo));
1217       CUDA_ERR_STRING(Err);
1218     }
1219     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1220   }
1221 
1222   void printDeviceInfo(int32_t device_id) {
1223     char TmpChar[1000];
1224     std::string TmpStr;
1225     size_t TmpSt;
1226     int TmpInt, TmpInt2, TmpInt3;
1227 
1228     CUdevice Device;
1229     checkResult(cuDeviceGet(&Device, device_id),
1230                 "Error returned from cuCtxGetDevice\n");
1231 
1232     cuDriverGetVersion(&TmpInt);
1233     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1234     printf("    CUDA Device Number: \t\t%d \n", device_id);
1235     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1236                 "Error returned from cuDeviceGetName\n");
1237     printf("    Device Name: \t\t\t%s \n", TmpChar);
1238     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1239                 "Error returned from cuDeviceTotalMem\n");
1240     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1241     checkResult(cuDeviceGetAttribute(
1242                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1243                 "Error returned from cuDeviceGetAttribute\n");
1244     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1245     checkResult(
1246         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1247         "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1249     checkResult(cuDeviceGetAttribute(
1250                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1251                 "Error returned from cuDeviceGetAttribute\n");
1252     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1253     checkResult(
1254         cuDeviceGetAttribute(
1255             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1256         "Error returned from cuDeviceGetAttribute\n");
1257     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1258     checkResult(
1259         cuDeviceGetAttribute(
1260             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1261         "Error returned from cuDeviceGetAttribute\n");
1262     printf("    Registers per Block: \t\t%d \n", TmpInt);
1263     checkResult(
1264         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1265         "Error returned from cuDeviceGetAttribute\n");
1266     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1267     checkResult(cuDeviceGetAttribute(
1268                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1269                 "Error returned from cuDeviceGetAttribute\n");
1270     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1271     checkResult(cuDeviceGetAttribute(
1272                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1273                 "Error returned from cuDeviceGetAttribute\n");
1274     checkResult(cuDeviceGetAttribute(
1275                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1276                 "Error returned from cuDeviceGetAttribute\n");
1277     checkResult(cuDeviceGetAttribute(
1278                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1279                 "Error returned from cuDeviceGetAttribute\n");
1280     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1281            TmpInt3);
1282     checkResult(cuDeviceGetAttribute(
1283                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1284                 "Error returned from cuDeviceGetAttribute\n");
1285     checkResult(cuDeviceGetAttribute(
1286                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1287                 "Error returned from cuDeviceGetAttribute\n");
1288     checkResult(cuDeviceGetAttribute(
1289                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1290                 "Error returned from cuDeviceGetAttribute\n");
1291     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1292            TmpInt3);
1293     checkResult(
1294         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1295         "Error returned from cuDeviceGetAttribute\n");
1296     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1297     checkResult(cuDeviceGetAttribute(
1298                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1299                 "Error returned from cuDeviceGetAttribute\n");
1300     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1301     checkResult(
1302         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1303         "Error returned from cuDeviceGetAttribute\n");
1304     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1305     checkResult(cuDeviceGetAttribute(
1306                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1307                 "Error returned from cuDeviceGetAttribute\n");
1308     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1309     checkResult(
1310         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1311         "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(cuDeviceGetAttribute(
1314                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1315                 "Error returned from cuDeviceGetAttribute\n");
1316     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1317     checkResult(
1318         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1319         "Error returned from cuDeviceGetAttribute\n");
1320     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1321       TmpStr = "DEFAULT";
1322     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1323       TmpStr = "PROHIBITED";
1324     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1325       TmpStr = "EXCLUSIVE PROCESS";
1326     else
1327       TmpStr = "unknown";
1328     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1329     checkResult(cuDeviceGetAttribute(
1330                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1331                 "Error returned from cuDeviceGetAttribute\n");
1332     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1333     checkResult(
1334         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1335         "Error returned from cuDeviceGetAttribute\n");
1336     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1337     checkResult(cuDeviceGetAttribute(
1338                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1339                 "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1341     checkResult(
1342         cuDeviceGetAttribute(
1343             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1344         "Error returned from cuDeviceGetAttribute\n");
1345     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1346     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1347                                      Device),
1348                 "Error returned from cuDeviceGetAttribute\n");
1349     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1350     checkResult(cuDeviceGetAttribute(
1351                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1352                     Device),
1353                 "Error returned from cuDeviceGetAttribute\n");
1354     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1355     checkResult(cuDeviceGetAttribute(
1356                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1357                 "Error returned from cuDeviceGetAttribute\n");
1358     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1359     checkResult(cuDeviceGetAttribute(
1360                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1361                 "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1363     checkResult(cuDeviceGetAttribute(
1364                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1365                 "Error returned from cuDeviceGetAttribute\n");
1366     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1367     checkResult(
1368         cuDeviceGetAttribute(
1369             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1370         "Error returned from cuDeviceGetAttribute\n");
1371     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1372     checkResult(
1373         cuDeviceGetAttribute(
1374             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1375         "Error returned from cuDeviceGetAttribute\n");
1376     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(cuDeviceGetAttribute(
1378                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1379                 "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1381     checkResult(cuDeviceGetAttribute(
1382                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1383                 "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1385     checkResult(
1386         cuDeviceGetAttribute(
1387             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1388         "Error returned from cuDeviceGetAttribute\n");
1389     checkResult(
1390         cuDeviceGetAttribute(
1391             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1392         "Error returned from cuDeviceGetAttribute\n");
1393     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1394   }
1395 
1396   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1397                 void *EventPtr) const {
1398     CUstream Stream = getStream(DeviceId, AsyncInfo);
1399     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1400 
1401     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1402     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1403     // API document, that argument has to be 0x0.
1404     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1405     if (Err != CUDA_SUCCESS) {
1406       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1407          DPxPTR(Stream), DPxPTR(Event));
1408       CUDA_ERR_STRING(Err);
1409       return OFFLOAD_FAIL;
1410     }
1411 
1412     return OFFLOAD_SUCCESS;
1413   }
1414 };
1415 
1416 DeviceRTLTy DeviceRTL;
1417 } // namespace
1418 
1419 // Exposed library API function
1420 #ifdef __cplusplus
1421 extern "C" {
1422 #endif
1423 
1424 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1425   return elf_check_machine(image, /* EM_CUDA */ 190);
1426 }
1427 
1428 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1429 
1430 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1431   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1432   DeviceRTL.setRequiresFlag(RequiresFlags);
1433   return RequiresFlags;
1434 }
1435 
1436 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1437   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1438       DeviceRTL.isValidDeviceId(dst_dev_id))
1439     return 1;
1440 
1441   return 0;
1442 }
1443 
1444 int32_t __tgt_rtl_init_device(int32_t device_id) {
1445   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1446 
1447   return DeviceRTL.initDevice(device_id);
1448 }
1449 
1450 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1451                                           __tgt_device_image *image) {
1452   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1453 
1454   return DeviceRTL.loadBinary(device_id, image);
1455 }
1456 
1457 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1458                            int32_t kind) {
1459   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1460 
1461   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1462 }
1463 
1464 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1465                               int64_t size) {
1466   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1467 
1468   __tgt_async_info AsyncInfo;
1469   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1470                                                  size, &AsyncInfo);
1471   if (rc != OFFLOAD_SUCCESS)
1472     return OFFLOAD_FAIL;
1473 
1474   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1475 }
1476 
1477 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1478                                     void *hst_ptr, int64_t size,
1479                                     __tgt_async_info *async_info_ptr) {
1480   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1481   assert(async_info_ptr && "async_info_ptr is nullptr");
1482 
1483   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1484                               async_info_ptr);
1485 }
1486 
1487 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1488                                 int64_t size) {
1489   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1490 
1491   __tgt_async_info AsyncInfo;
1492   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1493                                                    size, &AsyncInfo);
1494   if (rc != OFFLOAD_SUCCESS)
1495     return OFFLOAD_FAIL;
1496 
1497   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1498 }
1499 
1500 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1501                                       void *tgt_ptr, int64_t size,
1502                                       __tgt_async_info *async_info_ptr) {
1503   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1504   assert(async_info_ptr && "async_info_ptr is nullptr");
1505 
1506   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1507                                 async_info_ptr);
1508 }
1509 
1510 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1511                                       int dst_dev_id, void *dst_ptr,
1512                                       int64_t size,
1513                                       __tgt_async_info *AsyncInfo) {
1514   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1515   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1516   assert(AsyncInfo && "AsyncInfo is nullptr");
1517 
1518   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1519                                 AsyncInfo);
1520 }
1521 
1522 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1523                                 int32_t dst_dev_id, void *dst_ptr,
1524                                 int64_t size) {
1525   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1526   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1527 
1528   __tgt_async_info AsyncInfo;
1529   const int32_t rc = __tgt_rtl_data_exchange_async(
1530       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1531   if (rc != OFFLOAD_SUCCESS)
1532     return OFFLOAD_FAIL;
1533 
1534   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1535 }
1536 
1537 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1538   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1539 
1540   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1541 }
1542 
1543 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1544                                          void **tgt_args,
1545                                          ptrdiff_t *tgt_offsets,
1546                                          int32_t arg_num, int32_t team_num,
1547                                          int32_t thread_limit,
1548                                          uint64_t loop_tripcount) {
1549   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1550 
1551   __tgt_async_info AsyncInfo;
1552   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1553       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1554       thread_limit, loop_tripcount, &AsyncInfo);
1555   if (rc != OFFLOAD_SUCCESS)
1556     return OFFLOAD_FAIL;
1557 
1558   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1559 }
1560 
1561 int32_t __tgt_rtl_run_target_team_region_async(
1562     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1563     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1564     int32_t thread_limit, uint64_t loop_tripcount,
1565     __tgt_async_info *async_info_ptr) {
1566   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1567 
1568   return DeviceRTL.runTargetTeamRegion(
1569       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1570       thread_limit, loop_tripcount, async_info_ptr);
1571 }
1572 
1573 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1574                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1575                                     int32_t arg_num) {
1576   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1577 
1578   __tgt_async_info AsyncInfo;
1579   const int32_t rc = __tgt_rtl_run_target_region_async(
1580       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1581   if (rc != OFFLOAD_SUCCESS)
1582     return OFFLOAD_FAIL;
1583 
1584   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1585 }
1586 
1587 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1588                                           void *tgt_entry_ptr, void **tgt_args,
1589                                           ptrdiff_t *tgt_offsets,
1590                                           int32_t arg_num,
1591                                           __tgt_async_info *async_info_ptr) {
1592   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1593 
1594   return __tgt_rtl_run_target_team_region_async(
1595       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1596       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1597       async_info_ptr);
1598 }
1599 
1600 int32_t __tgt_rtl_synchronize(int32_t device_id,
1601                               __tgt_async_info *async_info_ptr) {
1602   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1603   assert(async_info_ptr && "async_info_ptr is nullptr");
1604   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1605 
1606   return DeviceRTL.synchronize(device_id, async_info_ptr);
1607 }
1608 
1609 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1610   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1611   InfoLevel.store(NewInfoLevel);
1612 }
1613 
1614 void __tgt_rtl_print_device_info(int32_t device_id) {
1615   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1616   DeviceRTL.printDeviceInfo(device_id);
1617 }
1618 
1619 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1620   assert(event && "event is nullptr");
1621   return createEvent(event);
1622 }
1623 
1624 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1625                                __tgt_async_info *async_info_ptr) {
1626   assert(async_info_ptr && "async_info_ptr is nullptr");
1627   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1628   assert(event_ptr && "event_ptr is nullptr");
1629 
1630   return recordEvent(event_ptr, async_info_ptr);
1631 }
1632 
1633 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1634                              __tgt_async_info *async_info_ptr) {
1635   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1636   assert(async_info_ptr && "async_info_ptr is nullptr");
1637   assert(event_ptr && "event is nullptr");
1638 
1639   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1640 }
1641 
1642 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1643   assert(event_ptr && "event is nullptr");
1644 
1645   return syncEvent(event_ptr);
1646 }
1647 
1648 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1649   assert(event_ptr && "event is nullptr");
1650 
1651   return destroyEvent(event_ptr);
1652 }
1653 
1654 #ifdef __cplusplus
1655 }
1656 #endif
1657