1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "omptargetplugin.h"
25 
26 #define TARGET_NAME CUDA
27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
28 
29 #include "MemoryManager.h"
30 
31 // Utility for retrieving and printing CUDA error string.
32 #ifdef OMPTARGET_DEBUG
33 #define CUDA_ERR_STRING(err)                                                   \
34   do {                                                                         \
35     if (getDebugLevel() > 0) {                                                 \
36       const char *errStr = nullptr;                                            \
37       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
38       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
39         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
40       else if (errStr_status == CUDA_SUCCESS)                                  \
41         REPORT("CUDA error is: %s\n", errStr);                                 \
42       else {                                                                   \
43         REPORT("Unresolved CUDA error code: %d\n", err);                       \
44         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
45                errStr_status);                                                 \
46       }                                                                        \
47     } else {                                                                   \
48       const char *errStr = nullptr;                                            \
49       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
50       if (errStr_status == CUDA_SUCCESS)                                       \
51         REPORT("%s \n", errStr);                                               \
52     }                                                                          \
53   } while (false)
54 #else // OMPTARGET_DEBUG
55 #define CUDA_ERR_STRING(err)                                                   \
56   do {                                                                         \
57     const char *errStr = nullptr;                                              \
58     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
59     if (errStr_status == CUDA_SUCCESS)                                         \
60       REPORT("%s \n", errStr);                                                 \
61   } while (false)
62 #endif // OMPTARGET_DEBUG
63 
64 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
65 
66 #include "elf_common.h"
67 
68 /// Keep entries table per device.
69 struct FuncOrGblEntryTy {
70   __tgt_target_table Table;
71   std::vector<__tgt_offload_entry> Entries;
72 };
73 
74 enum ExecutionModeType {
75   SPMD,         // constructors, destructors,
76                 // combined constructs (`teams distribute parallel for [simd]`)
77   GENERIC,      // everything else
78   SPMD_GENERIC, // Generic kernel with SPMD execution
79   NONE
80 };
81 
82 /// Use a single entity to encode a kernel and a set of flags.
83 struct KernelTy {
84   CUfunction Func;
85 
86   // execution mode of kernel
87   // 0 - SPMD mode (without master warp)
88   // 1 - Generic mode (with master warp)
89   // 2 - SPMD mode execution with Generic mode semantics.
90   int8_t ExecutionMode;
91 
92   /// Maximal number of threads per block for this kernel.
93   int MaxThreadsPerBlock = 0;
94 
95   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
96       : Func(_Func), ExecutionMode(_ExecutionMode) {}
97 };
98 
99 /// Device environment data
100 /// Manually sync with the deviceRTL side for now, move to a dedicated header
101 /// file later.
102 struct omptarget_device_environmentTy {
103   int32_t debug_level;
104 };
105 
106 namespace {
107 bool checkResult(CUresult Err, const char *ErrMsg) {
108   if (Err == CUDA_SUCCESS)
109     return true;
110 
111   REPORT("%s", ErrMsg);
112   CUDA_ERR_STRING(Err);
113   return false;
114 }
115 
116 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
117                CUstream Stream) {
118   CUresult Err =
119       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
120 
121   if (Err != CUDA_SUCCESS) {
122     DP("Error when copying data from device to device. Pointers: src "
123        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
124        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
125     CUDA_ERR_STRING(Err);
126     return OFFLOAD_FAIL;
127   }
128 
129   return OFFLOAD_SUCCESS;
130 }
131 
132 int createEvent(void **P) {
133   CUevent Event = nullptr;
134 
135   CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT);
136   if (Err != CUDA_SUCCESS) {
137     DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event));
138     CUDA_ERR_STRING(Err);
139     return OFFLOAD_FAIL;
140   }
141 
142   *P = Event;
143 
144   return OFFLOAD_SUCCESS;
145 }
146 
147 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
148   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
149   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
150 
151   CUresult Err = cuEventRecord(Event, Stream);
152   if (Err != CUDA_SUCCESS) {
153     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
154        DPxPTR(Stream), DPxPTR(Event));
155     CUDA_ERR_STRING(Err);
156     return OFFLOAD_FAIL;
157   }
158 
159   return OFFLOAD_SUCCESS;
160 }
161 
162 int syncEvent(void *EventPtr) {
163   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
164 
165   CUresult Err = cuEventSynchronize(Event);
166   if (Err != CUDA_SUCCESS) {
167     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
168     CUDA_ERR_STRING(Err);
169     return OFFLOAD_FAIL;
170   }
171 
172   return OFFLOAD_SUCCESS;
173 }
174 
175 int destroyEvent(void *EventPtr) {
176   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
177 
178   CUresult Err = cuEventDestroy(Event);
179   if (Err != CUDA_SUCCESS) {
180     DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event));
181     CUDA_ERR_STRING(Err);
182     return OFFLOAD_FAIL;
183   }
184 
185   return OFFLOAD_SUCCESS;
186 }
187 
188 // Structure contains per-device data
189 struct DeviceDataTy {
190   /// List that contains all the kernels.
191   std::list<KernelTy> KernelsList;
192 
193   std::list<FuncOrGblEntryTy> FuncGblEntries;
194 
195   CUcontext Context = nullptr;
196   // Device properties
197   int ThreadsPerBlock = 0;
198   int BlocksPerGrid = 0;
199   int WarpSize = 0;
200   // OpenMP properties
201   int NumTeams = 0;
202   int NumThreads = 0;
203 };
204 
205 class StreamManagerTy {
206   int NumberOfDevices;
207   // The initial size of stream pool
208   int EnvNumInitialStreams;
209   // Per-device stream mutex
210   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
211   // Per-device stream Id indicates the next available stream in the pool
212   std::vector<int> NextStreamId;
213   // Per-device stream pool
214   std::vector<std::vector<CUstream>> StreamPool;
215   // Reference to per-device data
216   std::vector<DeviceDataTy> &DeviceData;
217 
218   // If there is no CUstream left in the pool, we will resize the pool to
219   // allocate more CUstream. This function should be called with device mutex,
220   // and we do not resize to smaller one.
221   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
222     std::vector<CUstream> &Pool = StreamPool[DeviceId];
223     const size_t CurrentSize = Pool.size();
224     assert(NewSize > CurrentSize && "new size is not larger than current size");
225 
226     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
227     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
228       // We will return if cannot switch to the right context in case of
229       // creating bunch of streams that are not corresponding to the right
230       // device. The offloading will fail later because selected CUstream is
231       // nullptr.
232       return;
233     }
234 
235     Pool.resize(NewSize, nullptr);
236 
237     for (size_t I = CurrentSize; I < NewSize; ++I) {
238       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
239                   "Error returned from cuStreamCreate\n");
240     }
241   }
242 
243 public:
244   StreamManagerTy(const int NumberOfDevices,
245                   std::vector<DeviceDataTy> &DeviceData)
246       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
247         DeviceData(DeviceData) {
248     StreamPool.resize(NumberOfDevices);
249     NextStreamId.resize(NumberOfDevices);
250     StreamMtx.resize(NumberOfDevices);
251 
252     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
253       EnvNumInitialStreams = std::stoi(EnvStr);
254 
255     // Initialize the next stream id
256     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
257 
258     // Initialize stream mutex
259     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
260       Ptr = std::make_unique<std::mutex>();
261   }
262 
263   ~StreamManagerTy() {
264     // Destroy streams
265     for (int I = 0; I < NumberOfDevices; ++I) {
266       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
267                   "Error returned from cuCtxSetCurrent\n");
268 
269       for (CUstream &S : StreamPool[I]) {
270         if (S)
271           checkResult(cuStreamDestroy(S),
272                       "Error returned from cuStreamDestroy\n");
273       }
274     }
275   }
276 
277   // Get a CUstream from pool. Per-device next stream id always points to the
278   // next available CUstream. That means, CUstreams [0, id-1] have been
279   // assigned, and [id,] are still available. If there is no CUstream left, we
280   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
281   // the id will increase one.
282   // xxxxxs+++++++++
283   //      ^
284   //      id
285   // After assignment, the pool becomes the following and s is assigned.
286   // xxxxxs+++++++++
287   //       ^
288   //       id
289   CUstream getStream(const int DeviceId) {
290     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
291     int &Id = NextStreamId[DeviceId];
292     // No CUstream left in the pool, we need to request from CUDA RT
293     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
294       // By default we double the stream pool every time
295       resizeStreamPool(DeviceId, Id * 2);
296     }
297     return StreamPool[DeviceId][Id++];
298   }
299 
300   // Return a CUstream back to pool. As mentioned above, per-device next
301   // stream is always points to the next available CUstream, so when we return
302   // a CUstream, we need to first decrease the id, and then copy the CUstream
303   // back.
304   // It is worth noting that, the order of streams return might be different
305   // from that they're assigned, that saying, at some point, there might be
306   // two identical CUstreams.
307   // xxax+a+++++
308   //     ^
309   //     id
310   // However, it doesn't matter, because they're always on the two sides of
311   // id. The left one will in the end be overwritten by another CUstream.
312   // Therefore, after several execution, the order of pool might be different
313   // from its initial state.
314   void returnStream(const int DeviceId, CUstream Stream) {
315     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
316     int &Id = NextStreamId[DeviceId];
317     assert(Id > 0 && "Wrong stream ID");
318     StreamPool[DeviceId][--Id] = Stream;
319   }
320 
321   bool initializeDeviceStreamPool(const int DeviceId) {
322     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
323 
324     resizeStreamPool(DeviceId, EnvNumInitialStreams);
325 
326     // Check the size of stream pool
327     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
328       return false;
329 
330     // Check whether each stream is valid
331     for (CUstream &S : StreamPool[DeviceId])
332       if (!S)
333         return false;
334 
335     return true;
336   }
337 };
338 
339 class DeviceRTLTy {
340   int NumberOfDevices;
341   // OpenMP environment properties
342   int EnvNumTeams;
343   int EnvTeamLimit;
344   int EnvTeamThreadLimit;
345   // OpenMP requires flags
346   int64_t RequiresFlags;
347 
348   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
349   static constexpr const int HardThreadLimit = 1024;
350   static constexpr const int DefaultNumTeams = 128;
351   static constexpr const int DefaultNumThreads = 128;
352 
353   std::unique_ptr<StreamManagerTy> StreamManager;
354   std::vector<DeviceDataTy> DeviceData;
355   std::vector<CUmodule> Modules;
356 
357   /// A class responsible for interacting with device native runtime library to
358   /// allocate and free memory.
359   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
360     const int DeviceId;
361     const std::vector<DeviceDataTy> &DeviceData;
362     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
363 
364   public:
365     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
366         : DeviceId(DeviceId), DeviceData(DeviceData) {}
367 
368     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
369       if (Size == 0)
370         return nullptr;
371 
372       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
373       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
374         return nullptr;
375 
376       void *MemAlloc = nullptr;
377       switch (Kind) {
378       case TARGET_ALLOC_DEFAULT:
379       case TARGET_ALLOC_DEVICE:
380         CUdeviceptr DevicePtr;
381         Err = cuMemAlloc(&DevicePtr, Size);
382         MemAlloc = (void *)DevicePtr;
383         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
384           return nullptr;
385         break;
386       case TARGET_ALLOC_HOST:
387         void *HostPtr;
388         Err = cuMemAllocHost(&HostPtr, Size);
389         MemAlloc = HostPtr;
390         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
391           return nullptr;
392         HostPinnedAllocs[MemAlloc] = Kind;
393         break;
394       case TARGET_ALLOC_SHARED:
395         CUdeviceptr SharedPtr;
396         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
397         MemAlloc = (void *)SharedPtr;
398         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
399           return nullptr;
400         break;
401       }
402 
403       return MemAlloc;
404     }
405 
406     int free(void *TgtPtr) override {
407       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
408       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
409         return OFFLOAD_FAIL;
410 
411       // Host pinned memory must be freed differently.
412       TargetAllocTy Kind =
413           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
414               ? TARGET_ALLOC_DEFAULT
415               : TARGET_ALLOC_HOST;
416       switch (Kind) {
417       case TARGET_ALLOC_DEFAULT:
418       case TARGET_ALLOC_DEVICE:
419       case TARGET_ALLOC_SHARED:
420         Err = cuMemFree((CUdeviceptr)TgtPtr);
421         if (!checkResult(Err, "Error returned from cuMemFree\n"))
422           return OFFLOAD_FAIL;
423         break;
424       case TARGET_ALLOC_HOST:
425         Err = cuMemFreeHost(TgtPtr);
426         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
427           return OFFLOAD_FAIL;
428         break;
429       }
430 
431       return OFFLOAD_SUCCESS;
432     }
433   };
434 
435   /// A vector of device allocators
436   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
437 
438   /// A vector of memory managers. Since the memory manager is non-copyable and
439   // non-removable, we wrap them into std::unique_ptr.
440   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
441 
442   /// Whether use memory manager
443   bool UseMemoryManager = true;
444 
445   // Record entry point associated with device
446   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
447     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
448     E.Entries.push_back(entry);
449   }
450 
451   // Return a pointer to the entry associated with the pointer
452   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
453                                              const void *Addr) const {
454     for (const __tgt_offload_entry &Itr :
455          DeviceData[DeviceId].FuncGblEntries.back().Entries)
456       if (Itr.addr == Addr)
457         return &Itr;
458 
459     return nullptr;
460   }
461 
462   // Return the pointer to the target entries table
463   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
464     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
465 
466     if (E.Entries.empty())
467       return nullptr;
468 
469     // Update table info according to the entries and return the pointer
470     E.Table.EntriesBegin = E.Entries.data();
471     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
472 
473     return &E.Table;
474   }
475 
476   // Clear entries table for a device
477   void clearOffloadEntriesTable(const int DeviceId) {
478     DeviceData[DeviceId].FuncGblEntries.emplace_back();
479     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
480     E.Entries.clear();
481     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
482   }
483 
484   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
485     assert(AsyncInfo && "AsyncInfo is nullptr");
486 
487     if (!AsyncInfo->Queue)
488       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
489 
490     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
491   }
492 
493 public:
494   // This class should not be copied
495   DeviceRTLTy(const DeviceRTLTy &) = delete;
496   DeviceRTLTy(DeviceRTLTy &&) = delete;
497 
498   DeviceRTLTy()
499       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
500         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
501 
502     DP("Start initializing CUDA\n");
503 
504     CUresult Err = cuInit(0);
505     if (Err == CUDA_ERROR_INVALID_HANDLE) {
506       // Can't call cuGetErrorString if dlsym failed
507       DP("Failed to load CUDA shared library\n");
508       return;
509     }
510     if (!checkResult(Err, "Error returned from cuInit\n")) {
511       return;
512     }
513 
514     Err = cuDeviceGetCount(&NumberOfDevices);
515     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
516       return;
517 
518     if (NumberOfDevices == 0) {
519       DP("There are no devices supporting CUDA.\n");
520       return;
521     }
522 
523     DeviceData.resize(NumberOfDevices);
524 
525     // Get environment variables regarding teams
526     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
527       // OMP_TEAM_LIMIT has been set
528       EnvTeamLimit = std::stoi(EnvStr);
529       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
530     }
531     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
532       // OMP_TEAMS_THREAD_LIMIT has been set
533       EnvTeamThreadLimit = std::stoi(EnvStr);
534       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
535     }
536     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
537       // OMP_NUM_TEAMS has been set
538       EnvNumTeams = std::stoi(EnvStr);
539       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
540     }
541 
542     StreamManager =
543         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
544 
545     for (int I = 0; I < NumberOfDevices; ++I)
546       DeviceAllocators.emplace_back(I, DeviceData);
547 
548     // Get the size threshold from environment variable
549     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
550     UseMemoryManager = Res.second;
551     size_t MemoryManagerThreshold = Res.first;
552 
553     if (UseMemoryManager)
554       for (int I = 0; I < NumberOfDevices; ++I)
555         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
556             DeviceAllocators[I], MemoryManagerThreshold));
557   }
558 
559   ~DeviceRTLTy() {
560     // We first destruct memory managers in case that its dependent data are
561     // destroyed before it.
562     for (auto &M : MemoryManagers)
563       M.release();
564 
565     StreamManager = nullptr;
566 
567     for (CUmodule &M : Modules)
568       // Close module
569       if (M)
570         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
571 
572     for (DeviceDataTy &D : DeviceData) {
573       // Destroy context
574       if (D.Context) {
575         checkResult(cuCtxSetCurrent(D.Context),
576                     "Error returned from cuCtxSetCurrent\n");
577         CUdevice Device;
578         checkResult(cuCtxGetDevice(&Device),
579                     "Error returned from cuCtxGetDevice\n");
580         checkResult(cuDevicePrimaryCtxRelease(Device),
581                     "Error returned from cuDevicePrimaryCtxRelease\n");
582       }
583     }
584   }
585 
586   // Check whether a given DeviceId is valid
587   bool isValidDeviceId(const int DeviceId) const {
588     return DeviceId >= 0 && DeviceId < NumberOfDevices;
589   }
590 
591   int getNumOfDevices() const { return NumberOfDevices; }
592 
593   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
594 
595   int initDevice(const int DeviceId) {
596     CUdevice Device;
597 
598     DP("Getting device %d\n", DeviceId);
599     CUresult Err = cuDeviceGet(&Device, DeviceId);
600     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
601       return OFFLOAD_FAIL;
602 
603     // Query the current flags of the primary context and set its flags if
604     // it is inactive
605     unsigned int FormerPrimaryCtxFlags = 0;
606     int FormerPrimaryCtxIsActive = 0;
607     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
608                                      &FormerPrimaryCtxIsActive);
609     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
610       return OFFLOAD_FAIL;
611 
612     if (FormerPrimaryCtxIsActive) {
613       DP("The primary context is active, no change to its flags\n");
614       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
615           CU_CTX_SCHED_BLOCKING_SYNC)
616         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
617     } else {
618       DP("The primary context is inactive, set its flags to "
619          "CU_CTX_SCHED_BLOCKING_SYNC\n");
620       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
621       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
622         return OFFLOAD_FAIL;
623     }
624 
625     // Retain the per device primary context and save it to use whenever this
626     // device is selected.
627     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
628     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
629       return OFFLOAD_FAIL;
630 
631     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
632     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
633       return OFFLOAD_FAIL;
634 
635     // Initialize stream pool
636     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
637       return OFFLOAD_FAIL;
638 
639     // Query attributes to determine number of threads/block and blocks/grid.
640     int MaxGridDimX;
641     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
642                                Device);
643     if (Err != CUDA_SUCCESS) {
644       DP("Error getting max grid dimension, use default value %d\n",
645          DeviceRTLTy::DefaultNumTeams);
646       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
647     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
648       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
649       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
650     } else {
651       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
652          "at the hard limit\n",
653          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
654       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
655     }
656 
657     // We are only exploiting threads along the x axis.
658     int MaxBlockDimX;
659     Err = cuDeviceGetAttribute(&MaxBlockDimX,
660                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
661     if (Err != CUDA_SUCCESS) {
662       DP("Error getting max block dimension, use default value %d\n",
663          DeviceRTLTy::DefaultNumThreads);
664       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
665     } else {
666       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
667       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
668 
669       if (EnvTeamThreadLimit > 0 &&
670           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
671         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
672            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
673            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
674         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
675       }
676       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
677         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
678            "capping at the hard limit\n",
679            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
680         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
681       }
682     }
683 
684     // Get and set warp size
685     int WarpSize;
686     Err =
687         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
688     if (Err != CUDA_SUCCESS) {
689       DP("Error getting warp size, assume default value 32\n");
690       DeviceData[DeviceId].WarpSize = 32;
691     } else {
692       DP("Using warp size %d\n", WarpSize);
693       DeviceData[DeviceId].WarpSize = WarpSize;
694     }
695 
696     // Adjust teams to the env variables
697     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
698       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
699          EnvTeamLimit);
700       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
701     }
702 
703     size_t StackLimit;
704     size_t HeapLimit;
705     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
706       StackLimit = std::stol(EnvStr);
707       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
708         return OFFLOAD_FAIL;
709     } else {
710       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
711         return OFFLOAD_FAIL;
712     }
713     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
714       HeapLimit = std::stol(EnvStr);
715       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
716         return OFFLOAD_FAIL;
717     } else {
718       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
719         return OFFLOAD_FAIL;
720     }
721 
722     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
723          "Device supports up to %d CUDA blocks and %d threads with a "
724          "warp size of %d\n",
725          DeviceData[DeviceId].BlocksPerGrid,
726          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
727     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
728          "Device heap size is %d Bytes, device stack size is %d Bytes per "
729          "thread\n",
730          (int)HeapLimit, (int)StackLimit);
731 
732     // Set default number of teams
733     if (EnvNumTeams > 0) {
734       DP("Default number of teams set according to environment %d\n",
735          EnvNumTeams);
736       DeviceData[DeviceId].NumTeams = EnvNumTeams;
737     } else {
738       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
739       DP("Default number of teams set according to library's default %d\n",
740          DeviceRTLTy::DefaultNumTeams);
741     }
742 
743     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
744       DP("Default number of teams exceeds device limit, capping at %d\n",
745          DeviceData[DeviceId].BlocksPerGrid);
746       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
747     }
748 
749     // Set default number of threads
750     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
751     DP("Default number of threads set according to library's default %d\n",
752        DeviceRTLTy::DefaultNumThreads);
753     if (DeviceData[DeviceId].NumThreads >
754         DeviceData[DeviceId].ThreadsPerBlock) {
755       DP("Default number of threads exceeds device limit, capping at %d\n",
756          DeviceData[DeviceId].ThreadsPerBlock);
757       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
758     }
759 
760     return OFFLOAD_SUCCESS;
761   }
762 
763   __tgt_target_table *loadBinary(const int DeviceId,
764                                  const __tgt_device_image *Image) {
765     // Set the context we are using
766     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
767     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
768       return nullptr;
769 
770     // Clear the offload table as we are going to create a new one.
771     clearOffloadEntriesTable(DeviceId);
772 
773     // Create the module and extract the function pointers.
774     CUmodule Module;
775     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
776     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
777     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
778       return nullptr;
779 
780     DP("CUDA module successfully loaded!\n");
781 
782     Modules.push_back(Module);
783 
784     // Find the symbols in the module by name.
785     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
786     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
787 
788     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
789     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
790       if (!E->addr) {
791         // We return nullptr when something like this happens, the host should
792         // have always something in the address to uniquely identify the target
793         // region.
794         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
795         return nullptr;
796       }
797 
798       if (E->size) {
799         __tgt_offload_entry Entry = *E;
800         CUdeviceptr CUPtr;
801         size_t CUSize;
802         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
803         // We keep this style here because we need the name
804         if (Err != CUDA_SUCCESS) {
805           REPORT("Loading global '%s' Failed\n", E->name);
806           CUDA_ERR_STRING(Err);
807           return nullptr;
808         }
809 
810         if (CUSize != E->size) {
811           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
812              CUSize, E->size);
813           return nullptr;
814         }
815 
816         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
817            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
818 
819         Entry.addr = (void *)(CUPtr);
820 
821         // Note: In the current implementation declare target variables
822         // can either be link or to. This means that once unified
823         // memory is activated via the requires directive, the variable
824         // can be used directly from the host in both cases.
825         // TODO: when variables types other than to or link are added,
826         // the below condition should be changed to explicitly
827         // check for to and link variables types:
828         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
829         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
830         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
831           // If unified memory is present any target link or to variables
832           // can access host addresses directly. There is no longer a
833           // need for device copies.
834           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
835           DP("Copy linked variable host address (" DPxMOD
836              ") to device address (" DPxMOD ")\n",
837              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
838         }
839 
840         addOffloadEntry(DeviceId, Entry);
841 
842         continue;
843       }
844 
845       CUfunction Func;
846       Err = cuModuleGetFunction(&Func, Module, E->name);
847       // We keep this style here because we need the name
848       if (Err != CUDA_SUCCESS) {
849         REPORT("Loading '%s' Failed\n", E->name);
850         CUDA_ERR_STRING(Err);
851         return nullptr;
852       }
853 
854       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
855          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
856 
857       // default value GENERIC (in case symbol is missing from cubin file)
858       int8_t ExecModeVal = ExecutionModeType::GENERIC;
859       std::string ExecModeNameStr(E->name);
860       ExecModeNameStr += "_exec_mode";
861       const char *ExecModeName = ExecModeNameStr.c_str();
862 
863       CUdeviceptr ExecModePtr;
864       size_t CUSize;
865       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
866       if (Err == CUDA_SUCCESS) {
867         if (CUSize != sizeof(int8_t)) {
868           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
869              ExecModeName, CUSize, sizeof(int8_t));
870           return nullptr;
871         }
872 
873         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
874         if (Err != CUDA_SUCCESS) {
875           REPORT("Error when copying data from device to host. Pointers: "
876                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
877                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
878           CUDA_ERR_STRING(Err);
879           return nullptr;
880         }
881 
882         if (ExecModeVal < 0 || ExecModeVal > 2) {
883           DP("Error wrong exec_mode value specified in cubin file: %d\n",
884              ExecModeVal);
885           return nullptr;
886         }
887       } else {
888         DP("Loading global exec_mode '%s' - symbol missing, using default "
889            "value GENERIC (1)\n",
890            ExecModeName);
891       }
892 
893       KernelsList.emplace_back(Func, ExecModeVal);
894 
895       __tgt_offload_entry Entry = *E;
896       Entry.addr = &KernelsList.back();
897       addOffloadEntry(DeviceId, Entry);
898     }
899 
900     // send device environment data to the device
901     {
902       omptarget_device_environmentTy DeviceEnv{0};
903 
904 #ifdef OMPTARGET_DEBUG
905       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
906         DeviceEnv.debug_level = std::stoi(EnvStr);
907 #endif
908 
909       const char *DeviceEnvName = "omptarget_device_environment";
910       CUdeviceptr DeviceEnvPtr;
911       size_t CUSize;
912 
913       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
914       if (Err == CUDA_SUCCESS) {
915         if (CUSize != sizeof(DeviceEnv)) {
916           REPORT(
917               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
918               DeviceEnvName, CUSize, sizeof(int32_t));
919           CUDA_ERR_STRING(Err);
920           return nullptr;
921         }
922 
923         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
924         if (Err != CUDA_SUCCESS) {
925           REPORT("Error when copying data from host to device. Pointers: "
926                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
927                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
928           CUDA_ERR_STRING(Err);
929           return nullptr;
930         }
931 
932         DP("Sending global device environment data %zu bytes\n", CUSize);
933       } else {
934         DP("Finding global device environment '%s' - symbol missing.\n",
935            DeviceEnvName);
936         DP("Continue, considering this is a device RTL which does not accept "
937            "environment setting.\n");
938       }
939     }
940 
941     return getOffloadEntriesTable(DeviceId);
942   }
943 
944   void *dataAlloc(const int DeviceId, const int64_t Size,
945                   const TargetAllocTy Kind) {
946     switch (Kind) {
947     case TARGET_ALLOC_DEFAULT:
948     case TARGET_ALLOC_DEVICE:
949       if (UseMemoryManager)
950         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
951       else
952         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
953     case TARGET_ALLOC_HOST:
954     case TARGET_ALLOC_SHARED:
955       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
956     }
957 
958     REPORT("Invalid target data allocation kind or requested allocator not "
959            "implemented yet\n");
960 
961     return nullptr;
962   }
963 
964   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
965                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
966     assert(AsyncInfo && "AsyncInfo is nullptr");
967 
968     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
969     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
970       return OFFLOAD_FAIL;
971 
972     CUstream Stream = getStream(DeviceId, AsyncInfo);
973 
974     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
975     if (Err != CUDA_SUCCESS) {
976       DP("Error when copying data from host to device. Pointers: host "
977          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
978          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
979       CUDA_ERR_STRING(Err);
980       return OFFLOAD_FAIL;
981     }
982 
983     return OFFLOAD_SUCCESS;
984   }
985 
986   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
987                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
988     assert(AsyncInfo && "AsyncInfo is nullptr");
989 
990     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
991     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
992       return OFFLOAD_FAIL;
993 
994     CUstream Stream = getStream(DeviceId, AsyncInfo);
995 
996     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
997     if (Err != CUDA_SUCCESS) {
998       DP("Error when copying data from device to host. Pointers: host "
999          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1000          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1001       CUDA_ERR_STRING(Err);
1002       return OFFLOAD_FAIL;
1003     }
1004 
1005     return OFFLOAD_SUCCESS;
1006   }
1007 
1008   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1009                    int64_t Size, __tgt_async_info *AsyncInfo) const {
1010     assert(AsyncInfo && "AsyncInfo is nullptr");
1011 
1012     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
1013     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1014       return OFFLOAD_FAIL;
1015 
1016     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1017 
1018     // If they are two devices, we try peer to peer copy first
1019     if (SrcDevId != DstDevId) {
1020       int CanAccessPeer = 0;
1021       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1022       if (Err != CUDA_SUCCESS) {
1023         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1024                ", dst = %" PRId32 "\n",
1025                SrcDevId, DstDevId);
1026         CUDA_ERR_STRING(Err);
1027         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1028       }
1029 
1030       if (!CanAccessPeer) {
1031         DP("P2P memcpy not supported so fall back to D2D memcpy");
1032         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1033       }
1034 
1035       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1036       if (Err != CUDA_SUCCESS) {
1037         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1038                ", dst = %" PRId32 "\n",
1039                SrcDevId, DstDevId);
1040         CUDA_ERR_STRING(Err);
1041         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1042       }
1043 
1044       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1045                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
1046                               Size, Stream);
1047       if (Err == CUDA_SUCCESS)
1048         return OFFLOAD_SUCCESS;
1049 
1050       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1051          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
1052          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1053       CUDA_ERR_STRING(Err);
1054     }
1055 
1056     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1057   }
1058 
1059   int dataDelete(const int DeviceId, void *TgtPtr) {
1060     if (UseMemoryManager)
1061       return MemoryManagers[DeviceId]->free(TgtPtr);
1062 
1063     return DeviceAllocators[DeviceId].free(TgtPtr);
1064   }
1065 
1066   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1067                           ptrdiff_t *TgtOffsets, const int ArgNum,
1068                           const int TeamNum, const int ThreadLimit,
1069                           const unsigned int LoopTripCount,
1070                           __tgt_async_info *AsyncInfo) const {
1071     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1072     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1073       return OFFLOAD_FAIL;
1074 
1075     // All args are references.
1076     std::vector<void *> Args(ArgNum);
1077     std::vector<void *> Ptrs(ArgNum);
1078 
1079     for (int I = 0; I < ArgNum; ++I) {
1080       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1081       Args[I] = &Ptrs[I];
1082     }
1083 
1084     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1085 
1086     int CudaThreadsPerBlock;
1087     if (ThreadLimit > 0) {
1088       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1089       CudaThreadsPerBlock = ThreadLimit;
1090       // Add master warp if necessary
1091       if (KernelInfo->ExecutionMode == GENERIC) {
1092         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1093         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1094       }
1095     } else {
1096       DP("Setting CUDA threads per block to default %d\n",
1097          DeviceData[DeviceId].NumThreads);
1098       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1099     }
1100 
1101     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1102       DP("Threads per block capped at device limit %d\n",
1103          DeviceData[DeviceId].ThreadsPerBlock);
1104       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1105     }
1106 
1107     if (!KernelInfo->MaxThreadsPerBlock) {
1108       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1109                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1110                                KernelInfo->Func);
1111       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1112         return OFFLOAD_FAIL;
1113     }
1114 
1115     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1116       DP("Threads per block capped at kernel limit %d\n",
1117          KernelInfo->MaxThreadsPerBlock);
1118       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1119     }
1120 
1121     unsigned int CudaBlocksPerGrid;
1122     if (TeamNum <= 0) {
1123       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1124         if (KernelInfo->ExecutionMode == SPMD) {
1125           // We have a combined construct, i.e. `target teams distribute
1126           // parallel for [simd]`. We launch so many teams so that each thread
1127           // will execute one iteration of the loop. round up to the nearest
1128           // integer
1129           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1130         } else if (KernelInfo->ExecutionMode == GENERIC) {
1131           // If we reach this point, then we have a non-combined construct, i.e.
1132           // `teams distribute` with a nested `parallel for` and each team is
1133           // assigned one iteration of the `distribute` loop. E.g.:
1134           //
1135           // #pragma omp target teams distribute
1136           // for(...loop_tripcount...) {
1137           //   #pragma omp parallel for
1138           //   for(...) {}
1139           // }
1140           //
1141           // Threads within a team will execute the iterations of the `parallel`
1142           // loop.
1143           CudaBlocksPerGrid = LoopTripCount;
1144         } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1145           // If we reach this point, then we are executing a kernel that was
1146           // transformed from Generic-mode to SPMD-mode. This kernel has
1147           // SPMD-mode execution, but needs its blocks to be scheduled
1148           // differently because the current loop trip count only applies to the
1149           // `teams distribute` region and will create var too few blocks using
1150           // the regular SPMD-mode method.
1151           CudaBlocksPerGrid = LoopTripCount;
1152         } else {
1153           REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
1154           return OFFLOAD_FAIL;
1155         }
1156         DP("Using %d teams due to loop trip count %" PRIu32
1157            " and number of threads per block %d\n",
1158            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1159       } else {
1160         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1161         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1162       }
1163     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1164       DP("Capping number of teams to team limit %d\n",
1165          DeviceData[DeviceId].BlocksPerGrid);
1166       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1167     } else {
1168       DP("Using requested number of teams %d\n", TeamNum);
1169       CudaBlocksPerGrid = TeamNum;
1170     }
1171 
1172     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1173          "Launching kernel %s with %d blocks and %d threads in %s "
1174          "mode\n",
1175          (getOffloadEntry(DeviceId, TgtEntryPtr))
1176              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1177              : "(null)",
1178          CudaBlocksPerGrid, CudaThreadsPerBlock,
1179          (KernelInfo->ExecutionMode != SPMD
1180               ? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
1181                                                       : "SPMD-Generic")
1182               : "SPMD"));
1183 
1184     CUstream Stream = getStream(DeviceId, AsyncInfo);
1185     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1186                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1187                          /* blockDimY */ 1, /* blockDimZ */ 1,
1188                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1189     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1190       return OFFLOAD_FAIL;
1191 
1192     DP("Launch of entry point at " DPxMOD " successful!\n",
1193        DPxPTR(TgtEntryPtr));
1194 
1195     return OFFLOAD_SUCCESS;
1196   }
1197 
1198   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1199     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1200     CUresult Err = cuStreamSynchronize(Stream);
1201 
1202     // Once the stream is synchronized, return it to stream pool and reset
1203     // AsyncInfo. This is to make sure the synchronization only works for its
1204     // own tasks.
1205     StreamManager->returnStream(DeviceId,
1206                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1207     AsyncInfo->Queue = nullptr;
1208 
1209     if (Err != CUDA_SUCCESS) {
1210       DP("Error when synchronizing stream. stream = " DPxMOD
1211          ", async info ptr = " DPxMOD "\n",
1212          DPxPTR(Stream), DPxPTR(AsyncInfo));
1213       CUDA_ERR_STRING(Err);
1214     }
1215     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1216   }
1217 
1218   void printDeviceInfo(int32_t device_id) {
1219     char TmpChar[1000];
1220     std::string TmpStr;
1221     size_t TmpSt;
1222     int TmpInt, TmpInt2, TmpInt3;
1223 
1224     CUdevice Device;
1225     checkResult(cuDeviceGet(&Device, device_id),
1226                 "Error returned from cuCtxGetDevice\n");
1227 
1228     cuDriverGetVersion(&TmpInt);
1229     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1230     printf("    CUDA Device Number: \t\t%d \n", device_id);
1231     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1232                 "Error returned from cuDeviceGetName\n");
1233     printf("    Device Name: \t\t\t%s \n", TmpChar);
1234     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1235                 "Error returned from cuDeviceTotalMem\n");
1236     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1237     checkResult(cuDeviceGetAttribute(
1238                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1239                 "Error returned from cuDeviceGetAttribute\n");
1240     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1241     checkResult(
1242         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1243         "Error returned from cuDeviceGetAttribute\n");
1244     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1245     checkResult(cuDeviceGetAttribute(
1246                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1247                 "Error returned from cuDeviceGetAttribute\n");
1248     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1249     checkResult(
1250         cuDeviceGetAttribute(
1251             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1252         "Error returned from cuDeviceGetAttribute\n");
1253     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1254     checkResult(
1255         cuDeviceGetAttribute(
1256             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1257         "Error returned from cuDeviceGetAttribute\n");
1258     printf("    Registers per Block: \t\t%d \n", TmpInt);
1259     checkResult(
1260         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1261         "Error returned from cuDeviceGetAttribute\n");
1262     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1263     checkResult(cuDeviceGetAttribute(
1264                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1265                 "Error returned from cuDeviceGetAttribute\n");
1266     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1267     checkResult(cuDeviceGetAttribute(
1268                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1269                 "Error returned from cuDeviceGetAttribute\n");
1270     checkResult(cuDeviceGetAttribute(
1271                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1272                 "Error returned from cuDeviceGetAttribute\n");
1273     checkResult(cuDeviceGetAttribute(
1274                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1275                 "Error returned from cuDeviceGetAttribute\n");
1276     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1277            TmpInt3);
1278     checkResult(cuDeviceGetAttribute(
1279                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1280                 "Error returned from cuDeviceGetAttribute\n");
1281     checkResult(cuDeviceGetAttribute(
1282                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1283                 "Error returned from cuDeviceGetAttribute\n");
1284     checkResult(cuDeviceGetAttribute(
1285                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1286                 "Error returned from cuDeviceGetAttribute\n");
1287     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1288            TmpInt3);
1289     checkResult(
1290         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1291         "Error returned from cuDeviceGetAttribute\n");
1292     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1293     checkResult(cuDeviceGetAttribute(
1294                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1295                 "Error returned from cuDeviceGetAttribute\n");
1296     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1297     checkResult(
1298         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1299         "Error returned from cuDeviceGetAttribute\n");
1300     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1301     checkResult(cuDeviceGetAttribute(
1302                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1303                 "Error returned from cuDeviceGetAttribute\n");
1304     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1305     checkResult(
1306         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1307         "Error returned from cuDeviceGetAttribute\n");
1308     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1309     checkResult(cuDeviceGetAttribute(
1310                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1311                 "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(
1314         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1315         "Error returned from cuDeviceGetAttribute\n");
1316     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1317       TmpStr = "DEFAULT";
1318     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1319       TmpStr = "PROHIBITED";
1320     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1321       TmpStr = "EXCLUSIVE PROCESS";
1322     else
1323       TmpStr = "unknown";
1324     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1325     checkResult(cuDeviceGetAttribute(
1326                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1327                 "Error returned from cuDeviceGetAttribute\n");
1328     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1329     checkResult(
1330         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1331         "Error returned from cuDeviceGetAttribute\n");
1332     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1333     checkResult(cuDeviceGetAttribute(
1334                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1335                 "Error returned from cuDeviceGetAttribute\n");
1336     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1337     checkResult(
1338         cuDeviceGetAttribute(
1339             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1340         "Error returned from cuDeviceGetAttribute\n");
1341     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1342     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1343                                      Device),
1344                 "Error returned from cuDeviceGetAttribute\n");
1345     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1346     checkResult(cuDeviceGetAttribute(
1347                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1348                     Device),
1349                 "Error returned from cuDeviceGetAttribute\n");
1350     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1351     checkResult(cuDeviceGetAttribute(
1352                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1353                 "Error returned from cuDeviceGetAttribute\n");
1354     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1355     checkResult(cuDeviceGetAttribute(
1356                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1357                 "Error returned from cuDeviceGetAttribute\n");
1358     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1359     checkResult(cuDeviceGetAttribute(
1360                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1361                 "Error returned from cuDeviceGetAttribute\n");
1362     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1363     checkResult(
1364         cuDeviceGetAttribute(
1365             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1366         "Error returned from cuDeviceGetAttribute\n");
1367     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1368     checkResult(
1369         cuDeviceGetAttribute(
1370             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1371         "Error returned from cuDeviceGetAttribute\n");
1372     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1373     checkResult(cuDeviceGetAttribute(
1374                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1375                 "Error returned from cuDeviceGetAttribute\n");
1376     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(cuDeviceGetAttribute(
1378                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1379                 "Error returned from cuDeviceGetAttribute\n");
1380     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1381     checkResult(
1382         cuDeviceGetAttribute(
1383             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1384         "Error returned from cuDeviceGetAttribute\n");
1385     checkResult(
1386         cuDeviceGetAttribute(
1387             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1388         "Error returned from cuDeviceGetAttribute\n");
1389     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1390   }
1391 
1392   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1393                 void *EventPtr) const {
1394     CUstream Stream = getStream(DeviceId, AsyncInfo);
1395     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1396 
1397     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1398     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1399     // API document, that argument has to be 0x0.
1400     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1401     if (Err != CUDA_SUCCESS) {
1402       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1403          DPxPTR(Stream), DPxPTR(Event));
1404       CUDA_ERR_STRING(Err);
1405       return OFFLOAD_FAIL;
1406     }
1407 
1408     return OFFLOAD_SUCCESS;
1409   }
1410 };
1411 
1412 DeviceRTLTy DeviceRTL;
1413 } // namespace
1414 
1415 // Exposed library API function
1416 #ifdef __cplusplus
1417 extern "C" {
1418 #endif
1419 
1420 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1421   return elf_check_machine(image, /* EM_CUDA */ 190);
1422 }
1423 
1424 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1425 
1426 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1427   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1428   DeviceRTL.setRequiresFlag(RequiresFlags);
1429   return RequiresFlags;
1430 }
1431 
1432 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1433   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1434       DeviceRTL.isValidDeviceId(dst_dev_id))
1435     return 1;
1436 
1437   return 0;
1438 }
1439 
1440 int32_t __tgt_rtl_init_device(int32_t device_id) {
1441   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1442 
1443   return DeviceRTL.initDevice(device_id);
1444 }
1445 
1446 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1447                                           __tgt_device_image *image) {
1448   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1449 
1450   return DeviceRTL.loadBinary(device_id, image);
1451 }
1452 
1453 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1454                            int32_t kind) {
1455   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1456 
1457   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1458 }
1459 
1460 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1461                               int64_t size) {
1462   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1463 
1464   __tgt_async_info AsyncInfo;
1465   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1466                                                  size, &AsyncInfo);
1467   if (rc != OFFLOAD_SUCCESS)
1468     return OFFLOAD_FAIL;
1469 
1470   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1471 }
1472 
1473 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1474                                     void *hst_ptr, int64_t size,
1475                                     __tgt_async_info *async_info_ptr) {
1476   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1477   assert(async_info_ptr && "async_info_ptr is nullptr");
1478 
1479   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1480                               async_info_ptr);
1481 }
1482 
1483 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1484                                 int64_t size) {
1485   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1486 
1487   __tgt_async_info AsyncInfo;
1488   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1489                                                    size, &AsyncInfo);
1490   if (rc != OFFLOAD_SUCCESS)
1491     return OFFLOAD_FAIL;
1492 
1493   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1494 }
1495 
1496 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1497                                       void *tgt_ptr, int64_t size,
1498                                       __tgt_async_info *async_info_ptr) {
1499   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1500   assert(async_info_ptr && "async_info_ptr is nullptr");
1501 
1502   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1503                                 async_info_ptr);
1504 }
1505 
1506 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1507                                       int dst_dev_id, void *dst_ptr,
1508                                       int64_t size,
1509                                       __tgt_async_info *AsyncInfo) {
1510   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1511   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1512   assert(AsyncInfo && "AsyncInfo is nullptr");
1513 
1514   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1515                                 AsyncInfo);
1516 }
1517 
1518 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1519                                 int32_t dst_dev_id, void *dst_ptr,
1520                                 int64_t size) {
1521   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1522   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1523 
1524   __tgt_async_info AsyncInfo;
1525   const int32_t rc = __tgt_rtl_data_exchange_async(
1526       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1527   if (rc != OFFLOAD_SUCCESS)
1528     return OFFLOAD_FAIL;
1529 
1530   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1531 }
1532 
1533 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1534   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1535 
1536   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1537 }
1538 
1539 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1540                                          void **tgt_args,
1541                                          ptrdiff_t *tgt_offsets,
1542                                          int32_t arg_num, int32_t team_num,
1543                                          int32_t thread_limit,
1544                                          uint64_t loop_tripcount) {
1545   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1546 
1547   __tgt_async_info AsyncInfo;
1548   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1549       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1550       thread_limit, loop_tripcount, &AsyncInfo);
1551   if (rc != OFFLOAD_SUCCESS)
1552     return OFFLOAD_FAIL;
1553 
1554   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1555 }
1556 
1557 int32_t __tgt_rtl_run_target_team_region_async(
1558     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1559     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1560     int32_t thread_limit, uint64_t loop_tripcount,
1561     __tgt_async_info *async_info_ptr) {
1562   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1563 
1564   return DeviceRTL.runTargetTeamRegion(
1565       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1566       thread_limit, loop_tripcount, async_info_ptr);
1567 }
1568 
1569 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1570                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1571                                     int32_t arg_num) {
1572   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1573 
1574   __tgt_async_info AsyncInfo;
1575   const int32_t rc = __tgt_rtl_run_target_region_async(
1576       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1577   if (rc != OFFLOAD_SUCCESS)
1578     return OFFLOAD_FAIL;
1579 
1580   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1581 }
1582 
1583 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1584                                           void *tgt_entry_ptr, void **tgt_args,
1585                                           ptrdiff_t *tgt_offsets,
1586                                           int32_t arg_num,
1587                                           __tgt_async_info *async_info_ptr) {
1588   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1589 
1590   return __tgt_rtl_run_target_team_region_async(
1591       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1592       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1593       async_info_ptr);
1594 }
1595 
1596 int32_t __tgt_rtl_synchronize(int32_t device_id,
1597                               __tgt_async_info *async_info_ptr) {
1598   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1599   assert(async_info_ptr && "async_info_ptr is nullptr");
1600   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1601 
1602   return DeviceRTL.synchronize(device_id, async_info_ptr);
1603 }
1604 
1605 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1606   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1607   InfoLevel.store(NewInfoLevel);
1608 }
1609 
1610 void __tgt_rtl_print_device_info(int32_t device_id) {
1611   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1612   DeviceRTL.printDeviceInfo(device_id);
1613 }
1614 
1615 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
1616   assert(event && "event is nullptr");
1617   return createEvent(event);
1618 }
1619 
1620 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
1621                                __tgt_async_info *async_info_ptr) {
1622   assert(async_info_ptr && "async_info_ptr is nullptr");
1623   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1624   assert(event_ptr && "event_ptr is nullptr");
1625 
1626   return recordEvent(event_ptr, async_info_ptr);
1627 }
1628 
1629 int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr,
1630                              __tgt_async_info *async_info_ptr) {
1631   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1632   assert(async_info_ptr && "async_info_ptr is nullptr");
1633   assert(event_ptr && "event is nullptr");
1634 
1635   return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr);
1636 }
1637 
1638 int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
1639   assert(event_ptr && "event is nullptr");
1640 
1641   return syncEvent(event_ptr);
1642 }
1643 
1644 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
1645   assert(event_ptr && "event is nullptr");
1646 
1647   return destroyEvent(event_ptr);
1648 }
1649 
1650 #ifdef __cplusplus
1651 }
1652 #endif
1653