1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "omptargetplugin.h"
25 
26 #define TARGET_NAME CUDA
27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
28 
29 #include "MemoryManager.h"
30 
31 // Utility for retrieving and printing CUDA error string.
32 #ifdef OMPTARGET_DEBUG
33 #define CUDA_ERR_STRING(err)                                                   \
34   do {                                                                         \
35     if (getDebugLevel() > 0) {                                                 \
36       const char *errStr = nullptr;                                            \
37       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
38       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
39         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
40       else if (errStr_status == CUDA_SUCCESS)                                  \
41         REPORT("CUDA error is: %s\n", errStr);                                 \
42       else {                                                                   \
43         REPORT("Unresolved CUDA error code: %d\n", err);                       \
44         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
45                errStr_status);                                                 \
46       }                                                                        \
47     } else {                                                                   \
48       const char *errStr = nullptr;                                            \
49       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
50       if (errStr_status == CUDA_SUCCESS)                                       \
51         REPORT("%s \n", errStr);                                               \
52     }                                                                          \
53   } while (false)
54 #else // OMPTARGET_DEBUG
55 #define CUDA_ERR_STRING(err)                                                   \
56   do {                                                                         \
57     const char *errStr = nullptr;                                              \
58     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
59     if (errStr_status == CUDA_SUCCESS)                                         \
60       REPORT("%s \n", errStr);                                                 \
61   } while (false)
62 #endif // OMPTARGET_DEBUG
63 
64 #include "elf_common.h"
65 
66 /// Keep entries table per device.
67 struct FuncOrGblEntryTy {
68   __tgt_target_table Table;
69   std::vector<__tgt_offload_entry> Entries;
70 };
71 
72 enum ExecutionModeType {
73   SPMD, // constructors, destructors,
74   // combined constructs (`teams distribute parallel for [simd]`)
75   GENERIC, // everything else
76   NONE
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   // 0 - SPMD mode (without master warp)
85   // 1 - Generic mode (with master warp)
86   int8_t ExecutionMode;
87 
88   /// Maximal number of threads per block for this kernel.
89   int MaxThreadsPerBlock = 0;
90 
91   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
92       : Func(_Func), ExecutionMode(_ExecutionMode) {}
93 };
94 
95 /// Device environment data
96 /// Manually sync with the deviceRTL side for now, move to a dedicated header
97 /// file later.
98 struct omptarget_device_environmentTy {
99   int32_t debug_level;
100 };
101 
102 namespace {
103 bool checkResult(CUresult Err, const char *ErrMsg) {
104   if (Err == CUDA_SUCCESS)
105     return true;
106 
107   REPORT("%s", ErrMsg);
108   CUDA_ERR_STRING(Err);
109   return false;
110 }
111 
112 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
113                CUstream Stream) {
114   CUresult Err =
115       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
116 
117   if (Err != CUDA_SUCCESS) {
118     DP("Error when copying data from device to device. Pointers: src "
119        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
120        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
121     CUDA_ERR_STRING(Err);
122     return OFFLOAD_FAIL;
123   }
124 
125   return OFFLOAD_SUCCESS;
126 }
127 
128 // Structure contains per-device data
129 struct DeviceDataTy {
130   /// List that contains all the kernels.
131   std::list<KernelTy> KernelsList;
132 
133   std::list<FuncOrGblEntryTy> FuncGblEntries;
134 
135   CUcontext Context = nullptr;
136   // Device properties
137   int ThreadsPerBlock = 0;
138   int BlocksPerGrid = 0;
139   int WarpSize = 0;
140   // OpenMP properties
141   int NumTeams = 0;
142   int NumThreads = 0;
143 };
144 
145 class StreamManagerTy {
146   int NumberOfDevices;
147   // The initial size of stream pool
148   int EnvNumInitialStreams;
149   // Per-device stream mutex
150   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
151   // Per-device stream Id indicates the next available stream in the pool
152   std::vector<int> NextStreamId;
153   // Per-device stream pool
154   std::vector<std::vector<CUstream>> StreamPool;
155   // Reference to per-device data
156   std::vector<DeviceDataTy> &DeviceData;
157 
158   // If there is no CUstream left in the pool, we will resize the pool to
159   // allocate more CUstream. This function should be called with device mutex,
160   // and we do not resize to smaller one.
161   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
162     std::vector<CUstream> &Pool = StreamPool[DeviceId];
163     const size_t CurrentSize = Pool.size();
164     assert(NewSize > CurrentSize && "new size is not larger than current size");
165 
166     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
167     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
168       // We will return if cannot switch to the right context in case of
169       // creating bunch of streams that are not corresponding to the right
170       // device. The offloading will fail later because selected CUstream is
171       // nullptr.
172       return;
173     }
174 
175     Pool.resize(NewSize, nullptr);
176 
177     for (size_t I = CurrentSize; I < NewSize; ++I) {
178       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
179                   "Error returned from cuStreamCreate\n");
180     }
181   }
182 
183 public:
184   StreamManagerTy(const int NumberOfDevices,
185                   std::vector<DeviceDataTy> &DeviceData)
186       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
187         DeviceData(DeviceData) {
188     StreamPool.resize(NumberOfDevices);
189     NextStreamId.resize(NumberOfDevices);
190     StreamMtx.resize(NumberOfDevices);
191 
192     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
193       EnvNumInitialStreams = std::stoi(EnvStr);
194 
195     // Initialize the next stream id
196     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
197 
198     // Initialize stream mutex
199     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
200       Ptr = std::make_unique<std::mutex>();
201   }
202 
203   ~StreamManagerTy() {
204     // Destroy streams
205     for (int I = 0; I < NumberOfDevices; ++I) {
206       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
207                   "Error returned from cuCtxSetCurrent\n");
208 
209       for (CUstream &S : StreamPool[I]) {
210         if (S)
211           checkResult(cuStreamDestroy(S),
212                       "Error returned from cuStreamDestroy\n");
213       }
214     }
215   }
216 
217   // Get a CUstream from pool. Per-device next stream id always points to the
218   // next available CUstream. That means, CUstreams [0, id-1] have been
219   // assigned, and [id,] are still available. If there is no CUstream left, we
220   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
221   // the id will increase one.
222   // xxxxxs+++++++++
223   //      ^
224   //      id
225   // After assignment, the pool becomes the following and s is assigned.
226   // xxxxxs+++++++++
227   //       ^
228   //       id
229   CUstream getStream(const int DeviceId) {
230     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
231     int &Id = NextStreamId[DeviceId];
232     // No CUstream left in the pool, we need to request from CUDA RT
233     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
234       // By default we double the stream pool every time
235       resizeStreamPool(DeviceId, Id * 2);
236     }
237     return StreamPool[DeviceId][Id++];
238   }
239 
240   // Return a CUstream back to pool. As mentioned above, per-device next
241   // stream is always points to the next available CUstream, so when we return
242   // a CUstream, we need to first decrease the id, and then copy the CUstream
243   // back.
244   // It is worth noting that, the order of streams return might be different
245   // from that they're assigned, that saying, at some point, there might be
246   // two identical CUstreams.
247   // xxax+a+++++
248   //     ^
249   //     id
250   // However, it doesn't matter, because they're always on the two sides of
251   // id. The left one will in the end be overwritten by another CUstream.
252   // Therefore, after several execution, the order of pool might be different
253   // from its initial state.
254   void returnStream(const int DeviceId, CUstream Stream) {
255     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
256     int &Id = NextStreamId[DeviceId];
257     assert(Id > 0 && "Wrong stream ID");
258     StreamPool[DeviceId][--Id] = Stream;
259   }
260 
261   bool initializeDeviceStreamPool(const int DeviceId) {
262     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
263 
264     resizeStreamPool(DeviceId, EnvNumInitialStreams);
265 
266     // Check the size of stream pool
267     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
268       return false;
269 
270     // Check whether each stream is valid
271     for (CUstream &S : StreamPool[DeviceId])
272       if (!S)
273         return false;
274 
275     return true;
276   }
277 };
278 
279 class DeviceRTLTy {
280   int NumberOfDevices;
281   // OpenMP environment properties
282   int EnvNumTeams;
283   int EnvTeamLimit;
284   // OpenMP requires flags
285   int64_t RequiresFlags;
286 
287   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
288   static constexpr const int HardThreadLimit = 1024;
289   static constexpr const int DefaultNumTeams = 128;
290   static constexpr const int DefaultNumThreads = 128;
291 
292   std::unique_ptr<StreamManagerTy> StreamManager;
293   std::vector<DeviceDataTy> DeviceData;
294   std::vector<CUmodule> Modules;
295 
296   /// A class responsible for interacting with device native runtime library to
297   /// allocate and free memory.
298   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
299     const int DeviceId;
300     const std::vector<DeviceDataTy> &DeviceData;
301     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
302 
303   public:
304     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
305         : DeviceId(DeviceId), DeviceData(DeviceData) {}
306 
307     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
308       if (Size == 0)
309         return nullptr;
310 
311       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
312       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
313         return nullptr;
314 
315       void *MemAlloc = nullptr;
316       switch (Kind) {
317       case TARGET_ALLOC_DEFAULT:
318       case TARGET_ALLOC_DEVICE:
319         CUdeviceptr DevicePtr;
320         Err = cuMemAlloc(&DevicePtr, Size);
321         MemAlloc = (void *)DevicePtr;
322         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
323           return nullptr;
324         break;
325       case TARGET_ALLOC_HOST:
326         void *HostPtr;
327         Err = cuMemAllocHost(&HostPtr, Size);
328         MemAlloc = HostPtr;
329         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
330           return nullptr;
331         HostPinnedAllocs[MemAlloc] = Kind;
332         break;
333       case TARGET_ALLOC_SHARED:
334         CUdeviceptr SharedPtr;
335         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
336         MemAlloc = (void *)SharedPtr;
337         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
338           return nullptr;
339         break;
340       }
341 
342       return MemAlloc;
343     }
344 
345     int free(void *TgtPtr) override {
346       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
347       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
348         return OFFLOAD_FAIL;
349 
350       // Host pinned memory must be freed differently.
351       TargetAllocTy Kind =
352           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
353               ? TARGET_ALLOC_DEFAULT
354               : TARGET_ALLOC_HOST;
355       switch (Kind) {
356       case TARGET_ALLOC_DEFAULT:
357       case TARGET_ALLOC_DEVICE:
358       case TARGET_ALLOC_SHARED:
359         Err = cuMemFree((CUdeviceptr)TgtPtr);
360         if (!checkResult(Err, "Error returned from cuMemFree\n"))
361           return OFFLOAD_FAIL;
362         break;
363       case TARGET_ALLOC_HOST:
364         Err = cuMemFreeHost(TgtPtr);
365         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
366           return OFFLOAD_FAIL;
367         break;
368       }
369 
370       return OFFLOAD_SUCCESS;
371     }
372   };
373 
374   /// A vector of device allocators
375   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
376 
377   /// A vector of memory managers. Since the memory manager is non-copyable and
378   // non-removable, we wrap them into std::unique_ptr.
379   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
380 
381   /// Whether use memory manager
382   bool UseMemoryManager = true;
383 
384   // Record entry point associated with device
385   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
386     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
387     E.Entries.push_back(entry);
388   }
389 
390   // Return a pointer to the entry associated with the pointer
391   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
392                                              const void *Addr) const {
393     for (const __tgt_offload_entry &Itr :
394          DeviceData[DeviceId].FuncGblEntries.back().Entries)
395       if (Itr.addr == Addr)
396         return &Itr;
397 
398     return nullptr;
399   }
400 
401   // Return the pointer to the target entries table
402   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
403     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
404 
405     if (E.Entries.empty())
406       return nullptr;
407 
408     // Update table info according to the entries and return the pointer
409     E.Table.EntriesBegin = E.Entries.data();
410     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
411 
412     return &E.Table;
413   }
414 
415   // Clear entries table for a device
416   void clearOffloadEntriesTable(const int DeviceId) {
417     DeviceData[DeviceId].FuncGblEntries.emplace_back();
418     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
419     E.Entries.clear();
420     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
421   }
422 
423   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
424     assert(AsyncInfo && "AsyncInfo is nullptr");
425 
426     if (!AsyncInfo->Queue)
427       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
428 
429     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
430   }
431 
432 public:
433   // This class should not be copied
434   DeviceRTLTy(const DeviceRTLTy &) = delete;
435   DeviceRTLTy(DeviceRTLTy &&) = delete;
436 
437   DeviceRTLTy()
438       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
439         RequiresFlags(OMP_REQ_UNDEFINED) {
440 
441     DP("Start initializing CUDA\n");
442 
443     CUresult Err = cuInit(0);
444     if (Err == CUDA_ERROR_INVALID_HANDLE) {
445       // Can't call cuGetErrorString if dlsym failed
446       DP("Failed to load CUDA shared library\n");
447       return;
448     }
449     if (!checkResult(Err, "Error returned from cuInit\n")) {
450       return;
451     }
452 
453     Err = cuDeviceGetCount(&NumberOfDevices);
454     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
455       return;
456 
457     if (NumberOfDevices == 0) {
458       DP("There are no devices supporting CUDA.\n");
459       return;
460     }
461 
462     DeviceData.resize(NumberOfDevices);
463 
464     // Get environment variables regarding teams
465     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
466       // OMP_TEAM_LIMIT has been set
467       EnvTeamLimit = std::stoi(EnvStr);
468       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
469     }
470     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
471       // OMP_NUM_TEAMS has been set
472       EnvNumTeams = std::stoi(EnvStr);
473       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
474     }
475 
476     StreamManager =
477         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
478 
479     for (int I = 0; I < NumberOfDevices; ++I)
480       DeviceAllocators.emplace_back(I, DeviceData);
481 
482     // Get the size threshold from environment variable
483     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
484     UseMemoryManager = Res.second;
485     size_t MemoryManagerThreshold = Res.first;
486 
487     if (UseMemoryManager)
488       for (int I = 0; I < NumberOfDevices; ++I)
489         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
490             DeviceAllocators[I], MemoryManagerThreshold));
491   }
492 
493   ~DeviceRTLTy() {
494     // We first destruct memory managers in case that its dependent data are
495     // destroyed before it.
496     for (auto &M : MemoryManagers)
497       M.release();
498 
499     StreamManager = nullptr;
500 
501     for (CUmodule &M : Modules)
502       // Close module
503       if (M)
504         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
505 
506     for (DeviceDataTy &D : DeviceData) {
507       // Destroy context
508       if (D.Context) {
509         checkResult(cuCtxSetCurrent(D.Context),
510                     "Error returned from cuCtxSetCurrent\n");
511         CUdevice Device;
512         checkResult(cuCtxGetDevice(&Device),
513                     "Error returned from cuCtxGetDevice\n");
514         checkResult(cuDevicePrimaryCtxRelease(Device),
515                     "Error returned from cuDevicePrimaryCtxRelease\n");
516       }
517     }
518   }
519 
520   // Check whether a given DeviceId is valid
521   bool isValidDeviceId(const int DeviceId) const {
522     return DeviceId >= 0 && DeviceId < NumberOfDevices;
523   }
524 
525   int getNumOfDevices() const { return NumberOfDevices; }
526 
527   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
528 
529   int initDevice(const int DeviceId) {
530     CUdevice Device;
531 
532     DP("Getting device %d\n", DeviceId);
533     CUresult Err = cuDeviceGet(&Device, DeviceId);
534     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
535       return OFFLOAD_FAIL;
536 
537     // Query the current flags of the primary context and set its flags if
538     // it is inactive
539     unsigned int FormerPrimaryCtxFlags = 0;
540     int FormerPrimaryCtxIsActive = 0;
541     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
542                                      &FormerPrimaryCtxIsActive);
543     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
544       return OFFLOAD_FAIL;
545 
546     if (FormerPrimaryCtxIsActive) {
547       DP("The primary context is active, no change to its flags\n");
548       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
549           CU_CTX_SCHED_BLOCKING_SYNC)
550         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
551     } else {
552       DP("The primary context is inactive, set its flags to "
553          "CU_CTX_SCHED_BLOCKING_SYNC\n");
554       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
555       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
556         return OFFLOAD_FAIL;
557     }
558 
559     // Retain the per device primary context and save it to use whenever this
560     // device is selected.
561     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
562     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
563       return OFFLOAD_FAIL;
564 
565     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
566     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
567       return OFFLOAD_FAIL;
568 
569     // Initialize stream pool
570     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
571       return OFFLOAD_FAIL;
572 
573     // Query attributes to determine number of threads/block and blocks/grid.
574     int MaxGridDimX;
575     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
576                                Device);
577     if (Err != CUDA_SUCCESS) {
578       DP("Error getting max grid dimension, use default value %d\n",
579          DeviceRTLTy::DefaultNumTeams);
580       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
581     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
582       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
583       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
584     } else {
585       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
586          "at the hard limit\n",
587          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
588       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
589     }
590 
591     // We are only exploiting threads along the x axis.
592     int MaxBlockDimX;
593     Err = cuDeviceGetAttribute(&MaxBlockDimX,
594                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
595     if (Err != CUDA_SUCCESS) {
596       DP("Error getting max block dimension, use default value %d\n",
597          DeviceRTLTy::DefaultNumThreads);
598       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
599     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
600       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
601       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
602     } else {
603       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
604          "capping at the hard limit\n",
605          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
606       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
607     }
608 
609     // Get and set warp size
610     int WarpSize;
611     Err =
612         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
613     if (Err != CUDA_SUCCESS) {
614       DP("Error getting warp size, assume default value 32\n");
615       DeviceData[DeviceId].WarpSize = 32;
616     } else {
617       DP("Using warp size %d\n", WarpSize);
618       DeviceData[DeviceId].WarpSize = WarpSize;
619     }
620 
621     // Adjust teams to the env variables
622     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
623       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
624          EnvTeamLimit);
625       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
626     }
627 
628     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
629          "Device supports up to %d CUDA blocks and %d threads with a "
630          "warp size of %d\n",
631          DeviceData[DeviceId].BlocksPerGrid,
632          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
633 
634     // Set default number of teams
635     if (EnvNumTeams > 0) {
636       DP("Default number of teams set according to environment %d\n",
637          EnvNumTeams);
638       DeviceData[DeviceId].NumTeams = EnvNumTeams;
639     } else {
640       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
641       DP("Default number of teams set according to library's default %d\n",
642          DeviceRTLTy::DefaultNumTeams);
643     }
644 
645     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
646       DP("Default number of teams exceeds device limit, capping at %d\n",
647          DeviceData[DeviceId].BlocksPerGrid);
648       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
649     }
650 
651     // Set default number of threads
652     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
653     DP("Default number of threads set according to library's default %d\n",
654        DeviceRTLTy::DefaultNumThreads);
655     if (DeviceData[DeviceId].NumThreads >
656         DeviceData[DeviceId].ThreadsPerBlock) {
657       DP("Default number of threads exceeds device limit, capping at %d\n",
658          DeviceData[DeviceId].ThreadsPerBlock);
659       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
660     }
661 
662     return OFFLOAD_SUCCESS;
663   }
664 
665   __tgt_target_table *loadBinary(const int DeviceId,
666                                  const __tgt_device_image *Image) {
667     // Set the context we are using
668     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
669     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
670       return nullptr;
671 
672     // Clear the offload table as we are going to create a new one.
673     clearOffloadEntriesTable(DeviceId);
674 
675     // Create the module and extract the function pointers.
676     CUmodule Module;
677     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
678     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
679     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
680       return nullptr;
681 
682     DP("CUDA module successfully loaded!\n");
683 
684     Modules.push_back(Module);
685 
686     // Find the symbols in the module by name.
687     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
688     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
689 
690     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
691     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
692       if (!E->addr) {
693         // We return nullptr when something like this happens, the host should
694         // have always something in the address to uniquely identify the target
695         // region.
696         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
697         return nullptr;
698       }
699 
700       if (E->size) {
701         __tgt_offload_entry Entry = *E;
702         CUdeviceptr CUPtr;
703         size_t CUSize;
704         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
705         // We keep this style here because we need the name
706         if (Err != CUDA_SUCCESS) {
707           REPORT("Loading global '%s' Failed\n", E->name);
708           CUDA_ERR_STRING(Err);
709           return nullptr;
710         }
711 
712         if (CUSize != E->size) {
713           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
714              CUSize, E->size);
715           return nullptr;
716         }
717 
718         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
719            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
720 
721         Entry.addr = (void *)(CUPtr);
722 
723         // Note: In the current implementation declare target variables
724         // can either be link or to. This means that once unified
725         // memory is activated via the requires directive, the variable
726         // can be used directly from the host in both cases.
727         // TODO: when variables types other than to or link are added,
728         // the below condition should be changed to explicitly
729         // check for to and link variables types:
730         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
731         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
732         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
733           // If unified memory is present any target link or to variables
734           // can access host addresses directly. There is no longer a
735           // need for device copies.
736           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
737           DP("Copy linked variable host address (" DPxMOD
738              ") to device address (" DPxMOD ")\n",
739              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
740         }
741 
742         addOffloadEntry(DeviceId, Entry);
743 
744         continue;
745       }
746 
747       CUfunction Func;
748       Err = cuModuleGetFunction(&Func, Module, E->name);
749       // We keep this style here because we need the name
750       if (Err != CUDA_SUCCESS) {
751         REPORT("Loading '%s' Failed\n", E->name);
752         CUDA_ERR_STRING(Err);
753         return nullptr;
754       }
755 
756       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
757          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
758 
759       // default value GENERIC (in case symbol is missing from cubin file)
760       int8_t ExecModeVal = ExecutionModeType::GENERIC;
761       std::string ExecModeNameStr(E->name);
762       ExecModeNameStr += "_exec_mode";
763       const char *ExecModeName = ExecModeNameStr.c_str();
764 
765       CUdeviceptr ExecModePtr;
766       size_t CUSize;
767       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
768       if (Err == CUDA_SUCCESS) {
769         if (CUSize != sizeof(int8_t)) {
770           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
771              ExecModeName, CUSize, sizeof(int8_t));
772           return nullptr;
773         }
774 
775         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
776         if (Err != CUDA_SUCCESS) {
777           REPORT("Error when copying data from device to host. Pointers: "
778                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
779                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
780           CUDA_ERR_STRING(Err);
781           return nullptr;
782         }
783 
784         if (ExecModeVal < 0 || ExecModeVal > 1) {
785           DP("Error wrong exec_mode value specified in cubin file: %d\n",
786              ExecModeVal);
787           return nullptr;
788         }
789       } else {
790         REPORT("Loading global exec_mode '%s' - symbol missing, using default "
791                "value GENERIC (1)\n",
792                ExecModeName);
793         CUDA_ERR_STRING(Err);
794       }
795 
796       KernelsList.emplace_back(Func, ExecModeVal);
797 
798       __tgt_offload_entry Entry = *E;
799       Entry.addr = &KernelsList.back();
800       addOffloadEntry(DeviceId, Entry);
801     }
802 
803     // send device environment data to the device
804     {
805       omptarget_device_environmentTy DeviceEnv{0};
806 
807 #ifdef OMPTARGET_DEBUG
808       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
809         DeviceEnv.debug_level = std::stoi(EnvStr);
810 #endif
811 
812       const char *DeviceEnvName = "omptarget_device_environment";
813       CUdeviceptr DeviceEnvPtr;
814       size_t CUSize;
815 
816       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
817       if (Err == CUDA_SUCCESS) {
818         if (CUSize != sizeof(DeviceEnv)) {
819           REPORT(
820               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
821               DeviceEnvName, CUSize, sizeof(int32_t));
822           CUDA_ERR_STRING(Err);
823           return nullptr;
824         }
825 
826         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
827         if (Err != CUDA_SUCCESS) {
828           REPORT("Error when copying data from host to device. Pointers: "
829                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
830                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
831           CUDA_ERR_STRING(Err);
832           return nullptr;
833         }
834 
835         DP("Sending global device environment data %zu bytes\n", CUSize);
836       } else {
837         DP("Finding global device environment '%s' - symbol missing.\n",
838            DeviceEnvName);
839         DP("Continue, considering this is a device RTL which does not accept "
840            "environment setting.\n");
841       }
842     }
843 
844     return getOffloadEntriesTable(DeviceId);
845   }
846 
847   void *dataAlloc(const int DeviceId, const int64_t Size,
848                   const TargetAllocTy Kind) {
849     switch (Kind) {
850     case TARGET_ALLOC_DEFAULT:
851     case TARGET_ALLOC_DEVICE:
852       if (UseMemoryManager)
853         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
854       else
855         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
856     case TARGET_ALLOC_HOST:
857     case TARGET_ALLOC_SHARED:
858       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
859     }
860 
861     REPORT("Invalid target data allocation kind or requested allocator not "
862            "implemented yet\n");
863 
864     return nullptr;
865   }
866 
867   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
868                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
869     assert(AsyncInfo && "AsyncInfo is nullptr");
870 
871     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
872     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
873       return OFFLOAD_FAIL;
874 
875     CUstream Stream = getStream(DeviceId, AsyncInfo);
876 
877     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
878     if (Err != CUDA_SUCCESS) {
879       DP("Error when copying data from host to device. Pointers: host "
880          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
881          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
882       CUDA_ERR_STRING(Err);
883       return OFFLOAD_FAIL;
884     }
885 
886     return OFFLOAD_SUCCESS;
887   }
888 
889   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
890                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
891     assert(AsyncInfo && "AsyncInfo is nullptr");
892 
893     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
894     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
895       return OFFLOAD_FAIL;
896 
897     CUstream Stream = getStream(DeviceId, AsyncInfo);
898 
899     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
900     if (Err != CUDA_SUCCESS) {
901       DP("Error when copying data from device to host. Pointers: host "
902          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
903          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
904       CUDA_ERR_STRING(Err);
905       return OFFLOAD_FAIL;
906     }
907 
908     return OFFLOAD_SUCCESS;
909   }
910 
911   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
912                    int64_t Size, __tgt_async_info *AsyncInfo) const {
913     assert(AsyncInfo && "AsyncInfo is nullptr");
914 
915     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
916     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
917       return OFFLOAD_FAIL;
918 
919     CUstream Stream = getStream(SrcDevId, AsyncInfo);
920 
921     // If they are two devices, we try peer to peer copy first
922     if (SrcDevId != DstDevId) {
923       int CanAccessPeer = 0;
924       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
925       if (Err != CUDA_SUCCESS) {
926         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
927                ", dst = %" PRId32 "\n",
928                SrcDevId, DstDevId);
929         CUDA_ERR_STRING(Err);
930         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
931       }
932 
933       if (!CanAccessPeer) {
934         DP("P2P memcpy not supported so fall back to D2D memcpy");
935         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
936       }
937 
938       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
939       if (Err != CUDA_SUCCESS) {
940         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
941                ", dst = %" PRId32 "\n",
942                SrcDevId, DstDevId);
943         CUDA_ERR_STRING(Err);
944         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
945       }
946 
947       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
948                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
949                               Size, Stream);
950       if (Err == CUDA_SUCCESS)
951         return OFFLOAD_SUCCESS;
952 
953       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
954          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
955          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
956       CUDA_ERR_STRING(Err);
957     }
958 
959     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
960   }
961 
962   int dataDelete(const int DeviceId, void *TgtPtr) {
963     if (UseMemoryManager)
964       return MemoryManagers[DeviceId]->free(TgtPtr);
965 
966     return DeviceAllocators[DeviceId].free(TgtPtr);
967   }
968 
969   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
970                           ptrdiff_t *TgtOffsets, const int ArgNum,
971                           const int TeamNum, const int ThreadLimit,
972                           const unsigned int LoopTripCount,
973                           __tgt_async_info *AsyncInfo) const {
974     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
975     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
976       return OFFLOAD_FAIL;
977 
978     // All args are references.
979     std::vector<void *> Args(ArgNum);
980     std::vector<void *> Ptrs(ArgNum);
981 
982     for (int I = 0; I < ArgNum; ++I) {
983       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
984       Args[I] = &Ptrs[I];
985     }
986 
987     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
988 
989     int CudaThreadsPerBlock;
990     if (ThreadLimit > 0) {
991       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
992       CudaThreadsPerBlock = ThreadLimit;
993       // Add master warp if necessary
994       if (KernelInfo->ExecutionMode == GENERIC) {
995         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
996         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
997       }
998     } else {
999       DP("Setting CUDA threads per block to default %d\n",
1000          DeviceData[DeviceId].NumThreads);
1001       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1002     }
1003 
1004     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1005       DP("Threads per block capped at device limit %d\n",
1006          DeviceData[DeviceId].ThreadsPerBlock);
1007       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1008     }
1009 
1010     if (!KernelInfo->MaxThreadsPerBlock) {
1011       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1012                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1013                                KernelInfo->Func);
1014       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1015         return OFFLOAD_FAIL;
1016     }
1017 
1018     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1019       DP("Threads per block capped at kernel limit %d\n",
1020          KernelInfo->MaxThreadsPerBlock);
1021       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1022     }
1023 
1024     unsigned int CudaBlocksPerGrid;
1025     if (TeamNum <= 0) {
1026       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1027         if (KernelInfo->ExecutionMode == SPMD) {
1028           // We have a combined construct, i.e. `target teams distribute
1029           // parallel for [simd]`. We launch so many teams so that each thread
1030           // will execute one iteration of the loop. round up to the nearest
1031           // integer
1032           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1033         } else {
1034           // If we reach this point, then we have a non-combined construct, i.e.
1035           // `teams distribute` with a nested `parallel for` and each team is
1036           // assigned one iteration of the `distribute` loop. E.g.:
1037           //
1038           // #pragma omp target teams distribute
1039           // for(...loop_tripcount...) {
1040           //   #pragma omp parallel for
1041           //   for(...) {}
1042           // }
1043           //
1044           // Threads within a team will execute the iterations of the `parallel`
1045           // loop.
1046           CudaBlocksPerGrid = LoopTripCount;
1047         }
1048         DP("Using %d teams due to loop trip count %" PRIu32
1049            " and number of threads per block %d\n",
1050            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1051       } else {
1052         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1053         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1054       }
1055     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1056       DP("Capping number of teams to team limit %d\n",
1057          DeviceData[DeviceId].BlocksPerGrid);
1058       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1059     } else {
1060       DP("Using requested number of teams %d\n", TeamNum);
1061       CudaBlocksPerGrid = TeamNum;
1062     }
1063 
1064     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1065          "Launching kernel %s with %d blocks and %d threads in %s "
1066          "mode\n",
1067          (getOffloadEntry(DeviceId, TgtEntryPtr))
1068              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1069              : "(null)",
1070          CudaBlocksPerGrid, CudaThreadsPerBlock,
1071          (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
1072 
1073     CUstream Stream = getStream(DeviceId, AsyncInfo);
1074     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1075                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1076                          /* blockDimY */ 1, /* blockDimZ */ 1,
1077                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1078     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1079       return OFFLOAD_FAIL;
1080 
1081     DP("Launch of entry point at " DPxMOD " successful!\n",
1082        DPxPTR(TgtEntryPtr));
1083 
1084     return OFFLOAD_SUCCESS;
1085   }
1086 
1087   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1088     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1089     CUresult Err = cuStreamSynchronize(Stream);
1090 
1091     // Once the stream is synchronized, return it to stream pool and reset
1092     // AsyncInfo. This is to make sure the synchronization only works for its
1093     // own tasks.
1094     StreamManager->returnStream(DeviceId,
1095                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1096     AsyncInfo->Queue = nullptr;
1097 
1098     if (Err != CUDA_SUCCESS) {
1099       DP("Error when synchronizing stream. stream = " DPxMOD
1100          ", async info ptr = " DPxMOD "\n",
1101          DPxPTR(Stream), DPxPTR(AsyncInfo));
1102       CUDA_ERR_STRING(Err);
1103     }
1104     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1105   }
1106 };
1107 
1108 DeviceRTLTy DeviceRTL;
1109 } // namespace
1110 
1111 // Exposed library API function
1112 #ifdef __cplusplus
1113 extern "C" {
1114 #endif
1115 
1116 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1117   return elf_check_machine(image, /* EM_CUDA */ 190);
1118 }
1119 
1120 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1121 
1122 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1123   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1124   DeviceRTL.setRequiresFlag(RequiresFlags);
1125   return RequiresFlags;
1126 }
1127 
1128 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1129   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1130       DeviceRTL.isValidDeviceId(dst_dev_id))
1131     return 1;
1132 
1133   return 0;
1134 }
1135 
1136 int32_t __tgt_rtl_init_device(int32_t device_id) {
1137   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1138 
1139   return DeviceRTL.initDevice(device_id);
1140 }
1141 
1142 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1143                                           __tgt_device_image *image) {
1144   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1145 
1146   return DeviceRTL.loadBinary(device_id, image);
1147 }
1148 
1149 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1150                            int32_t kind) {
1151   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1152 
1153   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1154 }
1155 
1156 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1157                               int64_t size) {
1158   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1159 
1160   __tgt_async_info AsyncInfo;
1161   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1162                                                  size, &AsyncInfo);
1163   if (rc != OFFLOAD_SUCCESS)
1164     return OFFLOAD_FAIL;
1165 
1166   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1167 }
1168 
1169 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1170                                     void *hst_ptr, int64_t size,
1171                                     __tgt_async_info *async_info_ptr) {
1172   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1173   assert(async_info_ptr && "async_info_ptr is nullptr");
1174 
1175   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1176                               async_info_ptr);
1177 }
1178 
1179 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1180                                 int64_t size) {
1181   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1182 
1183   __tgt_async_info AsyncInfo;
1184   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1185                                                    size, &AsyncInfo);
1186   if (rc != OFFLOAD_SUCCESS)
1187     return OFFLOAD_FAIL;
1188 
1189   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1190 }
1191 
1192 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1193                                       void *tgt_ptr, int64_t size,
1194                                       __tgt_async_info *async_info_ptr) {
1195   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1196   assert(async_info_ptr && "async_info_ptr is nullptr");
1197 
1198   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1199                                 async_info_ptr);
1200 }
1201 
1202 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1203                                       int dst_dev_id, void *dst_ptr,
1204                                       int64_t size,
1205                                       __tgt_async_info *AsyncInfo) {
1206   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1207   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1208   assert(AsyncInfo && "AsyncInfo is nullptr");
1209 
1210   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1211                                 AsyncInfo);
1212 }
1213 
1214 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1215                                 int32_t dst_dev_id, void *dst_ptr,
1216                                 int64_t size) {
1217   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1218   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1219 
1220   __tgt_async_info AsyncInfo;
1221   const int32_t rc = __tgt_rtl_data_exchange_async(
1222       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1223   if (rc != OFFLOAD_SUCCESS)
1224     return OFFLOAD_FAIL;
1225 
1226   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1227 }
1228 
1229 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1230   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1231 
1232   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1233 }
1234 
1235 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1236                                          void **tgt_args,
1237                                          ptrdiff_t *tgt_offsets,
1238                                          int32_t arg_num, int32_t team_num,
1239                                          int32_t thread_limit,
1240                                          uint64_t loop_tripcount) {
1241   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1242 
1243   __tgt_async_info AsyncInfo;
1244   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1245       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1246       thread_limit, loop_tripcount, &AsyncInfo);
1247   if (rc != OFFLOAD_SUCCESS)
1248     return OFFLOAD_FAIL;
1249 
1250   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1251 }
1252 
1253 int32_t __tgt_rtl_run_target_team_region_async(
1254     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1255     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1256     int32_t thread_limit, uint64_t loop_tripcount,
1257     __tgt_async_info *async_info_ptr) {
1258   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1259 
1260   return DeviceRTL.runTargetTeamRegion(
1261       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1262       thread_limit, loop_tripcount, async_info_ptr);
1263 }
1264 
1265 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1266                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1267                                     int32_t arg_num) {
1268   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1269 
1270   __tgt_async_info AsyncInfo;
1271   const int32_t rc = __tgt_rtl_run_target_region_async(
1272       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1273   if (rc != OFFLOAD_SUCCESS)
1274     return OFFLOAD_FAIL;
1275 
1276   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1277 }
1278 
1279 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1280                                           void *tgt_entry_ptr, void **tgt_args,
1281                                           ptrdiff_t *tgt_offsets,
1282                                           int32_t arg_num,
1283                                           __tgt_async_info *async_info_ptr) {
1284   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1285 
1286   return __tgt_rtl_run_target_team_region_async(
1287       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1288       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1289       async_info_ptr);
1290 }
1291 
1292 int32_t __tgt_rtl_synchronize(int32_t device_id,
1293                               __tgt_async_info *async_info_ptr) {
1294   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1295   assert(async_info_ptr && "async_info_ptr is nullptr");
1296   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1297 
1298   return DeviceRTL.synchronize(device_id, async_info_ptr);
1299 }
1300 
1301 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1302   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1303   InfoLevel.store(NewInfoLevel);
1304 }
1305 
1306 #ifdef __cplusplus
1307 }
1308 #endif
1309