1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "omptargetplugin.h"
25 
26 #define TARGET_NAME CUDA
27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
28 
29 #include "MemoryManager.h"
30 
31 // Utility for retrieving and printing CUDA error string.
32 #ifdef OMPTARGET_DEBUG
33 #define CUDA_ERR_STRING(err)                                                   \
34   do {                                                                         \
35     if (getDebugLevel() > 0) {                                                 \
36       const char *errStr = nullptr;                                            \
37       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
38       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
39         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
40       else if (errStr_status == CUDA_SUCCESS)                                  \
41         REPORT("CUDA error is: %s\n", errStr);                                 \
42       else {                                                                   \
43         REPORT("Unresolved CUDA error code: %d\n", err);                       \
44         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
45                errStr_status);                                                 \
46       }                                                                        \
47     } else {                                                                   \
48       const char *errStr = nullptr;                                            \
49       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
50       if (errStr_status == CUDA_SUCCESS)                                       \
51         REPORT("%s \n", errStr);                                               \
52     }                                                                          \
53   } while (false)
54 #else // OMPTARGET_DEBUG
55 #define CUDA_ERR_STRING(err)                                                   \
56   do {                                                                         \
57     const char *errStr = nullptr;                                              \
58     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
59     if (errStr_status == CUDA_SUCCESS)                                         \
60       REPORT("%s \n", errStr);                                                 \
61   } while (false)
62 #endif // OMPTARGET_DEBUG
63 
64 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
65 
66 #include "elf_common.h"
67 
68 /// Keep entries table per device.
69 struct FuncOrGblEntryTy {
70   __tgt_target_table Table;
71   std::vector<__tgt_offload_entry> Entries;
72 };
73 
74 enum ExecutionModeType {
75   SPMD,         // constructors, destructors,
76                 // combined constructs (`teams distribute parallel for [simd]`)
77   GENERIC,      // everything else
78   SPMD_GENERIC, // Generic kernel with SPMD execution
79   NONE
80 };
81 
82 /// Use a single entity to encode a kernel and a set of flags.
83 struct KernelTy {
84   CUfunction Func;
85 
86   // execution mode of kernel
87   // 0 - SPMD mode (without master warp)
88   // 1 - Generic mode (with master warp)
89   // 2 - SPMD mode execution with Generic mode semantics.
90   int8_t ExecutionMode;
91 
92   /// Maximal number of threads per block for this kernel.
93   int MaxThreadsPerBlock = 0;
94 
95   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
96       : Func(_Func), ExecutionMode(_ExecutionMode) {}
97 };
98 
99 /// Device environment data
100 /// Manually sync with the deviceRTL side for now, move to a dedicated header
101 /// file later.
102 struct omptarget_device_environmentTy {
103   int32_t debug_level;
104 };
105 
106 namespace {
107 bool checkResult(CUresult Err, const char *ErrMsg) {
108   if (Err == CUDA_SUCCESS)
109     return true;
110 
111   REPORT("%s", ErrMsg);
112   CUDA_ERR_STRING(Err);
113   return false;
114 }
115 
116 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
117                CUstream Stream) {
118   CUresult Err =
119       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
120 
121   if (Err != CUDA_SUCCESS) {
122     DP("Error when copying data from device to device. Pointers: src "
123        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
124        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
125     CUDA_ERR_STRING(Err);
126     return OFFLOAD_FAIL;
127   }
128 
129   return OFFLOAD_SUCCESS;
130 }
131 
132 // Structure contains per-device data
133 struct DeviceDataTy {
134   /// List that contains all the kernels.
135   std::list<KernelTy> KernelsList;
136 
137   std::list<FuncOrGblEntryTy> FuncGblEntries;
138 
139   CUcontext Context = nullptr;
140   // Device properties
141   int ThreadsPerBlock = 0;
142   int BlocksPerGrid = 0;
143   int WarpSize = 0;
144   // OpenMP properties
145   int NumTeams = 0;
146   int NumThreads = 0;
147 };
148 
149 class StreamManagerTy {
150   int NumberOfDevices;
151   // The initial size of stream pool
152   int EnvNumInitialStreams;
153   // Per-device stream mutex
154   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
155   // Per-device stream Id indicates the next available stream in the pool
156   std::vector<int> NextStreamId;
157   // Per-device stream pool
158   std::vector<std::vector<CUstream>> StreamPool;
159   // Reference to per-device data
160   std::vector<DeviceDataTy> &DeviceData;
161 
162   // If there is no CUstream left in the pool, we will resize the pool to
163   // allocate more CUstream. This function should be called with device mutex,
164   // and we do not resize to smaller one.
165   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
166     std::vector<CUstream> &Pool = StreamPool[DeviceId];
167     const size_t CurrentSize = Pool.size();
168     assert(NewSize > CurrentSize && "new size is not larger than current size");
169 
170     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
171     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
172       // We will return if cannot switch to the right context in case of
173       // creating bunch of streams that are not corresponding to the right
174       // device. The offloading will fail later because selected CUstream is
175       // nullptr.
176       return;
177     }
178 
179     Pool.resize(NewSize, nullptr);
180 
181     for (size_t I = CurrentSize; I < NewSize; ++I) {
182       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
183                   "Error returned from cuStreamCreate\n");
184     }
185   }
186 
187 public:
188   StreamManagerTy(const int NumberOfDevices,
189                   std::vector<DeviceDataTy> &DeviceData)
190       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
191         DeviceData(DeviceData) {
192     StreamPool.resize(NumberOfDevices);
193     NextStreamId.resize(NumberOfDevices);
194     StreamMtx.resize(NumberOfDevices);
195 
196     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
197       EnvNumInitialStreams = std::stoi(EnvStr);
198 
199     // Initialize the next stream id
200     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
201 
202     // Initialize stream mutex
203     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
204       Ptr = std::make_unique<std::mutex>();
205   }
206 
207   ~StreamManagerTy() {
208     // Destroy streams
209     for (int I = 0; I < NumberOfDevices; ++I) {
210       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
211                   "Error returned from cuCtxSetCurrent\n");
212 
213       for (CUstream &S : StreamPool[I]) {
214         if (S)
215           checkResult(cuStreamDestroy(S),
216                       "Error returned from cuStreamDestroy\n");
217       }
218     }
219   }
220 
221   // Get a CUstream from pool. Per-device next stream id always points to the
222   // next available CUstream. That means, CUstreams [0, id-1] have been
223   // assigned, and [id,] are still available. If there is no CUstream left, we
224   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
225   // the id will increase one.
226   // xxxxxs+++++++++
227   //      ^
228   //      id
229   // After assignment, the pool becomes the following and s is assigned.
230   // xxxxxs+++++++++
231   //       ^
232   //       id
233   CUstream getStream(const int DeviceId) {
234     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
235     int &Id = NextStreamId[DeviceId];
236     // No CUstream left in the pool, we need to request from CUDA RT
237     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
238       // By default we double the stream pool every time
239       resizeStreamPool(DeviceId, Id * 2);
240     }
241     return StreamPool[DeviceId][Id++];
242   }
243 
244   // Return a CUstream back to pool. As mentioned above, per-device next
245   // stream is always points to the next available CUstream, so when we return
246   // a CUstream, we need to first decrease the id, and then copy the CUstream
247   // back.
248   // It is worth noting that, the order of streams return might be different
249   // from that they're assigned, that saying, at some point, there might be
250   // two identical CUstreams.
251   // xxax+a+++++
252   //     ^
253   //     id
254   // However, it doesn't matter, because they're always on the two sides of
255   // id. The left one will in the end be overwritten by another CUstream.
256   // Therefore, after several execution, the order of pool might be different
257   // from its initial state.
258   void returnStream(const int DeviceId, CUstream Stream) {
259     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
260     int &Id = NextStreamId[DeviceId];
261     assert(Id > 0 && "Wrong stream ID");
262     StreamPool[DeviceId][--Id] = Stream;
263   }
264 
265   bool initializeDeviceStreamPool(const int DeviceId) {
266     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
267 
268     resizeStreamPool(DeviceId, EnvNumInitialStreams);
269 
270     // Check the size of stream pool
271     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
272       return false;
273 
274     // Check whether each stream is valid
275     for (CUstream &S : StreamPool[DeviceId])
276       if (!S)
277         return false;
278 
279     return true;
280   }
281 };
282 
283 class DeviceRTLTy {
284   int NumberOfDevices;
285   // OpenMP environment properties
286   int EnvNumTeams;
287   int EnvTeamLimit;
288   int EnvTeamThreadLimit;
289   // OpenMP requires flags
290   int64_t RequiresFlags;
291 
292   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
293   static constexpr const int HardThreadLimit = 1024;
294   static constexpr const int DefaultNumTeams = 128;
295   static constexpr const int DefaultNumThreads = 128;
296 
297   std::unique_ptr<StreamManagerTy> StreamManager;
298   std::vector<DeviceDataTy> DeviceData;
299   std::vector<CUmodule> Modules;
300 
301   /// A class responsible for interacting with device native runtime library to
302   /// allocate and free memory.
303   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
304     const int DeviceId;
305     const std::vector<DeviceDataTy> &DeviceData;
306     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
307 
308   public:
309     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
310         : DeviceId(DeviceId), DeviceData(DeviceData) {}
311 
312     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
313       if (Size == 0)
314         return nullptr;
315 
316       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
317       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
318         return nullptr;
319 
320       void *MemAlloc = nullptr;
321       switch (Kind) {
322       case TARGET_ALLOC_DEFAULT:
323       case TARGET_ALLOC_DEVICE:
324         CUdeviceptr DevicePtr;
325         Err = cuMemAlloc(&DevicePtr, Size);
326         MemAlloc = (void *)DevicePtr;
327         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
328           return nullptr;
329         break;
330       case TARGET_ALLOC_HOST:
331         void *HostPtr;
332         Err = cuMemAllocHost(&HostPtr, Size);
333         MemAlloc = HostPtr;
334         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
335           return nullptr;
336         HostPinnedAllocs[MemAlloc] = Kind;
337         break;
338       case TARGET_ALLOC_SHARED:
339         CUdeviceptr SharedPtr;
340         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
341         MemAlloc = (void *)SharedPtr;
342         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
343           return nullptr;
344         break;
345       }
346 
347       return MemAlloc;
348     }
349 
350     int free(void *TgtPtr) override {
351       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
352       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
353         return OFFLOAD_FAIL;
354 
355       // Host pinned memory must be freed differently.
356       TargetAllocTy Kind =
357           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
358               ? TARGET_ALLOC_DEFAULT
359               : TARGET_ALLOC_HOST;
360       switch (Kind) {
361       case TARGET_ALLOC_DEFAULT:
362       case TARGET_ALLOC_DEVICE:
363       case TARGET_ALLOC_SHARED:
364         Err = cuMemFree((CUdeviceptr)TgtPtr);
365         if (!checkResult(Err, "Error returned from cuMemFree\n"))
366           return OFFLOAD_FAIL;
367         break;
368       case TARGET_ALLOC_HOST:
369         Err = cuMemFreeHost(TgtPtr);
370         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
371           return OFFLOAD_FAIL;
372         break;
373       }
374 
375       return OFFLOAD_SUCCESS;
376     }
377   };
378 
379   /// A vector of device allocators
380   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
381 
382   /// A vector of memory managers. Since the memory manager is non-copyable and
383   // non-removable, we wrap them into std::unique_ptr.
384   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
385 
386   /// Whether use memory manager
387   bool UseMemoryManager = true;
388 
389   // Record entry point associated with device
390   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
391     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
392     E.Entries.push_back(entry);
393   }
394 
395   // Return a pointer to the entry associated with the pointer
396   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
397                                              const void *Addr) const {
398     for (const __tgt_offload_entry &Itr :
399          DeviceData[DeviceId].FuncGblEntries.back().Entries)
400       if (Itr.addr == Addr)
401         return &Itr;
402 
403     return nullptr;
404   }
405 
406   // Return the pointer to the target entries table
407   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
408     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
409 
410     if (E.Entries.empty())
411       return nullptr;
412 
413     // Update table info according to the entries and return the pointer
414     E.Table.EntriesBegin = E.Entries.data();
415     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
416 
417     return &E.Table;
418   }
419 
420   // Clear entries table for a device
421   void clearOffloadEntriesTable(const int DeviceId) {
422     DeviceData[DeviceId].FuncGblEntries.emplace_back();
423     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
424     E.Entries.clear();
425     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
426   }
427 
428   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
429     assert(AsyncInfo && "AsyncInfo is nullptr");
430 
431     if (!AsyncInfo->Queue)
432       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
433 
434     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
435   }
436 
437 public:
438   // This class should not be copied
439   DeviceRTLTy(const DeviceRTLTy &) = delete;
440   DeviceRTLTy(DeviceRTLTy &&) = delete;
441 
442   DeviceRTLTy()
443       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
444         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
445 
446     DP("Start initializing CUDA\n");
447 
448     CUresult Err = cuInit(0);
449     if (Err == CUDA_ERROR_INVALID_HANDLE) {
450       // Can't call cuGetErrorString if dlsym failed
451       DP("Failed to load CUDA shared library\n");
452       return;
453     }
454     if (!checkResult(Err, "Error returned from cuInit\n")) {
455       return;
456     }
457 
458     Err = cuDeviceGetCount(&NumberOfDevices);
459     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
460       return;
461 
462     if (NumberOfDevices == 0) {
463       DP("There are no devices supporting CUDA.\n");
464       return;
465     }
466 
467     DeviceData.resize(NumberOfDevices);
468 
469     // Get environment variables regarding teams
470     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
471       // OMP_TEAM_LIMIT has been set
472       EnvTeamLimit = std::stoi(EnvStr);
473       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
474     }
475     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
476       // OMP_TEAMS_THREAD_LIMIT has been set
477       EnvTeamThreadLimit = std::stoi(EnvStr);
478       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
479     }
480     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
481       // OMP_NUM_TEAMS has been set
482       EnvNumTeams = std::stoi(EnvStr);
483       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
484     }
485 
486     StreamManager =
487         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
488 
489     for (int I = 0; I < NumberOfDevices; ++I)
490       DeviceAllocators.emplace_back(I, DeviceData);
491 
492     // Get the size threshold from environment variable
493     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
494     UseMemoryManager = Res.second;
495     size_t MemoryManagerThreshold = Res.first;
496 
497     if (UseMemoryManager)
498       for (int I = 0; I < NumberOfDevices; ++I)
499         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
500             DeviceAllocators[I], MemoryManagerThreshold));
501   }
502 
503   ~DeviceRTLTy() {
504     // We first destruct memory managers in case that its dependent data are
505     // destroyed before it.
506     for (auto &M : MemoryManagers)
507       M.release();
508 
509     StreamManager = nullptr;
510 
511     for (CUmodule &M : Modules)
512       // Close module
513       if (M)
514         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
515 
516     for (DeviceDataTy &D : DeviceData) {
517       // Destroy context
518       if (D.Context) {
519         checkResult(cuCtxSetCurrent(D.Context),
520                     "Error returned from cuCtxSetCurrent\n");
521         CUdevice Device;
522         checkResult(cuCtxGetDevice(&Device),
523                     "Error returned from cuCtxGetDevice\n");
524         checkResult(cuDevicePrimaryCtxRelease(Device),
525                     "Error returned from cuDevicePrimaryCtxRelease\n");
526       }
527     }
528   }
529 
530   // Check whether a given DeviceId is valid
531   bool isValidDeviceId(const int DeviceId) const {
532     return DeviceId >= 0 && DeviceId < NumberOfDevices;
533   }
534 
535   int getNumOfDevices() const { return NumberOfDevices; }
536 
537   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
538 
539   int initDevice(const int DeviceId) {
540     CUdevice Device;
541 
542     DP("Getting device %d\n", DeviceId);
543     CUresult Err = cuDeviceGet(&Device, DeviceId);
544     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
545       return OFFLOAD_FAIL;
546 
547     // Query the current flags of the primary context and set its flags if
548     // it is inactive
549     unsigned int FormerPrimaryCtxFlags = 0;
550     int FormerPrimaryCtxIsActive = 0;
551     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
552                                      &FormerPrimaryCtxIsActive);
553     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
554       return OFFLOAD_FAIL;
555 
556     if (FormerPrimaryCtxIsActive) {
557       DP("The primary context is active, no change to its flags\n");
558       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
559           CU_CTX_SCHED_BLOCKING_SYNC)
560         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
561     } else {
562       DP("The primary context is inactive, set its flags to "
563          "CU_CTX_SCHED_BLOCKING_SYNC\n");
564       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
565       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
566         return OFFLOAD_FAIL;
567     }
568 
569     // Retain the per device primary context and save it to use whenever this
570     // device is selected.
571     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
572     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
573       return OFFLOAD_FAIL;
574 
575     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
576     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
577       return OFFLOAD_FAIL;
578 
579     // Initialize stream pool
580     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
581       return OFFLOAD_FAIL;
582 
583     // Query attributes to determine number of threads/block and blocks/grid.
584     int MaxGridDimX;
585     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
586                                Device);
587     if (Err != CUDA_SUCCESS) {
588       DP("Error getting max grid dimension, use default value %d\n",
589          DeviceRTLTy::DefaultNumTeams);
590       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
591     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
592       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
593       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
594     } else {
595       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
596          "at the hard limit\n",
597          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
598       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
599     }
600 
601     // We are only exploiting threads along the x axis.
602     int MaxBlockDimX;
603     Err = cuDeviceGetAttribute(&MaxBlockDimX,
604                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
605     if (Err != CUDA_SUCCESS) {
606       DP("Error getting max block dimension, use default value %d\n",
607          DeviceRTLTy::DefaultNumThreads);
608       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
609     } else {
610       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
611       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
612 
613       if (EnvTeamThreadLimit > 0 &&
614           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
615         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
616            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
617            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
618         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
619       }
620       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
621         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
622            "capping at the hard limit\n",
623            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
624         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
625       }
626     }
627 
628     // Get and set warp size
629     int WarpSize;
630     Err =
631         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
632     if (Err != CUDA_SUCCESS) {
633       DP("Error getting warp size, assume default value 32\n");
634       DeviceData[DeviceId].WarpSize = 32;
635     } else {
636       DP("Using warp size %d\n", WarpSize);
637       DeviceData[DeviceId].WarpSize = WarpSize;
638     }
639 
640     // Adjust teams to the env variables
641     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
642       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
643          EnvTeamLimit);
644       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
645     }
646 
647     size_t StackLimit;
648     size_t HeapLimit;
649     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
650       StackLimit = std::stol(EnvStr);
651       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
652         return OFFLOAD_FAIL;
653     } else {
654       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
655         return OFFLOAD_FAIL;
656     }
657     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
658       HeapLimit = std::stol(EnvStr);
659       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
660         return OFFLOAD_FAIL;
661     } else {
662       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
663         return OFFLOAD_FAIL;
664     }
665 
666     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
667          "Device supports up to %d CUDA blocks and %d threads with a "
668          "warp size of %d\n",
669          DeviceData[DeviceId].BlocksPerGrid,
670          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
671     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
672          "Device heap size is %d Bytes, device stack size is %d Bytes per "
673          "thread\n",
674          (int)HeapLimit, (int)StackLimit);
675 
676     // Set default number of teams
677     if (EnvNumTeams > 0) {
678       DP("Default number of teams set according to environment %d\n",
679          EnvNumTeams);
680       DeviceData[DeviceId].NumTeams = EnvNumTeams;
681     } else {
682       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
683       DP("Default number of teams set according to library's default %d\n",
684          DeviceRTLTy::DefaultNumTeams);
685     }
686 
687     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
688       DP("Default number of teams exceeds device limit, capping at %d\n",
689          DeviceData[DeviceId].BlocksPerGrid);
690       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
691     }
692 
693     // Set default number of threads
694     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
695     DP("Default number of threads set according to library's default %d\n",
696        DeviceRTLTy::DefaultNumThreads);
697     if (DeviceData[DeviceId].NumThreads >
698         DeviceData[DeviceId].ThreadsPerBlock) {
699       DP("Default number of threads exceeds device limit, capping at %d\n",
700          DeviceData[DeviceId].ThreadsPerBlock);
701       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
702     }
703 
704     return OFFLOAD_SUCCESS;
705   }
706 
707   __tgt_target_table *loadBinary(const int DeviceId,
708                                  const __tgt_device_image *Image) {
709     // Set the context we are using
710     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
711     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
712       return nullptr;
713 
714     // Clear the offload table as we are going to create a new one.
715     clearOffloadEntriesTable(DeviceId);
716 
717     // Create the module and extract the function pointers.
718     CUmodule Module;
719     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
720     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
721     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
722       return nullptr;
723 
724     DP("CUDA module successfully loaded!\n");
725 
726     Modules.push_back(Module);
727 
728     // Find the symbols in the module by name.
729     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
730     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
731 
732     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
733     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
734       if (!E->addr) {
735         // We return nullptr when something like this happens, the host should
736         // have always something in the address to uniquely identify the target
737         // region.
738         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
739         return nullptr;
740       }
741 
742       if (E->size) {
743         __tgt_offload_entry Entry = *E;
744         CUdeviceptr CUPtr;
745         size_t CUSize;
746         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
747         // We keep this style here because we need the name
748         if (Err != CUDA_SUCCESS) {
749           REPORT("Loading global '%s' Failed\n", E->name);
750           CUDA_ERR_STRING(Err);
751           return nullptr;
752         }
753 
754         if (CUSize != E->size) {
755           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
756              CUSize, E->size);
757           return nullptr;
758         }
759 
760         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
761            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
762 
763         Entry.addr = (void *)(CUPtr);
764 
765         // Note: In the current implementation declare target variables
766         // can either be link or to. This means that once unified
767         // memory is activated via the requires directive, the variable
768         // can be used directly from the host in both cases.
769         // TODO: when variables types other than to or link are added,
770         // the below condition should be changed to explicitly
771         // check for to and link variables types:
772         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
773         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
774         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
775           // If unified memory is present any target link or to variables
776           // can access host addresses directly. There is no longer a
777           // need for device copies.
778           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
779           DP("Copy linked variable host address (" DPxMOD
780              ") to device address (" DPxMOD ")\n",
781              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
782         }
783 
784         addOffloadEntry(DeviceId, Entry);
785 
786         continue;
787       }
788 
789       CUfunction Func;
790       Err = cuModuleGetFunction(&Func, Module, E->name);
791       // We keep this style here because we need the name
792       if (Err != CUDA_SUCCESS) {
793         REPORT("Loading '%s' Failed\n", E->name);
794         CUDA_ERR_STRING(Err);
795         return nullptr;
796       }
797 
798       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
799          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
800 
801       // default value GENERIC (in case symbol is missing from cubin file)
802       int8_t ExecModeVal = ExecutionModeType::GENERIC;
803       std::string ExecModeNameStr(E->name);
804       ExecModeNameStr += "_exec_mode";
805       const char *ExecModeName = ExecModeNameStr.c_str();
806 
807       CUdeviceptr ExecModePtr;
808       size_t CUSize;
809       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
810       if (Err == CUDA_SUCCESS) {
811         if (CUSize != sizeof(int8_t)) {
812           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
813              ExecModeName, CUSize, sizeof(int8_t));
814           return nullptr;
815         }
816 
817         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
818         if (Err != CUDA_SUCCESS) {
819           REPORT("Error when copying data from device to host. Pointers: "
820                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
821                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
822           CUDA_ERR_STRING(Err);
823           return nullptr;
824         }
825 
826         if (ExecModeVal < 0 || ExecModeVal > 2) {
827           DP("Error wrong exec_mode value specified in cubin file: %d\n",
828              ExecModeVal);
829           return nullptr;
830         }
831       } else {
832         REPORT("Loading global exec_mode '%s' - symbol missing, using default "
833                "value GENERIC (1)\n",
834                ExecModeName);
835         CUDA_ERR_STRING(Err);
836       }
837 
838       KernelsList.emplace_back(Func, ExecModeVal);
839 
840       __tgt_offload_entry Entry = *E;
841       Entry.addr = &KernelsList.back();
842       addOffloadEntry(DeviceId, Entry);
843     }
844 
845     // send device environment data to the device
846     {
847       omptarget_device_environmentTy DeviceEnv{0};
848 
849 #ifdef OMPTARGET_DEBUG
850       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
851         DeviceEnv.debug_level = std::stoi(EnvStr);
852 #endif
853 
854       const char *DeviceEnvName = "omptarget_device_environment";
855       CUdeviceptr DeviceEnvPtr;
856       size_t CUSize;
857 
858       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
859       if (Err == CUDA_SUCCESS) {
860         if (CUSize != sizeof(DeviceEnv)) {
861           REPORT(
862               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
863               DeviceEnvName, CUSize, sizeof(int32_t));
864           CUDA_ERR_STRING(Err);
865           return nullptr;
866         }
867 
868         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
869         if (Err != CUDA_SUCCESS) {
870           REPORT("Error when copying data from host to device. Pointers: "
871                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
872                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
873           CUDA_ERR_STRING(Err);
874           return nullptr;
875         }
876 
877         DP("Sending global device environment data %zu bytes\n", CUSize);
878       } else {
879         DP("Finding global device environment '%s' - symbol missing.\n",
880            DeviceEnvName);
881         DP("Continue, considering this is a device RTL which does not accept "
882            "environment setting.\n");
883       }
884     }
885 
886     return getOffloadEntriesTable(DeviceId);
887   }
888 
889   void *dataAlloc(const int DeviceId, const int64_t Size,
890                   const TargetAllocTy Kind) {
891     switch (Kind) {
892     case TARGET_ALLOC_DEFAULT:
893     case TARGET_ALLOC_DEVICE:
894       if (UseMemoryManager)
895         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
896       else
897         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
898     case TARGET_ALLOC_HOST:
899     case TARGET_ALLOC_SHARED:
900       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
901     }
902 
903     REPORT("Invalid target data allocation kind or requested allocator not "
904            "implemented yet\n");
905 
906     return nullptr;
907   }
908 
909   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
910                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
911     assert(AsyncInfo && "AsyncInfo is nullptr");
912 
913     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
914     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
915       return OFFLOAD_FAIL;
916 
917     CUstream Stream = getStream(DeviceId, AsyncInfo);
918 
919     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
920     if (Err != CUDA_SUCCESS) {
921       DP("Error when copying data from host to device. Pointers: host "
922          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
923          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
924       CUDA_ERR_STRING(Err);
925       return OFFLOAD_FAIL;
926     }
927 
928     return OFFLOAD_SUCCESS;
929   }
930 
931   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
932                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
933     assert(AsyncInfo && "AsyncInfo is nullptr");
934 
935     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
936     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
937       return OFFLOAD_FAIL;
938 
939     CUstream Stream = getStream(DeviceId, AsyncInfo);
940 
941     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
942     if (Err != CUDA_SUCCESS) {
943       DP("Error when copying data from device to host. Pointers: host "
944          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
945          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
946       CUDA_ERR_STRING(Err);
947       return OFFLOAD_FAIL;
948     }
949 
950     return OFFLOAD_SUCCESS;
951   }
952 
953   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
954                    int64_t Size, __tgt_async_info *AsyncInfo) const {
955     assert(AsyncInfo && "AsyncInfo is nullptr");
956 
957     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
958     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
959       return OFFLOAD_FAIL;
960 
961     CUstream Stream = getStream(SrcDevId, AsyncInfo);
962 
963     // If they are two devices, we try peer to peer copy first
964     if (SrcDevId != DstDevId) {
965       int CanAccessPeer = 0;
966       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
967       if (Err != CUDA_SUCCESS) {
968         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
969                ", dst = %" PRId32 "\n",
970                SrcDevId, DstDevId);
971         CUDA_ERR_STRING(Err);
972         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
973       }
974 
975       if (!CanAccessPeer) {
976         DP("P2P memcpy not supported so fall back to D2D memcpy");
977         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
978       }
979 
980       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
981       if (Err != CUDA_SUCCESS) {
982         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
983                ", dst = %" PRId32 "\n",
984                SrcDevId, DstDevId);
985         CUDA_ERR_STRING(Err);
986         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
987       }
988 
989       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
990                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
991                               Size, Stream);
992       if (Err == CUDA_SUCCESS)
993         return OFFLOAD_SUCCESS;
994 
995       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
996          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
997          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
998       CUDA_ERR_STRING(Err);
999     }
1000 
1001     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1002   }
1003 
1004   int dataDelete(const int DeviceId, void *TgtPtr) {
1005     if (UseMemoryManager)
1006       return MemoryManagers[DeviceId]->free(TgtPtr);
1007 
1008     return DeviceAllocators[DeviceId].free(TgtPtr);
1009   }
1010 
1011   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1012                           ptrdiff_t *TgtOffsets, const int ArgNum,
1013                           const int TeamNum, const int ThreadLimit,
1014                           const unsigned int LoopTripCount,
1015                           __tgt_async_info *AsyncInfo) const {
1016     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1017     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
1018       return OFFLOAD_FAIL;
1019 
1020     // All args are references.
1021     std::vector<void *> Args(ArgNum);
1022     std::vector<void *> Ptrs(ArgNum);
1023 
1024     for (int I = 0; I < ArgNum; ++I) {
1025       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1026       Args[I] = &Ptrs[I];
1027     }
1028 
1029     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1030 
1031     int CudaThreadsPerBlock;
1032     if (ThreadLimit > 0) {
1033       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1034       CudaThreadsPerBlock = ThreadLimit;
1035       // Add master warp if necessary
1036       if (KernelInfo->ExecutionMode == GENERIC) {
1037         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1038         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1039       }
1040     } else {
1041       DP("Setting CUDA threads per block to default %d\n",
1042          DeviceData[DeviceId].NumThreads);
1043       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1044     }
1045 
1046     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1047       DP("Threads per block capped at device limit %d\n",
1048          DeviceData[DeviceId].ThreadsPerBlock);
1049       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1050     }
1051 
1052     if (!KernelInfo->MaxThreadsPerBlock) {
1053       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1054                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1055                                KernelInfo->Func);
1056       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1057         return OFFLOAD_FAIL;
1058     }
1059 
1060     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1061       DP("Threads per block capped at kernel limit %d\n",
1062          KernelInfo->MaxThreadsPerBlock);
1063       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1064     }
1065 
1066     unsigned int CudaBlocksPerGrid;
1067     if (TeamNum <= 0) {
1068       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1069         if (KernelInfo->ExecutionMode == SPMD) {
1070           // We have a combined construct, i.e. `target teams distribute
1071           // parallel for [simd]`. We launch so many teams so that each thread
1072           // will execute one iteration of the loop. round up to the nearest
1073           // integer
1074           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1075         } else if (KernelInfo->ExecutionMode == GENERIC) {
1076           // If we reach this point, then we have a non-combined construct, i.e.
1077           // `teams distribute` with a nested `parallel for` and each team is
1078           // assigned one iteration of the `distribute` loop. E.g.:
1079           //
1080           // #pragma omp target teams distribute
1081           // for(...loop_tripcount...) {
1082           //   #pragma omp parallel for
1083           //   for(...) {}
1084           // }
1085           //
1086           // Threads within a team will execute the iterations of the `parallel`
1087           // loop.
1088           CudaBlocksPerGrid = LoopTripCount;
1089         } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1090           // If we reach this point, then we are executing a kernel that was
1091           // transformed from Generic-mode to SPMD-mode. This kernel has
1092           // SPMD-mode execution, but needs its blocks to be scheduled
1093           // differently because the current loop trip count only applies to the
1094           // `teams distribute` region and will create var too few blocks using
1095           // the regular SPMD-mode method.
1096           CudaBlocksPerGrid = LoopTripCount;
1097         } else {
1098           REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
1099           return OFFLOAD_FAIL;
1100         }
1101         DP("Using %d teams due to loop trip count %" PRIu32
1102            " and number of threads per block %d\n",
1103            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1104       } else {
1105         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1106         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1107       }
1108     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1109       DP("Capping number of teams to team limit %d\n",
1110          DeviceData[DeviceId].BlocksPerGrid);
1111       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1112     } else {
1113       DP("Using requested number of teams %d\n", TeamNum);
1114       CudaBlocksPerGrid = TeamNum;
1115     }
1116 
1117     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1118          "Launching kernel %s with %d blocks and %d threads in %s "
1119          "mode\n",
1120          (getOffloadEntry(DeviceId, TgtEntryPtr))
1121              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1122              : "(null)",
1123          CudaBlocksPerGrid, CudaThreadsPerBlock,
1124          (KernelInfo->ExecutionMode != SPMD
1125               ? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
1126                                                       : "SPMD-Generic")
1127               : "SPMD"));
1128 
1129     CUstream Stream = getStream(DeviceId, AsyncInfo);
1130     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1131                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1132                          /* blockDimY */ 1, /* blockDimZ */ 1,
1133                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1134     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1135       return OFFLOAD_FAIL;
1136 
1137     DP("Launch of entry point at " DPxMOD " successful!\n",
1138        DPxPTR(TgtEntryPtr));
1139 
1140     return OFFLOAD_SUCCESS;
1141   }
1142 
1143   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1144     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1145     CUresult Err = cuStreamSynchronize(Stream);
1146 
1147     // Once the stream is synchronized, return it to stream pool and reset
1148     // AsyncInfo. This is to make sure the synchronization only works for its
1149     // own tasks.
1150     StreamManager->returnStream(DeviceId,
1151                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1152     AsyncInfo->Queue = nullptr;
1153 
1154     if (Err != CUDA_SUCCESS) {
1155       DP("Error when synchronizing stream. stream = " DPxMOD
1156          ", async info ptr = " DPxMOD "\n",
1157          DPxPTR(Stream), DPxPTR(AsyncInfo));
1158       CUDA_ERR_STRING(Err);
1159     }
1160     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1161   }
1162 
1163   void printDeviceInfo(int32_t device_id) {
1164     char TmpChar[1000];
1165     std::string TmpStr;
1166     size_t TmpSt;
1167     int TmpInt, TmpInt2, TmpInt3;
1168 
1169     CUdevice Device;
1170     checkResult(cuDeviceGet(&Device, device_id),
1171                 "Error returned from cuCtxGetDevice\n");
1172 
1173     cuDriverGetVersion(&TmpInt);
1174     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1175     printf("    CUDA Device Number: \t\t%d \n", device_id);
1176     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1177                 "Error returned from cuDeviceGetName\n");
1178     printf("    Device Name: \t\t\t%s \n", TmpChar);
1179     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1180                 "Error returned from cuDeviceTotalMem\n");
1181     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1182     checkResult(cuDeviceGetAttribute(
1183                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1184                 "Error returned from cuDeviceGetAttribute\n");
1185     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1186     checkResult(
1187         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1188         "Error returned from cuDeviceGetAttribute\n");
1189     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1190     checkResult(cuDeviceGetAttribute(
1191                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1192                 "Error returned from cuDeviceGetAttribute\n");
1193     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1194     checkResult(
1195         cuDeviceGetAttribute(
1196             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1197         "Error returned from cuDeviceGetAttribute\n");
1198     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1199     checkResult(
1200         cuDeviceGetAttribute(
1201             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1202         "Error returned from cuDeviceGetAttribute\n");
1203     printf("    Registers per Block: \t\t%d \n", TmpInt);
1204     checkResult(
1205         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1206         "Error returned from cuDeviceGetAttribute\n");
1207     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1208     checkResult(cuDeviceGetAttribute(
1209                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1210                 "Error returned from cuDeviceGetAttribute\n");
1211     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1212     checkResult(cuDeviceGetAttribute(
1213                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1214                 "Error returned from cuDeviceGetAttribute\n");
1215     checkResult(cuDeviceGetAttribute(
1216                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1217                 "Error returned from cuDeviceGetAttribute\n");
1218     checkResult(cuDeviceGetAttribute(
1219                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1220                 "Error returned from cuDeviceGetAttribute\n");
1221     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1222            TmpInt3);
1223     checkResult(cuDeviceGetAttribute(
1224                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1225                 "Error returned from cuDeviceGetAttribute\n");
1226     checkResult(cuDeviceGetAttribute(
1227                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1228                 "Error returned from cuDeviceGetAttribute\n");
1229     checkResult(cuDeviceGetAttribute(
1230                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1231                 "Error returned from cuDeviceGetAttribute\n");
1232     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1233            TmpInt3);
1234     checkResult(
1235         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1236         "Error returned from cuDeviceGetAttribute\n");
1237     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1238     checkResult(cuDeviceGetAttribute(
1239                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1240                 "Error returned from cuDeviceGetAttribute\n");
1241     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1242     checkResult(
1243         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1244         "Error returned from cuDeviceGetAttribute\n");
1245     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1246     checkResult(cuDeviceGetAttribute(
1247                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1248                 "Error returned from cuDeviceGetAttribute\n");
1249     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1250     checkResult(
1251         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1252         "Error returned from cuDeviceGetAttribute\n");
1253     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1254     checkResult(cuDeviceGetAttribute(
1255                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1256                 "Error returned from cuDeviceGetAttribute\n");
1257     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1258     checkResult(
1259         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1260         "Error returned from cuDeviceGetAttribute\n");
1261     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1262       TmpStr = "DEFAULT";
1263     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1264       TmpStr = "PROHIBITED";
1265     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1266       TmpStr = "EXCLUSIVE PROCESS";
1267     else
1268       TmpStr = "unknown";
1269     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1270     checkResult(cuDeviceGetAttribute(
1271                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1272                 "Error returned from cuDeviceGetAttribute\n");
1273     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1274     checkResult(
1275         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1276         "Error returned from cuDeviceGetAttribute\n");
1277     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1278     checkResult(cuDeviceGetAttribute(
1279                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1280                 "Error returned from cuDeviceGetAttribute\n");
1281     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1282     checkResult(
1283         cuDeviceGetAttribute(
1284             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1285         "Error returned from cuDeviceGetAttribute\n");
1286     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1287     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1288                                      Device),
1289                 "Error returned from cuDeviceGetAttribute\n");
1290     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1291     checkResult(cuDeviceGetAttribute(
1292                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1293                     Device),
1294                 "Error returned from cuDeviceGetAttribute\n");
1295     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1296     checkResult(cuDeviceGetAttribute(
1297                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1298                 "Error returned from cuDeviceGetAttribute\n");
1299     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1300     checkResult(cuDeviceGetAttribute(
1301                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1302                 "Error returned from cuDeviceGetAttribute\n");
1303     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1304     checkResult(cuDeviceGetAttribute(
1305                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1306                 "Error returned from cuDeviceGetAttribute\n");
1307     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1308     checkResult(
1309         cuDeviceGetAttribute(
1310             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1311         "Error returned from cuDeviceGetAttribute\n");
1312     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1313     checkResult(
1314         cuDeviceGetAttribute(
1315             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1316         "Error returned from cuDeviceGetAttribute\n");
1317     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1318     checkResult(cuDeviceGetAttribute(
1319                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1320                 "Error returned from cuDeviceGetAttribute\n");
1321     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1322     checkResult(cuDeviceGetAttribute(
1323                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1324                 "Error returned from cuDeviceGetAttribute\n");
1325     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1326     checkResult(
1327         cuDeviceGetAttribute(
1328             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1329         "Error returned from cuDeviceGetAttribute\n");
1330     checkResult(
1331         cuDeviceGetAttribute(
1332             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1333         "Error returned from cuDeviceGetAttribute\n");
1334     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1335   }
1336 };
1337 
1338 DeviceRTLTy DeviceRTL;
1339 } // namespace
1340 
1341 // Exposed library API function
1342 #ifdef __cplusplus
1343 extern "C" {
1344 #endif
1345 
1346 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1347   return elf_check_machine(image, /* EM_CUDA */ 190);
1348 }
1349 
1350 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1351 
1352 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1353   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1354   DeviceRTL.setRequiresFlag(RequiresFlags);
1355   return RequiresFlags;
1356 }
1357 
1358 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1359   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1360       DeviceRTL.isValidDeviceId(dst_dev_id))
1361     return 1;
1362 
1363   return 0;
1364 }
1365 
1366 int32_t __tgt_rtl_init_device(int32_t device_id) {
1367   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1368 
1369   return DeviceRTL.initDevice(device_id);
1370 }
1371 
1372 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1373                                           __tgt_device_image *image) {
1374   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1375 
1376   return DeviceRTL.loadBinary(device_id, image);
1377 }
1378 
1379 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1380                            int32_t kind) {
1381   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1382 
1383   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1384 }
1385 
1386 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1387                               int64_t size) {
1388   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1389 
1390   __tgt_async_info AsyncInfo;
1391   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1392                                                  size, &AsyncInfo);
1393   if (rc != OFFLOAD_SUCCESS)
1394     return OFFLOAD_FAIL;
1395 
1396   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1397 }
1398 
1399 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1400                                     void *hst_ptr, int64_t size,
1401                                     __tgt_async_info *async_info_ptr) {
1402   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1403   assert(async_info_ptr && "async_info_ptr is nullptr");
1404 
1405   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1406                               async_info_ptr);
1407 }
1408 
1409 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1410                                 int64_t size) {
1411   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1412 
1413   __tgt_async_info AsyncInfo;
1414   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1415                                                    size, &AsyncInfo);
1416   if (rc != OFFLOAD_SUCCESS)
1417     return OFFLOAD_FAIL;
1418 
1419   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1420 }
1421 
1422 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1423                                       void *tgt_ptr, int64_t size,
1424                                       __tgt_async_info *async_info_ptr) {
1425   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1426   assert(async_info_ptr && "async_info_ptr is nullptr");
1427 
1428   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1429                                 async_info_ptr);
1430 }
1431 
1432 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1433                                       int dst_dev_id, void *dst_ptr,
1434                                       int64_t size,
1435                                       __tgt_async_info *AsyncInfo) {
1436   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1437   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1438   assert(AsyncInfo && "AsyncInfo is nullptr");
1439 
1440   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1441                                 AsyncInfo);
1442 }
1443 
1444 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1445                                 int32_t dst_dev_id, void *dst_ptr,
1446                                 int64_t size) {
1447   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1448   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1449 
1450   __tgt_async_info AsyncInfo;
1451   const int32_t rc = __tgt_rtl_data_exchange_async(
1452       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1453   if (rc != OFFLOAD_SUCCESS)
1454     return OFFLOAD_FAIL;
1455 
1456   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1457 }
1458 
1459 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1460   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1461 
1462   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1463 }
1464 
1465 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1466                                          void **tgt_args,
1467                                          ptrdiff_t *tgt_offsets,
1468                                          int32_t arg_num, int32_t team_num,
1469                                          int32_t thread_limit,
1470                                          uint64_t loop_tripcount) {
1471   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1472 
1473   __tgt_async_info AsyncInfo;
1474   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1475       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1476       thread_limit, loop_tripcount, &AsyncInfo);
1477   if (rc != OFFLOAD_SUCCESS)
1478     return OFFLOAD_FAIL;
1479 
1480   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1481 }
1482 
1483 int32_t __tgt_rtl_run_target_team_region_async(
1484     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1485     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1486     int32_t thread_limit, uint64_t loop_tripcount,
1487     __tgt_async_info *async_info_ptr) {
1488   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1489 
1490   return DeviceRTL.runTargetTeamRegion(
1491       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1492       thread_limit, loop_tripcount, async_info_ptr);
1493 }
1494 
1495 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1496                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1497                                     int32_t arg_num) {
1498   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1499 
1500   __tgt_async_info AsyncInfo;
1501   const int32_t rc = __tgt_rtl_run_target_region_async(
1502       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1503   if (rc != OFFLOAD_SUCCESS)
1504     return OFFLOAD_FAIL;
1505 
1506   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1507 }
1508 
1509 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1510                                           void *tgt_entry_ptr, void **tgt_args,
1511                                           ptrdiff_t *tgt_offsets,
1512                                           int32_t arg_num,
1513                                           __tgt_async_info *async_info_ptr) {
1514   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1515 
1516   return __tgt_rtl_run_target_team_region_async(
1517       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1518       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1519       async_info_ptr);
1520 }
1521 
1522 int32_t __tgt_rtl_synchronize(int32_t device_id,
1523                               __tgt_async_info *async_info_ptr) {
1524   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1525   assert(async_info_ptr && "async_info_ptr is nullptr");
1526   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1527 
1528   return DeviceRTL.synchronize(device_id, async_info_ptr);
1529 }
1530 
1531 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1532   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1533   InfoLevel.store(NewInfoLevel);
1534 }
1535 
1536 void __tgt_rtl_print_device_info(int32_t device_id) {
1537   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1538   DeviceRTL.printDeviceInfo(device_id);
1539 }
1540 
1541 #ifdef __cplusplus
1542 }
1543 #endif
1544