1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "Debug.h"
23 #include "omptargetplugin.h"
24 
25 #define TARGET_NAME CUDA
26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
27 
28 // Utility for retrieving and printing CUDA error string.
29 #ifdef OMPTARGET_DEBUG
30 #define CUDA_ERR_STRING(err)                                                   \
31   do {                                                                         \
32     if (getDebugLevel() > 0) {                                                 \
33       const char *errStr = nullptr;                                            \
34       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
35       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
36         DP("Unrecognized CUDA error code: %d\n", err);                         \
37       else if (errStr_status == CUDA_SUCCESS)                                  \
38         DP("CUDA error is: %s\n", errStr);                                     \
39       else {                                                                   \
40         DP("Unresolved CUDA error code: %d\n", err);                           \
41         DP("Unsuccessful cuGetErrorString return status: %d\n",                \
42            errStr_status);                                                     \
43       }                                                                        \
44     }                                                                          \
45   } while (false)
46 #else // OMPTARGET_DEBUG
47 #define CUDA_ERR_STRING(err) {}
48 #endif // OMPTARGET_DEBUG
49 
50 #include "../../common/elf_common.c"
51 
52 /// Keep entries table per device.
53 struct FuncOrGblEntryTy {
54   __tgt_target_table Table;
55   std::vector<__tgt_offload_entry> Entries;
56 };
57 
58 enum ExecutionModeType {
59   SPMD, // constructors, destructors,
60   // combined constructs (`teams distribute parallel for [simd]`)
61   GENERIC, // everything else
62   NONE
63 };
64 
65 /// Use a single entity to encode a kernel and a set of flags.
66 struct KernelTy {
67   CUfunction Func;
68 
69   // execution mode of kernel
70   // 0 - SPMD mode (without master warp)
71   // 1 - Generic mode (with master warp)
72   int8_t ExecutionMode;
73 
74   /// Maximal number of threads per block for this kernel.
75   int MaxThreadsPerBlock = 0;
76 
77   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
78       : Func(_Func), ExecutionMode(_ExecutionMode) {}
79 };
80 
81 /// Device environment data
82 /// Manually sync with the deviceRTL side for now, move to a dedicated header
83 /// file later.
84 struct omptarget_device_environmentTy {
85   int32_t debug_level;
86 };
87 
88 namespace {
89 bool checkResult(CUresult Err, const char *ErrMsg) {
90   if (Err == CUDA_SUCCESS)
91     return true;
92 
93   DP("%s", ErrMsg);
94   CUDA_ERR_STRING(Err);
95   return false;
96 }
97 
98 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
99                CUstream Stream) {
100   CUresult Err =
101       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
102 
103   if (Err != CUDA_SUCCESS) {
104     DP("Error when copying data from device to device. Pointers: src "
105        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
106        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
107     CUDA_ERR_STRING(Err);
108     return OFFLOAD_FAIL;
109   }
110 
111   return OFFLOAD_SUCCESS;
112 }
113 
114 // Structure contains per-device data
115 struct DeviceDataTy {
116   /// List that contains all the kernels.
117   std::list<KernelTy> KernelsList;
118 
119   std::list<FuncOrGblEntryTy> FuncGblEntries;
120 
121   CUcontext Context = nullptr;
122   // Device properties
123   int ThreadsPerBlock = 0;
124   int BlocksPerGrid = 0;
125   int WarpSize = 0;
126   // OpenMP properties
127   int NumTeams = 0;
128   int NumThreads = 0;
129 };
130 
131 class StreamManagerTy {
132   int NumberOfDevices;
133   // The initial size of stream pool
134   int EnvNumInitialStreams;
135   // Per-device stream mutex
136   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
137   // Per-device stream Id indicates the next available stream in the pool
138   std::vector<int> NextStreamId;
139   // Per-device stream pool
140   std::vector<std::vector<CUstream>> StreamPool;
141   // Reference to per-device data
142   std::vector<DeviceDataTy> &DeviceData;
143 
144   // If there is no CUstream left in the pool, we will resize the pool to
145   // allocate more CUstream. This function should be called with device mutex,
146   // and we do not resize to smaller one.
147   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
148     std::vector<CUstream> &Pool = StreamPool[DeviceId];
149     const size_t CurrentSize = Pool.size();
150     assert(NewSize > CurrentSize && "new size is not larger than current size");
151 
152     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
153     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
154       // We will return if cannot switch to the right context in case of
155       // creating bunch of streams that are not corresponding to the right
156       // device. The offloading will fail later because selected CUstream is
157       // nullptr.
158       return;
159     }
160 
161     Pool.resize(NewSize, nullptr);
162 
163     for (size_t I = CurrentSize; I < NewSize; ++I) {
164       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
165                   "Error returned from cuStreamCreate\n");
166     }
167   }
168 
169 public:
170   StreamManagerTy(const int NumberOfDevices,
171                   std::vector<DeviceDataTy> &DeviceData)
172       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
173         DeviceData(DeviceData) {
174     StreamPool.resize(NumberOfDevices);
175     NextStreamId.resize(NumberOfDevices);
176     StreamMtx.resize(NumberOfDevices);
177 
178     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
179       EnvNumInitialStreams = std::stoi(EnvStr);
180 
181     // Initialize the next stream id
182     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
183 
184     // Initialize stream mutex
185     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
186       Ptr = std::make_unique<std::mutex>();
187   }
188 
189   ~StreamManagerTy() {
190     // Destroy streams
191     for (int I = 0; I < NumberOfDevices; ++I) {
192       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
193                   "Error returned from cuCtxSetCurrent\n");
194 
195       for (CUstream &S : StreamPool[I]) {
196         if (S)
197           checkResult(cuStreamDestroy(S),
198                       "Error returned from cuStreamDestroy\n");
199       }
200     }
201   }
202 
203   // Get a CUstream from pool. Per-device next stream id always points to the
204   // next available CUstream. That means, CUstreams [0, id-1] have been
205   // assigned, and [id,] are still available. If there is no CUstream left, we
206   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
207   // the id will increase one.
208   // xxxxxs+++++++++
209   //      ^
210   //      id
211   // After assignment, the pool becomes the following and s is assigned.
212   // xxxxxs+++++++++
213   //       ^
214   //       id
215   CUstream getStream(const int DeviceId) {
216     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
217     int &Id = NextStreamId[DeviceId];
218     // No CUstream left in the pool, we need to request from CUDA RT
219     if (Id == StreamPool[DeviceId].size()) {
220       // By default we double the stream pool every time
221       resizeStreamPool(DeviceId, Id * 2);
222     }
223     return StreamPool[DeviceId][Id++];
224   }
225 
226   // Return a CUstream back to pool. As mentioned above, per-device next
227   // stream is always points to the next available CUstream, so when we return
228   // a CUstream, we need to first decrease the id, and then copy the CUstream
229   // back.
230   // It is worth noting that, the order of streams return might be different
231   // from that they're assigned, that saying, at some point, there might be
232   // two identical CUstreams.
233   // xxax+a+++++
234   //     ^
235   //     id
236   // However, it doesn't matter, because they're always on the two sides of
237   // id. The left one will in the end be overwritten by another CUstream.
238   // Therefore, after several execution, the order of pool might be different
239   // from its initial state.
240   void returnStream(const int DeviceId, CUstream Stream) {
241     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
242     int &Id = NextStreamId[DeviceId];
243     assert(Id > 0 && "Wrong stream ID");
244     StreamPool[DeviceId][--Id] = Stream;
245   }
246 
247   bool initializeDeviceStreamPool(const int DeviceId) {
248     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
249 
250     resizeStreamPool(DeviceId, EnvNumInitialStreams);
251 
252     // Check the size of stream pool
253     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
254       return false;
255 
256     // Check whether each stream is valid
257     for (CUstream &S : StreamPool[DeviceId])
258       if (!S)
259         return false;
260 
261     return true;
262   }
263 };
264 
265 class DeviceRTLTy {
266   int NumberOfDevices;
267   // OpenMP environment properties
268   int EnvNumTeams;
269   int EnvTeamLimit;
270   // OpenMP requires flags
271   int64_t RequiresFlags;
272 
273   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
274   static constexpr const int HardThreadLimit = 1024;
275   static constexpr const int DefaultNumTeams = 128;
276   static constexpr const int DefaultNumThreads = 128;
277 
278   std::unique_ptr<StreamManagerTy> StreamManager;
279   std::vector<DeviceDataTy> DeviceData;
280   std::vector<CUmodule> Modules;
281 
282   // Record entry point associated with device
283   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
284     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
285     E.Entries.push_back(entry);
286   }
287 
288   // Return a pointer to the entry associated with the pointer
289   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
290                                              const void *Addr) const {
291     for (const __tgt_offload_entry &Itr :
292          DeviceData[DeviceId].FuncGblEntries.back().Entries)
293       if (Itr.addr == Addr)
294         return &Itr;
295 
296     return nullptr;
297   }
298 
299   // Return the pointer to the target entries table
300   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
301     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
302 
303     if (E.Entries.empty())
304       return nullptr;
305 
306     // Update table info according to the entries and return the pointer
307     E.Table.EntriesBegin = E.Entries.data();
308     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
309 
310     return &E.Table;
311   }
312 
313   // Clear entries table for a device
314   void clearOffloadEntriesTable(const int DeviceId) {
315     DeviceData[DeviceId].FuncGblEntries.emplace_back();
316     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
317     E.Entries.clear();
318     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
319   }
320 
321   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
322     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
323 
324     if (!AsyncInfoPtr->Queue)
325       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
326 
327     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
328   }
329 
330 public:
331   // This class should not be copied
332   DeviceRTLTy(const DeviceRTLTy &) = delete;
333   DeviceRTLTy(DeviceRTLTy &&) = delete;
334 
335   DeviceRTLTy()
336       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
337         RequiresFlags(OMP_REQ_UNDEFINED) {
338 
339     DP("Start initializing CUDA\n");
340 
341     CUresult Err = cuInit(0);
342     if (!checkResult(Err, "Error returned from cuInit\n")) {
343       return;
344     }
345 
346     Err = cuDeviceGetCount(&NumberOfDevices);
347     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
348       return;
349 
350     if (NumberOfDevices == 0) {
351       DP("There are no devices supporting CUDA.\n");
352       return;
353     }
354 
355     DeviceData.resize(NumberOfDevices);
356 
357     // Get environment variables regarding teams
358     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
359       // OMP_TEAM_LIMIT has been set
360       EnvTeamLimit = std::stoi(EnvStr);
361       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
362     }
363     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
364       // OMP_NUM_TEAMS has been set
365       EnvNumTeams = std::stoi(EnvStr);
366       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
367     }
368 
369     StreamManager =
370         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
371   }
372 
373   ~DeviceRTLTy() {
374     // First destruct stream manager in case of Contexts is destructed before it
375     StreamManager = nullptr;
376 
377     for (CUmodule &M : Modules)
378       // Close module
379       if (M)
380         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
381 
382     for (DeviceDataTy &D : DeviceData) {
383       // Destroy context
384       if (D.Context) {
385         checkResult(cuCtxSetCurrent(D.Context),
386                     "Error returned from cuCtxSetCurrent\n");
387         CUdevice Device;
388         checkResult(cuCtxGetDevice(&Device),
389                     "Error returned from cuCtxGetDevice\n");
390         checkResult(cuDevicePrimaryCtxRelease(Device),
391                     "Error returned from cuDevicePrimaryCtxRelease\n");
392       }
393     }
394   }
395 
396   // Check whether a given DeviceId is valid
397   bool isValidDeviceId(const int DeviceId) const {
398     return DeviceId >= 0 && DeviceId < NumberOfDevices;
399   }
400 
401   int getNumOfDevices() const { return NumberOfDevices; }
402 
403   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
404 
405   int initDevice(const int DeviceId) {
406     CUdevice Device;
407 
408     DP("Getting device %d\n", DeviceId);
409     CUresult Err = cuDeviceGet(&Device, DeviceId);
410     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
411       return OFFLOAD_FAIL;
412 
413     // Query the current flags of the primary context and set its flags if
414     // it is inactive
415     unsigned int FormerPrimaryCtxFlags = 0;
416     int FormerPrimaryCtxIsActive = 0;
417     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
418                                      &FormerPrimaryCtxIsActive);
419     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
420       return OFFLOAD_FAIL;
421 
422     if (FormerPrimaryCtxIsActive) {
423       DP("The primary context is active, no change to its flags\n");
424       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
425           CU_CTX_SCHED_BLOCKING_SYNC)
426         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
427     } else {
428       DP("The primary context is inactive, set its flags to "
429          "CU_CTX_SCHED_BLOCKING_SYNC\n");
430       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
431       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
432         return OFFLOAD_FAIL;
433     }
434 
435     // Retain the per device primary context and save it to use whenever this
436     // device is selected.
437     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
438     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
439       return OFFLOAD_FAIL;
440 
441     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
442     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
443       return OFFLOAD_FAIL;
444 
445     // Initialize stream pool
446     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
447       return OFFLOAD_FAIL;
448 
449     // Query attributes to determine number of threads/block and blocks/grid.
450     int MaxGridDimX;
451     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
452                                Device);
453     if (Err != CUDA_SUCCESS) {
454       DP("Error getting max grid dimension, use default value %d\n",
455          DeviceRTLTy::DefaultNumTeams);
456       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
457     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
458       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
459       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
460     } else {
461       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
462          "at the hard limit\n",
463          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
464       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
465     }
466 
467     // We are only exploiting threads along the x axis.
468     int MaxBlockDimX;
469     Err = cuDeviceGetAttribute(&MaxBlockDimX,
470                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
471     if (Err != CUDA_SUCCESS) {
472       DP("Error getting max block dimension, use default value %d\n",
473          DeviceRTLTy::DefaultNumThreads);
474       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
475     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
476       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
477       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
478     } else {
479       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
480          "capping at the hard limit\n",
481          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
482       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
483     }
484 
485     // Get and set warp size
486     int WarpSize;
487     Err =
488         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
489     if (Err != CUDA_SUCCESS) {
490       DP("Error getting warp size, assume default value 32\n");
491       DeviceData[DeviceId].WarpSize = 32;
492     } else {
493       DP("Using warp size %d\n", WarpSize);
494       DeviceData[DeviceId].WarpSize = WarpSize;
495     }
496 
497     // Adjust teams to the env variables
498     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
499       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
500          EnvTeamLimit);
501       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
502     }
503 
504     if (getDebugLevel() || (getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
505       INFO(DeviceId,
506            "Device supports up to %d CUDA blocks and %d threads with a "
507            "warp size of %d\n",
508            DeviceData[DeviceId].BlocksPerGrid,
509            DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
510 
511     // Set default number of teams
512     if (EnvNumTeams > 0) {
513       DP("Default number of teams set according to environment %d\n",
514          EnvNumTeams);
515       DeviceData[DeviceId].NumTeams = EnvNumTeams;
516     } else {
517       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
518       DP("Default number of teams set according to library's default %d\n",
519          DeviceRTLTy::DefaultNumTeams);
520     }
521 
522     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
523       DP("Default number of teams exceeds device limit, capping at %d\n",
524          DeviceData[DeviceId].BlocksPerGrid);
525       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
526     }
527 
528     // Set default number of threads
529     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
530     DP("Default number of threads set according to library's default %d\n",
531        DeviceRTLTy::DefaultNumThreads);
532     if (DeviceData[DeviceId].NumThreads >
533         DeviceData[DeviceId].ThreadsPerBlock) {
534       DP("Default number of threads exceeds device limit, capping at %d\n",
535          DeviceData[DeviceId].ThreadsPerBlock);
536       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
537     }
538 
539     return OFFLOAD_SUCCESS;
540   }
541 
542   __tgt_target_table *loadBinary(const int DeviceId,
543                                  const __tgt_device_image *Image) {
544     // Set the context we are using
545     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
546     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
547       return nullptr;
548 
549     // Clear the offload table as we are going to create a new one.
550     clearOffloadEntriesTable(DeviceId);
551 
552     // Create the module and extract the function pointers.
553     CUmodule Module;
554     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
555     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
556     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
557       return nullptr;
558 
559     DP("CUDA module successfully loaded!\n");
560 
561     Modules.push_back(Module);
562 
563     // Find the symbols in the module by name.
564     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
565     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
566 
567     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
568     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
569       if (!E->addr) {
570         // We return nullptr when something like this happens, the host should
571         // have always something in the address to uniquely identify the target
572         // region.
573         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
574         return nullptr;
575       }
576 
577       if (E->size) {
578         __tgt_offload_entry Entry = *E;
579         CUdeviceptr CUPtr;
580         size_t CUSize;
581         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
582         // We keep this style here because we need the name
583         if (Err != CUDA_SUCCESS) {
584           DP("Loading global '%s' (Failed)\n", E->name);
585           CUDA_ERR_STRING(Err);
586           return nullptr;
587         }
588 
589         if (CUSize != E->size) {
590           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
591              CUSize, E->size);
592           return nullptr;
593         }
594 
595         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
596            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
597 
598         Entry.addr = (void *)(CUPtr);
599 
600         // Note: In the current implementation declare target variables
601         // can either be link or to. This means that once unified
602         // memory is activated via the requires directive, the variable
603         // can be used directly from the host in both cases.
604         // TODO: when variables types other than to or link are added,
605         // the below condition should be changed to explicitly
606         // check for to and link variables types:
607         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
608         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
609         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
610           // If unified memory is present any target link or to variables
611           // can access host addresses directly. There is no longer a
612           // need for device copies.
613           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
614           DP("Copy linked variable host address (" DPxMOD
615              ") to device address (" DPxMOD ")\n",
616              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
617         }
618 
619         addOffloadEntry(DeviceId, Entry);
620 
621         continue;
622       }
623 
624       CUfunction Func;
625       Err = cuModuleGetFunction(&Func, Module, E->name);
626       // We keep this style here because we need the name
627       if (Err != CUDA_SUCCESS) {
628         DP("Loading '%s' (Failed)\n", E->name);
629         CUDA_ERR_STRING(Err);
630         return nullptr;
631       }
632 
633       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
634          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
635 
636       // default value GENERIC (in case symbol is missing from cubin file)
637       int8_t ExecModeVal = ExecutionModeType::GENERIC;
638       std::string ExecModeNameStr(E->name);
639       ExecModeNameStr += "_exec_mode";
640       const char *ExecModeName = ExecModeNameStr.c_str();
641 
642       CUdeviceptr ExecModePtr;
643       size_t CUSize;
644       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
645       if (Err == CUDA_SUCCESS) {
646         if (CUSize != sizeof(int8_t)) {
647           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
648              ExecModeName, CUSize, sizeof(int8_t));
649           return nullptr;
650         }
651 
652         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
653         if (Err != CUDA_SUCCESS) {
654           DP("Error when copying data from device to host. Pointers: "
655              "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
656              DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
657           CUDA_ERR_STRING(Err);
658           return nullptr;
659         }
660 
661         if (ExecModeVal < 0 || ExecModeVal > 1) {
662           DP("Error wrong exec_mode value specified in cubin file: %d\n",
663              ExecModeVal);
664           return nullptr;
665         }
666       } else {
667         DP("Loading global exec_mode '%s' - symbol missing, using default "
668            "value GENERIC (1)\n",
669            ExecModeName);
670         CUDA_ERR_STRING(Err);
671       }
672 
673       KernelsList.emplace_back(Func, ExecModeVal);
674 
675       __tgt_offload_entry Entry = *E;
676       Entry.addr = &KernelsList.back();
677       addOffloadEntry(DeviceId, Entry);
678     }
679 
680     // send device environment data to the device
681     {
682       omptarget_device_environmentTy DeviceEnv{0};
683 
684 #ifdef OMPTARGET_DEBUG
685       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
686         DeviceEnv.debug_level = std::stoi(EnvStr);
687 #endif
688 
689       const char *DeviceEnvName = "omptarget_device_environment";
690       CUdeviceptr DeviceEnvPtr;
691       size_t CUSize;
692 
693       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
694       if (Err == CUDA_SUCCESS) {
695         if (CUSize != sizeof(DeviceEnv)) {
696           DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
697              DeviceEnvName, CUSize, sizeof(int32_t));
698           CUDA_ERR_STRING(Err);
699           return nullptr;
700         }
701 
702         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
703         if (Err != CUDA_SUCCESS) {
704           DP("Error when copying data from host to device. Pointers: "
705              "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
706              DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
707           CUDA_ERR_STRING(Err);
708           return nullptr;
709         }
710 
711         DP("Sending global device environment data %zu bytes\n", CUSize);
712       } else {
713         DP("Finding global device environment '%s' - symbol missing.\n",
714            DeviceEnvName);
715         DP("Continue, considering this is a device RTL which does not accept "
716            "environment setting.\n");
717       }
718     }
719 
720     return getOffloadEntriesTable(DeviceId);
721   }
722 
723   void *dataAlloc(const int DeviceId, const int64_t Size) const {
724     if (Size == 0)
725       return nullptr;
726 
727     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
728     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
729       return nullptr;
730 
731     CUdeviceptr DevicePtr;
732     Err = cuMemAlloc(&DevicePtr, Size);
733     if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
734       return nullptr;
735 
736     return (void *)DevicePtr;
737   }
738 
739   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
740                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
741     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
742 
743     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
744     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
745       return OFFLOAD_FAIL;
746 
747     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
748 
749     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
750     if (Err != CUDA_SUCCESS) {
751       DP("Error when copying data from host to device. Pointers: host = " DPxMOD
752          ", device = " DPxMOD ", size = %" PRId64 "\n",
753          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
754       CUDA_ERR_STRING(Err);
755       return OFFLOAD_FAIL;
756     }
757 
758     return OFFLOAD_SUCCESS;
759   }
760 
761   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
762                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
763     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
764 
765     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
766     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
767       return OFFLOAD_FAIL;
768 
769     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
770 
771     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
772     if (Err != CUDA_SUCCESS) {
773       DP("Error when copying data from device to host. Pointers: host = " DPxMOD
774          ", device = " DPxMOD ", size = %" PRId64 "\n",
775          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
776       CUDA_ERR_STRING(Err);
777       return OFFLOAD_FAIL;
778     }
779 
780     return OFFLOAD_SUCCESS;
781   }
782 
783   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
784                    int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
785     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
786 
787     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
788     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
789       return OFFLOAD_FAIL;
790 
791     CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
792 
793     // If they are two devices, we try peer to peer copy first
794     if (SrcDevId != DstDevId) {
795       int CanAccessPeer = 0;
796       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
797       if (Err != CUDA_SUCCESS) {
798         DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
799            ", dst = %" PRId32 "\n",
800            SrcDevId, DstDevId);
801         CUDA_ERR_STRING(Err);
802         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
803       }
804 
805       if (!CanAccessPeer) {
806         DP("P2P memcpy not supported so fall back to D2D memcpy");
807         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
808       }
809 
810       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
811       if (Err != CUDA_SUCCESS) {
812         DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
813            ", dst = %" PRId32 "\n",
814            SrcDevId, DstDevId);
815         CUDA_ERR_STRING(Err);
816         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
817       }
818 
819       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
820                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
821                               Size, Stream);
822       if (Err == CUDA_SUCCESS)
823         return OFFLOAD_SUCCESS;
824 
825       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
826          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
827          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
828       CUDA_ERR_STRING(Err);
829     }
830 
831     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
832   }
833 
834   int dataDelete(const int DeviceId, void *TgtPtr) const {
835     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
836     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
837       return OFFLOAD_FAIL;
838 
839     Err = cuMemFree((CUdeviceptr)TgtPtr);
840     if (!checkResult(Err, "Error returned from cuMemFree\n"))
841       return OFFLOAD_FAIL;
842 
843     return OFFLOAD_SUCCESS;
844   }
845 
846   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
847                           ptrdiff_t *TgtOffsets, const int ArgNum,
848                           const int TeamNum, const int ThreadLimit,
849                           const unsigned int LoopTripCount,
850                           __tgt_async_info *AsyncInfo) const {
851     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
852     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
853       return OFFLOAD_FAIL;
854 
855     // All args are references.
856     std::vector<void *> Args(ArgNum);
857     std::vector<void *> Ptrs(ArgNum);
858 
859     for (int I = 0; I < ArgNum; ++I) {
860       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
861       Args[I] = &Ptrs[I];
862     }
863 
864     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
865 
866     int CudaThreadsPerBlock;
867     if (ThreadLimit > 0) {
868       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
869       CudaThreadsPerBlock = ThreadLimit;
870       // Add master warp if necessary
871       if (KernelInfo->ExecutionMode == GENERIC) {
872         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
873         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
874       }
875     } else {
876       DP("Setting CUDA threads per block to default %d\n",
877          DeviceData[DeviceId].NumThreads);
878       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
879     }
880 
881     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
882       DP("Threads per block capped at device limit %d\n",
883          DeviceData[DeviceId].ThreadsPerBlock);
884       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
885     }
886 
887     if (!KernelInfo->MaxThreadsPerBlock) {
888       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
889                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
890                                KernelInfo->Func);
891       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
892         return OFFLOAD_FAIL;
893     }
894 
895     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
896       DP("Threads per block capped at kernel limit %d\n",
897          KernelInfo->MaxThreadsPerBlock);
898       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
899     }
900 
901     unsigned int CudaBlocksPerGrid;
902     if (TeamNum <= 0) {
903       if (LoopTripCount > 0 && EnvNumTeams < 0) {
904         if (KernelInfo->ExecutionMode == SPMD) {
905           // We have a combined construct, i.e. `target teams distribute
906           // parallel for [simd]`. We launch so many teams so that each thread
907           // will execute one iteration of the loop. round up to the nearest
908           // integer
909           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
910         } else {
911           // If we reach this point, then we have a non-combined construct, i.e.
912           // `teams distribute` with a nested `parallel for` and each team is
913           // assigned one iteration of the `distribute` loop. E.g.:
914           //
915           // #pragma omp target teams distribute
916           // for(...loop_tripcount...) {
917           //   #pragma omp parallel for
918           //   for(...) {}
919           // }
920           //
921           // Threads within a team will execute the iterations of the `parallel`
922           // loop.
923           CudaBlocksPerGrid = LoopTripCount;
924         }
925         DP("Using %d teams due to loop trip count %" PRIu32
926            " and number of threads per block %d\n",
927            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
928       } else {
929         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
930         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
931       }
932     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
933       DP("Capping number of teams to team limit %d\n",
934          DeviceData[DeviceId].BlocksPerGrid);
935       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
936     } else {
937       DP("Using requested number of teams %d\n", TeamNum);
938       CudaBlocksPerGrid = TeamNum;
939     }
940 
941     if (getDebugLevel() || (getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
942       INFO(DeviceId,
943            "Launching kernel %s with %d blocks and %d threads in %s "
944            "mode\n",
945            (getOffloadEntry(DeviceId, TgtEntryPtr))
946                ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
947                : "(null)",
948            CudaBlocksPerGrid, CudaThreadsPerBlock,
949            (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
950 
951     CUstream Stream = getStream(DeviceId, AsyncInfo);
952     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
953                          /* gridDimZ */ 1, CudaThreadsPerBlock,
954                          /* blockDimY */ 1, /* blockDimZ */ 1,
955                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
956     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
957       return OFFLOAD_FAIL;
958 
959     DP("Launch of entry point at " DPxMOD " successful!\n",
960        DPxPTR(TgtEntryPtr));
961 
962     return OFFLOAD_SUCCESS;
963   }
964 
965   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
966     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
967     CUresult Err = cuStreamSynchronize(Stream);
968     if (Err != CUDA_SUCCESS) {
969       DP("Error when synchronizing stream. stream = " DPxMOD
970          ", async info ptr = " DPxMOD "\n",
971          DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
972       CUDA_ERR_STRING(Err);
973       return OFFLOAD_FAIL;
974     }
975 
976     // Once the stream is synchronized, return it to stream pool and reset
977     // async_info. This is to make sure the synchronization only works for its
978     // own tasks.
979     StreamManager->returnStream(
980         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
981     AsyncInfoPtr->Queue = nullptr;
982 
983     return OFFLOAD_SUCCESS;
984   }
985 };
986 
987 DeviceRTLTy DeviceRTL;
988 } // namespace
989 
990 // Exposed library API function
991 #ifdef __cplusplus
992 extern "C" {
993 #endif
994 
995 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
996   return elf_check_machine(image, /* EM_CUDA */ 190);
997 }
998 
999 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1000 
1001 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1002   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1003   DeviceRTL.setRequiresFlag(RequiresFlags);
1004   return RequiresFlags;
1005 }
1006 
1007 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1008   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1009       DeviceRTL.isValidDeviceId(dst_dev_id))
1010     return 1;
1011 
1012   return 0;
1013 }
1014 
1015 int32_t __tgt_rtl_init_device(int32_t device_id) {
1016   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1017 
1018   return DeviceRTL.initDevice(device_id);
1019 }
1020 
1021 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1022                                           __tgt_device_image *image) {
1023   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1024 
1025   return DeviceRTL.loadBinary(device_id, image);
1026 }
1027 
1028 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1029   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1030 
1031   return DeviceRTL.dataAlloc(device_id, size);
1032 }
1033 
1034 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1035                               int64_t size) {
1036   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1037 
1038   __tgt_async_info async_info;
1039   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1040                                                  size, &async_info);
1041   if (rc != OFFLOAD_SUCCESS)
1042     return OFFLOAD_FAIL;
1043 
1044   return __tgt_rtl_synchronize(device_id, &async_info);
1045 }
1046 
1047 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1048                                     void *hst_ptr, int64_t size,
1049                                     __tgt_async_info *async_info_ptr) {
1050   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1051   assert(async_info_ptr && "async_info_ptr is nullptr");
1052 
1053   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1054                               async_info_ptr);
1055 }
1056 
1057 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1058                                 int64_t size) {
1059   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1060 
1061   __tgt_async_info async_info;
1062   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1063                                                    size, &async_info);
1064   if (rc != OFFLOAD_SUCCESS)
1065     return OFFLOAD_FAIL;
1066 
1067   return __tgt_rtl_synchronize(device_id, &async_info);
1068 }
1069 
1070 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1071                                       void *tgt_ptr, int64_t size,
1072                                       __tgt_async_info *async_info_ptr) {
1073   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1074   assert(async_info_ptr && "async_info_ptr is nullptr");
1075 
1076   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1077                                 async_info_ptr);
1078 }
1079 
1080 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1081                                       int dst_dev_id, void *dst_ptr,
1082                                       int64_t size,
1083                                       __tgt_async_info *async_info_ptr) {
1084   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1085   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1086   assert(async_info_ptr && "async_info_ptr is nullptr");
1087 
1088   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1089                                 async_info_ptr);
1090 }
1091 
1092 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1093                                 int32_t dst_dev_id, void *dst_ptr,
1094                                 int64_t size) {
1095   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1096   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1097 
1098   __tgt_async_info async_info;
1099   const int32_t rc = __tgt_rtl_data_exchange_async(
1100       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1101   if (rc != OFFLOAD_SUCCESS)
1102     return OFFLOAD_FAIL;
1103 
1104   return __tgt_rtl_synchronize(src_dev_id, &async_info);
1105 }
1106 
1107 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1108   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1109 
1110   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1111 }
1112 
1113 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1114                                          void **tgt_args,
1115                                          ptrdiff_t *tgt_offsets,
1116                                          int32_t arg_num, int32_t team_num,
1117                                          int32_t thread_limit,
1118                                          uint64_t loop_tripcount) {
1119   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1120 
1121   __tgt_async_info async_info;
1122   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1123       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1124       thread_limit, loop_tripcount, &async_info);
1125   if (rc != OFFLOAD_SUCCESS)
1126     return OFFLOAD_FAIL;
1127 
1128   return __tgt_rtl_synchronize(device_id, &async_info);
1129 }
1130 
1131 int32_t __tgt_rtl_run_target_team_region_async(
1132     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1133     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1134     int32_t thread_limit, uint64_t loop_tripcount,
1135     __tgt_async_info *async_info_ptr) {
1136   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1137 
1138   return DeviceRTL.runTargetTeamRegion(
1139       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1140       thread_limit, loop_tripcount, async_info_ptr);
1141 }
1142 
1143 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1144                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1145                                     int32_t arg_num) {
1146   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1147 
1148   __tgt_async_info async_info;
1149   const int32_t rc = __tgt_rtl_run_target_region_async(
1150       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1151   if (rc != OFFLOAD_SUCCESS)
1152     return OFFLOAD_FAIL;
1153 
1154   return __tgt_rtl_synchronize(device_id, &async_info);
1155 }
1156 
1157 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1158                                           void *tgt_entry_ptr, void **tgt_args,
1159                                           ptrdiff_t *tgt_offsets,
1160                                           int32_t arg_num,
1161                                           __tgt_async_info *async_info_ptr) {
1162   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1163 
1164   return __tgt_rtl_run_target_team_region_async(
1165       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1166       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1167       async_info_ptr);
1168 }
1169 
1170 int32_t __tgt_rtl_synchronize(int32_t device_id,
1171                               __tgt_async_info *async_info_ptr) {
1172   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1173   assert(async_info_ptr && "async_info_ptr is nullptr");
1174   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1175 
1176   return DeviceRTL.synchronize(device_id, async_info_ptr);
1177 }
1178 
1179 #ifdef __cplusplus
1180 }
1181 #endif
1182