1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "Debug.h"
23 #include "omptargetplugin.h"
24 
25 #define TARGET_NAME CUDA
26 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
27 
28 // Utility for retrieving and printing CUDA error string.
29 #ifdef OMPTARGET_DEBUG
30 #define CUDA_ERR_STRING(err)                                                   \
31   do {                                                                         \
32     if (getDebugLevel() > 0) {                                                 \
33       const char *errStr;                                                      \
34       cuGetErrorString(err, &errStr);                                          \
35       DP("CUDA error is: %s\n", errStr);                                       \
36     }                                                                          \
37   } while (false)
38 #else // OMPTARGET_DEBUG
39 #define CUDA_ERR_STRING(err) {}
40 #endif // OMPTARGET_DEBUG
41 
42 #include "../../common/elf_common.c"
43 
44 /// Keep entries table per device.
45 struct FuncOrGblEntryTy {
46   __tgt_target_table Table;
47   std::vector<__tgt_offload_entry> Entries;
48 };
49 
50 enum ExecutionModeType {
51   SPMD, // constructors, destructors,
52   // combined constructs (`teams distribute parallel for [simd]`)
53   GENERIC, // everything else
54   NONE
55 };
56 
57 /// Use a single entity to encode a kernel and a set of flags.
58 struct KernelTy {
59   CUfunction Func;
60 
61   // execution mode of kernel
62   // 0 - SPMD mode (without master warp)
63   // 1 - Generic mode (with master warp)
64   int8_t ExecutionMode;
65 
66   /// Maximal number of threads per block for this kernel.
67   int MaxThreadsPerBlock = 0;
68 
69   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
70       : Func(_Func), ExecutionMode(_ExecutionMode) {}
71 };
72 
73 /// Device environment data
74 /// Manually sync with the deviceRTL side for now, move to a dedicated header
75 /// file later.
76 struct omptarget_device_environmentTy {
77   int32_t debug_level;
78 };
79 
80 namespace {
81 bool checkResult(CUresult Err, const char *ErrMsg) {
82   if (Err == CUDA_SUCCESS)
83     return true;
84 
85   DP("%s", ErrMsg);
86   CUDA_ERR_STRING(Err);
87   return false;
88 }
89 
90 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
91                CUstream Stream) {
92   CUresult Err =
93       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
94 
95   if (Err != CUDA_SUCCESS) {
96     DP("Error when copying data from device to device. Pointers: src "
97        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
98        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
99     CUDA_ERR_STRING(Err);
100     return OFFLOAD_FAIL;
101   }
102 
103   return OFFLOAD_SUCCESS;
104 }
105 
106 // Structure contains per-device data
107 struct DeviceDataTy {
108   /// List that contains all the kernels.
109   std::list<KernelTy> KernelsList;
110 
111   std::list<FuncOrGblEntryTy> FuncGblEntries;
112 
113   CUcontext Context = nullptr;
114   // Device properties
115   int ThreadsPerBlock = 0;
116   int BlocksPerGrid = 0;
117   int WarpSize = 0;
118   // OpenMP properties
119   int NumTeams = 0;
120   int NumThreads = 0;
121 };
122 
123 class StreamManagerTy {
124   int NumberOfDevices;
125   // The initial size of stream pool
126   int EnvNumInitialStreams;
127   // Per-device stream mutex
128   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
129   // Per-device stream Id indicates the next available stream in the pool
130   std::vector<int> NextStreamId;
131   // Per-device stream pool
132   std::vector<std::vector<CUstream>> StreamPool;
133   // Reference to per-device data
134   std::vector<DeviceDataTy> &DeviceData;
135 
136   // If there is no CUstream left in the pool, we will resize the pool to
137   // allocate more CUstream. This function should be called with device mutex,
138   // and we do not resize to smaller one.
139   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
140     std::vector<CUstream> &Pool = StreamPool[DeviceId];
141     const size_t CurrentSize = Pool.size();
142     assert(NewSize > CurrentSize && "new size is not larger than current size");
143 
144     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
145     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
146       // We will return if cannot switch to the right context in case of
147       // creating bunch of streams that are not corresponding to the right
148       // device. The offloading will fail later because selected CUstream is
149       // nullptr.
150       return;
151     }
152 
153     Pool.resize(NewSize, nullptr);
154 
155     for (size_t I = CurrentSize; I < NewSize; ++I) {
156       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
157                   "Error returned from cuStreamCreate\n");
158     }
159   }
160 
161 public:
162   StreamManagerTy(const int NumberOfDevices,
163                   std::vector<DeviceDataTy> &DeviceData)
164       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
165         DeviceData(DeviceData) {
166     StreamPool.resize(NumberOfDevices);
167     NextStreamId.resize(NumberOfDevices);
168     StreamMtx.resize(NumberOfDevices);
169 
170     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
171       EnvNumInitialStreams = std::stoi(EnvStr);
172 
173     // Initialize the next stream id
174     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
175 
176     // Initialize stream mutex
177     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
178       Ptr = std::make_unique<std::mutex>();
179   }
180 
181   ~StreamManagerTy() {
182     // Destroy streams
183     for (int I = 0; I < NumberOfDevices; ++I) {
184       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
185                   "Error returned from cuCtxSetCurrent\n");
186 
187       for (CUstream &S : StreamPool[I]) {
188         if (S)
189           checkResult(cuStreamDestroy(S),
190                       "Error returned from cuStreamDestroy\n");
191       }
192     }
193   }
194 
195   // Get a CUstream from pool. Per-device next stream id always points to the
196   // next available CUstream. That means, CUstreams [0, id-1] have been
197   // assigned, and [id,] are still available. If there is no CUstream left, we
198   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
199   // the id will increase one.
200   // xxxxxs+++++++++
201   //      ^
202   //      id
203   // After assignment, the pool becomes the following and s is assigned.
204   // xxxxxs+++++++++
205   //       ^
206   //       id
207   CUstream getStream(const int DeviceId) {
208     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
209     int &Id = NextStreamId[DeviceId];
210     // No CUstream left in the pool, we need to request from CUDA RT
211     if (Id == StreamPool[DeviceId].size()) {
212       // By default we double the stream pool every time
213       resizeStreamPool(DeviceId, Id * 2);
214     }
215     return StreamPool[DeviceId][Id++];
216   }
217 
218   // Return a CUstream back to pool. As mentioned above, per-device next
219   // stream is always points to the next available CUstream, so when we return
220   // a CUstream, we need to first decrease the id, and then copy the CUstream
221   // back.
222   // It is worth noting that, the order of streams return might be different
223   // from that they're assigned, that saying, at some point, there might be
224   // two identical CUstreams.
225   // xxax+a+++++
226   //     ^
227   //     id
228   // However, it doesn't matter, because they're always on the two sides of
229   // id. The left one will in the end be overwritten by another CUstream.
230   // Therefore, after several execution, the order of pool might be different
231   // from its initial state.
232   void returnStream(const int DeviceId, CUstream Stream) {
233     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
234     int &Id = NextStreamId[DeviceId];
235     assert(Id > 0 && "Wrong stream ID");
236     StreamPool[DeviceId][--Id] = Stream;
237   }
238 
239   bool initializeDeviceStreamPool(const int DeviceId) {
240     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
241 
242     resizeStreamPool(DeviceId, EnvNumInitialStreams);
243 
244     // Check the size of stream pool
245     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
246       return false;
247 
248     // Check whether each stream is valid
249     for (CUstream &S : StreamPool[DeviceId])
250       if (!S)
251         return false;
252 
253     return true;
254   }
255 };
256 
257 class DeviceRTLTy {
258   int NumberOfDevices;
259   // OpenMP environment properties
260   int EnvNumTeams;
261   int EnvTeamLimit;
262   // OpenMP requires flags
263   int64_t RequiresFlags;
264 
265   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
266   static constexpr const int HardThreadLimit = 1024;
267   static constexpr const int DefaultNumTeams = 128;
268   static constexpr const int DefaultNumThreads = 128;
269 
270   std::unique_ptr<StreamManagerTy> StreamManager;
271   std::vector<DeviceDataTy> DeviceData;
272   std::vector<CUmodule> Modules;
273 
274   // Record entry point associated with device
275   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
276     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
277     E.Entries.push_back(entry);
278   }
279 
280   // Return a pointer to the entry associated with the pointer
281   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
282                                              const void *Addr) const {
283     for (const __tgt_offload_entry &Itr :
284          DeviceData[DeviceId].FuncGblEntries.back().Entries)
285       if (Itr.addr == Addr)
286         return &Itr;
287 
288     return nullptr;
289   }
290 
291   // Return the pointer to the target entries table
292   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
293     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
294 
295     if (E.Entries.empty())
296       return nullptr;
297 
298     // Update table info according to the entries and return the pointer
299     E.Table.EntriesBegin = E.Entries.data();
300     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
301 
302     return &E.Table;
303   }
304 
305   // Clear entries table for a device
306   void clearOffloadEntriesTable(const int DeviceId) {
307     DeviceData[DeviceId].FuncGblEntries.emplace_back();
308     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
309     E.Entries.clear();
310     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
311   }
312 
313   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
314     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
315 
316     if (!AsyncInfoPtr->Queue)
317       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
318 
319     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
320   }
321 
322 public:
323   // This class should not be copied
324   DeviceRTLTy(const DeviceRTLTy &) = delete;
325   DeviceRTLTy(DeviceRTLTy &&) = delete;
326 
327   DeviceRTLTy()
328       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
329         RequiresFlags(OMP_REQ_UNDEFINED) {
330 
331     DP("Start initializing CUDA\n");
332 
333     CUresult Err = cuInit(0);
334     if (!checkResult(Err, "Error returned from cuInit\n")) {
335       return;
336     }
337 
338     Err = cuDeviceGetCount(&NumberOfDevices);
339     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
340       return;
341 
342     if (NumberOfDevices == 0) {
343       DP("There are no devices supporting CUDA.\n");
344       return;
345     }
346 
347     DeviceData.resize(NumberOfDevices);
348 
349     // Get environment variables regarding teams
350     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
351       // OMP_TEAM_LIMIT has been set
352       EnvTeamLimit = std::stoi(EnvStr);
353       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
354     }
355     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
356       // OMP_NUM_TEAMS has been set
357       EnvNumTeams = std::stoi(EnvStr);
358       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
359     }
360 
361     StreamManager =
362         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
363   }
364 
365   ~DeviceRTLTy() {
366     // First destruct stream manager in case of Contexts is destructed before it
367     StreamManager = nullptr;
368 
369     for (CUmodule &M : Modules)
370       // Close module
371       if (M)
372         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
373 
374     for (DeviceDataTy &D : DeviceData) {
375       // Destroy context
376       if (D.Context) {
377         checkResult(cuCtxSetCurrent(D.Context),
378                     "Error returned from cuCtxSetCurrent\n");
379         CUdevice Device;
380         checkResult(cuCtxGetDevice(&Device),
381                     "Error returned from cuCtxGetDevice\n");
382         checkResult(cuDevicePrimaryCtxRelease(Device),
383                     "Error returned from cuDevicePrimaryCtxRelease\n");
384       }
385     }
386   }
387 
388   // Check whether a given DeviceId is valid
389   bool isValidDeviceId(const int DeviceId) const {
390     return DeviceId >= 0 && DeviceId < NumberOfDevices;
391   }
392 
393   int getNumOfDevices() const { return NumberOfDevices; }
394 
395   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
396 
397   int initDevice(const int DeviceId) {
398     CUdevice Device;
399 
400     DP("Getting device %d\n", DeviceId);
401     CUresult Err = cuDeviceGet(&Device, DeviceId);
402     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
403       return OFFLOAD_FAIL;
404 
405     // Query the current flags of the primary context and set its flags if
406     // it is inactive
407     unsigned int FormerPrimaryCtxFlags = 0;
408     int FormerPrimaryCtxIsActive = 0;
409     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
410                                      &FormerPrimaryCtxIsActive);
411     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
412       return OFFLOAD_FAIL;
413 
414     if (FormerPrimaryCtxIsActive) {
415       DP("The primary context is active, no change to its flags\n");
416       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
417           CU_CTX_SCHED_BLOCKING_SYNC)
418         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
419     } else {
420       DP("The primary context is inactive, set its flags to "
421          "CU_CTX_SCHED_BLOCKING_SYNC\n");
422       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
423       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
424         return OFFLOAD_FAIL;
425     }
426 
427     // Retain the per device primary context and save it to use whenever this
428     // device is selected.
429     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
430     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
431       return OFFLOAD_FAIL;
432 
433     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
434     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
435       return OFFLOAD_FAIL;
436 
437     // Initialize stream pool
438     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
439       return OFFLOAD_FAIL;
440 
441     // Query attributes to determine number of threads/block and blocks/grid.
442     int MaxGridDimX;
443     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
444                                Device);
445     if (Err != CUDA_SUCCESS) {
446       DP("Error getting max grid dimension, use default value %d\n",
447          DeviceRTLTy::DefaultNumTeams);
448       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
449     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
450       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
451       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
452     } else {
453       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
454          "at the hard limit\n",
455          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
456       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
457     }
458 
459     // We are only exploiting threads along the x axis.
460     int MaxBlockDimX;
461     Err = cuDeviceGetAttribute(&MaxBlockDimX,
462                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
463     if (Err != CUDA_SUCCESS) {
464       DP("Error getting max block dimension, use default value %d\n",
465          DeviceRTLTy::DefaultNumThreads);
466       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
467     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
468       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
469       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
470     } else {
471       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
472          "capping at the hard limit\n",
473          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
474       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
475     }
476 
477     // Get and set warp size
478     int WarpSize;
479     Err =
480         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
481     if (Err != CUDA_SUCCESS) {
482       DP("Error getting warp size, assume default value 32\n");
483       DeviceData[DeviceId].WarpSize = 32;
484     } else {
485       DP("Using warp size %d\n", WarpSize);
486       DeviceData[DeviceId].WarpSize = WarpSize;
487     }
488 
489     // Adjust teams to the env variables
490     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
491       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
492          EnvTeamLimit);
493       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
494     }
495 
496     INFO(DeviceId,
497          "Device supports up to %d CUDA blocks and %d threads with a "
498          "warp size of %d\n",
499          DeviceData[DeviceId].BlocksPerGrid,
500          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
501 
502     // Set default number of teams
503     if (EnvNumTeams > 0) {
504       DP("Default number of teams set according to environment %d\n",
505          EnvNumTeams);
506       DeviceData[DeviceId].NumTeams = EnvNumTeams;
507     } else {
508       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
509       DP("Default number of teams set according to library's default %d\n",
510          DeviceRTLTy::DefaultNumTeams);
511     }
512 
513     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
514       DP("Default number of teams exceeds device limit, capping at %d\n",
515          DeviceData[DeviceId].BlocksPerGrid);
516       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
517     }
518 
519     // Set default number of threads
520     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
521     DP("Default number of threads set according to library's default %d\n",
522        DeviceRTLTy::DefaultNumThreads);
523     if (DeviceData[DeviceId].NumThreads >
524         DeviceData[DeviceId].ThreadsPerBlock) {
525       DP("Default number of threads exceeds device limit, capping at %d\n",
526          DeviceData[DeviceId].ThreadsPerBlock);
527       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
528     }
529 
530     return OFFLOAD_SUCCESS;
531   }
532 
533   __tgt_target_table *loadBinary(const int DeviceId,
534                                  const __tgt_device_image *Image) {
535     // Set the context we are using
536     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
537     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
538       return nullptr;
539 
540     // Clear the offload table as we are going to create a new one.
541     clearOffloadEntriesTable(DeviceId);
542 
543     // Create the module and extract the function pointers.
544     CUmodule Module;
545     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
546     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
547     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
548       return nullptr;
549 
550     DP("CUDA module successfully loaded!\n");
551 
552     Modules.push_back(Module);
553 
554     // Find the symbols in the module by name.
555     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
556     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
557 
558     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
559     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
560       if (!E->addr) {
561         // We return nullptr when something like this happens, the host should
562         // have always something in the address to uniquely identify the target
563         // region.
564         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
565         return nullptr;
566       }
567 
568       if (E->size) {
569         __tgt_offload_entry Entry = *E;
570         CUdeviceptr CUPtr;
571         size_t CUSize;
572         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
573         // We keep this style here because we need the name
574         if (Err != CUDA_SUCCESS) {
575           DP("Loading global '%s' (Failed)\n", E->name);
576           CUDA_ERR_STRING(Err);
577           return nullptr;
578         }
579 
580         if (CUSize != E->size) {
581           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
582              CUSize, E->size);
583           return nullptr;
584         }
585 
586         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
587            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
588 
589         Entry.addr = (void *)(CUPtr);
590 
591         // Note: In the current implementation declare target variables
592         // can either be link or to. This means that once unified
593         // memory is activated via the requires directive, the variable
594         // can be used directly from the host in both cases.
595         // TODO: when variables types other than to or link are added,
596         // the below condition should be changed to explicitly
597         // check for to and link variables types:
598         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
599         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
600         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
601           // If unified memory is present any target link or to variables
602           // can access host addresses directly. There is no longer a
603           // need for device copies.
604           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
605           DP("Copy linked variable host address (" DPxMOD
606              ") to device address (" DPxMOD ")\n",
607              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
608         }
609 
610         addOffloadEntry(DeviceId, Entry);
611 
612         continue;
613       }
614 
615       CUfunction Func;
616       Err = cuModuleGetFunction(&Func, Module, E->name);
617       // We keep this style here because we need the name
618       if (Err != CUDA_SUCCESS) {
619         DP("Loading '%s' (Failed)\n", E->name);
620         CUDA_ERR_STRING(Err);
621         return nullptr;
622       }
623 
624       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
625          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
626 
627       // default value GENERIC (in case symbol is missing from cubin file)
628       int8_t ExecModeVal = ExecutionModeType::GENERIC;
629       std::string ExecModeNameStr(E->name);
630       ExecModeNameStr += "_exec_mode";
631       const char *ExecModeName = ExecModeNameStr.c_str();
632 
633       CUdeviceptr ExecModePtr;
634       size_t CUSize;
635       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
636       if (Err == CUDA_SUCCESS) {
637         if (CUSize != sizeof(int8_t)) {
638           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
639              ExecModeName, CUSize, sizeof(int8_t));
640           return nullptr;
641         }
642 
643         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
644         if (Err != CUDA_SUCCESS) {
645           DP("Error when copying data from device to host. Pointers: "
646              "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
647              DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
648           CUDA_ERR_STRING(Err);
649           return nullptr;
650         }
651 
652         if (ExecModeVal < 0 || ExecModeVal > 1) {
653           DP("Error wrong exec_mode value specified in cubin file: %d\n",
654              ExecModeVal);
655           return nullptr;
656         }
657       } else {
658         DP("Loading global exec_mode '%s' - symbol missing, using default "
659            "value GENERIC (1)\n",
660            ExecModeName);
661         CUDA_ERR_STRING(Err);
662       }
663 
664       KernelsList.emplace_back(Func, ExecModeVal);
665 
666       __tgt_offload_entry Entry = *E;
667       Entry.addr = &KernelsList.back();
668       addOffloadEntry(DeviceId, Entry);
669     }
670 
671     // send device environment data to the device
672     {
673       omptarget_device_environmentTy DeviceEnv{0};
674 
675 #ifdef OMPTARGET_DEBUG
676       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
677         DeviceEnv.debug_level = std::stoi(EnvStr);
678 #endif
679 
680       const char *DeviceEnvName = "omptarget_device_environment";
681       CUdeviceptr DeviceEnvPtr;
682       size_t CUSize;
683 
684       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
685       if (Err == CUDA_SUCCESS) {
686         if (CUSize != sizeof(DeviceEnv)) {
687           DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
688              DeviceEnvName, CUSize, sizeof(int32_t));
689           CUDA_ERR_STRING(Err);
690           return nullptr;
691         }
692 
693         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
694         if (Err != CUDA_SUCCESS) {
695           DP("Error when copying data from host to device. Pointers: "
696              "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
697              DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
698           CUDA_ERR_STRING(Err);
699           return nullptr;
700         }
701 
702         DP("Sending global device environment data %zu bytes\n", CUSize);
703       } else {
704         DP("Finding global device environment '%s' - symbol missing.\n",
705            DeviceEnvName);
706         DP("Continue, considering this is a device RTL which does not accept "
707            "environment setting.\n");
708       }
709     }
710 
711     return getOffloadEntriesTable(DeviceId);
712   }
713 
714   void *dataAlloc(const int DeviceId, const int64_t Size) const {
715     if (Size == 0)
716       return nullptr;
717 
718     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
719     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
720       return nullptr;
721 
722     CUdeviceptr DevicePtr;
723     Err = cuMemAlloc(&DevicePtr, Size);
724     if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
725       return nullptr;
726 
727     return (void *)DevicePtr;
728   }
729 
730   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
731                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
732     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
733 
734     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
735     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
736       return OFFLOAD_FAIL;
737 
738     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
739 
740     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
741     if (Err != CUDA_SUCCESS) {
742       DP("Error when copying data from host to device. Pointers: host = " DPxMOD
743          ", device = " DPxMOD ", size = %" PRId64 "\n",
744          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
745       CUDA_ERR_STRING(Err);
746       return OFFLOAD_FAIL;
747     }
748 
749     return OFFLOAD_SUCCESS;
750   }
751 
752   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
753                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
754     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
755 
756     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
757     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
758       return OFFLOAD_FAIL;
759 
760     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
761 
762     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
763     if (Err != CUDA_SUCCESS) {
764       DP("Error when copying data from device to host. Pointers: host = " DPxMOD
765          ", device = " DPxMOD ", size = %" PRId64 "\n",
766          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
767       CUDA_ERR_STRING(Err);
768       return OFFLOAD_FAIL;
769     }
770 
771     return OFFLOAD_SUCCESS;
772   }
773 
774   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
775                    int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
776     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
777 
778     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
779     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
780       return OFFLOAD_FAIL;
781 
782     CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
783 
784     // If they are two devices, we try peer to peer copy first
785     if (SrcDevId != DstDevId) {
786       int CanAccessPeer = 0;
787       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
788       if (Err != CUDA_SUCCESS) {
789         DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
790            ", dst = %" PRId32 "\n",
791            SrcDevId, DstDevId);
792         CUDA_ERR_STRING(Err);
793         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
794       }
795 
796       if (!CanAccessPeer) {
797         DP("P2P memcpy not supported so fall back to D2D memcpy");
798         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
799       }
800 
801       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
802       if (Err != CUDA_SUCCESS) {
803         DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
804            ", dst = %" PRId32 "\n",
805            SrcDevId, DstDevId);
806         CUDA_ERR_STRING(Err);
807         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
808       }
809 
810       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
811                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
812                               Size, Stream);
813       if (Err == CUDA_SUCCESS)
814         return OFFLOAD_SUCCESS;
815 
816       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
817          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
818          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
819       CUDA_ERR_STRING(Err);
820     }
821 
822     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
823   }
824 
825   int dataDelete(const int DeviceId, void *TgtPtr) const {
826     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
827     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
828       return OFFLOAD_FAIL;
829 
830     Err = cuMemFree((CUdeviceptr)TgtPtr);
831     if (!checkResult(Err, "Error returned from cuMemFree\n"))
832       return OFFLOAD_FAIL;
833 
834     return OFFLOAD_SUCCESS;
835   }
836 
837   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
838                           ptrdiff_t *TgtOffsets, const int ArgNum,
839                           const int TeamNum, const int ThreadLimit,
840                           const unsigned int LoopTripCount,
841                           __tgt_async_info *AsyncInfo) const {
842     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
843     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
844       return OFFLOAD_FAIL;
845 
846     // All args are references.
847     std::vector<void *> Args(ArgNum);
848     std::vector<void *> Ptrs(ArgNum);
849 
850     for (int I = 0; I < ArgNum; ++I) {
851       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
852       Args[I] = &Ptrs[I];
853     }
854 
855     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
856 
857     int CudaThreadsPerBlock;
858     if (ThreadLimit > 0) {
859       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
860       CudaThreadsPerBlock = ThreadLimit;
861       // Add master warp if necessary
862       if (KernelInfo->ExecutionMode == GENERIC) {
863         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
864         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
865       }
866     } else {
867       DP("Setting CUDA threads per block to default %d\n",
868          DeviceData[DeviceId].NumThreads);
869       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
870     }
871 
872     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
873       DP("Threads per block capped at device limit %d\n",
874          DeviceData[DeviceId].ThreadsPerBlock);
875       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
876     }
877 
878     if (!KernelInfo->MaxThreadsPerBlock) {
879       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
880                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
881                                KernelInfo->Func);
882       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
883         return OFFLOAD_FAIL;
884     }
885 
886     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
887       DP("Threads per block capped at kernel limit %d\n",
888          KernelInfo->MaxThreadsPerBlock);
889       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
890     }
891 
892     unsigned int CudaBlocksPerGrid;
893     if (TeamNum <= 0) {
894       if (LoopTripCount > 0 && EnvNumTeams < 0) {
895         if (KernelInfo->ExecutionMode == SPMD) {
896           // We have a combined construct, i.e. `target teams distribute
897           // parallel for [simd]`. We launch so many teams so that each thread
898           // will execute one iteration of the loop. round up to the nearest
899           // integer
900           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
901         } else {
902           // If we reach this point, then we have a non-combined construct, i.e.
903           // `teams distribute` with a nested `parallel for` and each team is
904           // assigned one iteration of the `distribute` loop. E.g.:
905           //
906           // #pragma omp target teams distribute
907           // for(...loop_tripcount...) {
908           //   #pragma omp parallel for
909           //   for(...) {}
910           // }
911           //
912           // Threads within a team will execute the iterations of the `parallel`
913           // loop.
914           CudaBlocksPerGrid = LoopTripCount;
915         }
916         DP("Using %d teams due to loop trip count %" PRIu32
917            " and number of threads per block %d\n",
918            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
919       } else {
920         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
921         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
922       }
923     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
924       DP("Capping number of teams to team limit %d\n",
925          DeviceData[DeviceId].BlocksPerGrid);
926       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
927     } else {
928       DP("Using requested number of teams %d\n", TeamNum);
929       CudaBlocksPerGrid = TeamNum;
930     }
931 
932     INFO(DeviceId,
933          "Launching kernel %s with %d blocks and %d threads in %s "
934          "mode\n",
935          (getOffloadEntry(DeviceId, TgtEntryPtr))
936              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
937              : "(null)",
938          CudaBlocksPerGrid, CudaThreadsPerBlock,
939          (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
940 
941     CUstream Stream = getStream(DeviceId, AsyncInfo);
942     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
943                          /* gridDimZ */ 1, CudaThreadsPerBlock,
944                          /* blockDimY */ 1, /* blockDimZ */ 1,
945                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
946     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
947       return OFFLOAD_FAIL;
948 
949     DP("Launch of entry point at " DPxMOD " successful!\n",
950        DPxPTR(TgtEntryPtr));
951 
952     return OFFLOAD_SUCCESS;
953   }
954 
955   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
956     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
957     CUresult Err = cuStreamSynchronize(Stream);
958     if (Err != CUDA_SUCCESS) {
959       DP("Error when synchronizing stream. stream = " DPxMOD
960          ", async info ptr = " DPxMOD "\n",
961          DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
962       CUDA_ERR_STRING(Err);
963       return OFFLOAD_FAIL;
964     }
965 
966     // Once the stream is synchronized, return it to stream pool and reset
967     // async_info. This is to make sure the synchronization only works for its
968     // own tasks.
969     StreamManager->returnStream(
970         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
971     AsyncInfoPtr->Queue = nullptr;
972 
973     return OFFLOAD_SUCCESS;
974   }
975 };
976 
977 DeviceRTLTy DeviceRTL;
978 } // namespace
979 
980 // Exposed library API function
981 #ifdef __cplusplus
982 extern "C" {
983 #endif
984 
985 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
986   return elf_check_machine(image, /* EM_CUDA */ 190);
987 }
988 
989 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
990 
991 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
992   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
993   DeviceRTL.setRequiresFlag(RequiresFlags);
994   return RequiresFlags;
995 }
996 
997 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
998   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
999       DeviceRTL.isValidDeviceId(dst_dev_id))
1000     return 1;
1001 
1002   return 0;
1003 }
1004 
1005 int32_t __tgt_rtl_init_device(int32_t device_id) {
1006   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1007 
1008   return DeviceRTL.initDevice(device_id);
1009 }
1010 
1011 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1012                                           __tgt_device_image *image) {
1013   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1014 
1015   return DeviceRTL.loadBinary(device_id, image);
1016 }
1017 
1018 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1019   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1020 
1021   return DeviceRTL.dataAlloc(device_id, size);
1022 }
1023 
1024 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1025                               int64_t size) {
1026   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1027 
1028   __tgt_async_info async_info;
1029   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1030                                                  size, &async_info);
1031   if (rc != OFFLOAD_SUCCESS)
1032     return OFFLOAD_FAIL;
1033 
1034   return __tgt_rtl_synchronize(device_id, &async_info);
1035 }
1036 
1037 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1038                                     void *hst_ptr, int64_t size,
1039                                     __tgt_async_info *async_info_ptr) {
1040   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1041   assert(async_info_ptr && "async_info_ptr is nullptr");
1042 
1043   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1044                               async_info_ptr);
1045 }
1046 
1047 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1048                                 int64_t size) {
1049   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1050 
1051   __tgt_async_info async_info;
1052   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1053                                                    size, &async_info);
1054   if (rc != OFFLOAD_SUCCESS)
1055     return OFFLOAD_FAIL;
1056 
1057   return __tgt_rtl_synchronize(device_id, &async_info);
1058 }
1059 
1060 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1061                                       void *tgt_ptr, int64_t size,
1062                                       __tgt_async_info *async_info_ptr) {
1063   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1064   assert(async_info_ptr && "async_info_ptr is nullptr");
1065 
1066   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1067                                 async_info_ptr);
1068 }
1069 
1070 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1071                                       int dst_dev_id, void *dst_ptr,
1072                                       int64_t size,
1073                                       __tgt_async_info *async_info_ptr) {
1074   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1075   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1076   assert(async_info_ptr && "async_info_ptr is nullptr");
1077 
1078   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1079                                 async_info_ptr);
1080 }
1081 
1082 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1083                                 int32_t dst_dev_id, void *dst_ptr,
1084                                 int64_t size) {
1085   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1086   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1087 
1088   __tgt_async_info async_info;
1089   const int32_t rc = __tgt_rtl_data_exchange_async(
1090       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1091   if (rc != OFFLOAD_SUCCESS)
1092     return OFFLOAD_FAIL;
1093 
1094   return __tgt_rtl_synchronize(src_dev_id, &async_info);
1095 }
1096 
1097 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1098   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1099 
1100   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1101 }
1102 
1103 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1104                                          void **tgt_args,
1105                                          ptrdiff_t *tgt_offsets,
1106                                          int32_t arg_num, int32_t team_num,
1107                                          int32_t thread_limit,
1108                                          uint64_t loop_tripcount) {
1109   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1110 
1111   __tgt_async_info async_info;
1112   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1113       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1114       thread_limit, loop_tripcount, &async_info);
1115   if (rc != OFFLOAD_SUCCESS)
1116     return OFFLOAD_FAIL;
1117 
1118   return __tgt_rtl_synchronize(device_id, &async_info);
1119 }
1120 
1121 int32_t __tgt_rtl_run_target_team_region_async(
1122     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1123     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1124     int32_t thread_limit, uint64_t loop_tripcount,
1125     __tgt_async_info *async_info_ptr) {
1126   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1127 
1128   return DeviceRTL.runTargetTeamRegion(
1129       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1130       thread_limit, loop_tripcount, async_info_ptr);
1131 }
1132 
1133 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1134                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1135                                     int32_t arg_num) {
1136   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1137 
1138   __tgt_async_info async_info;
1139   const int32_t rc = __tgt_rtl_run_target_region_async(
1140       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1141   if (rc != OFFLOAD_SUCCESS)
1142     return OFFLOAD_FAIL;
1143 
1144   return __tgt_rtl_synchronize(device_id, &async_info);
1145 }
1146 
1147 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1148                                           void *tgt_entry_ptr, void **tgt_args,
1149                                           ptrdiff_t *tgt_offsets,
1150                                           int32_t arg_num,
1151                                           __tgt_async_info *async_info_ptr) {
1152   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1153 
1154   return __tgt_rtl_run_target_team_region_async(
1155       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1156       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1157       async_info_ptr);
1158 }
1159 
1160 int32_t __tgt_rtl_synchronize(int32_t device_id,
1161                               __tgt_async_info *async_info_ptr) {
1162   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1163   assert(async_info_ptr && "async_info_ptr is nullptr");
1164   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1165 
1166   return DeviceRTL.synchronize(device_id, async_info_ptr);
1167 }
1168 
1169 #ifdef __cplusplus
1170 }
1171 #endif
1172