1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "omptargetplugin.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #ifdef OMPTARGET_DEBUG
29 static int DebugLevel = 0;
30 
31 #define GETNAME2(name) #name
32 #define GETNAME(name) GETNAME2(name)
33 #define DP(...) \
34   do { \
35     if (DebugLevel > 0) { \
36       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
37     } \
38   } while (false)
39 
40 // Utility for retrieving and printing CUDA error string.
41 #define CUDA_ERR_STRING(err) \
42   do { \
43     if (DebugLevel > 0) { \
44       const char *errStr; \
45       cuGetErrorString(err, &errStr); \
46       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
47     } \
48   } while (false)
49 #else // OMPTARGET_DEBUG
50 #define DP(...) {}
51 #define CUDA_ERR_STRING(err) {}
52 #endif // OMPTARGET_DEBUG
53 
54 #include "../../common/elf_common.c"
55 
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58   __tgt_target_table Table;
59   std::vector<__tgt_offload_entry> Entries;
60 };
61 
62 enum ExecutionModeType {
63   SPMD, // constructors, destructors,
64   // combined constructs (`teams distribute parallel for [simd]`)
65   GENERIC, // everything else
66   NONE
67 };
68 
69 /// Use a single entity to encode a kernel and a set of flags.
70 struct KernelTy {
71   CUfunction Func;
72 
73   // execution mode of kernel
74   // 0 - SPMD mode (without master warp)
75   // 1 - Generic mode (with master warp)
76   int8_t ExecutionMode;
77 
78   /// Maximal number of threads per block for this kernel.
79   int MaxThreadsPerBlock = 0;
80 
81   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
82       : Func(_Func), ExecutionMode(_ExecutionMode) {}
83 };
84 
85 /// Device environment data
86 /// Manually sync with the deviceRTL side for now, move to a dedicated header
87 /// file later.
88 struct omptarget_device_environmentTy {
89   int32_t debug_level;
90 };
91 
92 namespace {
93 bool checkResult(CUresult Err, const char *ErrMsg) {
94   if (Err == CUDA_SUCCESS)
95     return true;
96 
97   DP("%s", ErrMsg);
98   CUDA_ERR_STRING(Err);
99   return false;
100 }
101 
102 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
103                CUstream Stream) {
104   CUresult Err =
105       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
106 
107   if (Err != CUDA_SUCCESS) {
108     DP("Error when copying data from device to device. Pointers: src "
109        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
110        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
111     CUDA_ERR_STRING(Err);
112     return OFFLOAD_FAIL;
113   }
114 
115   return OFFLOAD_SUCCESS;
116 }
117 
118 // Structure contains per-device data
119 struct DeviceDataTy {
120   /// List that contains all the kernels.
121   std::list<KernelTy> KernelsList;
122 
123   std::list<FuncOrGblEntryTy> FuncGblEntries;
124 
125   CUcontext Context = nullptr;
126   // Device properties
127   int ThreadsPerBlock = 0;
128   int BlocksPerGrid = 0;
129   int WarpSize = 0;
130   // OpenMP properties
131   int NumTeams = 0;
132   int NumThreads = 0;
133 };
134 
135 class StreamManagerTy {
136   int NumberOfDevices;
137   // The initial size of stream pool
138   int EnvNumInitialStreams;
139   // Per-device stream mutex
140   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
141   // Per-device stream Id indicates the next available stream in the pool
142   std::vector<int> NextStreamId;
143   // Per-device stream pool
144   std::vector<std::vector<CUstream>> StreamPool;
145   // Reference to per-device data
146   std::vector<DeviceDataTy> &DeviceData;
147 
148   // If there is no CUstream left in the pool, we will resize the pool to
149   // allocate more CUstream. This function should be called with device mutex,
150   // and we do not resize to smaller one.
151   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
152     std::vector<CUstream> &Pool = StreamPool[DeviceId];
153     const size_t CurrentSize = Pool.size();
154     assert(NewSize > CurrentSize && "new size is not larger than current size");
155 
156     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
157     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
158       // We will return if cannot switch to the right context in case of
159       // creating bunch of streams that are not corresponding to the right
160       // device. The offloading will fail later because selected CUstream is
161       // nullptr.
162       return;
163     }
164 
165     Pool.resize(NewSize, nullptr);
166 
167     for (size_t I = CurrentSize; I < NewSize; ++I) {
168       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
169                   "Error returned from cuStreamCreate\n");
170     }
171   }
172 
173 public:
174   StreamManagerTy(const int NumberOfDevices,
175                   std::vector<DeviceDataTy> &DeviceData)
176       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
177         DeviceData(DeviceData) {
178     StreamPool.resize(NumberOfDevices);
179     NextStreamId.resize(NumberOfDevices);
180     StreamMtx.resize(NumberOfDevices);
181 
182     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
183       EnvNumInitialStreams = std::stoi(EnvStr);
184 
185     // Initialize the next stream id
186     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
187 
188     // Initialize stream mutex
189     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
190       Ptr = std::make_unique<std::mutex>();
191   }
192 
193   ~StreamManagerTy() {
194     // Destroy streams
195     for (int I = 0; I < NumberOfDevices; ++I) {
196       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
197                   "Error returned from cuCtxSetCurrent\n");
198 
199       for (CUstream &S : StreamPool[I]) {
200         if (S)
201           checkResult(cuStreamDestroy(S),
202                       "Error returned from cuStreamDestroy\n");
203       }
204     }
205   }
206 
207   // Get a CUstream from pool. Per-device next stream id always points to the
208   // next available CUstream. That means, CUstreams [0, id-1] have been
209   // assigned, and [id,] are still available. If there is no CUstream left, we
210   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
211   // the id will increase one.
212   // xxxxxs+++++++++
213   //      ^
214   //      id
215   // After assignment, the pool becomes the following and s is assigned.
216   // xxxxxs+++++++++
217   //       ^
218   //       id
219   CUstream getStream(const int DeviceId) {
220     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
221     int &Id = NextStreamId[DeviceId];
222     // No CUstream left in the pool, we need to request from CUDA RT
223     if (Id == StreamPool[DeviceId].size()) {
224       // By default we double the stream pool every time
225       resizeStreamPool(DeviceId, Id * 2);
226     }
227     return StreamPool[DeviceId][Id++];
228   }
229 
230   // Return a CUstream back to pool. As mentioned above, per-device next
231   // stream is always points to the next available CUstream, so when we return
232   // a CUstream, we need to first decrease the id, and then copy the CUstream
233   // back.
234   // It is worth noting that, the order of streams return might be different
235   // from that they're assigned, that saying, at some point, there might be
236   // two identical CUstreams.
237   // xxax+a+++++
238   //     ^
239   //     id
240   // However, it doesn't matter, because they're always on the two sides of
241   // id. The left one will in the end be overwritten by another CUstream.
242   // Therefore, after several execution, the order of pool might be different
243   // from its initial state.
244   void returnStream(const int DeviceId, CUstream Stream) {
245     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
246     int &Id = NextStreamId[DeviceId];
247     assert(Id > 0 && "Wrong stream ID");
248     StreamPool[DeviceId][--Id] = Stream;
249   }
250 
251   bool initializeDeviceStreamPool(const int DeviceId) {
252     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
253 
254     resizeStreamPool(DeviceId, EnvNumInitialStreams);
255 
256     // Check the size of stream pool
257     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
258       return false;
259 
260     // Check whether each stream is valid
261     for (CUstream &S : StreamPool[DeviceId])
262       if (!S)
263         return false;
264 
265     return true;
266   }
267 };
268 
269 class DeviceRTLTy {
270   int NumberOfDevices;
271   // OpenMP environment properties
272   int EnvNumTeams;
273   int EnvTeamLimit;
274   // OpenMP requires flags
275   int64_t RequiresFlags;
276 
277   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
278   static constexpr const int HardThreadLimit = 1024;
279   static constexpr const int DefaultNumTeams = 128;
280   static constexpr const int DefaultNumThreads = 128;
281 
282   std::unique_ptr<StreamManagerTy> StreamManager;
283   std::vector<DeviceDataTy> DeviceData;
284   std::vector<CUmodule> Modules;
285 
286   // Record entry point associated with device
287   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
288     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
289     E.Entries.push_back(entry);
290   }
291 
292   // Return true if the entry is associated with device
293   bool findOffloadEntry(const int DeviceId, const void *Addr) const {
294     for (const __tgt_offload_entry &Itr :
295          DeviceData[DeviceId].FuncGblEntries.back().Entries)
296       if (Itr.addr == Addr)
297         return true;
298 
299     return false;
300   }
301 
302   // Return the pointer to the target entries table
303   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
304     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
305 
306     if (E.Entries.empty())
307       return nullptr;
308 
309     // Update table info according to the entries and return the pointer
310     E.Table.EntriesBegin = E.Entries.data();
311     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
312 
313     return &E.Table;
314   }
315 
316   // Clear entries table for a device
317   void clearOffloadEntriesTable(const int DeviceId) {
318     DeviceData[DeviceId].FuncGblEntries.emplace_back();
319     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
320     E.Entries.clear();
321     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
322   }
323 
324   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
325     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
326 
327     if (!AsyncInfoPtr->Queue)
328       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
329 
330     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
331   }
332 
333 public:
334   // This class should not be copied
335   DeviceRTLTy(const DeviceRTLTy &) = delete;
336   DeviceRTLTy(DeviceRTLTy &&) = delete;
337 
338   DeviceRTLTy()
339       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
340         RequiresFlags(OMP_REQ_UNDEFINED) {
341 #ifdef OMPTARGET_DEBUG
342     if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
343       DebugLevel = std::stoi(EnvStr);
344 #endif // OMPTARGET_DEBUG
345 
346     DP("Start initializing CUDA\n");
347 
348     CUresult Err = cuInit(0);
349     if (!checkResult(Err, "Error returned from cuInit\n")) {
350       return;
351     }
352 
353     Err = cuDeviceGetCount(&NumberOfDevices);
354     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
355       return;
356 
357     if (NumberOfDevices == 0) {
358       DP("There are no devices supporting CUDA.\n");
359       return;
360     }
361 
362     DeviceData.resize(NumberOfDevices);
363 
364     // Get environment variables regarding teams
365     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
366       // OMP_TEAM_LIMIT has been set
367       EnvTeamLimit = std::stoi(EnvStr);
368       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
369     }
370     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
371       // OMP_NUM_TEAMS has been set
372       EnvNumTeams = std::stoi(EnvStr);
373       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
374     }
375 
376     StreamManager =
377         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
378   }
379 
380   ~DeviceRTLTy() {
381     // First destruct stream manager in case of Contexts is destructed before it
382     StreamManager = nullptr;
383 
384     for (CUmodule &M : Modules)
385       // Close module
386       if (M)
387         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
388 
389     for (DeviceDataTy &D : DeviceData) {
390       // Destroy context
391       if (D.Context) {
392         checkResult(cuCtxSetCurrent(D.Context),
393                     "Error returned from cuCtxSetCurrent\n");
394         CUdevice Device;
395         checkResult(cuCtxGetDevice(&Device),
396                     "Error returned from cuCtxGetDevice\n");
397         checkResult(cuDevicePrimaryCtxRelease(Device),
398                     "Error returned from cuDevicePrimaryCtxRelease\n");
399       }
400     }
401   }
402 
403   // Check whether a given DeviceId is valid
404   bool isValidDeviceId(const int DeviceId) const {
405     return DeviceId >= 0 && DeviceId < NumberOfDevices;
406   }
407 
408   int getNumOfDevices() const { return NumberOfDevices; }
409 
410   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
411 
412   int initDevice(const int DeviceId) {
413     CUdevice Device;
414 
415     DP("Getting device %d\n", DeviceId);
416     CUresult Err = cuDeviceGet(&Device, DeviceId);
417     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
418       return OFFLOAD_FAIL;
419 
420     // Query the current flags of the primary context and set its flags if
421     // it is inactive
422     unsigned int FormerPrimaryCtxFlags = 0;
423     int FormerPrimaryCtxIsActive = 0;
424     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
425                                      &FormerPrimaryCtxIsActive);
426     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
427       return OFFLOAD_FAIL;
428 
429     if (FormerPrimaryCtxIsActive) {
430       DP("The primary context is active, no change to its flags\n");
431       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
432           CU_CTX_SCHED_BLOCKING_SYNC)
433         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
434     } else {
435       DP("The primary context is inactive, set its flags to "
436          "CU_CTX_SCHED_BLOCKING_SYNC\n");
437       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
438       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
439         return OFFLOAD_FAIL;
440     }
441 
442     // Retain the per device primary context and save it to use whenever this
443     // device is selected.
444     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
445     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
446       return OFFLOAD_FAIL;
447 
448     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
449     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
450       return OFFLOAD_FAIL;
451 
452     // Initialize stream pool
453     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
454       return OFFLOAD_FAIL;
455 
456     // Query attributes to determine number of threads/block and blocks/grid.
457     int MaxGridDimX;
458     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
459                                Device);
460     if (Err != CUDA_SUCCESS) {
461       DP("Error getting max grid dimension, use default value %d\n",
462          DeviceRTLTy::DefaultNumTeams);
463       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
464     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
465       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
466       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
467     } else {
468       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
469          "at the hard limit\n",
470          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
471       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
472     }
473 
474     // We are only exploiting threads along the x axis.
475     int MaxBlockDimX;
476     Err = cuDeviceGetAttribute(&MaxBlockDimX,
477                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
478     if (Err != CUDA_SUCCESS) {
479       DP("Error getting max block dimension, use default value %d\n",
480          DeviceRTLTy::DefaultNumThreads);
481       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
482     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
483       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
484       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
485     } else {
486       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
487          "capping at the hard limit\n",
488          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
489       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
490     }
491 
492     // Get and set warp size
493     int WarpSize;
494     Err =
495         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
496     if (Err != CUDA_SUCCESS) {
497       DP("Error getting warp size, assume default value 32\n");
498       DeviceData[DeviceId].WarpSize = 32;
499     } else {
500       DP("Using warp size %d\n", WarpSize);
501       DeviceData[DeviceId].WarpSize = WarpSize;
502     }
503 
504     // Adjust teams to the env variables
505     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
506       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
507          EnvTeamLimit);
508       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
509     }
510 
511     DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
512        DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
513        DeviceData[DeviceId].WarpSize);
514 
515     // Set default number of teams
516     if (EnvNumTeams > 0) {
517       DP("Default number of teams set according to environment %d\n",
518          EnvNumTeams);
519       DeviceData[DeviceId].NumTeams = EnvNumTeams;
520     } else {
521       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
522       DP("Default number of teams set according to library's default %d\n",
523          DeviceRTLTy::DefaultNumTeams);
524     }
525 
526     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
527       DP("Default number of teams exceeds device limit, capping at %d\n",
528          DeviceData[DeviceId].BlocksPerGrid);
529       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
530     }
531 
532     // Set default number of threads
533     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
534     DP("Default number of threads set according to library's default %d\n",
535        DeviceRTLTy::DefaultNumThreads);
536     if (DeviceData[DeviceId].NumThreads >
537         DeviceData[DeviceId].ThreadsPerBlock) {
538       DP("Default number of threads exceeds device limit, capping at %d\n",
539          DeviceData[DeviceId].ThreadsPerBlock);
540       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
541     }
542 
543     return OFFLOAD_SUCCESS;
544   }
545 
546   __tgt_target_table *loadBinary(const int DeviceId,
547                                  const __tgt_device_image *Image) {
548     // Set the context we are using
549     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
550     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
551       return nullptr;
552 
553     // Clear the offload table as we are going to create a new one.
554     clearOffloadEntriesTable(DeviceId);
555 
556     // Create the module and extract the function pointers.
557     CUmodule Module;
558     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
559     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
560     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
561       return nullptr;
562 
563     DP("CUDA module successfully loaded!\n");
564 
565     Modules.push_back(Module);
566 
567     // Find the symbols in the module by name.
568     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
569     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
570 
571     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
572     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
573       if (!E->addr) {
574         // We return nullptr when something like this happens, the host should
575         // have always something in the address to uniquely identify the target
576         // region.
577         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
578         return nullptr;
579       }
580 
581       if (E->size) {
582         __tgt_offload_entry Entry = *E;
583         CUdeviceptr CUPtr;
584         size_t CUSize;
585         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
586         // We keep this style here because we need the name
587         if (Err != CUDA_SUCCESS) {
588           DP("Loading global '%s' (Failed)\n", E->name);
589           CUDA_ERR_STRING(Err);
590           return nullptr;
591         }
592 
593         if (CUSize != E->size) {
594           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
595              CUSize, E->size);
596           return nullptr;
597         }
598 
599         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
600            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
601 
602         Entry.addr = (void *)(CUPtr);
603 
604         // Note: In the current implementation declare target variables
605         // can either be link or to. This means that once unified
606         // memory is activated via the requires directive, the variable
607         // can be used directly from the host in both cases.
608         // TODO: when variables types other than to or link are added,
609         // the below condition should be changed to explicitly
610         // check for to and link variables types:
611         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
612         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
613         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
614           // If unified memory is present any target link or to variables
615           // can access host addresses directly. There is no longer a
616           // need for device copies.
617           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
618           DP("Copy linked variable host address (" DPxMOD
619              ") to device address (" DPxMOD ")\n",
620              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
621         }
622 
623         addOffloadEntry(DeviceId, Entry);
624 
625         continue;
626       }
627 
628       CUfunction Func;
629       Err = cuModuleGetFunction(&Func, Module, E->name);
630       // We keep this style here because we need the name
631       if (Err != CUDA_SUCCESS) {
632         DP("Loading '%s' (Failed)\n", E->name);
633         CUDA_ERR_STRING(Err);
634         return nullptr;
635       }
636 
637       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
638          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
639 
640       // default value GENERIC (in case symbol is missing from cubin file)
641       int8_t ExecModeVal = ExecutionModeType::GENERIC;
642       std::string ExecModeNameStr(E->name);
643       ExecModeNameStr += "_exec_mode";
644       const char *ExecModeName = ExecModeNameStr.c_str();
645 
646       CUdeviceptr ExecModePtr;
647       size_t CUSize;
648       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
649       if (Err == CUDA_SUCCESS) {
650         if (CUSize != sizeof(int8_t)) {
651           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
652              ExecModeName, CUSize, sizeof(int8_t));
653           return nullptr;
654         }
655 
656         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
657         if (Err != CUDA_SUCCESS) {
658           DP("Error when copying data from device to host. Pointers: "
659              "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
660              DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
661           CUDA_ERR_STRING(Err);
662           return nullptr;
663         }
664 
665         if (ExecModeVal < 0 || ExecModeVal > 1) {
666           DP("Error wrong exec_mode value specified in cubin file: %d\n",
667              ExecModeVal);
668           return nullptr;
669         }
670       } else {
671         DP("Loading global exec_mode '%s' - symbol missing, using default "
672            "value GENERIC (1)\n",
673            ExecModeName);
674         CUDA_ERR_STRING(Err);
675       }
676 
677       KernelsList.emplace_back(Func, ExecModeVal);
678 
679       __tgt_offload_entry Entry = *E;
680       Entry.addr = &KernelsList.back();
681       addOffloadEntry(DeviceId, Entry);
682     }
683 
684     // send device environment data to the device
685     {
686       omptarget_device_environmentTy DeviceEnv{0};
687 
688 #ifdef OMPTARGET_DEBUG
689       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
690         DeviceEnv.debug_level = std::stoi(EnvStr);
691 #endif
692 
693       const char *DeviceEnvName = "omptarget_device_environment";
694       CUdeviceptr DeviceEnvPtr;
695       size_t CUSize;
696 
697       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
698       if (Err == CUDA_SUCCESS) {
699         if (CUSize != sizeof(DeviceEnv)) {
700           DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
701              DeviceEnvName, CUSize, sizeof(int32_t));
702           CUDA_ERR_STRING(Err);
703           return nullptr;
704         }
705 
706         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
707         if (Err != CUDA_SUCCESS) {
708           DP("Error when copying data from host to device. Pointers: "
709              "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
710              DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
711           CUDA_ERR_STRING(Err);
712           return nullptr;
713         }
714 
715         DP("Sending global device environment data %zu bytes\n", CUSize);
716       } else {
717         DP("Finding global device environment '%s' - symbol missing.\n",
718            DeviceEnvName);
719         DP("Continue, considering this is a device RTL which does not accept "
720            "environment setting.\n");
721       }
722     }
723 
724     return getOffloadEntriesTable(DeviceId);
725   }
726 
727   void *dataAlloc(const int DeviceId, const int64_t Size) const {
728     if (Size == 0)
729       return nullptr;
730 
731     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
732     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
733       return nullptr;
734 
735     CUdeviceptr DevicePtr;
736     Err = cuMemAlloc(&DevicePtr, Size);
737     if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
738       return nullptr;
739 
740     return (void *)DevicePtr;
741   }
742 
743   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
744                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
745     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
746 
747     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
748     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
749       return OFFLOAD_FAIL;
750 
751     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
752 
753     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
754     if (Err != CUDA_SUCCESS) {
755       DP("Error when copying data from host to device. Pointers: host = " DPxMOD
756          ", device = " DPxMOD ", size = %" PRId64 "\n",
757          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
758       CUDA_ERR_STRING(Err);
759       return OFFLOAD_FAIL;
760     }
761 
762     return OFFLOAD_SUCCESS;
763   }
764 
765   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
766                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
767     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
768 
769     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
770     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
771       return OFFLOAD_FAIL;
772 
773     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
774 
775     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
776     if (Err != CUDA_SUCCESS) {
777       DP("Error when copying data from device to host. Pointers: host = " DPxMOD
778          ", device = " DPxMOD ", size = %" PRId64 "\n",
779          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
780       CUDA_ERR_STRING(Err);
781       return OFFLOAD_FAIL;
782     }
783 
784     return OFFLOAD_SUCCESS;
785   }
786 
787   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
788                    int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
789     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
790 
791     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
792     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
793       return OFFLOAD_FAIL;
794 
795     CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
796 
797     // If they are two devices, we try peer to peer copy first
798     if (SrcDevId != DstDevId) {
799       int CanAccessPeer = 0;
800       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
801       if (Err != CUDA_SUCCESS) {
802         DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
803            ", dst = %" PRId32 "\n",
804            SrcDevId, DstDevId);
805         CUDA_ERR_STRING(Err);
806         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
807       }
808 
809       if (!CanAccessPeer) {
810         DP("P2P memcpy not supported so fall back to D2D memcpy");
811         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
812       }
813 
814       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
815       if (Err != CUDA_SUCCESS) {
816         DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
817            ", dst = %" PRId32 "\n",
818            SrcDevId, DstDevId);
819         CUDA_ERR_STRING(Err);
820         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
821       }
822 
823       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
824                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
825                               Size, Stream);
826       if (Err == CUDA_SUCCESS)
827         return OFFLOAD_SUCCESS;
828 
829       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
830          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
831          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
832       CUDA_ERR_STRING(Err);
833     }
834 
835     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
836   }
837 
838   int dataDelete(const int DeviceId, void *TgtPtr) const {
839     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
840     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
841       return OFFLOAD_FAIL;
842 
843     Err = cuMemFree((CUdeviceptr)TgtPtr);
844     if (!checkResult(Err, "Error returned from cuMemFree\n"))
845       return OFFLOAD_FAIL;
846 
847     return OFFLOAD_SUCCESS;
848   }
849 
850   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
851                           ptrdiff_t *TgtOffsets, const int ArgNum,
852                           const int TeamNum, const int ThreadLimit,
853                           const unsigned int LoopTripCount,
854                           __tgt_async_info *AsyncInfo) const {
855     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
856     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
857       return OFFLOAD_FAIL;
858 
859     // All args are references.
860     std::vector<void *> Args(ArgNum);
861     std::vector<void *> Ptrs(ArgNum);
862 
863     for (int I = 0; I < ArgNum; ++I) {
864       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
865       Args[I] = &Ptrs[I];
866     }
867 
868     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
869 
870     int CudaThreadsPerBlock;
871     if (ThreadLimit > 0) {
872       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
873       CudaThreadsPerBlock = ThreadLimit;
874       // Add master warp if necessary
875       if (KernelInfo->ExecutionMode == GENERIC) {
876         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
877         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
878       }
879     } else {
880       DP("Setting CUDA threads per block to default %d\n",
881          DeviceData[DeviceId].NumThreads);
882       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
883     }
884 
885     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
886       DP("Threads per block capped at device limit %d\n",
887          DeviceData[DeviceId].ThreadsPerBlock);
888       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
889     }
890 
891     if (!KernelInfo->MaxThreadsPerBlock) {
892       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
893                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
894                                KernelInfo->Func);
895       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
896         return OFFLOAD_FAIL;
897     }
898 
899     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
900       DP("Threads per block capped at kernel limit %d\n",
901          KernelInfo->MaxThreadsPerBlock);
902       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
903     }
904 
905     unsigned int CudaBlocksPerGrid;
906     if (TeamNum <= 0) {
907       if (LoopTripCount > 0 && EnvNumTeams < 0) {
908         if (KernelInfo->ExecutionMode == SPMD) {
909           // We have a combined construct, i.e. `target teams distribute
910           // parallel for [simd]`. We launch so many teams so that each thread
911           // will execute one iteration of the loop. round up to the nearest
912           // integer
913           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
914         } else {
915           // If we reach this point, then we have a non-combined construct, i.e.
916           // `teams distribute` with a nested `parallel for` and each team is
917           // assigned one iteration of the `distribute` loop. E.g.:
918           //
919           // #pragma omp target teams distribute
920           // for(...loop_tripcount...) {
921           //   #pragma omp parallel for
922           //   for(...) {}
923           // }
924           //
925           // Threads within a team will execute the iterations of the `parallel`
926           // loop.
927           CudaBlocksPerGrid = LoopTripCount;
928         }
929         DP("Using %d teams due to loop trip count %" PRIu32
930            " and number of threads per block %d\n",
931            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
932       } else {
933         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
934         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
935       }
936     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
937       DP("Capping number of teams to team limit %d\n",
938          DeviceData[DeviceId].BlocksPerGrid);
939       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
940     } else {
941       DP("Using requested number of teams %d\n", TeamNum);
942       CudaBlocksPerGrid = TeamNum;
943     }
944 
945     // Run on the device.
946     DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
947        CudaThreadsPerBlock);
948 
949     CUstream Stream = getStream(DeviceId, AsyncInfo);
950     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
951                          /* gridDimZ */ 1, CudaThreadsPerBlock,
952                          /* blockDimY */ 1, /* blockDimZ */ 1,
953                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
954     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
955       return OFFLOAD_FAIL;
956 
957     DP("Launch of entry point at " DPxMOD " successful!\n",
958        DPxPTR(TgtEntryPtr));
959 
960     return OFFLOAD_SUCCESS;
961   }
962 
963   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
964     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
965     CUresult Err = cuStreamSynchronize(Stream);
966     if (Err != CUDA_SUCCESS) {
967       DP("Error when synchronizing stream. stream = " DPxMOD
968          ", async info ptr = " DPxMOD "\n",
969          DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
970       CUDA_ERR_STRING(Err);
971       return OFFLOAD_FAIL;
972     }
973 
974     // Once the stream is synchronized, return it to stream pool and reset
975     // async_info. This is to make sure the synchronization only works for its
976     // own tasks.
977     StreamManager->returnStream(
978         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
979     AsyncInfoPtr->Queue = nullptr;
980 
981     return OFFLOAD_SUCCESS;
982   }
983 };
984 
985 DeviceRTLTy DeviceRTL;
986 } // namespace
987 
988 // Exposed library API function
989 #ifdef __cplusplus
990 extern "C" {
991 #endif
992 
993 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
994   return elf_check_machine(image, /* EM_CUDA */ 190);
995 }
996 
997 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
998 
999 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1000   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1001   DeviceRTL.setRequiresFlag(RequiresFlags);
1002   return RequiresFlags;
1003 }
1004 
1005 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1006   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1007       DeviceRTL.isValidDeviceId(dst_dev_id))
1008     return 1;
1009 
1010   return 0;
1011 }
1012 
1013 int32_t __tgt_rtl_init_device(int32_t device_id) {
1014   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1015 
1016   return DeviceRTL.initDevice(device_id);
1017 }
1018 
1019 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1020                                           __tgt_device_image *image) {
1021   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1022 
1023   return DeviceRTL.loadBinary(device_id, image);
1024 }
1025 
1026 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
1027   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1028 
1029   return DeviceRTL.dataAlloc(device_id, size);
1030 }
1031 
1032 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1033                               int64_t size) {
1034   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1035 
1036   __tgt_async_info async_info;
1037   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1038                                                  size, &async_info);
1039   if (rc != OFFLOAD_SUCCESS)
1040     return OFFLOAD_FAIL;
1041 
1042   return __tgt_rtl_synchronize(device_id, &async_info);
1043 }
1044 
1045 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1046                                     void *hst_ptr, int64_t size,
1047                                     __tgt_async_info *async_info_ptr) {
1048   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1049   assert(async_info_ptr && "async_info_ptr is nullptr");
1050 
1051   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1052                               async_info_ptr);
1053 }
1054 
1055 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1056                                 int64_t size) {
1057   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1058 
1059   __tgt_async_info async_info;
1060   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1061                                                    size, &async_info);
1062   if (rc != OFFLOAD_SUCCESS)
1063     return OFFLOAD_FAIL;
1064 
1065   return __tgt_rtl_synchronize(device_id, &async_info);
1066 }
1067 
1068 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1069                                       void *tgt_ptr, int64_t size,
1070                                       __tgt_async_info *async_info_ptr) {
1071   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1072   assert(async_info_ptr && "async_info_ptr is nullptr");
1073 
1074   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1075                                 async_info_ptr);
1076 }
1077 
1078 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1079                                       int dst_dev_id, void *dst_ptr,
1080                                       int64_t size,
1081                                       __tgt_async_info *async_info_ptr) {
1082   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1083   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1084   assert(async_info_ptr && "async_info_ptr is nullptr");
1085 
1086   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1087                                 async_info_ptr);
1088 }
1089 
1090 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1091                                 int32_t dst_dev_id, void *dst_ptr,
1092                                 int64_t size) {
1093   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1094   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1095 
1096   __tgt_async_info async_info;
1097   const int32_t rc = __tgt_rtl_data_exchange_async(
1098       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1099   if (rc != OFFLOAD_SUCCESS)
1100     return OFFLOAD_FAIL;
1101 
1102   return __tgt_rtl_synchronize(src_dev_id, &async_info);
1103 }
1104 
1105 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1106   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1107 
1108   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1109 }
1110 
1111 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1112                                          void **tgt_args,
1113                                          ptrdiff_t *tgt_offsets,
1114                                          int32_t arg_num, int32_t team_num,
1115                                          int32_t thread_limit,
1116                                          uint64_t loop_tripcount) {
1117   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1118 
1119   __tgt_async_info async_info;
1120   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1121       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1122       thread_limit, loop_tripcount, &async_info);
1123   if (rc != OFFLOAD_SUCCESS)
1124     return OFFLOAD_FAIL;
1125 
1126   return __tgt_rtl_synchronize(device_id, &async_info);
1127 }
1128 
1129 int32_t __tgt_rtl_run_target_team_region_async(
1130     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1131     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1132     int32_t thread_limit, uint64_t loop_tripcount,
1133     __tgt_async_info *async_info_ptr) {
1134   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1135 
1136   return DeviceRTL.runTargetTeamRegion(
1137       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1138       thread_limit, loop_tripcount, async_info_ptr);
1139 }
1140 
1141 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1142                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1143                                     int32_t arg_num) {
1144   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1145 
1146   __tgt_async_info async_info;
1147   const int32_t rc = __tgt_rtl_run_target_region_async(
1148       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1149   if (rc != OFFLOAD_SUCCESS)
1150     return OFFLOAD_FAIL;
1151 
1152   return __tgt_rtl_synchronize(device_id, &async_info);
1153 }
1154 
1155 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1156                                           void *tgt_entry_ptr, void **tgt_args,
1157                                           ptrdiff_t *tgt_offsets,
1158                                           int32_t arg_num,
1159                                           __tgt_async_info *async_info_ptr) {
1160   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1161 
1162   return __tgt_rtl_run_target_team_region_async(
1163       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1164       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1165       async_info_ptr);
1166 }
1167 
1168 int32_t __tgt_rtl_synchronize(int32_t device_id,
1169                               __tgt_async_info *async_info_ptr) {
1170   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1171   assert(async_info_ptr && "async_info_ptr is nullptr");
1172   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1173 
1174   return DeviceRTL.synchronize(device_id, async_info_ptr);
1175 }
1176 
1177 #ifdef __cplusplus
1178 }
1179 #endif
1180