1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "omptargetplugin.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #ifdef OMPTARGET_DEBUG
29 static int DebugLevel = 0;
30 
31 #define GETNAME2(name) #name
32 #define GETNAME(name) GETNAME2(name)
33 #define DP(...) \
34   do { \
35     if (DebugLevel > 0) { \
36       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
37     } \
38   } while (false)
39 
40 // Utility for retrieving and printing CUDA error string.
41 #define CUDA_ERR_STRING(err) \
42   do { \
43     if (DebugLevel > 0) { \
44       const char *errStr; \
45       cuGetErrorString(err, &errStr); \
46       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
47     } \
48   } while (false)
49 #else // OMPTARGET_DEBUG
50 #define DP(...) {}
51 #define CUDA_ERR_STRING(err) {}
52 #endif // OMPTARGET_DEBUG
53 
54 #include "../../common/elf_common.c"
55 
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58   __tgt_target_table Table;
59   std::vector<__tgt_offload_entry> Entries;
60 };
61 
62 enum ExecutionModeType {
63   SPMD, // constructors, destructors,
64   // combined constructs (`teams distribute parallel for [simd]`)
65   GENERIC, // everything else
66   NONE
67 };
68 
69 /// Use a single entity to encode a kernel and a set of flags.
70 struct KernelTy {
71   CUfunction Func;
72 
73   // execution mode of kernel
74   // 0 - SPMD mode (without master warp)
75   // 1 - Generic mode (with master warp)
76   int8_t ExecutionMode;
77 
78   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
79       : Func(_Func), ExecutionMode(_ExecutionMode) {}
80 };
81 
82 /// Device environment data
83 /// Manually sync with the deviceRTL side for now, move to a dedicated header
84 /// file later.
85 struct omptarget_device_environmentTy {
86   int32_t debug_level;
87 };
88 
89 /// List that contains all the kernels.
90 /// FIXME: we may need this to be per device and per library.
91 std::list<KernelTy> KernelsList;
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   DP(ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
104                CUstream Stream) {
105   CUresult Err =
106       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
107 
108   if (Err != CUDA_SUCCESS) {
109     DP("Error when copying data from device to device. Pointers: src "
110        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
111        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
112     CUDA_ERR_STRING(Err);
113     return OFFLOAD_FAIL;
114   }
115 
116   return OFFLOAD_SUCCESS;
117 }
118 
119 // Structure contains per-device data
120 struct DeviceDataTy {
121   std::list<FuncOrGblEntryTy> FuncGblEntries;
122   CUcontext Context = nullptr;
123   // Device properties
124   int ThreadsPerBlock = 0;
125   int BlocksPerGrid = 0;
126   int WarpSize = 0;
127   // OpenMP properties
128   int NumTeams = 0;
129   int NumThreads = 0;
130 };
131 
132 class StreamManagerTy {
133   int NumberOfDevices;
134   // The initial size of stream pool
135   int EnvNumInitialStreams;
136   // Per-device stream mutex
137   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
138   // Per-device stream Id indicates the next available stream in the pool
139   std::vector<int> NextStreamId;
140   // Per-device stream pool
141   std::vector<std::vector<CUstream>> StreamPool;
142   // Reference to per-device data
143   std::vector<DeviceDataTy> &DeviceData;
144 
145   // If there is no CUstream left in the pool, we will resize the pool to
146   // allocate more CUstream. This function should be called with device mutex,
147   // and we do not resize to smaller one.
148   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
149     std::vector<CUstream> &Pool = StreamPool[DeviceId];
150     const size_t CurrentSize = Pool.size();
151     assert(NewSize > CurrentSize && "new size is not larger than current size");
152 
153     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
154     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
155       // We will return if cannot switch to the right context in case of
156       // creating bunch of streams that are not corresponding to the right
157       // device. The offloading will fail later because selected CUstream is
158       // nullptr.
159       return;
160     }
161 
162     Pool.resize(NewSize, nullptr);
163 
164     for (size_t I = CurrentSize; I < NewSize; ++I) {
165       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
166                   "Error returned from cuStreamCreate\n");
167     }
168   }
169 
170 public:
171   StreamManagerTy(const int NumberOfDevices,
172                   std::vector<DeviceDataTy> &DeviceData)
173       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
174         DeviceData(DeviceData) {
175     StreamPool.resize(NumberOfDevices);
176     NextStreamId.resize(NumberOfDevices);
177     StreamMtx.resize(NumberOfDevices);
178 
179     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
180       EnvNumInitialStreams = std::stoi(EnvStr);
181 
182     // Initialize the next stream id
183     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
184 
185     // Initialize stream mutex
186     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
187       Ptr = std::make_unique<std::mutex>();
188   }
189 
190   ~StreamManagerTy() {
191     // Destroy streams
192     for (int I = 0; I < NumberOfDevices; ++I) {
193       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
194                   "Error returned from cuCtxSetCurrent\n");
195 
196       for (CUstream &S : StreamPool[I]) {
197         if (S)
198           checkResult(cuStreamDestroy(S),
199                       "Error returned from cuStreamDestroy\n");
200       }
201     }
202   }
203 
204   // Get a CUstream from pool. Per-device next stream id always points to the
205   // next available CUstream. That means, CUstreams [0, id-1] have been
206   // assigned, and [id,] are still available. If there is no CUstream left, we
207   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
208   // the id will increase one.
209   // xxxxxs+++++++++
210   //      ^
211   //      id
212   // After assignment, the pool becomes the following and s is assigned.
213   // xxxxxs+++++++++
214   //       ^
215   //       id
216   CUstream getStream(const int DeviceId) {
217     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
218     int &Id = NextStreamId[DeviceId];
219     // No CUstream left in the pool, we need to request from CUDA RT
220     if (Id == StreamPool[DeviceId].size()) {
221       // By default we double the stream pool every time
222       resizeStreamPool(DeviceId, Id * 2);
223     }
224     return StreamPool[DeviceId][Id++];
225   }
226 
227   // Return a CUstream back to pool. As mentioned above, per-device next
228   // stream is always points to the next available CUstream, so when we return
229   // a CUstream, we need to first decrease the id, and then copy the CUstream
230   // back.
231   // It is worth noting that, the order of streams return might be different
232   // from that they're assigned, that saying, at some point, there might be
233   // two identical CUstreams.
234   // xxax+a+++++
235   //     ^
236   //     id
237   // However, it doesn't matter, because they're always on the two sides of
238   // id. The left one will in the end be overwritten by another CUstream.
239   // Therefore, after several execution, the order of pool might be different
240   // from its initial state.
241   void returnStream(const int DeviceId, CUstream Stream) {
242     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
243     int &Id = NextStreamId[DeviceId];
244     assert(Id > 0 && "Wrong stream ID");
245     StreamPool[DeviceId][--Id] = Stream;
246   }
247 
248   bool initializeDeviceStreamPool(const int DeviceId) {
249     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
250 
251     resizeStreamPool(DeviceId, EnvNumInitialStreams);
252 
253     // Check the size of stream pool
254     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
255       return false;
256 
257     // Check whether each stream is valid
258     for (CUstream &S : StreamPool[DeviceId])
259       if (!S)
260         return false;
261 
262     return true;
263   }
264 };
265 
266 class DeviceRTLTy {
267   int NumberOfDevices;
268   // OpenMP environment properties
269   int EnvNumTeams;
270   int EnvTeamLimit;
271   // OpenMP requires flags
272   int64_t RequiresFlags;
273 
274   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
275   static constexpr const int HardThreadLimit = 1024;
276   static constexpr const int DefaultNumTeams = 128;
277   static constexpr const int DefaultNumThreads = 128;
278 
279   std::unique_ptr<StreamManagerTy> StreamManager;
280   std::vector<DeviceDataTy> DeviceData;
281   std::vector<CUmodule> Modules;
282 
283   // Record entry point associated with device
284   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
285     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
286     E.Entries.push_back(entry);
287   }
288 
289   // Return true if the entry is associated with device
290   bool findOffloadEntry(const int DeviceId, const void *Addr) const {
291     for (const __tgt_offload_entry &Itr :
292          DeviceData[DeviceId].FuncGblEntries.back().Entries)
293       if (Itr.addr == Addr)
294         return true;
295 
296     return false;
297   }
298 
299   // Return the pointer to the target entries table
300   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
301     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
302 
303     if (E.Entries.empty())
304       return nullptr;
305 
306     // Update table info according to the entries and return the pointer
307     E.Table.EntriesBegin = E.Entries.data();
308     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
309 
310     return &E.Table;
311   }
312 
313   // Clear entries table for a device
314   void clearOffloadEntriesTable(const int DeviceId) {
315     DeviceData[DeviceId].FuncGblEntries.emplace_back();
316     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
317     E.Entries.clear();
318     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
319   }
320 
321   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
322     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
323 
324     if (!AsyncInfoPtr->Queue)
325       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
326 
327     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
328   }
329 
330 public:
331   // This class should not be copied
332   DeviceRTLTy(const DeviceRTLTy &) = delete;
333   DeviceRTLTy(DeviceRTLTy &&) = delete;
334 
335   DeviceRTLTy()
336       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
337         RequiresFlags(OMP_REQ_UNDEFINED) {
338 #ifdef OMPTARGET_DEBUG
339     if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
340       DebugLevel = std::stoi(EnvStr);
341 #endif // OMPTARGET_DEBUG
342 
343     DP("Start initializing CUDA\n");
344 
345     CUresult Err = cuInit(0);
346     if (!checkResult(Err, "Error returned from cuInit\n")) {
347       return;
348     }
349 
350     Err = cuDeviceGetCount(&NumberOfDevices);
351     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
352       return;
353 
354     if (NumberOfDevices == 0) {
355       DP("There are no devices supporting CUDA.\n");
356       return;
357     }
358 
359     DeviceData.resize(NumberOfDevices);
360 
361     // Get environment variables regarding teams
362     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
363       // OMP_TEAM_LIMIT has been set
364       EnvTeamLimit = std::stoi(EnvStr);
365       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
366     }
367     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
368       // OMP_NUM_TEAMS has been set
369       EnvNumTeams = std::stoi(EnvStr);
370       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
371     }
372 
373     StreamManager =
374         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
375   }
376 
377   ~DeviceRTLTy() {
378     // First destruct stream manager in case of Contexts is destructed before it
379     StreamManager = nullptr;
380 
381     for (CUmodule &M : Modules)
382       // Close module
383       if (M)
384         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
385 
386     for (DeviceDataTy &D : DeviceData) {
387       // Destroy context
388       if (D.Context)
389         checkResult(cuCtxDestroy(D.Context),
390                     "Error returned from cuCtxDestroy\n");
391     }
392   }
393 
394   // Check whether a given DeviceId is valid
395   bool isValidDeviceId(const int DeviceId) const {
396     return DeviceId >= 0 && DeviceId < NumberOfDevices;
397   }
398 
399   int getNumOfDevices() const { return NumberOfDevices; }
400 
401   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
402 
403   int initDevice(const int DeviceId) {
404     CUdevice Device;
405 
406     DP("Getting device %d\n", DeviceId);
407     CUresult Err = cuDeviceGet(&Device, DeviceId);
408     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
409       return OFFLOAD_FAIL;
410 
411     // Create the context and save it to use whenever this device is selected.
412     Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
413                       Device);
414     if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
415       return OFFLOAD_FAIL;
416 
417     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
418     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
419       return OFFLOAD_FAIL;
420 
421     // Initialize stream pool
422     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
423       return OFFLOAD_FAIL;
424 
425     // Query attributes to determine number of threads/block and blocks/grid.
426     int MaxGridDimX;
427     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
428                                Device);
429     if (Err != CUDA_SUCCESS) {
430       DP("Error getting max grid dimension, use default value %d\n",
431          DeviceRTLTy::DefaultNumTeams);
432       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
433     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
434       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
435       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
436     } else {
437       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
438          "at the hard limit\n",
439          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
440       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
441     }
442 
443     // We are only exploiting threads along the x axis.
444     int MaxBlockDimX;
445     Err = cuDeviceGetAttribute(&MaxBlockDimX,
446                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
447     if (Err != CUDA_SUCCESS) {
448       DP("Error getting max block dimension, use default value %d\n",
449          DeviceRTLTy::DefaultNumThreads);
450       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
451     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
452       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
453       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
454     } else {
455       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
456          "capping at the hard limit\n",
457          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
458       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
459     }
460 
461     // Get and set warp size
462     int WarpSize;
463     Err =
464         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
465     if (Err != CUDA_SUCCESS) {
466       DP("Error getting warp size, assume default value 32\n");
467       DeviceData[DeviceId].WarpSize = 32;
468     } else {
469       DP("Using warp size %d\n", WarpSize);
470       DeviceData[DeviceId].WarpSize = WarpSize;
471     }
472 
473     // Adjust teams to the env variables
474     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
475       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
476          EnvTeamLimit);
477       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
478     }
479 
480     DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
481        DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
482        DeviceData[DeviceId].WarpSize);
483 
484     // Set default number of teams
485     if (EnvNumTeams > 0) {
486       DP("Default number of teams set according to environment %d\n",
487          EnvNumTeams);
488       DeviceData[DeviceId].NumTeams = EnvNumTeams;
489     } else {
490       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
491       DP("Default number of teams set according to library's default %d\n",
492          DeviceRTLTy::DefaultNumTeams);
493     }
494 
495     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
496       DP("Default number of teams exceeds device limit, capping at %d\n",
497          DeviceData[DeviceId].BlocksPerGrid);
498       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
499     }
500 
501     // Set default number of threads
502     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
503     DP("Default number of threads set according to library's default %d\n",
504        DeviceRTLTy::DefaultNumThreads);
505     if (DeviceData[DeviceId].NumThreads >
506         DeviceData[DeviceId].ThreadsPerBlock) {
507       DP("Default number of threads exceeds device limit, capping at %d\n",
508          DeviceData[DeviceId].ThreadsPerBlock);
509       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
510     }
511 
512     return OFFLOAD_SUCCESS;
513   }
514 
515   __tgt_target_table *loadBinary(const int DeviceId,
516                                  const __tgt_device_image *Image) {
517     // Set the context we are using
518     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
519     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
520       return nullptr;
521 
522     // Clear the offload table as we are going to create a new one.
523     clearOffloadEntriesTable(DeviceId);
524 
525     // Create the module and extract the function pointers.
526     CUmodule Module;
527     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
528     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
529     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
530       return nullptr;
531 
532     DP("CUDA module successfully loaded!\n");
533 
534     Modules.push_back(Module);
535 
536     // Find the symbols in the module by name.
537     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
538     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
539 
540     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
541       if (!E->addr) {
542         // We return nullptr when something like this happens, the host should
543         // have always something in the address to uniquely identify the target
544         // region.
545         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
546         return nullptr;
547       }
548 
549       if (E->size) {
550         __tgt_offload_entry Entry = *E;
551         CUdeviceptr CUPtr;
552         size_t CUSize;
553         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
554         // We keep this style here because we need the name
555         if (Err != CUDA_SUCCESS) {
556           DP("Loading global '%s' (Failed)\n", E->name);
557           CUDA_ERR_STRING(Err);
558           return nullptr;
559         }
560 
561         if (CUSize != E->size) {
562           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
563              CUSize, E->size);
564           return nullptr;
565         }
566 
567         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
568            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
569 
570         Entry.addr = (void *)(CUPtr);
571 
572         // Note: In the current implementation declare target variables
573         // can either be link or to. This means that once unified
574         // memory is activated via the requires directive, the variable
575         // can be used directly from the host in both cases.
576         // TODO: when variables types other than to or link are added,
577         // the below condition should be changed to explicitly
578         // check for to and link variables types:
579         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
580         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
581         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
582           // If unified memory is present any target link or to variables
583           // can access host addresses directly. There is no longer a
584           // need for device copies.
585           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
586           DP("Copy linked variable host address (" DPxMOD
587              ") to device address (" DPxMOD ")\n",
588              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
589         }
590 
591         addOffloadEntry(DeviceId, Entry);
592 
593         continue;
594       }
595 
596       CUfunction Func;
597       Err = cuModuleGetFunction(&Func, Module, E->name);
598       // We keep this style here because we need the name
599       if (Err != CUDA_SUCCESS) {
600         DP("Loading '%s' (Failed)\n", E->name);
601         CUDA_ERR_STRING(Err);
602         return nullptr;
603       }
604 
605       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
606          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
607 
608       // default value GENERIC (in case symbol is missing from cubin file)
609       int8_t ExecModeVal = ExecutionModeType::GENERIC;
610       std::string ExecModeNameStr(E->name);
611       ExecModeNameStr += "_exec_mode";
612       const char *ExecModeName = ExecModeNameStr.c_str();
613 
614       CUdeviceptr ExecModePtr;
615       size_t CUSize;
616       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
617       if (Err == CUDA_SUCCESS) {
618         if (CUSize != sizeof(int8_t)) {
619           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
620              ExecModeName, CUSize, sizeof(int8_t));
621           return nullptr;
622         }
623 
624         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
625         if (Err != CUDA_SUCCESS) {
626           DP("Error when copying data from device to host. Pointers: "
627              "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
628              DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
629           CUDA_ERR_STRING(Err);
630           return nullptr;
631         }
632 
633         if (ExecModeVal < 0 || ExecModeVal > 1) {
634           DP("Error wrong exec_mode value specified in cubin file: %d\n",
635              ExecModeVal);
636           return nullptr;
637         }
638       } else {
639         DP("Loading global exec_mode '%s' - symbol missing, using default "
640            "value GENERIC (1)\n",
641            ExecModeName);
642         CUDA_ERR_STRING(Err);
643       }
644 
645       KernelsList.emplace_back(Func, ExecModeVal);
646 
647       __tgt_offload_entry Entry = *E;
648       Entry.addr = &KernelsList.back();
649       addOffloadEntry(DeviceId, Entry);
650     }
651 
652     // send device environment data to the device
653     {
654       omptarget_device_environmentTy DeviceEnv{0};
655 
656 #ifdef OMPTARGET_DEBUG
657       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
658         DeviceEnv.debug_level = std::stoi(EnvStr);
659 #endif
660 
661       const char *DeviceEnvName = "omptarget_device_environment";
662       CUdeviceptr DeviceEnvPtr;
663       size_t CUSize;
664 
665       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
666       if (Err == CUDA_SUCCESS) {
667         if (CUSize != sizeof(DeviceEnv)) {
668           DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
669              DeviceEnvName, CUSize, sizeof(int32_t));
670           CUDA_ERR_STRING(Err);
671           return nullptr;
672         }
673 
674         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
675         if (Err != CUDA_SUCCESS) {
676           DP("Error when copying data from host to device. Pointers: "
677              "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
678              DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
679           CUDA_ERR_STRING(Err);
680           return nullptr;
681         }
682 
683         DP("Sending global device environment data %zu bytes\n", CUSize);
684       } else {
685         DP("Finding global device environment '%s' - symbol missing.\n",
686            DeviceEnvName);
687         DP("Continue, considering this is a device RTL which does not accept "
688            "environment setting.\n");
689       }
690     }
691 
692     return getOffloadEntriesTable(DeviceId);
693   }
694 
695   void *dataAlloc(const int DeviceId, const int64_t Size) const {
696     if (Size == 0)
697       return nullptr;
698 
699     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
700     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
701       return nullptr;
702 
703     CUdeviceptr DevicePtr;
704     Err = cuMemAlloc(&DevicePtr, Size);
705     if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
706       return nullptr;
707 
708     return (void *)DevicePtr;
709   }
710 
711   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
712                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
713     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
714 
715     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
716     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
717       return OFFLOAD_FAIL;
718 
719     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
720 
721     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
722     if (Err != CUDA_SUCCESS) {
723       DP("Error when copying data from host to device. Pointers: host = " DPxMOD
724          ", device = " DPxMOD ", size = %" PRId64 "\n",
725          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
726       CUDA_ERR_STRING(Err);
727       return OFFLOAD_FAIL;
728     }
729 
730     return OFFLOAD_SUCCESS;
731   }
732 
733   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
734                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
735     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
736 
737     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
738     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
739       return OFFLOAD_FAIL;
740 
741     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
742 
743     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
744     if (Err != CUDA_SUCCESS) {
745       DP("Error when copying data from device to host. Pointers: host = " DPxMOD
746          ", device = " DPxMOD ", size = %" PRId64 "\n",
747          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
748       CUDA_ERR_STRING(Err);
749       return OFFLOAD_FAIL;
750     }
751 
752     return OFFLOAD_SUCCESS;
753   }
754 
755   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
756                    int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
757     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
758 
759     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
760     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
761       return OFFLOAD_FAIL;
762 
763     CUstream Stream = getStream(SrcDevId, AsyncInfoPtr);
764 
765     // If they are two devices, we try peer to peer copy first
766     if (SrcDevId != DstDevId) {
767       int CanAccessPeer = 0;
768       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
769       if (Err != CUDA_SUCCESS) {
770         DP("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
771            ", dst = %" PRId32 "\n",
772            SrcDevId, DstDevId);
773         CUDA_ERR_STRING(Err);
774         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
775       }
776 
777       if (!CanAccessPeer) {
778         DP("P2P memcpy not supported so fall back to D2D memcpy");
779         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
780       }
781 
782       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
783       if (Err != CUDA_SUCCESS) {
784         DP("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
785            ", dst = %" PRId32 "\n",
786            SrcDevId, DstDevId);
787         CUDA_ERR_STRING(Err);
788         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
789       }
790 
791       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
792                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
793                               Size, Stream);
794       if (Err == CUDA_SUCCESS)
795         return OFFLOAD_SUCCESS;
796 
797       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
798          ", src_id =%" PRId32 ", dst_ptr = %" DPxMOD ", dst_id =%" PRId32 "\n",
799          SrcPtr, SrcDevId, DstPtr, DstDevId);
800       CUDA_ERR_STRING(Err);
801     }
802 
803     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
804   }
805 
806   int dataDelete(const int DeviceId, void *TgtPtr) const {
807     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
808     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
809       return OFFLOAD_FAIL;
810 
811     Err = cuMemFree((CUdeviceptr)TgtPtr);
812     if (!checkResult(Err, "Error returned from cuMemFree\n"))
813       return OFFLOAD_FAIL;
814 
815     return OFFLOAD_SUCCESS;
816   }
817 
818   int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
819                           void **TgtArgs, ptrdiff_t *TgtOffsets,
820                           const int ArgNum, const int TeamNum,
821                           const int ThreadLimit,
822                           const unsigned int LoopTripCount,
823                           __tgt_async_info *AsyncInfo) const {
824     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
825     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
826       return OFFLOAD_FAIL;
827 
828     // All args are references.
829     std::vector<void *> Args(ArgNum);
830     std::vector<void *> Ptrs(ArgNum);
831 
832     for (int I = 0; I < ArgNum; ++I) {
833       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
834       Args[I] = &Ptrs[I];
835     }
836 
837     const KernelTy *KernelInfo =
838         reinterpret_cast<const KernelTy *>(TgtEntryPtr);
839 
840     unsigned int CudaThreadsPerBlock;
841     if (ThreadLimit > 0) {
842       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
843       CudaThreadsPerBlock = ThreadLimit;
844       // Add master warp if necessary
845       if (KernelInfo->ExecutionMode == GENERIC) {
846         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
847         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
848       }
849     } else {
850       DP("Setting CUDA threads per block to default %d\n",
851          DeviceData[DeviceId].NumThreads);
852       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
853     }
854 
855     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
856       DP("Threads per block capped at device limit %d\n",
857          DeviceData[DeviceId].ThreadsPerBlock);
858       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
859     }
860 
861     int KernelLimit;
862     Err = cuFuncGetAttribute(&KernelLimit,
863                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
864                              KernelInfo->Func);
865     if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
866       DP("Threads per block capped at kernel limit %d\n", KernelLimit);
867       CudaThreadsPerBlock = KernelLimit;
868     }
869 
870     unsigned int CudaBlocksPerGrid;
871     if (TeamNum <= 0) {
872       if (LoopTripCount > 0 && EnvNumTeams < 0) {
873         if (KernelInfo->ExecutionMode == SPMD) {
874           // We have a combined construct, i.e. `target teams distribute
875           // parallel for [simd]`. We launch so many teams so that each thread
876           // will execute one iteration of the loop. round up to the nearest
877           // integer
878           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
879         } else {
880           // If we reach this point, then we have a non-combined construct, i.e.
881           // `teams distribute` with a nested `parallel for` and each team is
882           // assigned one iteration of the `distribute` loop. E.g.:
883           //
884           // #pragma omp target teams distribute
885           // for(...loop_tripcount...) {
886           //   #pragma omp parallel for
887           //   for(...) {}
888           // }
889           //
890           // Threads within a team will execute the iterations of the `parallel`
891           // loop.
892           CudaBlocksPerGrid = LoopTripCount;
893         }
894         DP("Using %d teams due to loop trip count %" PRIu64
895            " and number of threads per block %d\n",
896            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
897       } else {
898         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
899         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
900       }
901     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
902       DP("Capping number of teams to team limit %d\n",
903          DeviceData[DeviceId].BlocksPerGrid);
904       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
905     } else {
906       DP("Using requested number of teams %d\n", TeamNum);
907       CudaBlocksPerGrid = TeamNum;
908     }
909 
910     // Run on the device.
911     DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
912        CudaThreadsPerBlock);
913 
914     CUstream Stream = getStream(DeviceId, AsyncInfo);
915     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
916                          /* gridDimZ */ 1, CudaThreadsPerBlock,
917                          /* blockDimY */ 1, /* blockDimZ */ 1,
918                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
919     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
920       return OFFLOAD_FAIL;
921 
922     DP("Launch of entry point at " DPxMOD " successful!\n",
923        DPxPTR(TgtEntryPtr));
924 
925     return OFFLOAD_SUCCESS;
926   }
927 
928   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
929     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
930     CUresult Err = cuStreamSynchronize(Stream);
931     if (Err != CUDA_SUCCESS) {
932       DP("Error when synchronizing stream. stream = " DPxMOD
933          ", async info ptr = " DPxMOD "\n",
934          DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
935       CUDA_ERR_STRING(Err);
936       return OFFLOAD_FAIL;
937     }
938 
939     // Once the stream is synchronized, return it to stream pool and reset
940     // async_info. This is to make sure the synchronization only works for its
941     // own tasks.
942     StreamManager->returnStream(
943         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
944     AsyncInfoPtr->Queue = nullptr;
945 
946     return OFFLOAD_SUCCESS;
947   }
948 };
949 
950 DeviceRTLTy DeviceRTL;
951 } // namespace
952 
953 // Exposed library API function
954 #ifdef __cplusplus
955 extern "C" {
956 #endif
957 
958 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
959   return elf_check_machine(image, /* EM_CUDA */ 190);
960 }
961 
962 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
963 
964 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
965   DP("Init requires flags to %ld\n", RequiresFlags);
966   DeviceRTL.setRequiresFlag(RequiresFlags);
967   return RequiresFlags;
968 }
969 
970 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
971   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
972       DeviceRTL.isValidDeviceId(dst_dev_id))
973     return 1;
974 
975   return 0;
976 }
977 
978 int32_t __tgt_rtl_init_device(int32_t device_id) {
979   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
980 
981   return DeviceRTL.initDevice(device_id);
982 }
983 
984 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
985                                           __tgt_device_image *image) {
986   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
987 
988   return DeviceRTL.loadBinary(device_id, image);
989 }
990 
991 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
992   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
993 
994   return DeviceRTL.dataAlloc(device_id, size);
995 }
996 
997 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
998                               int64_t size) {
999   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1000 
1001   __tgt_async_info async_info;
1002   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1003                                                  size, &async_info);
1004   if (rc != OFFLOAD_SUCCESS)
1005     return OFFLOAD_FAIL;
1006 
1007   return __tgt_rtl_synchronize(device_id, &async_info);
1008 }
1009 
1010 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1011                                     void *hst_ptr, int64_t size,
1012                                     __tgt_async_info *async_info_ptr) {
1013   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1014   assert(async_info_ptr && "async_info_ptr is nullptr");
1015 
1016   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1017                               async_info_ptr);
1018 }
1019 
1020 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1021                                 int64_t size) {
1022   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1023 
1024   __tgt_async_info async_info;
1025   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1026                                                    size, &async_info);
1027   if (rc != OFFLOAD_SUCCESS)
1028     return OFFLOAD_FAIL;
1029 
1030   return __tgt_rtl_synchronize(device_id, &async_info);
1031 }
1032 
1033 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1034                                       void *tgt_ptr, int64_t size,
1035                                       __tgt_async_info *async_info_ptr) {
1036   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1037   assert(async_info_ptr && "async_info_ptr is nullptr");
1038 
1039   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1040                                 async_info_ptr);
1041 }
1042 
1043 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1044                                       int dst_dev_id, void *dst_ptr,
1045                                       int64_t size,
1046                                       __tgt_async_info *async_info_ptr) {
1047   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1048   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1049   assert(async_info_ptr && "async_info_ptr is nullptr");
1050 
1051   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1052                                 async_info_ptr);
1053 }
1054 
1055 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1056                                 int32_t dst_dev_id, void *dst_ptr,
1057                                 int64_t size) {
1058   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1059   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1060 
1061   __tgt_async_info async_info;
1062   const int32_t rc = __tgt_rtl_data_exchange_async(
1063       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &async_info);
1064   if (rc != OFFLOAD_SUCCESS)
1065     return OFFLOAD_FAIL;
1066 
1067   return __tgt_rtl_synchronize(src_dev_id, &async_info);
1068 }
1069 
1070 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1071   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1072 
1073   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1074 }
1075 
1076 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1077                                          void **tgt_args,
1078                                          ptrdiff_t *tgt_offsets,
1079                                          int32_t arg_num, int32_t team_num,
1080                                          int32_t thread_limit,
1081                                          uint64_t loop_tripcount) {
1082   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1083 
1084   __tgt_async_info async_info;
1085   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1086       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1087       thread_limit, loop_tripcount, &async_info);
1088   if (rc != OFFLOAD_SUCCESS)
1089     return OFFLOAD_FAIL;
1090 
1091   return __tgt_rtl_synchronize(device_id, &async_info);
1092 }
1093 
1094 int32_t __tgt_rtl_run_target_team_region_async(
1095     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1096     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1097     int32_t thread_limit, uint64_t loop_tripcount,
1098     __tgt_async_info *async_info_ptr) {
1099   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1100 
1101   return DeviceRTL.runTargetTeamRegion(
1102       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1103       thread_limit, loop_tripcount, async_info_ptr);
1104 }
1105 
1106 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1107                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1108                                     int32_t arg_num) {
1109   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1110 
1111   __tgt_async_info async_info;
1112   const int32_t rc = __tgt_rtl_run_target_region_async(
1113       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1114   if (rc != OFFLOAD_SUCCESS)
1115     return OFFLOAD_FAIL;
1116 
1117   return __tgt_rtl_synchronize(device_id, &async_info);
1118 }
1119 
1120 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1121                                           void *tgt_entry_ptr, void **tgt_args,
1122                                           ptrdiff_t *tgt_offsets,
1123                                           int32_t arg_num,
1124                                           __tgt_async_info *async_info_ptr) {
1125   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1126 
1127   return __tgt_rtl_run_target_team_region_async(
1128       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1129       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1130       async_info_ptr);
1131 }
1132 
1133 int32_t __tgt_rtl_synchronize(int32_t device_id,
1134                               __tgt_async_info *async_info_ptr) {
1135   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1136   assert(async_info_ptr && "async_info_ptr is nullptr");
1137   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1138 
1139   return DeviceRTL.synchronize(device_id, async_info_ptr);
1140 }
1141 
1142 #ifdef __cplusplus
1143 }
1144 #endif
1145