1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <unordered_map>
21 #include <vector>
22 
23 #include "Debug.h"
24 #include "omptargetplugin.h"
25 
26 #define TARGET_NAME CUDA
27 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
28 
29 #include "MemoryManager.h"
30 
31 // Utility for retrieving and printing CUDA error string.
32 #ifdef OMPTARGET_DEBUG
33 #define CUDA_ERR_STRING(err)                                                   \
34   do {                                                                         \
35     if (getDebugLevel() > 0) {                                                 \
36       const char *errStr = nullptr;                                            \
37       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
38       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
39         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
40       else if (errStr_status == CUDA_SUCCESS)                                  \
41         REPORT("CUDA error is: %s\n", errStr);                                 \
42       else {                                                                   \
43         REPORT("Unresolved CUDA error code: %d\n", err);                       \
44         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
45                errStr_status);                                                 \
46       }                                                                        \
47     } else {                                                                   \
48       const char *errStr = nullptr;                                            \
49       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
50       if (errStr_status == CUDA_SUCCESS)                                       \
51         REPORT("%s \n", errStr);                                               \
52     }                                                                          \
53   } while (false)
54 #else // OMPTARGET_DEBUG
55 #define CUDA_ERR_STRING(err)                                                   \
56   do {                                                                         \
57     const char *errStr = nullptr;                                              \
58     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
59     if (errStr_status == CUDA_SUCCESS)                                         \
60       REPORT("%s \n", errStr);                                                 \
61   } while (false)
62 #endif // OMPTARGET_DEBUG
63 
64 #include "elf_common.h"
65 
66 /// Keep entries table per device.
67 struct FuncOrGblEntryTy {
68   __tgt_target_table Table;
69   std::vector<__tgt_offload_entry> Entries;
70 };
71 
72 enum ExecutionModeType {
73   SPMD, // constructors, destructors,
74   // combined constructs (`teams distribute parallel for [simd]`)
75   GENERIC, // everything else
76   NONE
77 };
78 
79 /// Use a single entity to encode a kernel and a set of flags.
80 struct KernelTy {
81   CUfunction Func;
82 
83   // execution mode of kernel
84   // 0 - SPMD mode (without master warp)
85   // 1 - Generic mode (with master warp)
86   int8_t ExecutionMode;
87 
88   /// Maximal number of threads per block for this kernel.
89   int MaxThreadsPerBlock = 0;
90 
91   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
92       : Func(_Func), ExecutionMode(_ExecutionMode) {}
93 };
94 
95 /// Device environment data
96 /// Manually sync with the deviceRTL side for now, move to a dedicated header
97 /// file later.
98 struct omptarget_device_environmentTy {
99   int32_t debug_level;
100 };
101 
102 namespace {
103 bool checkResult(CUresult Err, const char *ErrMsg) {
104   if (Err == CUDA_SUCCESS)
105     return true;
106 
107   REPORT("%s", ErrMsg);
108   CUDA_ERR_STRING(Err);
109   return false;
110 }
111 
112 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
113                CUstream Stream) {
114   CUresult Err =
115       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
116 
117   if (Err != CUDA_SUCCESS) {
118     DP("Error when copying data from device to device. Pointers: src "
119        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
120        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
121     CUDA_ERR_STRING(Err);
122     return OFFLOAD_FAIL;
123   }
124 
125   return OFFLOAD_SUCCESS;
126 }
127 
128 // Structure contains per-device data
129 struct DeviceDataTy {
130   /// List that contains all the kernels.
131   std::list<KernelTy> KernelsList;
132 
133   std::list<FuncOrGblEntryTy> FuncGblEntries;
134 
135   CUcontext Context = nullptr;
136   // Device properties
137   int ThreadsPerBlock = 0;
138   int BlocksPerGrid = 0;
139   int WarpSize = 0;
140   // OpenMP properties
141   int NumTeams = 0;
142   int NumThreads = 0;
143 };
144 
145 class StreamManagerTy {
146   int NumberOfDevices;
147   // The initial size of stream pool
148   int EnvNumInitialStreams;
149   // Per-device stream mutex
150   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
151   // Per-device stream Id indicates the next available stream in the pool
152   std::vector<int> NextStreamId;
153   // Per-device stream pool
154   std::vector<std::vector<CUstream>> StreamPool;
155   // Reference to per-device data
156   std::vector<DeviceDataTy> &DeviceData;
157 
158   // If there is no CUstream left in the pool, we will resize the pool to
159   // allocate more CUstream. This function should be called with device mutex,
160   // and we do not resize to smaller one.
161   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
162     std::vector<CUstream> &Pool = StreamPool[DeviceId];
163     const size_t CurrentSize = Pool.size();
164     assert(NewSize > CurrentSize && "new size is not larger than current size");
165 
166     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
167     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
168       // We will return if cannot switch to the right context in case of
169       // creating bunch of streams that are not corresponding to the right
170       // device. The offloading will fail later because selected CUstream is
171       // nullptr.
172       return;
173     }
174 
175     Pool.resize(NewSize, nullptr);
176 
177     for (size_t I = CurrentSize; I < NewSize; ++I) {
178       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
179                   "Error returned from cuStreamCreate\n");
180     }
181   }
182 
183 public:
184   StreamManagerTy(const int NumberOfDevices,
185                   std::vector<DeviceDataTy> &DeviceData)
186       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
187         DeviceData(DeviceData) {
188     StreamPool.resize(NumberOfDevices);
189     NextStreamId.resize(NumberOfDevices);
190     StreamMtx.resize(NumberOfDevices);
191 
192     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
193       EnvNumInitialStreams = std::stoi(EnvStr);
194 
195     // Initialize the next stream id
196     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
197 
198     // Initialize stream mutex
199     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
200       Ptr = std::make_unique<std::mutex>();
201   }
202 
203   ~StreamManagerTy() {
204     // Destroy streams
205     for (int I = 0; I < NumberOfDevices; ++I) {
206       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
207                   "Error returned from cuCtxSetCurrent\n");
208 
209       for (CUstream &S : StreamPool[I]) {
210         if (S)
211           checkResult(cuStreamDestroy(S),
212                       "Error returned from cuStreamDestroy\n");
213       }
214     }
215   }
216 
217   // Get a CUstream from pool. Per-device next stream id always points to the
218   // next available CUstream. That means, CUstreams [0, id-1] have been
219   // assigned, and [id,] are still available. If there is no CUstream left, we
220   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
221   // the id will increase one.
222   // xxxxxs+++++++++
223   //      ^
224   //      id
225   // After assignment, the pool becomes the following and s is assigned.
226   // xxxxxs+++++++++
227   //       ^
228   //       id
229   CUstream getStream(const int DeviceId) {
230     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
231     int &Id = NextStreamId[DeviceId];
232     // No CUstream left in the pool, we need to request from CUDA RT
233     if (Id == static_cast<int>(StreamPool[DeviceId].size())) {
234       // By default we double the stream pool every time
235       resizeStreamPool(DeviceId, Id * 2);
236     }
237     return StreamPool[DeviceId][Id++];
238   }
239 
240   // Return a CUstream back to pool. As mentioned above, per-device next
241   // stream is always points to the next available CUstream, so when we return
242   // a CUstream, we need to first decrease the id, and then copy the CUstream
243   // back.
244   // It is worth noting that, the order of streams return might be different
245   // from that they're assigned, that saying, at some point, there might be
246   // two identical CUstreams.
247   // xxax+a+++++
248   //     ^
249   //     id
250   // However, it doesn't matter, because they're always on the two sides of
251   // id. The left one will in the end be overwritten by another CUstream.
252   // Therefore, after several execution, the order of pool might be different
253   // from its initial state.
254   void returnStream(const int DeviceId, CUstream Stream) {
255     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
256     int &Id = NextStreamId[DeviceId];
257     assert(Id > 0 && "Wrong stream ID");
258     StreamPool[DeviceId][--Id] = Stream;
259   }
260 
261   bool initializeDeviceStreamPool(const int DeviceId) {
262     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
263 
264     resizeStreamPool(DeviceId, EnvNumInitialStreams);
265 
266     // Check the size of stream pool
267     if (static_cast<int>(StreamPool[DeviceId].size()) != EnvNumInitialStreams)
268       return false;
269 
270     // Check whether each stream is valid
271     for (CUstream &S : StreamPool[DeviceId])
272       if (!S)
273         return false;
274 
275     return true;
276   }
277 };
278 
279 class DeviceRTLTy {
280   int NumberOfDevices;
281   // OpenMP environment properties
282   int EnvNumTeams;
283   int EnvTeamLimit;
284   int EnvTeamThreadLimit;
285   // OpenMP requires flags
286   int64_t RequiresFlags;
287 
288   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
289   static constexpr const int HardThreadLimit = 1024;
290   static constexpr const int DefaultNumTeams = 128;
291   static constexpr const int DefaultNumThreads = 128;
292 
293   std::unique_ptr<StreamManagerTy> StreamManager;
294   std::vector<DeviceDataTy> DeviceData;
295   std::vector<CUmodule> Modules;
296 
297   /// A class responsible for interacting with device native runtime library to
298   /// allocate and free memory.
299   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
300     const int DeviceId;
301     const std::vector<DeviceDataTy> &DeviceData;
302     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
303 
304   public:
305     CUDADeviceAllocatorTy(int DeviceId, std::vector<DeviceDataTy> &DeviceData)
306         : DeviceId(DeviceId), DeviceData(DeviceData) {}
307 
308     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
309       if (Size == 0)
310         return nullptr;
311 
312       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
313       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
314         return nullptr;
315 
316       void *MemAlloc = nullptr;
317       switch (Kind) {
318       case TARGET_ALLOC_DEFAULT:
319       case TARGET_ALLOC_DEVICE:
320         CUdeviceptr DevicePtr;
321         Err = cuMemAlloc(&DevicePtr, Size);
322         MemAlloc = (void *)DevicePtr;
323         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
324           return nullptr;
325         break;
326       case TARGET_ALLOC_HOST:
327         void *HostPtr;
328         Err = cuMemAllocHost(&HostPtr, Size);
329         MemAlloc = HostPtr;
330         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
331           return nullptr;
332         HostPinnedAllocs[MemAlloc] = Kind;
333         break;
334       case TARGET_ALLOC_SHARED:
335         CUdeviceptr SharedPtr;
336         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
337         MemAlloc = (void *)SharedPtr;
338         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
339           return nullptr;
340         break;
341       }
342 
343       return MemAlloc;
344     }
345 
346     int free(void *TgtPtr) override {
347       CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
348       if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
349         return OFFLOAD_FAIL;
350 
351       // Host pinned memory must be freed differently.
352       TargetAllocTy Kind =
353           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
354               ? TARGET_ALLOC_DEFAULT
355               : TARGET_ALLOC_HOST;
356       switch (Kind) {
357       case TARGET_ALLOC_DEFAULT:
358       case TARGET_ALLOC_DEVICE:
359       case TARGET_ALLOC_SHARED:
360         Err = cuMemFree((CUdeviceptr)TgtPtr);
361         if (!checkResult(Err, "Error returned from cuMemFree\n"))
362           return OFFLOAD_FAIL;
363         break;
364       case TARGET_ALLOC_HOST:
365         Err = cuMemFreeHost(TgtPtr);
366         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
367           return OFFLOAD_FAIL;
368         break;
369       }
370 
371       return OFFLOAD_SUCCESS;
372     }
373   };
374 
375   /// A vector of device allocators
376   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
377 
378   /// A vector of memory managers. Since the memory manager is non-copyable and
379   // non-removable, we wrap them into std::unique_ptr.
380   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
381 
382   /// Whether use memory manager
383   bool UseMemoryManager = true;
384 
385   // Record entry point associated with device
386   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
387     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
388     E.Entries.push_back(entry);
389   }
390 
391   // Return a pointer to the entry associated with the pointer
392   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
393                                              const void *Addr) const {
394     for (const __tgt_offload_entry &Itr :
395          DeviceData[DeviceId].FuncGblEntries.back().Entries)
396       if (Itr.addr == Addr)
397         return &Itr;
398 
399     return nullptr;
400   }
401 
402   // Return the pointer to the target entries table
403   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
404     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
405 
406     if (E.Entries.empty())
407       return nullptr;
408 
409     // Update table info according to the entries and return the pointer
410     E.Table.EntriesBegin = E.Entries.data();
411     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
412 
413     return &E.Table;
414   }
415 
416   // Clear entries table for a device
417   void clearOffloadEntriesTable(const int DeviceId) {
418     DeviceData[DeviceId].FuncGblEntries.emplace_back();
419     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
420     E.Entries.clear();
421     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
422   }
423 
424   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
425     assert(AsyncInfo && "AsyncInfo is nullptr");
426 
427     if (!AsyncInfo->Queue)
428       AsyncInfo->Queue = StreamManager->getStream(DeviceId);
429 
430     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
431   }
432 
433 public:
434   // This class should not be copied
435   DeviceRTLTy(const DeviceRTLTy &) = delete;
436   DeviceRTLTy(DeviceRTLTy &&) = delete;
437 
438   DeviceRTLTy()
439       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
440         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
441 
442     DP("Start initializing CUDA\n");
443 
444     CUresult Err = cuInit(0);
445     if (Err == CUDA_ERROR_INVALID_HANDLE) {
446       // Can't call cuGetErrorString if dlsym failed
447       DP("Failed to load CUDA shared library\n");
448       return;
449     }
450     if (!checkResult(Err, "Error returned from cuInit\n")) {
451       return;
452     }
453 
454     Err = cuDeviceGetCount(&NumberOfDevices);
455     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
456       return;
457 
458     if (NumberOfDevices == 0) {
459       DP("There are no devices supporting CUDA.\n");
460       return;
461     }
462 
463     DeviceData.resize(NumberOfDevices);
464 
465     // Get environment variables regarding teams
466     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
467       // OMP_TEAM_LIMIT has been set
468       EnvTeamLimit = std::stoi(EnvStr);
469       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
470     }
471     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
472       // OMP_TEAMS_THREAD_LIMIT has been set
473       EnvTeamThreadLimit = std::stoi(EnvStr);
474       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
475     }
476     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
477       // OMP_NUM_TEAMS has been set
478       EnvNumTeams = std::stoi(EnvStr);
479       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
480     }
481 
482     StreamManager =
483         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
484 
485     for (int I = 0; I < NumberOfDevices; ++I)
486       DeviceAllocators.emplace_back(I, DeviceData);
487 
488     // Get the size threshold from environment variable
489     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
490     UseMemoryManager = Res.second;
491     size_t MemoryManagerThreshold = Res.first;
492 
493     if (UseMemoryManager)
494       for (int I = 0; I < NumberOfDevices; ++I)
495         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
496             DeviceAllocators[I], MemoryManagerThreshold));
497   }
498 
499   ~DeviceRTLTy() {
500     // We first destruct memory managers in case that its dependent data are
501     // destroyed before it.
502     for (auto &M : MemoryManagers)
503       M.release();
504 
505     StreamManager = nullptr;
506 
507     for (CUmodule &M : Modules)
508       // Close module
509       if (M)
510         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
511 
512     for (DeviceDataTy &D : DeviceData) {
513       // Destroy context
514       if (D.Context) {
515         checkResult(cuCtxSetCurrent(D.Context),
516                     "Error returned from cuCtxSetCurrent\n");
517         CUdevice Device;
518         checkResult(cuCtxGetDevice(&Device),
519                     "Error returned from cuCtxGetDevice\n");
520         checkResult(cuDevicePrimaryCtxRelease(Device),
521                     "Error returned from cuDevicePrimaryCtxRelease\n");
522       }
523     }
524   }
525 
526   // Check whether a given DeviceId is valid
527   bool isValidDeviceId(const int DeviceId) const {
528     return DeviceId >= 0 && DeviceId < NumberOfDevices;
529   }
530 
531   int getNumOfDevices() const { return NumberOfDevices; }
532 
533   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
534 
535   int initDevice(const int DeviceId) {
536     CUdevice Device;
537 
538     DP("Getting device %d\n", DeviceId);
539     CUresult Err = cuDeviceGet(&Device, DeviceId);
540     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
541       return OFFLOAD_FAIL;
542 
543     // Query the current flags of the primary context and set its flags if
544     // it is inactive
545     unsigned int FormerPrimaryCtxFlags = 0;
546     int FormerPrimaryCtxIsActive = 0;
547     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
548                                      &FormerPrimaryCtxIsActive);
549     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
550       return OFFLOAD_FAIL;
551 
552     if (FormerPrimaryCtxIsActive) {
553       DP("The primary context is active, no change to its flags\n");
554       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
555           CU_CTX_SCHED_BLOCKING_SYNC)
556         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
557     } else {
558       DP("The primary context is inactive, set its flags to "
559          "CU_CTX_SCHED_BLOCKING_SYNC\n");
560       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
561       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
562         return OFFLOAD_FAIL;
563     }
564 
565     // Retain the per device primary context and save it to use whenever this
566     // device is selected.
567     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
568     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
569       return OFFLOAD_FAIL;
570 
571     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
572     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
573       return OFFLOAD_FAIL;
574 
575     // Initialize stream pool
576     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
577       return OFFLOAD_FAIL;
578 
579     // Query attributes to determine number of threads/block and blocks/grid.
580     int MaxGridDimX;
581     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
582                                Device);
583     if (Err != CUDA_SUCCESS) {
584       DP("Error getting max grid dimension, use default value %d\n",
585          DeviceRTLTy::DefaultNumTeams);
586       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
587     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
588       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
589       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
590     } else {
591       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
592          "at the hard limit\n",
593          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
594       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
595     }
596 
597     // We are only exploiting threads along the x axis.
598     int MaxBlockDimX;
599     Err = cuDeviceGetAttribute(&MaxBlockDimX,
600                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
601     if (Err != CUDA_SUCCESS) {
602       DP("Error getting max block dimension, use default value %d\n",
603          DeviceRTLTy::DefaultNumThreads);
604       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
605     } else {
606       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
607       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
608 
609       if (EnvTeamThreadLimit > 0 &&
610           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
611         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
612            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
613            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
614         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
615       }
616       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
617         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
618            "capping at the hard limit\n",
619            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
620         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
621       }
622     }
623 
624     // Get and set warp size
625     int WarpSize;
626     Err =
627         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
628     if (Err != CUDA_SUCCESS) {
629       DP("Error getting warp size, assume default value 32\n");
630       DeviceData[DeviceId].WarpSize = 32;
631     } else {
632       DP("Using warp size %d\n", WarpSize);
633       DeviceData[DeviceId].WarpSize = WarpSize;
634     }
635 
636     // Adjust teams to the env variables
637     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
638       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
639          EnvTeamLimit);
640       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
641     }
642 
643     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
644          "Device supports up to %d CUDA blocks and %d threads with a "
645          "warp size of %d\n",
646          DeviceData[DeviceId].BlocksPerGrid,
647          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
648 
649     // Set default number of teams
650     if (EnvNumTeams > 0) {
651       DP("Default number of teams set according to environment %d\n",
652          EnvNumTeams);
653       DeviceData[DeviceId].NumTeams = EnvNumTeams;
654     } else {
655       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
656       DP("Default number of teams set according to library's default %d\n",
657          DeviceRTLTy::DefaultNumTeams);
658     }
659 
660     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
661       DP("Default number of teams exceeds device limit, capping at %d\n",
662          DeviceData[DeviceId].BlocksPerGrid);
663       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
664     }
665 
666     // Set default number of threads
667     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
668     DP("Default number of threads set according to library's default %d\n",
669        DeviceRTLTy::DefaultNumThreads);
670     if (DeviceData[DeviceId].NumThreads >
671         DeviceData[DeviceId].ThreadsPerBlock) {
672       DP("Default number of threads exceeds device limit, capping at %d\n",
673          DeviceData[DeviceId].ThreadsPerBlock);
674       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
675     }
676 
677     return OFFLOAD_SUCCESS;
678   }
679 
680   __tgt_target_table *loadBinary(const int DeviceId,
681                                  const __tgt_device_image *Image) {
682     // Set the context we are using
683     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
684     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
685       return nullptr;
686 
687     // Clear the offload table as we are going to create a new one.
688     clearOffloadEntriesTable(DeviceId);
689 
690     // Create the module and extract the function pointers.
691     CUmodule Module;
692     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
693     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
694     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
695       return nullptr;
696 
697     DP("CUDA module successfully loaded!\n");
698 
699     Modules.push_back(Module);
700 
701     // Find the symbols in the module by name.
702     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
703     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
704 
705     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
706     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
707       if (!E->addr) {
708         // We return nullptr when something like this happens, the host should
709         // have always something in the address to uniquely identify the target
710         // region.
711         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
712         return nullptr;
713       }
714 
715       if (E->size) {
716         __tgt_offload_entry Entry = *E;
717         CUdeviceptr CUPtr;
718         size_t CUSize;
719         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
720         // We keep this style here because we need the name
721         if (Err != CUDA_SUCCESS) {
722           REPORT("Loading global '%s' Failed\n", E->name);
723           CUDA_ERR_STRING(Err);
724           return nullptr;
725         }
726 
727         if (CUSize != E->size) {
728           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
729              CUSize, E->size);
730           return nullptr;
731         }
732 
733         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
734            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
735 
736         Entry.addr = (void *)(CUPtr);
737 
738         // Note: In the current implementation declare target variables
739         // can either be link or to. This means that once unified
740         // memory is activated via the requires directive, the variable
741         // can be used directly from the host in both cases.
742         // TODO: when variables types other than to or link are added,
743         // the below condition should be changed to explicitly
744         // check for to and link variables types:
745         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
746         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
747         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
748           // If unified memory is present any target link or to variables
749           // can access host addresses directly. There is no longer a
750           // need for device copies.
751           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
752           DP("Copy linked variable host address (" DPxMOD
753              ") to device address (" DPxMOD ")\n",
754              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
755         }
756 
757         addOffloadEntry(DeviceId, Entry);
758 
759         continue;
760       }
761 
762       CUfunction Func;
763       Err = cuModuleGetFunction(&Func, Module, E->name);
764       // We keep this style here because we need the name
765       if (Err != CUDA_SUCCESS) {
766         REPORT("Loading '%s' Failed\n", E->name);
767         CUDA_ERR_STRING(Err);
768         return nullptr;
769       }
770 
771       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
772          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
773 
774       // default value GENERIC (in case symbol is missing from cubin file)
775       int8_t ExecModeVal = ExecutionModeType::GENERIC;
776       std::string ExecModeNameStr(E->name);
777       ExecModeNameStr += "_exec_mode";
778       const char *ExecModeName = ExecModeNameStr.c_str();
779 
780       CUdeviceptr ExecModePtr;
781       size_t CUSize;
782       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
783       if (Err == CUDA_SUCCESS) {
784         if (CUSize != sizeof(int8_t)) {
785           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
786              ExecModeName, CUSize, sizeof(int8_t));
787           return nullptr;
788         }
789 
790         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
791         if (Err != CUDA_SUCCESS) {
792           REPORT("Error when copying data from device to host. Pointers: "
793                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
794                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
795           CUDA_ERR_STRING(Err);
796           return nullptr;
797         }
798 
799         if (ExecModeVal < 0 || ExecModeVal > 1) {
800           DP("Error wrong exec_mode value specified in cubin file: %d\n",
801              ExecModeVal);
802           return nullptr;
803         }
804       } else {
805         REPORT("Loading global exec_mode '%s' - symbol missing, using default "
806                "value GENERIC (1)\n",
807                ExecModeName);
808         CUDA_ERR_STRING(Err);
809       }
810 
811       KernelsList.emplace_back(Func, ExecModeVal);
812 
813       __tgt_offload_entry Entry = *E;
814       Entry.addr = &KernelsList.back();
815       addOffloadEntry(DeviceId, Entry);
816     }
817 
818     // send device environment data to the device
819     {
820       omptarget_device_environmentTy DeviceEnv{0};
821 
822 #ifdef OMPTARGET_DEBUG
823       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
824         DeviceEnv.debug_level = std::stoi(EnvStr);
825 #endif
826 
827       const char *DeviceEnvName = "omptarget_device_environment";
828       CUdeviceptr DeviceEnvPtr;
829       size_t CUSize;
830 
831       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
832       if (Err == CUDA_SUCCESS) {
833         if (CUSize != sizeof(DeviceEnv)) {
834           REPORT(
835               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
836               DeviceEnvName, CUSize, sizeof(int32_t));
837           CUDA_ERR_STRING(Err);
838           return nullptr;
839         }
840 
841         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
842         if (Err != CUDA_SUCCESS) {
843           REPORT("Error when copying data from host to device. Pointers: "
844                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
845                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
846           CUDA_ERR_STRING(Err);
847           return nullptr;
848         }
849 
850         DP("Sending global device environment data %zu bytes\n", CUSize);
851       } else {
852         DP("Finding global device environment '%s' - symbol missing.\n",
853            DeviceEnvName);
854         DP("Continue, considering this is a device RTL which does not accept "
855            "environment setting.\n");
856       }
857     }
858 
859     return getOffloadEntriesTable(DeviceId);
860   }
861 
862   void *dataAlloc(const int DeviceId, const int64_t Size,
863                   const TargetAllocTy Kind) {
864     switch (Kind) {
865     case TARGET_ALLOC_DEFAULT:
866     case TARGET_ALLOC_DEVICE:
867       if (UseMemoryManager)
868         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
869       else
870         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
871     case TARGET_ALLOC_HOST:
872     case TARGET_ALLOC_SHARED:
873       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
874     }
875 
876     REPORT("Invalid target data allocation kind or requested allocator not "
877            "implemented yet\n");
878 
879     return nullptr;
880   }
881 
882   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
883                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
884     assert(AsyncInfo && "AsyncInfo is nullptr");
885 
886     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
887     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
888       return OFFLOAD_FAIL;
889 
890     CUstream Stream = getStream(DeviceId, AsyncInfo);
891 
892     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
893     if (Err != CUDA_SUCCESS) {
894       DP("Error when copying data from host to device. Pointers: host "
895          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
896          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
897       CUDA_ERR_STRING(Err);
898       return OFFLOAD_FAIL;
899     }
900 
901     return OFFLOAD_SUCCESS;
902   }
903 
904   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
905                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
906     assert(AsyncInfo && "AsyncInfo is nullptr");
907 
908     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
909     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
910       return OFFLOAD_FAIL;
911 
912     CUstream Stream = getStream(DeviceId, AsyncInfo);
913 
914     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
915     if (Err != CUDA_SUCCESS) {
916       DP("Error when copying data from device to host. Pointers: host "
917          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
918          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
919       CUDA_ERR_STRING(Err);
920       return OFFLOAD_FAIL;
921     }
922 
923     return OFFLOAD_SUCCESS;
924   }
925 
926   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
927                    int64_t Size, __tgt_async_info *AsyncInfo) const {
928     assert(AsyncInfo && "AsyncInfo is nullptr");
929 
930     CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context);
931     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
932       return OFFLOAD_FAIL;
933 
934     CUstream Stream = getStream(SrcDevId, AsyncInfo);
935 
936     // If they are two devices, we try peer to peer copy first
937     if (SrcDevId != DstDevId) {
938       int CanAccessPeer = 0;
939       Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
940       if (Err != CUDA_SUCCESS) {
941         REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
942                ", dst = %" PRId32 "\n",
943                SrcDevId, DstDevId);
944         CUDA_ERR_STRING(Err);
945         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
946       }
947 
948       if (!CanAccessPeer) {
949         DP("P2P memcpy not supported so fall back to D2D memcpy");
950         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
951       }
952 
953       Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
954       if (Err != CUDA_SUCCESS) {
955         REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
956                ", dst = %" PRId32 "\n",
957                SrcDevId, DstDevId);
958         CUDA_ERR_STRING(Err);
959         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
960       }
961 
962       Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
963                               (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context,
964                               Size, Stream);
965       if (Err == CUDA_SUCCESS)
966         return OFFLOAD_SUCCESS;
967 
968       DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
969          ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n",
970          DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
971       CUDA_ERR_STRING(Err);
972     }
973 
974     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
975   }
976 
977   int dataDelete(const int DeviceId, void *TgtPtr) {
978     if (UseMemoryManager)
979       return MemoryManagers[DeviceId]->free(TgtPtr);
980 
981     return DeviceAllocators[DeviceId].free(TgtPtr);
982   }
983 
984   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
985                           ptrdiff_t *TgtOffsets, const int ArgNum,
986                           const int TeamNum, const int ThreadLimit,
987                           const unsigned int LoopTripCount,
988                           __tgt_async_info *AsyncInfo) const {
989     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
990     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
991       return OFFLOAD_FAIL;
992 
993     // All args are references.
994     std::vector<void *> Args(ArgNum);
995     std::vector<void *> Ptrs(ArgNum);
996 
997     for (int I = 0; I < ArgNum; ++I) {
998       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
999       Args[I] = &Ptrs[I];
1000     }
1001 
1002     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1003 
1004     int CudaThreadsPerBlock;
1005     if (ThreadLimit > 0) {
1006       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1007       CudaThreadsPerBlock = ThreadLimit;
1008       // Add master warp if necessary
1009       if (KernelInfo->ExecutionMode == GENERIC) {
1010         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1011         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1012       }
1013     } else {
1014       DP("Setting CUDA threads per block to default %d\n",
1015          DeviceData[DeviceId].NumThreads);
1016       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1017     }
1018 
1019     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1020       DP("Threads per block capped at device limit %d\n",
1021          DeviceData[DeviceId].ThreadsPerBlock);
1022       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1023     }
1024 
1025     if (!KernelInfo->MaxThreadsPerBlock) {
1026       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1027                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1028                                KernelInfo->Func);
1029       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1030         return OFFLOAD_FAIL;
1031     }
1032 
1033     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1034       DP("Threads per block capped at kernel limit %d\n",
1035          KernelInfo->MaxThreadsPerBlock);
1036       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1037     }
1038 
1039     unsigned int CudaBlocksPerGrid;
1040     if (TeamNum <= 0) {
1041       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1042         if (KernelInfo->ExecutionMode == SPMD) {
1043           // We have a combined construct, i.e. `target teams distribute
1044           // parallel for [simd]`. We launch so many teams so that each thread
1045           // will execute one iteration of the loop. round up to the nearest
1046           // integer
1047           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1048         } else {
1049           // If we reach this point, then we have a non-combined construct, i.e.
1050           // `teams distribute` with a nested `parallel for` and each team is
1051           // assigned one iteration of the `distribute` loop. E.g.:
1052           //
1053           // #pragma omp target teams distribute
1054           // for(...loop_tripcount...) {
1055           //   #pragma omp parallel for
1056           //   for(...) {}
1057           // }
1058           //
1059           // Threads within a team will execute the iterations of the `parallel`
1060           // loop.
1061           CudaBlocksPerGrid = LoopTripCount;
1062         }
1063         DP("Using %d teams due to loop trip count %" PRIu32
1064            " and number of threads per block %d\n",
1065            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1066       } else {
1067         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1068         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1069       }
1070     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
1071       DP("Capping number of teams to team limit %d\n",
1072          DeviceData[DeviceId].BlocksPerGrid);
1073       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1074     } else {
1075       DP("Using requested number of teams %d\n", TeamNum);
1076       CudaBlocksPerGrid = TeamNum;
1077     }
1078 
1079     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1080          "Launching kernel %s with %d blocks and %d threads in %s "
1081          "mode\n",
1082          (getOffloadEntry(DeviceId, TgtEntryPtr))
1083              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1084              : "(null)",
1085          CudaBlocksPerGrid, CudaThreadsPerBlock,
1086          (KernelInfo->ExecutionMode == SPMD) ? "SPMD" : "Generic");
1087 
1088     CUstream Stream = getStream(DeviceId, AsyncInfo);
1089     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1090                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1091                          /* blockDimY */ 1, /* blockDimZ */ 1,
1092                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
1093     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1094       return OFFLOAD_FAIL;
1095 
1096     DP("Launch of entry point at " DPxMOD " successful!\n",
1097        DPxPTR(TgtEntryPtr));
1098 
1099     return OFFLOAD_SUCCESS;
1100   }
1101 
1102   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1103     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1104     CUresult Err = cuStreamSynchronize(Stream);
1105 
1106     // Once the stream is synchronized, return it to stream pool and reset
1107     // AsyncInfo. This is to make sure the synchronization only works for its
1108     // own tasks.
1109     StreamManager->returnStream(DeviceId,
1110                                 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1111     AsyncInfo->Queue = nullptr;
1112 
1113     if (Err != CUDA_SUCCESS) {
1114       DP("Error when synchronizing stream. stream = " DPxMOD
1115          ", async info ptr = " DPxMOD "\n",
1116          DPxPTR(Stream), DPxPTR(AsyncInfo));
1117       CUDA_ERR_STRING(Err);
1118     }
1119     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1120   }
1121 };
1122 
1123 DeviceRTLTy DeviceRTL;
1124 } // namespace
1125 
1126 // Exposed library API function
1127 #ifdef __cplusplus
1128 extern "C" {
1129 #endif
1130 
1131 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
1132   return elf_check_machine(image, /* EM_CUDA */ 190);
1133 }
1134 
1135 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1136 
1137 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1138   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1139   DeviceRTL.setRequiresFlag(RequiresFlags);
1140   return RequiresFlags;
1141 }
1142 
1143 int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) {
1144   if (DeviceRTL.isValidDeviceId(src_dev_id) &&
1145       DeviceRTL.isValidDeviceId(dst_dev_id))
1146     return 1;
1147 
1148   return 0;
1149 }
1150 
1151 int32_t __tgt_rtl_init_device(int32_t device_id) {
1152   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1153 
1154   return DeviceRTL.initDevice(device_id);
1155 }
1156 
1157 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
1158                                           __tgt_device_image *image) {
1159   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1160 
1161   return DeviceRTL.loadBinary(device_id, image);
1162 }
1163 
1164 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *,
1165                            int32_t kind) {
1166   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1167 
1168   return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind);
1169 }
1170 
1171 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
1172                               int64_t size) {
1173   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1174 
1175   __tgt_async_info AsyncInfo;
1176   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
1177                                                  size, &AsyncInfo);
1178   if (rc != OFFLOAD_SUCCESS)
1179     return OFFLOAD_FAIL;
1180 
1181   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1182 }
1183 
1184 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
1185                                     void *hst_ptr, int64_t size,
1186                                     __tgt_async_info *async_info_ptr) {
1187   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1188   assert(async_info_ptr && "async_info_ptr is nullptr");
1189 
1190   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
1191                               async_info_ptr);
1192 }
1193 
1194 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
1195                                 int64_t size) {
1196   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1197 
1198   __tgt_async_info AsyncInfo;
1199   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
1200                                                    size, &AsyncInfo);
1201   if (rc != OFFLOAD_SUCCESS)
1202     return OFFLOAD_FAIL;
1203 
1204   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1205 }
1206 
1207 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
1208                                       void *tgt_ptr, int64_t size,
1209                                       __tgt_async_info *async_info_ptr) {
1210   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1211   assert(async_info_ptr && "async_info_ptr is nullptr");
1212 
1213   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
1214                                 async_info_ptr);
1215 }
1216 
1217 int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr,
1218                                       int dst_dev_id, void *dst_ptr,
1219                                       int64_t size,
1220                                       __tgt_async_info *AsyncInfo) {
1221   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1222   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1223   assert(AsyncInfo && "AsyncInfo is nullptr");
1224 
1225   return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size,
1226                                 AsyncInfo);
1227 }
1228 
1229 int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr,
1230                                 int32_t dst_dev_id, void *dst_ptr,
1231                                 int64_t size) {
1232   assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid");
1233   assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid");
1234 
1235   __tgt_async_info AsyncInfo;
1236   const int32_t rc = __tgt_rtl_data_exchange_async(
1237       src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo);
1238   if (rc != OFFLOAD_SUCCESS)
1239     return OFFLOAD_FAIL;
1240 
1241   return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo);
1242 }
1243 
1244 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
1245   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1246 
1247   return DeviceRTL.dataDelete(device_id, tgt_ptr);
1248 }
1249 
1250 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
1251                                          void **tgt_args,
1252                                          ptrdiff_t *tgt_offsets,
1253                                          int32_t arg_num, int32_t team_num,
1254                                          int32_t thread_limit,
1255                                          uint64_t loop_tripcount) {
1256   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1257 
1258   __tgt_async_info AsyncInfo;
1259   const int32_t rc = __tgt_rtl_run_target_team_region_async(
1260       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1261       thread_limit, loop_tripcount, &AsyncInfo);
1262   if (rc != OFFLOAD_SUCCESS)
1263     return OFFLOAD_FAIL;
1264 
1265   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1266 }
1267 
1268 int32_t __tgt_rtl_run_target_team_region_async(
1269     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
1270     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
1271     int32_t thread_limit, uint64_t loop_tripcount,
1272     __tgt_async_info *async_info_ptr) {
1273   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1274 
1275   return DeviceRTL.runTargetTeamRegion(
1276       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1277       thread_limit, loop_tripcount, async_info_ptr);
1278 }
1279 
1280 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1281                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1282                                     int32_t arg_num) {
1283   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1284 
1285   __tgt_async_info AsyncInfo;
1286   const int32_t rc = __tgt_rtl_run_target_region_async(
1287       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo);
1288   if (rc != OFFLOAD_SUCCESS)
1289     return OFFLOAD_FAIL;
1290 
1291   return __tgt_rtl_synchronize(device_id, &AsyncInfo);
1292 }
1293 
1294 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1295                                           void *tgt_entry_ptr, void **tgt_args,
1296                                           ptrdiff_t *tgt_offsets,
1297                                           int32_t arg_num,
1298                                           __tgt_async_info *async_info_ptr) {
1299   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1300 
1301   return __tgt_rtl_run_target_team_region_async(
1302       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1303       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1304       async_info_ptr);
1305 }
1306 
1307 int32_t __tgt_rtl_synchronize(int32_t device_id,
1308                               __tgt_async_info *async_info_ptr) {
1309   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1310   assert(async_info_ptr && "async_info_ptr is nullptr");
1311   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1312 
1313   return DeviceRTL.synchronize(device_id, async_info_ptr);
1314 }
1315 
1316 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1317   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1318   InfoLevel.store(NewInfoLevel);
1319 }
1320 
1321 #ifdef __cplusplus
1322 }
1323 #endif
1324