1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/ADT/StringRef.h"
14 
15 #include <algorithm>
16 #include <cassert>
17 #include <cstddef>
18 #include <cuda.h>
19 #include <list>
20 #include <memory>
21 #include <mutex>
22 #include <string>
23 #include <unordered_map>
24 #include <vector>
25 
26 #include "Debug.h"
27 #include "DeviceEnvironment.h"
28 #include "omptarget.h"
29 #include "omptargetplugin.h"
30 
31 #define TARGET_NAME CUDA
32 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
33 
34 #include "MemoryManager.h"
35 
36 #include "llvm/Frontend/OpenMP/OMPConstants.h"
37 
38 using namespace llvm;
39 
40 // Utility for retrieving and printing CUDA error string.
41 #ifdef OMPTARGET_DEBUG
42 #define CUDA_ERR_STRING(err)                                                   \
43   do {                                                                         \
44     if (getDebugLevel() > 0) {                                                 \
45       const char *errStr = nullptr;                                            \
46       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
47       if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
48         REPORT("Unrecognized CUDA error code: %d\n", err);                     \
49       else if (errStr_status == CUDA_SUCCESS)                                  \
50         REPORT("CUDA error is: %s\n", errStr);                                 \
51       else {                                                                   \
52         REPORT("Unresolved CUDA error code: %d\n", err);                       \
53         REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
54                errStr_status);                                                 \
55       }                                                                        \
56     } else {                                                                   \
57       const char *errStr = nullptr;                                            \
58       CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
59       if (errStr_status == CUDA_SUCCESS)                                       \
60         REPORT("%s \n", errStr);                                               \
61     }                                                                          \
62   } while (false)
63 #else // OMPTARGET_DEBUG
64 #define CUDA_ERR_STRING(err)                                                   \
65   do {                                                                         \
66     const char *errStr = nullptr;                                              \
67     CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
68     if (errStr_status == CUDA_SUCCESS)                                         \
69       REPORT("%s \n", errStr);                                                 \
70   } while (false)
71 #endif // OMPTARGET_DEBUG
72 
73 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
74 
75 #include "elf_common.h"
76 
77 /// Keep entries table per device.
78 struct FuncOrGblEntryTy {
79   __tgt_target_table Table;
80   std::vector<__tgt_offload_entry> Entries;
81 };
82 
83 /// Use a single entity to encode a kernel and a set of flags.
84 struct KernelTy {
85   CUfunction Func;
86 
87   // execution mode of kernel
88   llvm::omp::OMPTgtExecModeFlags ExecutionMode;
89 
90   /// Maximal number of threads per block for this kernel.
91   int MaxThreadsPerBlock = 0;
92 
KernelTyKernelTy93   KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
94       : Func(Func), ExecutionMode(ExecutionMode) {}
95 };
96 
97 namespace {
checkResult(CUresult Err,const char * ErrMsg)98 bool checkResult(CUresult Err, const char *ErrMsg) {
99   if (Err == CUDA_SUCCESS)
100     return true;
101 
102   REPORT("%s", ErrMsg);
103   CUDA_ERR_STRING(Err);
104   return false;
105 }
106 
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)107 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
108                CUstream Stream) {
109   CUresult Err =
110       cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
111 
112   if (Err != CUDA_SUCCESS) {
113     DP("Error when copying data from device to device. Pointers: src "
114        "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
115        DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
116     CUDA_ERR_STRING(Err);
117     return OFFLOAD_FAIL;
118   }
119 
120   return OFFLOAD_SUCCESS;
121 }
122 
recordEvent(void * EventPtr,__tgt_async_info * AsyncInfo)123 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
124   CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
125   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
126 
127   CUresult Err = cuEventRecord(Event, Stream);
128   if (Err != CUDA_SUCCESS) {
129     DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
130        DPxPTR(Stream), DPxPTR(Event));
131     CUDA_ERR_STRING(Err);
132     return OFFLOAD_FAIL;
133   }
134 
135   return OFFLOAD_SUCCESS;
136 }
137 
syncEvent(void * EventPtr)138 int syncEvent(void *EventPtr) {
139   CUevent Event = reinterpret_cast<CUevent>(EventPtr);
140 
141   CUresult Err = cuEventSynchronize(Event);
142   if (Err != CUDA_SUCCESS) {
143     DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
144     CUDA_ERR_STRING(Err);
145     return OFFLOAD_FAIL;
146   }
147 
148   return OFFLOAD_SUCCESS;
149 }
150 
151 namespace {
152 
153 // Structure contains per-device data
154 struct DeviceDataTy {
155   /// List that contains all the kernels.
156   std::list<KernelTy> KernelsList;
157 
158   std::list<FuncOrGblEntryTy> FuncGblEntries;
159 
160   CUcontext Context = nullptr;
161   // Device properties
162   unsigned int ThreadsPerBlock = 0;
163   unsigned int BlocksPerGrid = 0;
164   unsigned int WarpSize = 0;
165   // OpenMP properties
166   int NumTeams = 0;
167   int NumThreads = 0;
168 };
169 
170 /// Resource allocator where \p T is the resource type.
171 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
172 /// accordingly. The implementation should not raise any exception.
173 template <typename T> struct AllocatorTy {
174   using ElementTy = T;
~AllocatorTy__anondfa3b3cd0111::__anondfa3b3cd0211::AllocatorTy175   virtual ~AllocatorTy() {}
176 
177   /// Create a resource and assign to R.
178   virtual int create(T &R) noexcept = 0;
179   /// Destroy the resource.
180   virtual int destroy(T) noexcept = 0;
181 };
182 
183 /// Allocator for CUstream.
184 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
185   /// See AllocatorTy<T>::create.
create__anondfa3b3cd0111::__anondfa3b3cd0211::StreamAllocatorTy186   int create(CUstream &Stream) noexcept override {
187     if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
188                      "Error returned from cuStreamCreate\n"))
189       return OFFLOAD_FAIL;
190 
191     return OFFLOAD_SUCCESS;
192   }
193 
194   /// See AllocatorTy<T>::destroy.
destroy__anondfa3b3cd0111::__anondfa3b3cd0211::StreamAllocatorTy195   int destroy(CUstream Stream) noexcept override {
196     if (!checkResult(cuStreamDestroy(Stream),
197                      "Error returned from cuStreamDestroy\n"))
198       return OFFLOAD_FAIL;
199 
200     return OFFLOAD_SUCCESS;
201   }
202 };
203 
204 /// Allocator for CUevent.
205 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
206   /// See AllocatorTy<T>::create.
create__anondfa3b3cd0111::__anondfa3b3cd0211::EventAllocatorTy207   int create(CUevent &Event) noexcept override {
208     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
209                      "Error returned from cuEventCreate\n"))
210       return OFFLOAD_FAIL;
211 
212     return OFFLOAD_SUCCESS;
213   }
214 
215   /// See AllocatorTy<T>::destroy.
destroy__anondfa3b3cd0111::__anondfa3b3cd0211::EventAllocatorTy216   int destroy(CUevent Event) noexcept override {
217     if (!checkResult(cuEventDestroy(Event),
218                      "Error returned from cuEventDestroy\n"))
219       return OFFLOAD_FAIL;
220 
221     return OFFLOAD_SUCCESS;
222   }
223 };
224 
225 /// A generic pool of resources where \p T is the resource type.
226 /// \p T should be copyable as the object is stored in \p std::vector .
227 template <typename AllocTy> class ResourcePoolTy {
228   using ElementTy = typename AllocTy::ElementTy;
229   /// Index of the next available resource.
230   size_t Next = 0;
231   /// Mutex to guard the pool.
232   std::mutex Mutex;
233   /// Pool of resources. The difference between \p Resources and \p Pool is,
234   /// when a resource is acquired and released, it is all on \p Resources. When
235   /// a batch of new resources are needed, they are both added to \p Resources
236   /// and \p Pool. The reason for this setting is, \p Resources could contain
237   /// redundant elements because resources are not released, which can cause
238   /// double free. This setting makes sure that \p Pool always has every
239   /// resource allocated from the device.
240   std::vector<ElementTy> Resources;
241   std::vector<ElementTy> Pool;
242   /// A reference to the corresponding allocator.
243   AllocTy Allocator;
244 
245   /// If `Resources` is used up, we will fill in more resources. It assumes that
246   /// the new size `Size` should be always larger than the current size.
resize(size_t Size)247   bool resize(size_t Size) {
248     assert(Resources.size() == Pool.size() && "size mismatch");
249     auto CurSize = Resources.size();
250     assert(Size > CurSize && "Unexpected smaller size");
251     Pool.reserve(Size);
252     Resources.reserve(Size);
253     for (auto I = CurSize; I < Size; ++I) {
254       ElementTy NewItem;
255       int Ret = Allocator.create(NewItem);
256       if (Ret != OFFLOAD_SUCCESS)
257         return false;
258       Pool.push_back(NewItem);
259       Resources.push_back(NewItem);
260     }
261     return true;
262   }
263 
264 public:
ResourcePoolTy(AllocTy && A,size_t Size=0)265   ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
266       : Allocator(std::move(A)) {
267     if (Size)
268       (void)resize(Size);
269   }
270 
~ResourcePoolTy()271   ~ResourcePoolTy() noexcept { clear(); }
272 
273   /// Get a resource from pool. `Next` always points to the next available
274   /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
275   /// still available. If there is no resource left, we will ask for more. Each
276   /// time a resource is assigned, the id will increase one.
277   /// xxxxxs+++++++++
278   ///      ^
279   ///      Next
280   /// After assignment, the pool becomes the following and s is assigned.
281   /// xxxxxs+++++++++
282   ///       ^
283   ///       Next
acquire(ElementTy & R)284   int acquire(ElementTy &R) noexcept {
285     std::lock_guard<std::mutex> LG(Mutex);
286     if (Next == Resources.size()) {
287       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
288       if (!resize(NewSize))
289         return OFFLOAD_FAIL;
290     }
291 
292     assert(Next < Resources.size());
293 
294     R = Resources[Next++];
295 
296     return OFFLOAD_SUCCESS;
297   }
298 
299   /// Return the resource back to the pool. When we return a resource, we need
300   /// to first decrease `Next`, and then copy the resource back. It is worth
301   /// noting that, the order of resources return might be different from that
302   /// they're assigned, that saying, at some point, there might be two identical
303   /// resources.
304   /// xxax+a+++++
305   ///     ^
306   ///     Next
307   /// However, it doesn't matter, because they're always on the two sides of
308   /// `Next`. The left one will in the end be overwritten by another resource.
309   /// Therefore, after several execution, the order of pool might be different
310   /// from its initial state.
release(ElementTy R)311   void release(ElementTy R) noexcept {
312     std::lock_guard<std::mutex> LG(Mutex);
313     Resources[--Next] = R;
314   }
315 
316   /// Released all stored resources and clear the pool.
317   /// Note: This function is not thread safe. Be sure to guard it if necessary.
clear()318   void clear() noexcept {
319     for (auto &R : Pool)
320       (void)Allocator.destroy(R);
321     Pool.clear();
322     Resources.clear();
323   }
324 };
325 
326 } // namespace
327 
328 class DeviceRTLTy {
329   int NumberOfDevices;
330   // OpenMP environment properties
331   int EnvNumTeams;
332   int EnvTeamLimit;
333   int EnvTeamThreadLimit;
334   // OpenMP requires flags
335   int64_t RequiresFlags;
336   // Amount of dynamic shared memory to use at launch.
337   uint64_t DynamicMemorySize;
338 
339   /// Number of initial streams for each device.
340   int NumInitialStreams = 32;
341 
342   /// Number of initial events for each device.
343   int NumInitialEvents = 8;
344 
345   static constexpr const int32_t HardThreadLimit = 1024;
346   static constexpr const int32_t DefaultNumTeams = 128;
347   static constexpr const int32_t DefaultNumThreads = 128;
348 
349   using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
350   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
351 
352   using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
353   std::vector<std::unique_ptr<EventPoolTy>> EventPool;
354 
355   std::vector<DeviceDataTy> DeviceData;
356   std::vector<std::vector<CUmodule>> Modules;
357 
358   /// Vector of flags indicating the initalization status of all associated
359   /// devices.
360   std::vector<bool> InitializedFlags;
361 
362   enum class PeerAccessState : uint8_t { Unkown, Yes, No };
363   std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
364   std::mutex PeerAccessMatrixLock;
365 
366   /// A class responsible for interacting with device native runtime library to
367   /// allocate and free memory.
368   class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
369     std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
370 
371   public:
allocate(size_t Size,void *,TargetAllocTy Kind)372     void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
373       if (Size == 0)
374         return nullptr;
375 
376       void *MemAlloc = nullptr;
377       CUresult Err;
378       switch (Kind) {
379       case TARGET_ALLOC_DEFAULT:
380       case TARGET_ALLOC_DEVICE:
381         CUdeviceptr DevicePtr;
382         Err = cuMemAlloc(&DevicePtr, Size);
383         MemAlloc = (void *)DevicePtr;
384         if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
385           return nullptr;
386         break;
387       case TARGET_ALLOC_HOST:
388         void *HostPtr;
389         Err = cuMemAllocHost(&HostPtr, Size);
390         MemAlloc = HostPtr;
391         if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
392           return nullptr;
393         HostPinnedAllocs[MemAlloc] = Kind;
394         break;
395       case TARGET_ALLOC_SHARED:
396         CUdeviceptr SharedPtr;
397         Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
398         MemAlloc = (void *)SharedPtr;
399         if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
400           return nullptr;
401         break;
402       }
403 
404       return MemAlloc;
405     }
406 
free(void * TgtPtr)407     int free(void *TgtPtr) override {
408       CUresult Err;
409       // Host pinned memory must be freed differently.
410       TargetAllocTy Kind =
411           (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
412               ? TARGET_ALLOC_DEFAULT
413               : TARGET_ALLOC_HOST;
414       switch (Kind) {
415       case TARGET_ALLOC_DEFAULT:
416       case TARGET_ALLOC_DEVICE:
417       case TARGET_ALLOC_SHARED:
418         Err = cuMemFree((CUdeviceptr)TgtPtr);
419         if (!checkResult(Err, "Error returned from cuMemFree\n"))
420           return OFFLOAD_FAIL;
421         break;
422       case TARGET_ALLOC_HOST:
423         Err = cuMemFreeHost(TgtPtr);
424         if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
425           return OFFLOAD_FAIL;
426         break;
427       }
428 
429       return OFFLOAD_SUCCESS;
430     }
431   };
432 
433   /// A vector of device allocators
434   std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
435 
436   /// A vector of memory managers. Since the memory manager is non-copyable and
437   // non-removable, we wrap them into std::unique_ptr.
438   std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
439 
440   /// Whether use memory manager
441   bool UseMemoryManager = true;
442 
443   // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry Entry)444   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
445     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
446     E.Entries.push_back(Entry);
447   }
448 
449   // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const450   const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
451                                              const void *Addr) const {
452     for (const __tgt_offload_entry &Itr :
453          DeviceData[DeviceId].FuncGblEntries.back().Entries)
454       if (Itr.addr == Addr)
455         return &Itr;
456 
457     return nullptr;
458   }
459 
460   // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)461   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
462     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
463 
464     if (E.Entries.empty())
465       return nullptr;
466 
467     // Update table info according to the entries and return the pointer
468     E.Table.EntriesBegin = E.Entries.data();
469     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
470 
471     return &E.Table;
472   }
473 
474   // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)475   void clearOffloadEntriesTable(const int DeviceId) {
476     DeviceData[DeviceId].FuncGblEntries.emplace_back();
477     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
478     E.Entries.clear();
479     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
480   }
481 
482 public:
getStream(const int DeviceId,__tgt_async_info * AsyncInfo) const483   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
484     assert(AsyncInfo && "AsyncInfo is nullptr");
485 
486     if (!AsyncInfo->Queue) {
487       CUstream S;
488       if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
489         return nullptr;
490 
491       AsyncInfo->Queue = S;
492     }
493 
494     return reinterpret_cast<CUstream>(AsyncInfo->Queue);
495   }
496 
497   // This class should not be copied
498   DeviceRTLTy(const DeviceRTLTy &) = delete;
499   DeviceRTLTy(DeviceRTLTy &&) = delete;
500 
DeviceRTLTy()501   DeviceRTLTy()
502       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
503         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
504         DynamicMemorySize(0) {
505 
506     DP("Start initializing CUDA\n");
507 
508     CUresult Err = cuInit(0);
509     if (Err == CUDA_ERROR_INVALID_HANDLE) {
510       // Can't call cuGetErrorString if dlsym failed
511       DP("Failed to load CUDA shared library\n");
512       return;
513     }
514     if (Err == CUDA_ERROR_NO_DEVICE) {
515       DP("There are no devices supporting CUDA.\n");
516       return;
517     }
518     if (!checkResult(Err, "Error returned from cuInit\n")) {
519       return;
520     }
521 
522     Err = cuDeviceGetCount(&NumberOfDevices);
523     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
524       return;
525 
526     if (NumberOfDevices == 0) {
527       DP("There are no devices supporting CUDA.\n");
528       return;
529     }
530 
531     DeviceData.resize(NumberOfDevices);
532     Modules.resize(NumberOfDevices);
533     StreamPool.resize(NumberOfDevices);
534     EventPool.resize(NumberOfDevices);
535     PeerAccessMatrix.resize(NumberOfDevices);
536     for (auto &V : PeerAccessMatrix)
537       V.resize(NumberOfDevices, PeerAccessState::Unkown);
538 
539     // Get environment variables regarding teams
540     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
541       // OMP_TEAM_LIMIT has been set
542       EnvTeamLimit = std::stoi(EnvStr);
543       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
544     }
545     if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
546       // OMP_TEAMS_THREAD_LIMIT has been set
547       EnvTeamThreadLimit = std::stoi(EnvStr);
548       DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
549     }
550     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
551       // OMP_NUM_TEAMS has been set
552       EnvNumTeams = std::stoi(EnvStr);
553       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
554     }
555     if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
556       // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
557       DynamicMemorySize = std::stoi(EnvStr);
558       DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
559          DynamicMemorySize);
560     }
561     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
562       // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
563       NumInitialStreams = std::stoi(EnvStr);
564       DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
565     }
566 
567     for (int I = 0; I < NumberOfDevices; ++I)
568       DeviceAllocators.emplace_back();
569 
570     // Get the size threshold from environment variable
571     std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
572     UseMemoryManager = Res.second;
573     size_t MemoryManagerThreshold = Res.first;
574 
575     if (UseMemoryManager)
576       for (int I = 0; I < NumberOfDevices; ++I)
577         MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
578             DeviceAllocators[I], MemoryManagerThreshold));
579 
580     // We lazily initialize all devices later.
581     InitializedFlags.assign(NumberOfDevices, false);
582   }
583 
~DeviceRTLTy()584   ~DeviceRTLTy() {
585     for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
586       deinitDevice(DeviceId);
587   }
588 
589   // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const590   bool isValidDeviceId(const int DeviceId) const {
591     return DeviceId >= 0 && DeviceId < NumberOfDevices;
592   }
593 
getNumOfDevices() const594   int getNumOfDevices() const { return NumberOfDevices; }
595 
setRequiresFlag(const int64_t Flags)596   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
597 
initDevice(const int DeviceId)598   int initDevice(const int DeviceId) {
599     CUdevice Device;
600 
601     DP("Getting device %d\n", DeviceId);
602     CUresult Err = cuDeviceGet(&Device, DeviceId);
603     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
604       return OFFLOAD_FAIL;
605 
606     assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
607     InitializedFlags[DeviceId] = true;
608 
609     // Query the current flags of the primary context and set its flags if
610     // it is inactive
611     unsigned int FormerPrimaryCtxFlags = 0;
612     int FormerPrimaryCtxIsActive = 0;
613     Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
614                                      &FormerPrimaryCtxIsActive);
615     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
616       return OFFLOAD_FAIL;
617 
618     if (FormerPrimaryCtxIsActive) {
619       DP("The primary context is active, no change to its flags\n");
620       if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
621           CU_CTX_SCHED_BLOCKING_SYNC)
622         DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
623     } else {
624       DP("The primary context is inactive, set its flags to "
625          "CU_CTX_SCHED_BLOCKING_SYNC\n");
626       Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
627       if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
628         return OFFLOAD_FAIL;
629     }
630 
631     // Retain the per device primary context and save it to use whenever this
632     // device is selected.
633     Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
634     if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
635       return OFFLOAD_FAIL;
636 
637     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
638     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
639       return OFFLOAD_FAIL;
640 
641     // Initialize the stream pool.
642     if (!StreamPool[DeviceId])
643       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
644                                                             NumInitialStreams);
645 
646     // Initialize the event pool.
647     if (!EventPool[DeviceId])
648       EventPool[DeviceId] =
649           std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
650 
651     // Query attributes to determine number of threads/block and blocks/grid.
652     int MaxGridDimX;
653     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
654                                Device);
655     if (Err != CUDA_SUCCESS) {
656       DP("Error getting max grid dimension, use default value %d\n",
657          DeviceRTLTy::DefaultNumTeams);
658       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
659     } else {
660       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
661       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
662     }
663 
664     // We are only exploiting threads along the x axis.
665     int MaxBlockDimX;
666     Err = cuDeviceGetAttribute(&MaxBlockDimX,
667                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
668     if (Err != CUDA_SUCCESS) {
669       DP("Error getting max block dimension, use default value %d\n",
670          DeviceRTLTy::DefaultNumThreads);
671       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
672     } else {
673       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
674       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
675 
676       if (EnvTeamThreadLimit > 0 &&
677           DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
678         DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
679            "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
680            DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
681         DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
682       }
683       if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
684         DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
685            "capping at the hard limit\n",
686            DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
687         DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
688       }
689     }
690 
691     // Get and set warp size
692     int WarpSize;
693     Err =
694         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
695     if (Err != CUDA_SUCCESS) {
696       DP("Error getting warp size, assume default value 32\n");
697       DeviceData[DeviceId].WarpSize = 32;
698     } else {
699       DP("Using warp size %d\n", WarpSize);
700       DeviceData[DeviceId].WarpSize = WarpSize;
701     }
702 
703     // Adjust teams to the env variables
704     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
705       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
706          EnvTeamLimit);
707       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
708     }
709 
710     size_t StackLimit;
711     size_t HeapLimit;
712     if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
713       StackLimit = std::stol(EnvStr);
714       if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
715         return OFFLOAD_FAIL;
716     } else {
717       if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
718         return OFFLOAD_FAIL;
719     }
720     if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
721       HeapLimit = std::stol(EnvStr);
722       if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
723         return OFFLOAD_FAIL;
724     } else {
725       if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
726         return OFFLOAD_FAIL;
727     }
728 
729     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
730          "Device supports up to %d CUDA blocks and %d threads with a "
731          "warp size of %d\n",
732          DeviceData[DeviceId].BlocksPerGrid,
733          DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
734     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
735          "Device heap size is %d Bytes, device stack size is %d Bytes per "
736          "thread\n",
737          (int)HeapLimit, (int)StackLimit);
738 
739     // Set default number of teams
740     if (EnvNumTeams > 0) {
741       DP("Default number of teams set according to environment %d\n",
742          EnvNumTeams);
743       DeviceData[DeviceId].NumTeams = EnvNumTeams;
744     } else {
745       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
746       DP("Default number of teams set according to library's default %d\n",
747          DeviceRTLTy::DefaultNumTeams);
748     }
749 
750     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
751       DP("Default number of teams exceeds device limit, capping at %d\n",
752          DeviceData[DeviceId].BlocksPerGrid);
753       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
754     }
755 
756     // Set default number of threads
757     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
758     DP("Default number of threads set according to library's default %d\n",
759        DeviceRTLTy::DefaultNumThreads);
760     if (DeviceData[DeviceId].NumThreads >
761         DeviceData[DeviceId].ThreadsPerBlock) {
762       DP("Default number of threads exceeds device limit, capping at %d\n",
763          DeviceData[DeviceId].ThreadsPerBlock);
764       DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
765     }
766 
767     return OFFLOAD_SUCCESS;
768   }
769 
deinitDevice(const int DeviceId)770   int deinitDevice(const int DeviceId) {
771     auto IsInitialized = InitializedFlags[DeviceId];
772     if (!IsInitialized)
773       return OFFLOAD_SUCCESS;
774     InitializedFlags[DeviceId] = false;
775 
776     if (UseMemoryManager)
777       MemoryManagers[DeviceId].release();
778 
779     StreamPool[DeviceId].reset();
780     EventPool[DeviceId].reset();
781 
782     DeviceDataTy &D = DeviceData[DeviceId];
783     if (!checkResult(cuCtxSetCurrent(D.Context),
784                      "Error returned from cuCtxSetCurrent\n"))
785       return OFFLOAD_FAIL;
786 
787     // Unload all modules.
788     for (auto &M : Modules[DeviceId])
789       if (!checkResult(cuModuleUnload(M),
790                        "Error returned from cuModuleUnload\n"))
791         return OFFLOAD_FAIL;
792 
793     // Destroy context.
794     CUdevice Device;
795     if (!checkResult(cuCtxGetDevice(&Device),
796                      "Error returned from cuCtxGetDevice\n"))
797       return OFFLOAD_FAIL;
798 
799     if (!checkResult(cuDevicePrimaryCtxRelease(Device),
800                      "Error returned from cuDevicePrimaryCtxRelease\n"))
801       return OFFLOAD_FAIL;
802 
803     return OFFLOAD_SUCCESS;
804   }
805 
loadBinary(const int DeviceId,const __tgt_device_image * Image)806   __tgt_target_table *loadBinary(const int DeviceId,
807                                  const __tgt_device_image *Image) {
808     // Clear the offload table as we are going to create a new one.
809     clearOffloadEntriesTable(DeviceId);
810 
811     // Create the module and extract the function pointers.
812     CUmodule Module;
813     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
814     CUresult Err =
815         cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
816     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
817       return nullptr;
818 
819     DP("CUDA module successfully loaded!\n");
820 
821     Modules[DeviceId].push_back(Module);
822 
823     // Find the symbols in the module by name.
824     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
825     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
826 
827     std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
828     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
829       if (!E->addr) {
830         // We return nullptr when something like this happens, the host should
831         // have always something in the address to uniquely identify the target
832         // region.
833         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
834         return nullptr;
835       }
836 
837       if (E->size) {
838         __tgt_offload_entry Entry = *E;
839         CUdeviceptr CUPtr;
840         size_t CUSize;
841         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
842         // We keep this style here because we need the name
843         if (Err != CUDA_SUCCESS) {
844           REPORT("Loading global '%s' Failed\n", E->name);
845           CUDA_ERR_STRING(Err);
846           return nullptr;
847         }
848 
849         if (CUSize != E->size) {
850           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
851              CUSize, E->size);
852           return nullptr;
853         }
854 
855         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
856            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
857 
858         Entry.addr = (void *)(CUPtr);
859 
860         // Note: In the current implementation declare target variables
861         // can either be link or to. This means that once unified
862         // memory is activated via the requires directive, the variable
863         // can be used directly from the host in both cases.
864         // TODO: when variables types other than to or link are added,
865         // the below condition should be changed to explicitly
866         // check for to and link variables types:
867         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
868         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
869         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
870           // If unified memory is present any target link or to variables
871           // can access host addresses directly. There is no longer a
872           // need for device copies.
873           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
874           DP("Copy linked variable host address (" DPxMOD
875              ") to device address (" DPxMOD ")\n",
876              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
877         }
878 
879         addOffloadEntry(DeviceId, Entry);
880 
881         continue;
882       }
883 
884       CUfunction Func;
885       Err = cuModuleGetFunction(&Func, Module, E->name);
886       // We keep this style here because we need the name
887       if (Err != CUDA_SUCCESS) {
888         REPORT("Loading '%s' Failed\n", E->name);
889         CUDA_ERR_STRING(Err);
890         return nullptr;
891       }
892 
893       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
894          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
895 
896       // default value GENERIC (in case symbol is missing from cubin file)
897       llvm::omp::OMPTgtExecModeFlags ExecModeVal;
898       std::string ExecModeNameStr(E->name);
899       ExecModeNameStr += "_exec_mode";
900       const char *ExecModeName = ExecModeNameStr.c_str();
901 
902       CUdeviceptr ExecModePtr;
903       size_t CUSize;
904       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
905       if (Err == CUDA_SUCCESS) {
906         if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
907           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
908              ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
909           return nullptr;
910         }
911 
912         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
913         if (Err != CUDA_SUCCESS) {
914           REPORT("Error when copying data from device to host. Pointers: "
915                  "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
916                  DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
917           CUDA_ERR_STRING(Err);
918           return nullptr;
919         }
920       } else {
921         DP("Loading global exec_mode '%s' - symbol missing, using default "
922            "value GENERIC (1)\n",
923            ExecModeName);
924       }
925 
926       KernelsList.emplace_back(Func, ExecModeVal);
927 
928       __tgt_offload_entry Entry = *E;
929       Entry.addr = &KernelsList.back();
930       addOffloadEntry(DeviceId, Entry);
931     }
932 
933     // send device environment data to the device
934     {
935       // TODO: The device ID used here is not the real device ID used by OpenMP.
936       DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
937                                     static_cast<uint32_t>(DeviceId),
938                                     static_cast<uint32_t>(DynamicMemorySize)};
939 
940       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
941         DeviceEnv.DebugKind = std::stoi(EnvStr);
942 
943       const char *DeviceEnvName = "omptarget_device_environment";
944       CUdeviceptr DeviceEnvPtr;
945       size_t CUSize;
946 
947       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
948       if (Err == CUDA_SUCCESS) {
949         if (CUSize != sizeof(DeviceEnv)) {
950           REPORT(
951               "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
952               DeviceEnvName, CUSize, sizeof(int32_t));
953           CUDA_ERR_STRING(Err);
954           return nullptr;
955         }
956 
957         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
958         if (Err != CUDA_SUCCESS) {
959           REPORT("Error when copying data from host to device. Pointers: "
960                  "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
961                  DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
962           CUDA_ERR_STRING(Err);
963           return nullptr;
964         }
965 
966         DP("Sending global device environment data %zu bytes\n", CUSize);
967       } else {
968         DP("Finding global device environment '%s' - symbol missing.\n",
969            DeviceEnvName);
970         DP("Continue, considering this is a device RTL which does not accept "
971            "environment setting.\n");
972       }
973     }
974 
975     return getOffloadEntriesTable(DeviceId);
976   }
977 
dataAlloc(const int DeviceId,const int64_t Size,const TargetAllocTy Kind)978   void *dataAlloc(const int DeviceId, const int64_t Size,
979                   const TargetAllocTy Kind) {
980     switch (Kind) {
981     case TARGET_ALLOC_DEFAULT:
982     case TARGET_ALLOC_DEVICE:
983       if (UseMemoryManager)
984         return MemoryManagers[DeviceId]->allocate(Size, nullptr);
985       else
986         return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
987     case TARGET_ALLOC_HOST:
988     case TARGET_ALLOC_SHARED:
989       return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
990     }
991 
992     REPORT("Invalid target data allocation kind or requested allocator not "
993            "implemented yet\n");
994 
995     return nullptr;
996   }
997 
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const998   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
999                  const int64_t Size, __tgt_async_info *AsyncInfo) const {
1000     assert(AsyncInfo && "AsyncInfo is nullptr");
1001 
1002     CUstream Stream = getStream(DeviceId, AsyncInfo);
1003     CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
1004     if (Err != CUDA_SUCCESS) {
1005       DP("Error when copying data from host to device. Pointers: host "
1006          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1007          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1008       CUDA_ERR_STRING(Err);
1009       return OFFLOAD_FAIL;
1010     }
1011 
1012     return OFFLOAD_SUCCESS;
1013   }
1014 
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const1015   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1016                    const int64_t Size, __tgt_async_info *AsyncInfo) const {
1017     assert(AsyncInfo && "AsyncInfo is nullptr");
1018 
1019     CUstream Stream = getStream(DeviceId, AsyncInfo);
1020     CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1021     if (Err != CUDA_SUCCESS) {
1022       DP("Error when copying data from device to host. Pointers: host "
1023          "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1024          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1025       CUDA_ERR_STRING(Err);
1026       return OFFLOAD_FAIL;
1027     }
1028 
1029     return OFFLOAD_SUCCESS;
1030   }
1031 
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1032   int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1033                    int64_t Size, __tgt_async_info *AsyncInfo) {
1034     assert(AsyncInfo && "AsyncInfo is nullptr");
1035 
1036     CUresult Err;
1037     CUstream Stream = getStream(SrcDevId, AsyncInfo);
1038 
1039     // If they are two devices, we try peer to peer copy first
1040     if (SrcDevId != DstDevId) {
1041       std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
1042 
1043       switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1044       case PeerAccessState::No: {
1045         REPORT("Peer access from %" PRId32 " to %" PRId32
1046                " is not supported. Fall back to D2D memcpy.\n",
1047                SrcDevId, DstDevId);
1048         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1049       }
1050       case PeerAccessState::Unkown: {
1051         int CanAccessPeer = 0;
1052         Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1053         if (Err != CUDA_SUCCESS) {
1054           REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1055                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1056                  SrcDevId, DstDevId);
1057           CUDA_ERR_STRING(Err);
1058           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1059           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1060         }
1061 
1062         if (!CanAccessPeer) {
1063           REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
1064                  "memcpy.\n",
1065                  SrcDevId, DstDevId);
1066           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1067           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1068         }
1069 
1070         Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1071         if (Err != CUDA_SUCCESS) {
1072           REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1073                  ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1074                  SrcDevId, DstDevId);
1075           CUDA_ERR_STRING(Err);
1076           PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1077           return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1078         }
1079 
1080         PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1081 
1082         LLVM_FALLTHROUGH;
1083       }
1084       case PeerAccessState::Yes: {
1085         Err = cuMemcpyPeerAsync(
1086             (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1087             (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
1088         if (Err == CUDA_SUCCESS)
1089           return OFFLOAD_SUCCESS;
1090 
1091         DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1092            ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
1093            ". Fall back to D2D memcpy.\n",
1094            DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1095         CUDA_ERR_STRING(Err);
1096 
1097         return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1098       }
1099       }
1100     }
1101 
1102     return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1103   }
1104 
dataDelete(const int DeviceId,void * TgtPtr)1105   int dataDelete(const int DeviceId, void *TgtPtr) {
1106     if (UseMemoryManager)
1107       return MemoryManagers[DeviceId]->free(TgtPtr);
1108 
1109     return DeviceAllocators[DeviceId].free(TgtPtr);
1110   }
1111 
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const1112   int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1113                           ptrdiff_t *TgtOffsets, const int ArgNum,
1114                           const int TeamNum, const int ThreadLimit,
1115                           const unsigned int LoopTripCount,
1116                           __tgt_async_info *AsyncInfo) const {
1117     // All args are references.
1118     std::vector<void *> Args(ArgNum);
1119     std::vector<void *> Ptrs(ArgNum);
1120 
1121     for (int I = 0; I < ArgNum; ++I) {
1122       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1123       Args[I] = &Ptrs[I];
1124     }
1125 
1126     KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1127 
1128     const bool IsSPMDGenericMode =
1129         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1130     const bool IsSPMDMode =
1131         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1132     const bool IsGenericMode =
1133         KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1134 
1135     int CudaThreadsPerBlock;
1136     if (ThreadLimit > 0) {
1137       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1138       CudaThreadsPerBlock = ThreadLimit;
1139       // Add master warp if necessary
1140       if (IsGenericMode) {
1141         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1142         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1143       }
1144     } else {
1145       DP("Setting CUDA threads per block to default %d\n",
1146          DeviceData[DeviceId].NumThreads);
1147       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1148     }
1149 
1150     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1151       DP("Threads per block capped at device limit %d\n",
1152          DeviceData[DeviceId].ThreadsPerBlock);
1153       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1154     }
1155 
1156     CUresult Err;
1157     if (!KernelInfo->MaxThreadsPerBlock) {
1158       Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1159                                CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1160                                KernelInfo->Func);
1161       if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1162         return OFFLOAD_FAIL;
1163     }
1164 
1165     if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1166       DP("Threads per block capped at kernel limit %d\n",
1167          KernelInfo->MaxThreadsPerBlock);
1168       CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1169     }
1170 
1171     unsigned int CudaBlocksPerGrid;
1172     if (TeamNum <= 0) {
1173       if (LoopTripCount > 0 && EnvNumTeams < 0) {
1174         if (IsSPMDGenericMode) {
1175           // If we reach this point, then we are executing a kernel that was
1176           // transformed from Generic-mode to SPMD-mode. This kernel has
1177           // SPMD-mode execution, but needs its blocks to be scheduled
1178           // differently because the current loop trip count only applies to the
1179           // `teams distribute` region and will create var too few blocks using
1180           // the regular SPMD-mode method.
1181           CudaBlocksPerGrid = LoopTripCount;
1182         } else if (IsSPMDMode) {
1183           // We have a combined construct, i.e. `target teams distribute
1184           // parallel for [simd]`. We launch so many teams so that each thread
1185           // will execute one iteration of the loop. round up to the nearest
1186           // integer
1187           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1188         } else if (IsGenericMode) {
1189           // If we reach this point, then we have a non-combined construct, i.e.
1190           // `teams distribute` with a nested `parallel for` and each team is
1191           // assigned one iteration of the `distribute` loop. E.g.:
1192           //
1193           // #pragma omp target teams distribute
1194           // for(...loop_tripcount...) {
1195           //   #pragma omp parallel for
1196           //   for(...) {}
1197           // }
1198           //
1199           // Threads within a team will execute the iterations of the `parallel`
1200           // loop.
1201           CudaBlocksPerGrid = LoopTripCount;
1202         } else {
1203           REPORT("Unknown execution mode: %d\n",
1204                  static_cast<int8_t>(KernelInfo->ExecutionMode));
1205           return OFFLOAD_FAIL;
1206         }
1207         DP("Using %d teams due to loop trip count %" PRIu32
1208            " and number of threads per block %d\n",
1209            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1210       } else {
1211         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1212         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1213       }
1214     } else {
1215       DP("Using requested number of teams %d\n", TeamNum);
1216       CudaBlocksPerGrid = TeamNum;
1217     }
1218 
1219     if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1220       DP("Capping number of teams to team limit %d\n",
1221          DeviceData[DeviceId].BlocksPerGrid);
1222       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1223     }
1224 
1225     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1226          "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1227          (getOffloadEntry(DeviceId, TgtEntryPtr))
1228              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1229              : "(null)",
1230          CudaBlocksPerGrid, CudaThreadsPerBlock,
1231          (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1232 
1233     CUstream Stream = getStream(DeviceId, AsyncInfo);
1234     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1235                          /* gridDimZ */ 1, CudaThreadsPerBlock,
1236                          /* blockDimY */ 1, /* blockDimZ */ 1,
1237                          DynamicMemorySize, Stream, &Args[0], nullptr);
1238     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1239       return OFFLOAD_FAIL;
1240 
1241     DP("Launch of entry point at " DPxMOD " successful!\n",
1242        DPxPTR(TgtEntryPtr));
1243 
1244     return OFFLOAD_SUCCESS;
1245   }
1246 
synchronize(const int DeviceId,__tgt_async_info * AsyncInfo) const1247   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1248     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1249     CUresult Err = cuStreamSynchronize(Stream);
1250 
1251     // Once the stream is synchronized, return it to stream pool and reset
1252     // AsyncInfo. This is to make sure the synchronization only works for its
1253     // own tasks.
1254     StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1255     AsyncInfo->Queue = nullptr;
1256 
1257     if (Err != CUDA_SUCCESS) {
1258       DP("Error when synchronizing stream. stream = " DPxMOD
1259          ", async info ptr = " DPxMOD "\n",
1260          DPxPTR(Stream), DPxPTR(AsyncInfo));
1261       CUDA_ERR_STRING(Err);
1262     }
1263     return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1264   }
1265 
printDeviceInfo(int32_t DeviceId)1266   void printDeviceInfo(int32_t DeviceId) {
1267     char TmpChar[1000];
1268     std::string TmpStr;
1269     size_t TmpSt;
1270     int TmpInt, TmpInt2, TmpInt3;
1271 
1272     CUdevice Device;
1273     checkResult(cuDeviceGet(&Device, DeviceId),
1274                 "Error returned from cuCtxGetDevice\n");
1275 
1276     cuDriverGetVersion(&TmpInt);
1277     printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
1278     printf("    CUDA Device Number: \t\t%d \n", DeviceId);
1279     checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1280                 "Error returned from cuDeviceGetName\n");
1281     printf("    Device Name: \t\t\t%s \n", TmpChar);
1282     checkResult(cuDeviceTotalMem(&TmpSt, Device),
1283                 "Error returned from cuDeviceTotalMem\n");
1284     printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
1285     checkResult(cuDeviceGetAttribute(
1286                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1287                 "Error returned from cuDeviceGetAttribute\n");
1288     printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
1289     checkResult(
1290         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1291         "Error returned from cuDeviceGetAttribute\n");
1292     printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1293     checkResult(cuDeviceGetAttribute(
1294                     &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1295                 "Error returned from cuDeviceGetAttribute\n");
1296     printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
1297     checkResult(
1298         cuDeviceGetAttribute(
1299             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1300         "Error returned from cuDeviceGetAttribute\n");
1301     printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1302     checkResult(
1303         cuDeviceGetAttribute(
1304             &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1305         "Error returned from cuDeviceGetAttribute\n");
1306     printf("    Registers per Block: \t\t%d \n", TmpInt);
1307     checkResult(
1308         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1309         "Error returned from cuDeviceGetAttribute\n");
1310     printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1311     checkResult(cuDeviceGetAttribute(
1312                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1313                 "Error returned from cuDeviceGetAttribute\n");
1314     printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
1315     checkResult(cuDeviceGetAttribute(
1316                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1317                 "Error returned from cuDeviceGetAttribute\n");
1318     checkResult(cuDeviceGetAttribute(
1319                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1320                 "Error returned from cuDeviceGetAttribute\n");
1321     checkResult(cuDeviceGetAttribute(
1322                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1323                 "Error returned from cuDeviceGetAttribute\n");
1324     printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1325            TmpInt3);
1326     checkResult(cuDeviceGetAttribute(
1327                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1328                 "Error returned from cuDeviceGetAttribute\n");
1329     checkResult(cuDeviceGetAttribute(
1330                     &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1331                 "Error returned from cuDeviceGetAttribute\n");
1332     checkResult(cuDeviceGetAttribute(
1333                     &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1334                 "Error returned from cuDeviceGetAttribute\n");
1335     printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1336            TmpInt3);
1337     checkResult(
1338         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1339         "Error returned from cuDeviceGetAttribute\n");
1340     printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1341     checkResult(cuDeviceGetAttribute(
1342                     &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1343                 "Error returned from cuDeviceGetAttribute\n");
1344     printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1345     checkResult(
1346         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1347         "Error returned from cuDeviceGetAttribute\n");
1348     printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
1349     checkResult(cuDeviceGetAttribute(
1350                     &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1351                 "Error returned from cuDeviceGetAttribute\n");
1352     printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1353     checkResult(
1354         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1355         "Error returned from cuDeviceGetAttribute\n");
1356     printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1357     checkResult(cuDeviceGetAttribute(
1358                     &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1359                 "Error returned from cuDeviceGetAttribute\n");
1360     printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1361     checkResult(
1362         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1363         "Error returned from cuDeviceGetAttribute\n");
1364     if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1365       TmpStr = "DEFAULT";
1366     else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1367       TmpStr = "PROHIBITED";
1368     else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1369       TmpStr = "EXCLUSIVE PROCESS";
1370     else
1371       TmpStr = "unknown";
1372     printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1373     checkResult(cuDeviceGetAttribute(
1374                     &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1375                 "Error returned from cuDeviceGetAttribute\n");
1376     printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1377     checkResult(
1378         cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1379         "Error returned from cuDeviceGetAttribute\n");
1380     printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1381     checkResult(cuDeviceGetAttribute(
1382                     &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1383                 "Error returned from cuDeviceGetAttribute\n");
1384     printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1385     checkResult(
1386         cuDeviceGetAttribute(
1387             &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1388         "Error returned from cuDeviceGetAttribute\n");
1389     printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1390     checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1391                                      Device),
1392                 "Error returned from cuDeviceGetAttribute\n");
1393     printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1394     checkResult(cuDeviceGetAttribute(
1395                     &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1396                     Device),
1397                 "Error returned from cuDeviceGetAttribute\n");
1398     printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
1399     checkResult(cuDeviceGetAttribute(
1400                     &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1401                 "Error returned from cuDeviceGetAttribute\n");
1402     printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1403     checkResult(cuDeviceGetAttribute(
1404                     &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1405                 "Error returned from cuDeviceGetAttribute\n");
1406     printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1407     checkResult(cuDeviceGetAttribute(
1408                     &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1409                 "Error returned from cuDeviceGetAttribute\n");
1410     printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1411     checkResult(
1412         cuDeviceGetAttribute(
1413             &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1414         "Error returned from cuDeviceGetAttribute\n");
1415     printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1416     checkResult(
1417         cuDeviceGetAttribute(
1418             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1419         "Error returned from cuDeviceGetAttribute\n");
1420     printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1421     checkResult(cuDeviceGetAttribute(
1422                     &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1423                 "Error returned from cuDeviceGetAttribute\n");
1424     printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1425     checkResult(cuDeviceGetAttribute(
1426                     &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1427                 "Error returned from cuDeviceGetAttribute\n");
1428     printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1429     checkResult(
1430         cuDeviceGetAttribute(
1431             &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1432         "Error returned from cuDeviceGetAttribute\n");
1433     checkResult(
1434         cuDeviceGetAttribute(
1435             &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1436         "Error returned from cuDeviceGetAttribute\n");
1437     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1438   }
1439 
createEvent(int DeviceId,void ** P)1440   int createEvent(int DeviceId, void **P) {
1441     CUevent Event = nullptr;
1442     if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1443       return OFFLOAD_FAIL;
1444     *P = Event;
1445     return OFFLOAD_SUCCESS;
1446   }
1447 
destroyEvent(int DeviceId,void * EventPtr)1448   int destroyEvent(int DeviceId, void *EventPtr) {
1449     EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1450     return OFFLOAD_SUCCESS;
1451   }
1452 
waitEvent(const int DeviceId,__tgt_async_info * AsyncInfo,void * EventPtr) const1453   int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1454                 void *EventPtr) const {
1455     CUstream Stream = getStream(DeviceId, AsyncInfo);
1456     CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1457 
1458     // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1459     // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1460     // API document, that argument has to be 0x0.
1461     CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1462     if (Err != CUDA_SUCCESS) {
1463       DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1464          DPxPTR(Stream), DPxPTR(Event));
1465       CUDA_ERR_STRING(Err);
1466       return OFFLOAD_FAIL;
1467     }
1468 
1469     return OFFLOAD_SUCCESS;
1470   }
1471 
releaseAsyncInfo(int DeviceId,__tgt_async_info * AsyncInfo) const1472   int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1473     if (AsyncInfo->Queue) {
1474       StreamPool[DeviceId]->release(
1475           reinterpret_cast<CUstream>(AsyncInfo->Queue));
1476       AsyncInfo->Queue = nullptr;
1477     }
1478 
1479     return OFFLOAD_SUCCESS;
1480   }
1481 
initAsyncInfo(int DeviceId,__tgt_async_info ** AsyncInfo) const1482   int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1483     *AsyncInfo = new __tgt_async_info;
1484     getStream(DeviceId, *AsyncInfo);
1485     return OFFLOAD_SUCCESS;
1486   }
1487 
initDeviceInfo(int DeviceId,__tgt_device_info * DeviceInfo,const char ** ErrStr) const1488   int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1489                      const char **ErrStr) const {
1490     assert(DeviceInfo && "DeviceInfo is nullptr");
1491 
1492     if (!DeviceInfo->Context)
1493       DeviceInfo->Context = DeviceData[DeviceId].Context;
1494     if (!DeviceInfo->Device) {
1495       CUdevice Dev;
1496       CUresult Err = cuDeviceGet(&Dev, DeviceId);
1497       if (Err == CUDA_SUCCESS) {
1498         DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1499       } else {
1500         cuGetErrorString(Err, ErrStr);
1501         return OFFLOAD_FAIL;
1502       }
1503     }
1504     return OFFLOAD_SUCCESS;
1505   }
1506 
setContext(int DeviceId)1507   int setContext(int DeviceId) {
1508     assert(InitializedFlags[DeviceId] && "Device is not initialized");
1509 
1510     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1511     if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1512       return OFFLOAD_FAIL;
1513 
1514     return OFFLOAD_SUCCESS;
1515   }
1516 };
1517 
1518 DeviceRTLTy DeviceRTL;
1519 } // namespace
1520 
1521 // Exposed library API function
1522 #ifdef __cplusplus
1523 extern "C" {
1524 #endif
1525 
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)1526 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1527   return elf_check_machine(Image, /* EM_CUDA */ 190);
1528 }
1529 
__tgt_rtl_is_valid_binary_info(__tgt_device_image * image,__tgt_image_info * info)1530 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1531                                        __tgt_image_info *info) {
1532   if (!__tgt_rtl_is_valid_binary(image))
1533     return false;
1534 
1535   // A subarchitecture was not specified. Assume it is compatible.
1536   if (!info || !info->Arch)
1537     return true;
1538 
1539   int32_t NumberOfDevices = 0;
1540   if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
1541     return false;
1542 
1543   StringRef ArchStr = StringRef(info->Arch).drop_front(sizeof("sm_") - 1);
1544   for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1545     CUdevice Device;
1546     if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
1547       return false;
1548 
1549     int32_t Major, Minor;
1550     if (cuDeviceGetAttribute(&Major,
1551                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1552                              Device) != CUDA_SUCCESS)
1553       return false;
1554     if (cuDeviceGetAttribute(&Minor,
1555                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1556                              Device) != CUDA_SUCCESS)
1557       return false;
1558 
1559     // A cubin generated for a certain compute capability is supported to run on
1560     // any GPU with the same major revision and same or higher minor revision.
1561     int32_t ImageMajor = ArchStr[0] - '0';
1562     int32_t ImageMinor = ArchStr[1] - '0';
1563     if (Major != ImageMajor || Minor < ImageMinor)
1564       return false;
1565   }
1566 
1567   DP("Image has compatible compute capability: %s\n", info->Arch);
1568   return true;
1569 }
1570 
__tgt_rtl_number_of_devices()1571 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1572 
__tgt_rtl_init_requires(int64_t RequiresFlags)1573 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1574   DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1575   DeviceRTL.setRequiresFlag(RequiresFlags);
1576   return RequiresFlags;
1577 }
1578 
__tgt_rtl_is_data_exchangable(int32_t SrcDevId,int DstDevId)1579 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
1580   if (DeviceRTL.isValidDeviceId(SrcDevId) &&
1581       DeviceRTL.isValidDeviceId(DstDevId))
1582     return 1;
1583 
1584   return 0;
1585 }
1586 
__tgt_rtl_init_device(int32_t DeviceId)1587 int32_t __tgt_rtl_init_device(int32_t DeviceId) {
1588   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1589   // Context is set when init the device.
1590 
1591   return DeviceRTL.initDevice(DeviceId);
1592 }
1593 
__tgt_rtl_deinit_device(int32_t DeviceId)1594 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
1595   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1596   // Context is set when deinit the device.
1597 
1598   return DeviceRTL.deinitDevice(DeviceId);
1599 }
1600 
__tgt_rtl_load_binary(int32_t DeviceId,__tgt_device_image * Image)1601 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
1602                                           __tgt_device_image *Image) {
1603   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1604 
1605   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1606     return nullptr;
1607 
1608   return DeviceRTL.loadBinary(DeviceId, Image);
1609 }
1610 
__tgt_rtl_data_alloc(int32_t DeviceId,int64_t Size,void *,int32_t Kind)1611 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
1612                            int32_t Kind) {
1613   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1614 
1615   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1616     return nullptr;
1617 
1618   return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
1619 }
1620 
__tgt_rtl_data_submit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size)1621 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1622                               int64_t Size) {
1623   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1624   // Context is set in __tgt_rtl_data_submit_async.
1625 
1626   __tgt_async_info AsyncInfo;
1627   const int32_t Rc =
1628       __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
1629   if (Rc != OFFLOAD_SUCCESS)
1630     return OFFLOAD_FAIL;
1631 
1632   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1633 }
1634 
__tgt_rtl_data_submit_async(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)1635 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
1636                                     void *HstPtr, int64_t Size,
1637                                     __tgt_async_info *AsyncInfoPtr) {
1638   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1639   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1640 
1641   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1642     return OFFLOAD_FAIL;
1643 
1644   return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
1645 }
1646 
__tgt_rtl_data_retrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size)1647 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1648                                 int64_t Size) {
1649   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1650   // Context is set in __tgt_rtl_data_retrieve_async.
1651 
1652   __tgt_async_info AsyncInfo;
1653   const int32_t Rc =
1654       __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
1655   if (Rc != OFFLOAD_SUCCESS)
1656     return OFFLOAD_FAIL;
1657 
1658   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1659 }
1660 
__tgt_rtl_data_retrieve_async(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)1661 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
1662                                       void *TgtPtr, int64_t Size,
1663                                       __tgt_async_info *AsyncInfoPtr) {
1664   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1665   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1666 
1667   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1668     return OFFLOAD_FAIL;
1669 
1670   return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
1671 }
1672 
__tgt_rtl_data_exchange_async(int32_t SrcDevId,void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1673 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
1674                                       int DstDevId, void *DstPtr, int64_t Size,
1675                                       __tgt_async_info *AsyncInfo) {
1676   assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1677   assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1678   assert(AsyncInfo && "AsyncInfo is nullptr");
1679 
1680   if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
1681     return OFFLOAD_FAIL;
1682 
1683   return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
1684                                 AsyncInfo);
1685 }
1686 
__tgt_rtl_data_exchange(int32_t SrcDevId,void * SrcPtr,int32_t DstDevId,void * DstPtr,int64_t Size)1687 int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
1688                                 int32_t DstDevId, void *DstPtr, int64_t Size) {
1689   assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1690   assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1691   // Context is set in __tgt_rtl_data_exchange_async.
1692 
1693   __tgt_async_info AsyncInfo;
1694   const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
1695                                                    DstPtr, Size, &AsyncInfo);
1696   if (Rc != OFFLOAD_SUCCESS)
1697     return OFFLOAD_FAIL;
1698 
1699   return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
1700 }
1701 
__tgt_rtl_data_delete(int32_t DeviceId,void * TgtPtr)1702 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
1703   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1704 
1705   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1706     return OFFLOAD_FAIL;
1707 
1708   return DeviceRTL.dataDelete(DeviceId, TgtPtr);
1709 }
1710 
__tgt_rtl_run_target_team_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t TeamNum,int32_t ThreadLimit,uint64_t LoopTripcount)1711 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
1712                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
1713                                          int32_t ArgNum, int32_t TeamNum,
1714                                          int32_t ThreadLimit,
1715                                          uint64_t LoopTripcount) {
1716   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1717   // Context is set in __tgt_rtl_run_target_team_region_async.
1718 
1719   __tgt_async_info AsyncInfo;
1720   const int32_t Rc = __tgt_rtl_run_target_team_region_async(
1721       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1722       LoopTripcount, &AsyncInfo);
1723   if (Rc != OFFLOAD_SUCCESS)
1724     return OFFLOAD_FAIL;
1725 
1726   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1727 }
1728 
__tgt_rtl_run_target_team_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t TeamNum,int32_t ThreadLimit,uint64_t LoopTripcount,__tgt_async_info * AsyncInfoPtr)1729 int32_t __tgt_rtl_run_target_team_region_async(
1730     int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
1731     int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
1732     uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
1733   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1734 
1735   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1736     return OFFLOAD_FAIL;
1737 
1738   return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
1739                                        TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1740                                        LoopTripcount, AsyncInfoPtr);
1741 }
1742 
__tgt_rtl_run_target_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum)1743 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
1744                                     void **TgtArgs, ptrdiff_t *TgtOffsets,
1745                                     int32_t ArgNum) {
1746   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1747   // Context is set in __tgt_rtl_run_target_region_async.
1748 
1749   __tgt_async_info AsyncInfo;
1750   const int32_t Rc = __tgt_rtl_run_target_region_async(
1751       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
1752   if (Rc != OFFLOAD_SUCCESS)
1753     return OFFLOAD_FAIL;
1754 
1755   return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1756 }
1757 
__tgt_rtl_run_target_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,__tgt_async_info * AsyncInfoPtr)1758 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
1759                                           void **TgtArgs, ptrdiff_t *TgtOffsets,
1760                                           int32_t ArgNum,
1761                                           __tgt_async_info *AsyncInfoPtr) {
1762   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1763   // Context is set in __tgt_rtl_run_target_team_region_async.
1764   return __tgt_rtl_run_target_team_region_async(
1765       DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
1766       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1767       AsyncInfoPtr);
1768 }
1769 
__tgt_rtl_synchronize(int32_t DeviceId,__tgt_async_info * AsyncInfoPtr)1770 int32_t __tgt_rtl_synchronize(int32_t DeviceId,
1771                               __tgt_async_info *AsyncInfoPtr) {
1772   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1773   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1774   assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1775   // NOTE: We don't need to set context for stream sync.
1776   return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
1777 }
1778 
__tgt_rtl_set_info_flag(uint32_t NewInfoLevel)1779 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1780   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1781   InfoLevel.store(NewInfoLevel);
1782 }
1783 
__tgt_rtl_print_device_info(int32_t DeviceId)1784 void __tgt_rtl_print_device_info(int32_t DeviceId) {
1785   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1786   // NOTE: We don't need to set context for print device info.
1787   DeviceRTL.printDeviceInfo(DeviceId);
1788 }
1789 
__tgt_rtl_create_event(int32_t DeviceId,void ** Event)1790 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
1791   assert(Event && "event is nullptr");
1792 
1793   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1794     return OFFLOAD_FAIL;
1795 
1796   return DeviceRTL.createEvent(DeviceId, Event);
1797 }
1798 
__tgt_rtl_record_event(int32_t DeviceId,void * EventPtr,__tgt_async_info * AsyncInfoPtr)1799 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
1800                                __tgt_async_info *AsyncInfoPtr) {
1801   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1802   assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1803   assert(EventPtr && "event_ptr is nullptr");
1804   // NOTE: We might not need to set context for event record.
1805   return recordEvent(EventPtr, AsyncInfoPtr);
1806 }
1807 
__tgt_rtl_wait_event(int32_t DeviceId,void * EventPtr,__tgt_async_info * AsyncInfoPtr)1808 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
1809                              __tgt_async_info *AsyncInfoPtr) {
1810   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1811   assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1812   assert(EventPtr && "event is nullptr");
1813   // If we don't have a queue we need to set the context.
1814   if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1815     return OFFLOAD_FAIL;
1816   return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
1817 }
1818 
__tgt_rtl_sync_event(int32_t DeviceId,void * EventPtr)1819 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
1820   assert(EventPtr && "event is nullptr");
1821   // NOTE: We might not need to set context for event sync.
1822   return syncEvent(EventPtr);
1823 }
1824 
__tgt_rtl_destroy_event(int32_t DeviceId,void * EventPtr)1825 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
1826   assert(EventPtr && "event is nullptr");
1827 
1828   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1829     return OFFLOAD_FAIL;
1830 
1831   return DeviceRTL.destroyEvent(DeviceId, EventPtr);
1832 }
1833 
__tgt_rtl_release_async_info(int32_t DeviceId,__tgt_async_info * AsyncInfo)1834 int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
1835                                      __tgt_async_info *AsyncInfo) {
1836   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1837   assert(AsyncInfo && "async_info is nullptr");
1838 
1839   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1840     return OFFLOAD_FAIL;
1841 
1842   return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
1843 }
1844 
__tgt_rtl_init_async_info(int32_t DeviceId,__tgt_async_info ** AsyncInfo)1845 int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
1846                                   __tgt_async_info **AsyncInfo) {
1847   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1848   assert(AsyncInfo && "async_info is nullptr");
1849 
1850   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1851     return OFFLOAD_FAIL;
1852 
1853   return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
1854 }
1855 
__tgt_rtl_init_device_info(int32_t DeviceId,__tgt_device_info * DeviceInfoPtr,const char ** ErrStr)1856 int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
1857                                    __tgt_device_info *DeviceInfoPtr,
1858                                    const char **ErrStr) {
1859   assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1860   assert(DeviceInfoPtr && "device_info_ptr is nullptr");
1861 
1862   if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1863     return OFFLOAD_FAIL;
1864 
1865   return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
1866 }
1867 
1868 #ifdef __cplusplus
1869 }
1870 #endif
1871