1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <memory>
18 #include <mutex>
19 #include <string>
20 #include <vector>
21 
22 #include "omptargetplugin.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #ifdef OMPTARGET_DEBUG
29 static int DebugLevel = 0;
30 
31 #define GETNAME2(name) #name
32 #define GETNAME(name) GETNAME2(name)
33 #define DP(...) \
34   do { \
35     if (DebugLevel > 0) { \
36       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
37     } \
38   } while (false)
39 
40 // Utility for retrieving and printing CUDA error string.
41 #define CUDA_ERR_STRING(err) \
42   do { \
43     if (DebugLevel > 0) { \
44       const char *errStr; \
45       cuGetErrorString(err, &errStr); \
46       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
47     } \
48   } while (false)
49 #else // OMPTARGET_DEBUG
50 #define DP(...) {}
51 #define CUDA_ERR_STRING(err) {}
52 #endif // OMPTARGET_DEBUG
53 
54 #include "../../common/elf_common.c"
55 
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58   __tgt_target_table Table;
59   std::vector<__tgt_offload_entry> Entries;
60 };
61 
62 enum ExecutionModeType {
63   SPMD, // constructors, destructors,
64   // combined constructs (`teams distribute parallel for [simd]`)
65   GENERIC, // everything else
66   NONE
67 };
68 
69 /// Use a single entity to encode a kernel and a set of flags.
70 struct KernelTy {
71   CUfunction Func;
72 
73   // execution mode of kernel
74   // 0 - SPMD mode (without master warp)
75   // 1 - Generic mode (with master warp)
76   int8_t ExecutionMode;
77 
78   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
79       : Func(_Func), ExecutionMode(_ExecutionMode) {}
80 };
81 
82 /// Device environment data
83 /// Manually sync with the deviceRTL side for now, move to a dedicated header
84 /// file later.
85 struct omptarget_device_environmentTy {
86   int32_t debug_level;
87 };
88 
89 /// List that contains all the kernels.
90 /// FIXME: we may need this to be per device and per library.
91 std::list<KernelTy> KernelsList;
92 
93 namespace {
94 bool checkResult(CUresult Err, const char *ErrMsg) {
95   if (Err == CUDA_SUCCESS)
96     return true;
97 
98   DP(ErrMsg);
99   CUDA_ERR_STRING(Err);
100   return false;
101 }
102 
103 // Structure contains per-device data
104 struct DeviceDataTy {
105   std::list<FuncOrGblEntryTy> FuncGblEntries;
106   CUcontext Context = nullptr;
107   // Device properties
108   int ThreadsPerBlock = 0;
109   int BlocksPerGrid = 0;
110   int WarpSize = 0;
111   // OpenMP properties
112   int NumTeams = 0;
113   int NumThreads = 0;
114 };
115 
116 class StreamManagerTy {
117   int NumberOfDevices;
118   // The initial size of stream pool
119   int EnvNumInitialStreams;
120   // Per-device stream mutex
121   std::vector<std::unique_ptr<std::mutex>> StreamMtx;
122   // Per-device stream Id indicates the next available stream in the pool
123   std::vector<int> NextStreamId;
124   // Per-device stream pool
125   std::vector<std::vector<CUstream>> StreamPool;
126   // Reference to per-device data
127   std::vector<DeviceDataTy> &DeviceData;
128 
129   // If there is no CUstream left in the pool, we will resize the pool to
130   // allocate more CUstream. This function should be called with device mutex,
131   // and we do not resize to smaller one.
132   void resizeStreamPool(const int DeviceId, const size_t NewSize) {
133     std::vector<CUstream> &Pool = StreamPool[DeviceId];
134     const size_t CurrentSize = Pool.size();
135     assert(NewSize > CurrentSize && "new size is not larger than current size");
136 
137     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
138     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) {
139       // We will return if cannot switch to the right context in case of
140       // creating bunch of streams that are not corresponding to the right
141       // device. The offloading will fail later because selected CUstream is
142       // nullptr.
143       return;
144     }
145 
146     Pool.resize(NewSize, nullptr);
147 
148     for (size_t I = CurrentSize; I < NewSize; ++I) {
149       checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING),
150                   "Error returned from cuStreamCreate\n");
151     }
152   }
153 
154 public:
155   StreamManagerTy(const int NumberOfDevices,
156                   std::vector<DeviceDataTy> &DeviceData)
157       : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32),
158         DeviceData(DeviceData) {
159     StreamPool.resize(NumberOfDevices);
160     NextStreamId.resize(NumberOfDevices);
161     StreamMtx.resize(NumberOfDevices);
162 
163     if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS"))
164       EnvNumInitialStreams = std::stoi(EnvStr);
165 
166     // Initialize the next stream id
167     std::fill(NextStreamId.begin(), NextStreamId.end(), 0);
168 
169     // Initialize stream mutex
170     for (std::unique_ptr<std::mutex> &Ptr : StreamMtx)
171       Ptr = std::make_unique<std::mutex>();
172   }
173 
174   ~StreamManagerTy() {
175     // Destroy streams
176     for (int I = 0; I < NumberOfDevices; ++I) {
177       checkResult(cuCtxSetCurrent(DeviceData[I].Context),
178                   "Error returned from cuCtxSetCurrent\n");
179 
180       for (CUstream &S : StreamPool[I]) {
181         if (S)
182           checkResult(cuStreamDestroy(S),
183                       "Error returned from cuStreamDestroy\n");
184       }
185     }
186   }
187 
188   // Get a CUstream from pool. Per-device next stream id always points to the
189   // next available CUstream. That means, CUstreams [0, id-1] have been
190   // assigned, and [id,] are still available. If there is no CUstream left, we
191   // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned,
192   // the id will increase one.
193   // xxxxxs+++++++++
194   //      ^
195   //      id
196   // After assignment, the pool becomes the following and s is assigned.
197   // xxxxxs+++++++++
198   //       ^
199   //       id
200   CUstream getStream(const int DeviceId) {
201     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
202     int &Id = NextStreamId[DeviceId];
203     // No CUstream left in the pool, we need to request from CUDA RT
204     if (Id == StreamPool[DeviceId].size()) {
205       // By default we double the stream pool every time
206       resizeStreamPool(DeviceId, Id * 2);
207     }
208     return StreamPool[DeviceId][Id++];
209   }
210 
211   // Return a CUstream back to pool. As mentioned above, per-device next
212   // stream is always points to the next available CUstream, so when we return
213   // a CUstream, we need to first decrease the id, and then copy the CUstream
214   // back.
215   // It is worth noting that, the order of streams return might be different
216   // from that they're assigned, that saying, at some point, there might be
217   // two identical CUstreams.
218   // xxax+a+++++
219   //     ^
220   //     id
221   // However, it doesn't matter, because they're always on the two sides of
222   // id. The left one will in the end be overwritten by another CUstream.
223   // Therefore, after several execution, the order of pool might be different
224   // from its initial state.
225   void returnStream(const int DeviceId, CUstream Stream) {
226     const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]);
227     int &Id = NextStreamId[DeviceId];
228     assert(Id > 0 && "Wrong stream ID");
229     StreamPool[DeviceId][--Id] = Stream;
230   }
231 
232   bool initializeDeviceStreamPool(const int DeviceId) {
233     assert(StreamPool[DeviceId].empty() && "stream pool has been initialized");
234 
235     resizeStreamPool(DeviceId, EnvNumInitialStreams);
236 
237     // Check the size of stream pool
238     if (StreamPool[DeviceId].size() != EnvNumInitialStreams)
239       return false;
240 
241     // Check whether each stream is valid
242     for (CUstream &S : StreamPool[DeviceId])
243       if (!S)
244         return false;
245 
246     return true;
247   }
248 };
249 
250 class DeviceRTLTy {
251   int NumberOfDevices;
252   // OpenMP environment properties
253   int EnvNumTeams;
254   int EnvTeamLimit;
255   // OpenMP requires flags
256   int64_t RequiresFlags;
257 
258   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
259   static constexpr const int HardThreadLimit = 1024;
260   static constexpr const int DefaultNumTeams = 128;
261   static constexpr const int DefaultNumThreads = 128;
262 
263   std::unique_ptr<StreamManagerTy> StreamManager;
264   std::vector<DeviceDataTy> DeviceData;
265   std::vector<CUmodule> Modules;
266 
267   // Record entry point associated with device
268   void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) {
269     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
270     E.Entries.push_back(entry);
271   }
272 
273   // Return true if the entry is associated with device
274   bool findOffloadEntry(const int DeviceId, const void *Addr) const {
275     for (const __tgt_offload_entry &Itr :
276          DeviceData[DeviceId].FuncGblEntries.back().Entries)
277       if (Itr.addr == Addr)
278         return true;
279 
280     return false;
281   }
282 
283   // Return the pointer to the target entries table
284   __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
285     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
286 
287     if (E.Entries.empty())
288       return nullptr;
289 
290     // Update table info according to the entries and return the pointer
291     E.Table.EntriesBegin = E.Entries.data();
292     E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
293 
294     return &E.Table;
295   }
296 
297   // Clear entries table for a device
298   void clearOffloadEntriesTable(const int DeviceId) {
299     DeviceData[DeviceId].FuncGblEntries.emplace_back();
300     FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
301     E.Entries.clear();
302     E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
303   }
304 
305   CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
306     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
307 
308     if (!AsyncInfoPtr->Queue)
309       AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId);
310 
311     return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
312   }
313 
314 public:
315   // This class should not be copied
316   DeviceRTLTy(const DeviceRTLTy &) = delete;
317   DeviceRTLTy(DeviceRTLTy &&) = delete;
318 
319   DeviceRTLTy()
320       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
321         RequiresFlags(OMP_REQ_UNDEFINED) {
322 #ifdef OMPTARGET_DEBUG
323     if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG"))
324       DebugLevel = std::stoi(EnvStr);
325 #endif // OMPTARGET_DEBUG
326 
327     DP("Start initializing CUDA\n");
328 
329     CUresult Err = cuInit(0);
330     if (!checkResult(Err, "Error returned from cuInit\n")) {
331       return;
332     }
333 
334     Err = cuDeviceGetCount(&NumberOfDevices);
335     if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
336       return;
337 
338     if (NumberOfDevices == 0) {
339       DP("There are no devices supporting CUDA.\n");
340       return;
341     }
342 
343     DeviceData.resize(NumberOfDevices);
344 
345     // Get environment variables regarding teams
346     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
347       // OMP_TEAM_LIMIT has been set
348       EnvTeamLimit = std::stoi(EnvStr);
349       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
350     }
351     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
352       // OMP_NUM_TEAMS has been set
353       EnvNumTeams = std::stoi(EnvStr);
354       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
355     }
356 
357     StreamManager =
358         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
359   }
360 
361   ~DeviceRTLTy() {
362     // First destruct stream manager in case of Contexts is destructed before it
363     StreamManager = nullptr;
364 
365     for (CUmodule &M : Modules)
366       // Close module
367       if (M)
368         checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
369 
370     for (DeviceDataTy &D : DeviceData) {
371       // Destroy context
372       if (D.Context)
373         checkResult(cuCtxDestroy(D.Context),
374                     "Error returned from cuCtxDestroy\n");
375     }
376   }
377 
378   // Check whether a given DeviceId is valid
379   bool isValidDeviceId(const int DeviceId) const {
380     return DeviceId >= 0 && DeviceId < NumberOfDevices;
381   }
382 
383   int getNumOfDevices() const { return NumberOfDevices; }
384 
385   void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
386 
387   int initDevice(const int DeviceId) {
388     CUdevice Device;
389 
390     DP("Getting device %d\n", DeviceId);
391     CUresult Err = cuDeviceGet(&Device, DeviceId);
392     if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
393       return OFFLOAD_FAIL;
394 
395     // Create the context and save it to use whenever this device is selected.
396     Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC,
397                       Device);
398     if (!checkResult(Err, "Error returned from cuCtxCreate\n"))
399       return OFFLOAD_FAIL;
400 
401     Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
402     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
403       return OFFLOAD_FAIL;
404 
405     // Initialize stream pool
406     if (!StreamManager->initializeDeviceStreamPool(DeviceId))
407       return OFFLOAD_FAIL;
408 
409     // Query attributes to determine number of threads/block and blocks/grid.
410     int MaxGridDimX;
411     Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
412                                Device);
413     if (Err != CUDA_SUCCESS) {
414       DP("Error getting max grid dimension, use default value %d\n",
415          DeviceRTLTy::DefaultNumTeams);
416       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
417     } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) {
418       DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
419       DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
420     } else {
421       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
422          "at the hard limit\n",
423          MaxGridDimX, DeviceRTLTy::HardTeamLimit);
424       DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit;
425     }
426 
427     // We are only exploiting threads along the x axis.
428     int MaxBlockDimX;
429     Err = cuDeviceGetAttribute(&MaxBlockDimX,
430                                CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
431     if (Err != CUDA_SUCCESS) {
432       DP("Error getting max block dimension, use default value %d\n",
433          DeviceRTLTy::DefaultNumThreads);
434       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
435     } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
436       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
437       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
438     } else {
439       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
440          "capping at the hard limit\n",
441          MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
442       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
443     }
444 
445     // Get and set warp size
446     int WarpSize;
447     Err =
448         cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
449     if (Err != CUDA_SUCCESS) {
450       DP("Error getting warp size, assume default value 32\n");
451       DeviceData[DeviceId].WarpSize = 32;
452     } else {
453       DP("Using warp size %d\n", WarpSize);
454       DeviceData[DeviceId].WarpSize = WarpSize;
455     }
456 
457     // Adjust teams to the env variables
458     if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
459       DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
460          EnvTeamLimit);
461       DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
462     }
463 
464     DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
465        DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock,
466        DeviceData[DeviceId].WarpSize);
467 
468     // Set default number of teams
469     if (EnvNumTeams > 0) {
470       DP("Default number of teams set according to environment %d\n",
471          EnvNumTeams);
472       DeviceData[DeviceId].NumTeams = EnvNumTeams;
473     } else {
474       DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
475       DP("Default number of teams set according to library's default %d\n",
476          DeviceRTLTy::DefaultNumTeams);
477     }
478 
479     if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
480       DP("Default number of teams exceeds device limit, capping at %d\n",
481          DeviceData[DeviceId].BlocksPerGrid);
482       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
483     }
484 
485     // Set default number of threads
486     DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
487     DP("Default number of threads set according to library's default %d\n",
488        DeviceRTLTy::DefaultNumThreads);
489     if (DeviceData[DeviceId].NumThreads >
490         DeviceData[DeviceId].ThreadsPerBlock) {
491       DP("Default number of threads exceeds device limit, capping at %d\n",
492          DeviceData[DeviceId].ThreadsPerBlock);
493       DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock;
494     }
495 
496     return OFFLOAD_SUCCESS;
497   }
498 
499   __tgt_target_table *loadBinary(const int DeviceId,
500                                  const __tgt_device_image *Image) {
501     // Set the context we are using
502     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
503     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
504       return nullptr;
505 
506     // Clear the offload table as we are going to create a new one.
507     clearOffloadEntriesTable(DeviceId);
508 
509     // Create the module and extract the function pointers.
510     CUmodule Module;
511     DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
512     Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
513     if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
514       return nullptr;
515 
516     DP("CUDA module successfully loaded!\n");
517 
518     Modules.push_back(Module);
519 
520     // Find the symbols in the module by name.
521     const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
522     const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
523 
524     for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
525       if (!E->addr) {
526         // We return nullptr when something like this happens, the host should
527         // have always something in the address to uniquely identify the target
528         // region.
529         DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
530         return nullptr;
531       }
532 
533       if (E->size) {
534         __tgt_offload_entry Entry = *E;
535         CUdeviceptr CUPtr;
536         size_t CUSize;
537         Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
538         // We keep this style here because we need the name
539         if (Err != CUDA_SUCCESS) {
540           DP("Loading global '%s' (Failed)\n", E->name);
541           CUDA_ERR_STRING(Err);
542           return nullptr;
543         }
544 
545         if (CUSize != E->size) {
546           DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
547              CUSize, E->size);
548           return nullptr;
549         }
550 
551         DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
552            DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
553 
554         Entry.addr = (void *)(CUPtr);
555 
556         // Note: In the current implementation declare target variables
557         // can either be link or to. This means that once unified
558         // memory is activated via the requires directive, the variable
559         // can be used directly from the host in both cases.
560         // TODO: when variables types other than to or link are added,
561         // the below condition should be changed to explicitly
562         // check for to and link variables types:
563         // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
564         // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
565         if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
566           // If unified memory is present any target link or to variables
567           // can access host addresses directly. There is no longer a
568           // need for device copies.
569           cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
570           DP("Copy linked variable host address (" DPxMOD
571              ") to device address (" DPxMOD ")\n",
572              DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
573         }
574 
575         addOffloadEntry(DeviceId, Entry);
576 
577         continue;
578       }
579 
580       CUfunction Func;
581       Err = cuModuleGetFunction(&Func, Module, E->name);
582       // We keep this style here because we need the name
583       if (Err != CUDA_SUCCESS) {
584         DP("Loading '%s' (Failed)\n", E->name);
585         CUDA_ERR_STRING(Err);
586         return nullptr;
587       }
588 
589       DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
590          DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
591 
592       // default value GENERIC (in case symbol is missing from cubin file)
593       int8_t ExecModeVal = ExecutionModeType::GENERIC;
594       std::string ExecModeNameStr(E->name);
595       ExecModeNameStr += "_exec_mode";
596       const char *ExecModeName = ExecModeNameStr.c_str();
597 
598       CUdeviceptr ExecModePtr;
599       size_t CUSize;
600       Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
601       if (Err == CUDA_SUCCESS) {
602         if (CUSize != sizeof(int8_t)) {
603           DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
604              ExecModeName, CUSize, sizeof(int8_t));
605           return nullptr;
606         }
607 
608         Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
609         if (Err != CUDA_SUCCESS) {
610           DP("Error when copying data from device to host. Pointers: "
611              "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
612              DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
613           CUDA_ERR_STRING(Err);
614           return nullptr;
615         }
616 
617         if (ExecModeVal < 0 || ExecModeVal > 1) {
618           DP("Error wrong exec_mode value specified in cubin file: %d\n",
619              ExecModeVal);
620           return nullptr;
621         }
622       } else {
623         DP("Loading global exec_mode '%s' - symbol missing, using default "
624            "value GENERIC (1)\n",
625            ExecModeName);
626         CUDA_ERR_STRING(Err);
627       }
628 
629       KernelsList.emplace_back(Func, ExecModeVal);
630 
631       __tgt_offload_entry Entry = *E;
632       Entry.addr = &KernelsList.back();
633       addOffloadEntry(DeviceId, Entry);
634     }
635 
636     // send device environment data to the device
637     {
638       omptarget_device_environmentTy DeviceEnv{0};
639 
640 #ifdef OMPTARGET_DEBUG
641       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
642         DeviceEnv.debug_level = std::stoi(EnvStr);
643 #endif
644 
645       const char *DeviceEnvName = "omptarget_device_environment";
646       CUdeviceptr DeviceEnvPtr;
647       size_t CUSize;
648 
649       Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
650       if (Err == CUDA_SUCCESS) {
651         if (CUSize != sizeof(DeviceEnv)) {
652           DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
653              DeviceEnvName, CUSize, sizeof(int32_t));
654           CUDA_ERR_STRING(Err);
655           return nullptr;
656         }
657 
658         Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
659         if (Err != CUDA_SUCCESS) {
660           DP("Error when copying data from host to device. Pointers: "
661              "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
662              DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
663           CUDA_ERR_STRING(Err);
664           return nullptr;
665         }
666 
667         DP("Sending global device environment data %zu bytes\n", CUSize);
668       } else {
669         DP("Finding global device environment '%s' - symbol missing.\n",
670            DeviceEnvName);
671         DP("Continue, considering this is a device RTL which does not accept "
672            "environment setting.\n");
673       }
674     }
675 
676     return getOffloadEntriesTable(DeviceId);
677   }
678 
679   void *dataAlloc(const int DeviceId, const int64_t Size) const {
680     if (Size == 0)
681       return nullptr;
682 
683     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
684     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
685       return nullptr;
686 
687     CUdeviceptr DevicePtr;
688     Err = cuMemAlloc(&DevicePtr, Size);
689     if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
690       return nullptr;
691 
692     return (void *)DevicePtr;
693   }
694 
695   int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
696                  const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
697     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
698 
699     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
700     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
701       return OFFLOAD_FAIL;
702 
703     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
704 
705     Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
706     if (Err != CUDA_SUCCESS) {
707       DP("Error when copying data from host to device. Pointers: host = " DPxMOD
708          ", device = " DPxMOD ", size = %" PRId64 "\n",
709          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
710       CUDA_ERR_STRING(Err);
711       return OFFLOAD_FAIL;
712     }
713 
714     return OFFLOAD_SUCCESS;
715   }
716 
717   int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
718                    const int64_t Size, __tgt_async_info *AsyncInfoPtr) const {
719     assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr");
720 
721     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
722     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
723       return OFFLOAD_FAIL;
724 
725     CUstream Stream = getStream(DeviceId, AsyncInfoPtr);
726 
727     Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
728     if (Err != CUDA_SUCCESS) {
729       DP("Error when copying data from device to host. Pointers: host = " DPxMOD
730          ", device = " DPxMOD ", size = %" PRId64 "\n",
731          DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
732       CUDA_ERR_STRING(Err);
733       return OFFLOAD_FAIL;
734     }
735 
736     return OFFLOAD_SUCCESS;
737   }
738 
739   int dataDelete(const int DeviceId, void *TgtPtr) const {
740     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
741     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
742       return OFFLOAD_FAIL;
743 
744     Err = cuMemFree((CUdeviceptr)TgtPtr);
745     if (!checkResult(Err, "Error returned from cuMemFree\n"))
746       return OFFLOAD_FAIL;
747 
748     return OFFLOAD_SUCCESS;
749   }
750 
751   int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr,
752                           void **TgtArgs, ptrdiff_t *TgtOffsets,
753                           const int ArgNum, const int TeamNum,
754                           const int ThreadLimit,
755                           const unsigned int LoopTripCount,
756                           __tgt_async_info *AsyncInfo) const {
757     CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
758     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
759       return OFFLOAD_FAIL;
760 
761     // All args are references.
762     std::vector<void *> Args(ArgNum);
763     std::vector<void *> Ptrs(ArgNum);
764 
765     for (int I = 0; I < ArgNum; ++I) {
766       Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
767       Args[I] = &Ptrs[I];
768     }
769 
770     const KernelTy *KernelInfo =
771         reinterpret_cast<const KernelTy *>(TgtEntryPtr);
772 
773     unsigned int CudaThreadsPerBlock;
774     if (ThreadLimit > 0) {
775       DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
776       CudaThreadsPerBlock = ThreadLimit;
777       // Add master warp if necessary
778       if (KernelInfo->ExecutionMode == GENERIC) {
779         DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
780         CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
781       }
782     } else {
783       DP("Setting CUDA threads per block to default %d\n",
784          DeviceData[DeviceId].NumThreads);
785       CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
786     }
787 
788     if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
789       DP("Threads per block capped at device limit %d\n",
790          DeviceData[DeviceId].ThreadsPerBlock);
791       CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
792     }
793 
794     int KernelLimit;
795     Err = cuFuncGetAttribute(&KernelLimit,
796                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
797                              KernelInfo->Func);
798     if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) {
799       DP("Threads per block capped at kernel limit %d\n", KernelLimit);
800       CudaThreadsPerBlock = KernelLimit;
801     }
802 
803     unsigned int CudaBlocksPerGrid;
804     if (TeamNum <= 0) {
805       if (LoopTripCount > 0 && EnvNumTeams < 0) {
806         if (KernelInfo->ExecutionMode == SPMD) {
807           // We have a combined construct, i.e. `target teams distribute
808           // parallel for [simd]`. We launch so many teams so that each thread
809           // will execute one iteration of the loop. round up to the nearest
810           // integer
811           CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
812         } else {
813           // If we reach this point, then we have a non-combined construct, i.e.
814           // `teams distribute` with a nested `parallel for` and each team is
815           // assigned one iteration of the `distribute` loop. E.g.:
816           //
817           // #pragma omp target teams distribute
818           // for(...loop_tripcount...) {
819           //   #pragma omp parallel for
820           //   for(...) {}
821           // }
822           //
823           // Threads within a team will execute the iterations of the `parallel`
824           // loop.
825           CudaBlocksPerGrid = LoopTripCount;
826         }
827         DP("Using %d teams due to loop trip count %" PRIu64
828            " and number of threads per block %d\n",
829            CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
830       } else {
831         DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
832         CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
833       }
834     } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) {
835       DP("Capping number of teams to team limit %d\n",
836          DeviceData[DeviceId].BlocksPerGrid);
837       CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
838     } else {
839       DP("Using requested number of teams %d\n", TeamNum);
840       CudaBlocksPerGrid = TeamNum;
841     }
842 
843     // Run on the device.
844     DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid,
845        CudaThreadsPerBlock);
846 
847     CUstream Stream = getStream(DeviceId, AsyncInfo);
848     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
849                          /* gridDimZ */ 1, CudaThreadsPerBlock,
850                          /* blockDimY */ 1, /* blockDimZ */ 1,
851                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
852     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
853       return OFFLOAD_FAIL;
854 
855     DP("Launch of entry point at " DPxMOD " successful!\n",
856        DPxPTR(TgtEntryPtr));
857 
858     return OFFLOAD_SUCCESS;
859   }
860 
861   int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const {
862     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue);
863     CUresult Err = cuStreamSynchronize(Stream);
864     if (Err != CUDA_SUCCESS) {
865       DP("Error when synchronizing stream. stream = " DPxMOD
866          ", async info ptr = " DPxMOD "\n",
867          DPxPTR(Stream), DPxPTR(AsyncInfoPtr));
868       CUDA_ERR_STRING(Err);
869       return OFFLOAD_FAIL;
870     }
871 
872     // Once the stream is synchronized, return it to stream pool and reset
873     // async_info. This is to make sure the synchronization only works for its
874     // own tasks.
875     StreamManager->returnStream(
876         DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue));
877     AsyncInfoPtr->Queue = nullptr;
878 
879     return OFFLOAD_SUCCESS;
880   }
881 };
882 
883 DeviceRTLTy DeviceRTL;
884 } // namespace
885 
886 // Exposed library API function
887 #ifdef __cplusplus
888 extern "C" {
889 #endif
890 
891 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
892   return elf_check_machine(image, /* EM_CUDA */ 190);
893 }
894 
895 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
896 
897 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
898   DP("Init requires flags to %ld\n", RequiresFlags);
899   DeviceRTL.setRequiresFlag(RequiresFlags);
900   return RequiresFlags;
901 }
902 
903 int32_t __tgt_rtl_init_device(int32_t device_id) {
904   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
905 
906   return DeviceRTL.initDevice(device_id);
907 }
908 
909 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
910                                           __tgt_device_image *image) {
911   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
912 
913   return DeviceRTL.loadBinary(device_id, image);
914 }
915 
916 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) {
917   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
918 
919   return DeviceRTL.dataAlloc(device_id, size);
920 }
921 
922 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
923                               int64_t size) {
924   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
925 
926   __tgt_async_info async_info;
927   const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr,
928                                                  size, &async_info);
929   if (rc != OFFLOAD_SUCCESS)
930     return OFFLOAD_FAIL;
931 
932   return __tgt_rtl_synchronize(device_id, &async_info);
933 }
934 
935 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr,
936                                     void *hst_ptr, int64_t size,
937                                     __tgt_async_info *async_info_ptr) {
938   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
939   assert(async_info_ptr && "async_info_ptr is nullptr");
940 
941   return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size,
942                               async_info_ptr);
943 }
944 
945 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
946                                 int64_t size) {
947   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
948 
949   __tgt_async_info async_info;
950   const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr,
951                                                    size, &async_info);
952   if (rc != OFFLOAD_SUCCESS)
953     return OFFLOAD_FAIL;
954 
955   return __tgt_rtl_synchronize(device_id, &async_info);
956 }
957 
958 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr,
959                                       void *tgt_ptr, int64_t size,
960                                       __tgt_async_info *async_info_ptr) {
961   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
962   assert(async_info_ptr && "async_info_ptr is nullptr");
963 
964   return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size,
965                                 async_info_ptr);
966 }
967 
968 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
969   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
970 
971   return DeviceRTL.dataDelete(device_id, tgt_ptr);
972 }
973 
974 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
975                                          void **tgt_args,
976                                          ptrdiff_t *tgt_offsets,
977                                          int32_t arg_num, int32_t team_num,
978                                          int32_t thread_limit,
979                                          uint64_t loop_tripcount) {
980   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
981 
982   __tgt_async_info async_info;
983   const int32_t rc = __tgt_rtl_run_target_team_region_async(
984       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
985       thread_limit, loop_tripcount, &async_info);
986   if (rc != OFFLOAD_SUCCESS)
987     return OFFLOAD_FAIL;
988 
989   return __tgt_rtl_synchronize(device_id, &async_info);
990 }
991 
992 int32_t __tgt_rtl_run_target_team_region_async(
993     int32_t device_id, void *tgt_entry_ptr, void **tgt_args,
994     ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
995     int32_t thread_limit, uint64_t loop_tripcount,
996     __tgt_async_info *async_info_ptr) {
997   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
998 
999   return DeviceRTL.runTargetTeamRegion(
1000       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num,
1001       thread_limit, loop_tripcount, async_info_ptr);
1002 }
1003 
1004 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
1005                                     void **tgt_args, ptrdiff_t *tgt_offsets,
1006                                     int32_t arg_num) {
1007   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1008 
1009   __tgt_async_info async_info;
1010   const int32_t rc = __tgt_rtl_run_target_region_async(
1011       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info);
1012   if (rc != OFFLOAD_SUCCESS)
1013     return OFFLOAD_FAIL;
1014 
1015   return __tgt_rtl_synchronize(device_id, &async_info);
1016 }
1017 
1018 int32_t __tgt_rtl_run_target_region_async(int32_t device_id,
1019                                           void *tgt_entry_ptr, void **tgt_args,
1020                                           ptrdiff_t *tgt_offsets,
1021                                           int32_t arg_num,
1022                                           __tgt_async_info *async_info_ptr) {
1023   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1024 
1025   return __tgt_rtl_run_target_team_region_async(
1026       device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num,
1027       /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1028       async_info_ptr);
1029 }
1030 
1031 int32_t __tgt_rtl_synchronize(int32_t device_id,
1032                               __tgt_async_info *async_info_ptr) {
1033   assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid");
1034   assert(async_info_ptr && "async_info_ptr is nullptr");
1035   assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr");
1036 
1037   return DeviceRTL.synchronize(device_id, async_info_ptr);
1038 }
1039 
1040 #ifdef __cplusplus
1041 }
1042 #endif
1043