1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <atomic>
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <memory>
19 #include <string>
20 #include <vector>
21 
22 #include "omptargetplugin.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #ifdef OMPTARGET_DEBUG
29 static int DebugLevel = 0;
30 
31 #define GETNAME2(name) #name
32 #define GETNAME(name) GETNAME2(name)
33 #define DP(...) \
34   do { \
35     if (DebugLevel > 0) { \
36       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
37     } \
38   } while (false)
39 
40 // Utility for retrieving and printing CUDA error string.
41 #define CUDA_ERR_STRING(err) \
42   do { \
43     if (DebugLevel > 0) { \
44       const char *errStr; \
45       cuGetErrorString(err, &errStr); \
46       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
47     } \
48   } while (false)
49 #else // OMPTARGET_DEBUG
50 #define DP(...) {}
51 #define CUDA_ERR_STRING(err) {}
52 #endif // OMPTARGET_DEBUG
53 
54 #include "../../common/elf_common.c"
55 
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58   __tgt_target_table Table;
59   std::vector<__tgt_offload_entry> Entries;
60 };
61 
62 enum ExecutionModeType {
63   SPMD, // constructors, destructors,
64         // combined constructs (`teams distribute parallel for [simd]`)
65   GENERIC, // everything else
66   NONE
67 };
68 
69 /// Use a single entity to encode a kernel and a set of flags.
70 struct KernelTy {
71   CUfunction Func;
72 
73   // execution mode of kernel
74   // 0 - SPMD mode (without master warp)
75   // 1 - Generic mode (with master warp)
76   int8_t ExecutionMode;
77 
78   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
79       : Func(_Func), ExecutionMode(_ExecutionMode) {}
80 };
81 
82 /// Device environment data
83 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
84 struct omptarget_device_environmentTy {
85   int32_t debug_level;
86 };
87 
88 /// List that contains all the kernels.
89 /// FIXME: we may need this to be per device and per library.
90 std::list<KernelTy> KernelsList;
91 
92 /// Class containing all the device information.
93 class RTLDeviceInfoTy {
94   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
95   std::vector<std::unique_ptr<std::atomic_uint>> NextStreamId;
96 
97 public:
98   int NumberOfDevices;
99   std::vector<CUmodule> Modules;
100   std::vector<CUcontext> Contexts;
101   std::vector<std::vector<CUstream>> Streams;
102 
103   // Device properties
104   std::vector<int> ThreadsPerBlock;
105   std::vector<int> BlocksPerGrid;
106   std::vector<int> WarpSize;
107 
108   // OpenMP properties
109   std::vector<int> NumTeams;
110   std::vector<int> NumThreads;
111 
112   // OpenMP Environment properties
113   int EnvNumTeams;
114   int EnvTeamLimit;
115   int EnvNumStreams;
116 
117   // OpenMP Requires Flags
118   int64_t RequiresFlags;
119 
120   //static int EnvNumThreads;
121   static const int HardTeamLimit = 1<<16; // 64k
122   static const int HardThreadLimit = 1024;
123   static const int DefaultNumTeams = 128;
124   static const int DefaultNumThreads = 128;
125 
126   // Record entry point associated with device
127   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
128     assert(device_id < (int32_t)FuncGblEntries.size() &&
129            "Unexpected device id!");
130     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
131 
132     E.Entries.push_back(entry);
133   }
134 
135   // Return true if the entry is associated with device
136   bool findOffloadEntry(int32_t device_id, void *addr) {
137     assert(device_id < (int32_t)FuncGblEntries.size() &&
138            "Unexpected device id!");
139     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
140 
141     for (auto &it : E.Entries) {
142       if (it.addr == addr)
143         return true;
144     }
145 
146     return false;
147   }
148 
149   // Return the pointer to the target entries table
150   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
151     assert(device_id < (int32_t)FuncGblEntries.size() &&
152            "Unexpected device id!");
153     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
154 
155     int32_t size = E.Entries.size();
156 
157     // Table is empty
158     if (!size)
159       return 0;
160 
161     __tgt_offload_entry *begin = &E.Entries[0];
162     __tgt_offload_entry *end = &E.Entries[size - 1];
163 
164     // Update table info according to the entries and return the pointer
165     E.Table.EntriesBegin = begin;
166     E.Table.EntriesEnd = ++end;
167 
168     return &E.Table;
169   }
170 
171   // Clear entries table for a device
172   void clearOffloadEntriesTable(int32_t device_id) {
173     assert(device_id < (int32_t)FuncGblEntries.size() &&
174            "Unexpected device id!");
175     FuncGblEntries[device_id].emplace_back();
176     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
177     E.Entries.clear();
178     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
179   }
180 
181   // Get the next stream on a given device in a round robin manner
182   CUstream &getNextStream(const int DeviceId) {
183     assert(DeviceId >= 0 &&
184            static_cast<size_t>(DeviceId) < NextStreamId.size() &&
185            "Unexpected device id!");
186     const unsigned int Id = NextStreamId[DeviceId]->fetch_add(1);
187     return Streams[DeviceId][Id % EnvNumStreams];
188   }
189 
190   RTLDeviceInfoTy() {
191 #ifdef OMPTARGET_DEBUG
192     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
193       DebugLevel = std::stoi(envStr);
194     }
195 #endif // OMPTARGET_DEBUG
196 
197     DP("Start initializing CUDA\n");
198 
199     CUresult err = cuInit(0);
200     if (err != CUDA_SUCCESS) {
201       DP("Error when initializing CUDA\n");
202       CUDA_ERR_STRING(err);
203       return;
204     }
205 
206     NumberOfDevices = 0;
207 
208     err = cuDeviceGetCount(&NumberOfDevices);
209     if (err != CUDA_SUCCESS) {
210       DP("Error when getting CUDA device count\n");
211       CUDA_ERR_STRING(err);
212       return;
213     }
214 
215     if (NumberOfDevices == 0) {
216       DP("There are no devices supporting CUDA.\n");
217       return;
218     }
219 
220     FuncGblEntries.resize(NumberOfDevices);
221     Contexts.resize(NumberOfDevices);
222     Streams.resize(NumberOfDevices);
223     NextStreamId.resize(NumberOfDevices);
224     ThreadsPerBlock.resize(NumberOfDevices);
225     BlocksPerGrid.resize(NumberOfDevices);
226     WarpSize.resize(NumberOfDevices);
227     NumTeams.resize(NumberOfDevices);
228     NumThreads.resize(NumberOfDevices);
229 
230     // Get environment variables regarding teams
231     char *envStr = getenv("OMP_TEAM_LIMIT");
232     if (envStr) {
233       // OMP_TEAM_LIMIT has been set
234       EnvTeamLimit = std::stoi(envStr);
235       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
236     } else {
237       EnvTeamLimit = -1;
238     }
239     envStr = getenv("OMP_NUM_TEAMS");
240     if (envStr) {
241       // OMP_NUM_TEAMS has been set
242       EnvNumTeams = std::stoi(envStr);
243       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
244     } else {
245       EnvNumTeams = -1;
246     }
247 
248     // By default let's create 256 streams per device
249     EnvNumStreams = 256;
250     envStr = getenv("LIBOMPTARGET_NUM_STREAMS");
251     if (envStr) {
252       EnvNumStreams = std::stoi(envStr);
253     }
254 
255     // Initialize streams for each device
256     for (std::vector<CUstream> &S : Streams) {
257       S.resize(EnvNumStreams);
258     }
259 
260     // Initialize the next stream id
261     for (std::unique_ptr<std::atomic_uint> &Ptr : NextStreamId) {
262       Ptr = std::make_unique<std::atomic_uint>(0);
263     }
264 
265     // Default state.
266     RequiresFlags = OMP_REQ_UNDEFINED;
267   }
268 
269   ~RTLDeviceInfoTy() {
270     // Close modules
271     for (auto &module : Modules)
272       if (module) {
273         CUresult err = cuModuleUnload(module);
274         if (err != CUDA_SUCCESS) {
275           DP("Error when unloading CUDA module\n");
276           CUDA_ERR_STRING(err);
277         }
278       }
279 
280     // Destroy streams before contexts
281     for (int I = 0; I < NumberOfDevices; ++I) {
282       CUresult err = cuCtxSetCurrent(Contexts[I]);
283       if (err != CUDA_SUCCESS) {
284         DP("Error when setting current CUDA context\n");
285         CUDA_ERR_STRING(err);
286       }
287 
288       for (auto &S : Streams[I])
289         if (S) {
290           err = cuStreamDestroy(S);
291           if (err != CUDA_SUCCESS) {
292             DP("Error when destroying CUDA stream\n");
293             CUDA_ERR_STRING(err);
294           }
295         }
296     }
297 
298     // Destroy contexts
299     for (auto &ctx : Contexts)
300       if (ctx) {
301         CUresult err = cuCtxDestroy(ctx);
302         if (err != CUDA_SUCCESS) {
303           DP("Error when destroying CUDA context\n");
304           CUDA_ERR_STRING(err);
305         }
306       }
307   }
308 };
309 
310 static RTLDeviceInfoTy DeviceInfo;
311 
312 #ifdef __cplusplus
313 extern "C" {
314 #endif
315 
316 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
317   return elf_check_machine(image, 190); // EM_CUDA = 190.
318 }
319 
320 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
321 
322 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
323   DP("Init requires flags to %ld\n", RequiresFlags);
324   DeviceInfo.RequiresFlags = RequiresFlags;
325   return RequiresFlags;
326 }
327 
328 int32_t __tgt_rtl_init_device(int32_t device_id) {
329 
330   CUdevice cuDevice;
331   DP("Getting device %d\n", device_id);
332   CUresult err = cuDeviceGet(&cuDevice, device_id);
333   if (err != CUDA_SUCCESS) {
334     DP("Error when getting CUDA device with id = %d\n", device_id);
335     CUDA_ERR_STRING(err);
336     return OFFLOAD_FAIL;
337   }
338 
339   // Create the context and save it to use whenever this device is selected.
340   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
341                     cuDevice);
342   if (err != CUDA_SUCCESS) {
343     DP("Error when creating a CUDA context\n");
344     CUDA_ERR_STRING(err);
345     return OFFLOAD_FAIL;
346   }
347 
348   err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
349   if (err != CUDA_SUCCESS) {
350     DP("Error when setting current CUDA context\n");
351     CUDA_ERR_STRING(err);
352   }
353 
354   for (CUstream &Stream : DeviceInfo.Streams[device_id]) {
355     err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
356     if (err != CUDA_SUCCESS) {
357       DP("Error when creating CUDA stream\n");
358       CUDA_ERR_STRING(err);
359     }
360   }
361 
362   // Query attributes to determine number of threads/block and blocks/grid.
363   int maxGridDimX;
364   err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
365                              cuDevice);
366   if (err != CUDA_SUCCESS) {
367     DP("Error getting max grid dimension, use default\n");
368     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
369   } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
370     DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
371     DP("Using %d CUDA blocks per grid\n", maxGridDimX);
372   } else {
373     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
374     DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
375        "at the hard limit\n",
376        maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
377   }
378 
379   // We are only exploiting threads along the x axis.
380   int maxBlockDimX;
381   err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
382                              cuDevice);
383   if (err != CUDA_SUCCESS) {
384     DP("Error getting max block dimension, use default\n");
385     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
386   } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
387     DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
388     DP("Using %d CUDA threads per block\n", maxBlockDimX);
389   } else {
390     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
391     DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
392        "at the hard limit\n",
393        maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
394   }
395 
396   int warpSize;
397   err =
398       cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
399   if (err != CUDA_SUCCESS) {
400     DP("Error getting warp size, assume default\n");
401     DeviceInfo.WarpSize[device_id] = 32;
402   } else {
403     DeviceInfo.WarpSize[device_id] = warpSize;
404   }
405 
406   // Adjust teams to the env variables
407   if (DeviceInfo.EnvTeamLimit > 0 &&
408       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
409     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
410     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
411         DeviceInfo.EnvTeamLimit);
412   }
413 
414   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
415      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
416      DeviceInfo.WarpSize[device_id]);
417 
418   // Set default number of teams
419   if (DeviceInfo.EnvNumTeams > 0) {
420     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
421     DP("Default number of teams set according to environment %d\n",
422         DeviceInfo.EnvNumTeams);
423   } else {
424     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
425     DP("Default number of teams set according to library's default %d\n",
426         RTLDeviceInfoTy::DefaultNumTeams);
427   }
428   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
429     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
430     DP("Default number of teams exceeds device limit, capping at %d\n",
431         DeviceInfo.BlocksPerGrid[device_id]);
432   }
433 
434   // Set default number of threads
435   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
436   DP("Default number of threads set according to library's default %d\n",
437           RTLDeviceInfoTy::DefaultNumThreads);
438   if (DeviceInfo.NumThreads[device_id] >
439       DeviceInfo.ThreadsPerBlock[device_id]) {
440     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
441     DP("Default number of threads exceeds device limit, capping at %d\n",
442         DeviceInfo.ThreadsPerBlock[device_id]);
443   }
444 
445   return OFFLOAD_SUCCESS;
446 }
447 
448 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
449     __tgt_device_image *image) {
450 
451   // Set the context we are using.
452   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
453   if (err != CUDA_SUCCESS) {
454     DP("Error when setting a CUDA context for device %d\n", device_id);
455     CUDA_ERR_STRING(err);
456     return NULL;
457   }
458 
459   // Clear the offload table as we are going to create a new one.
460   DeviceInfo.clearOffloadEntriesTable(device_id);
461 
462   // Create the module and extract the function pointers.
463 
464   CUmodule cumod;
465   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
466   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
467   if (err != CUDA_SUCCESS) {
468     DP("Error when loading CUDA module\n");
469     CUDA_ERR_STRING(err);
470     return NULL;
471   }
472 
473   DP("CUDA module successfully loaded!\n");
474   DeviceInfo.Modules.push_back(cumod);
475 
476   // Find the symbols in the module by name.
477   __tgt_offload_entry *HostBegin = image->EntriesBegin;
478   __tgt_offload_entry *HostEnd = image->EntriesEnd;
479 
480   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
481 
482     if (!e->addr) {
483       // We return NULL when something like this happens, the host should have
484       // always something in the address to uniquely identify the target region.
485       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
486 
487       return NULL;
488     }
489 
490     if (e->size) {
491       __tgt_offload_entry entry = *e;
492 
493       CUdeviceptr cuptr;
494       size_t cusize;
495       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
496 
497       if (err != CUDA_SUCCESS) {
498         DP("Loading global '%s' (Failed)\n", e->name);
499         CUDA_ERR_STRING(err);
500         return NULL;
501       }
502 
503       if (cusize != e->size) {
504         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
505             cusize, e->size);
506         CUDA_ERR_STRING(err);
507         return NULL;
508       }
509 
510       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
511           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
512       entry.addr = (void *)cuptr;
513 
514       // Note: In the current implementation declare target variables
515       // can either be link or to. This means that once unified
516       // memory is activated via the requires directive, the variable
517       // can be used directly from the host in both cases.
518       // TODO: when variables types other than to or link are added,
519       // the below condition should be changed to explicitly
520       // check for to and link variables types:
521       //  (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
522       //   (e->flags & OMP_DECLARE_TARGET_LINK ||
523       //    e->flags == OMP_DECLARE_TARGET_TO))
524       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
525         // If unified memory is present any target link or to variables
526         // can access host addresses directly. There is no longer a
527         // need for device copies.
528         cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
529         DP("Copy linked variable host address (" DPxMOD ")"
530            "to device address (" DPxMOD ")\n",
531           DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
532       }
533 
534       DeviceInfo.addOffloadEntry(device_id, entry);
535 
536       continue;
537     }
538 
539     CUfunction fun;
540     err = cuModuleGetFunction(&fun, cumod, e->name);
541 
542     if (err != CUDA_SUCCESS) {
543       DP("Loading '%s' (Failed)\n", e->name);
544       CUDA_ERR_STRING(err);
545       return NULL;
546     }
547 
548     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
549         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
550 
551     // default value GENERIC (in case symbol is missing from cubin file)
552     int8_t ExecModeVal = ExecutionModeType::GENERIC;
553     std::string ExecModeNameStr (e->name);
554     ExecModeNameStr += "_exec_mode";
555     const char *ExecModeName = ExecModeNameStr.c_str();
556 
557     CUdeviceptr ExecModePtr;
558     size_t cusize;
559     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
560     if (err == CUDA_SUCCESS) {
561       if ((size_t)cusize != sizeof(int8_t)) {
562         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
563            ExecModeName, cusize, sizeof(int8_t));
564         CUDA_ERR_STRING(err);
565         return NULL;
566       }
567 
568       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
569       if (err != CUDA_SUCCESS) {
570         DP("Error when copying data from device to host. Pointers: "
571            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
572            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
573         CUDA_ERR_STRING(err);
574         return NULL;
575       }
576 
577       if (ExecModeVal < 0 || ExecModeVal > 1) {
578         DP("Error wrong exec_mode value specified in cubin file: %d\n",
579            ExecModeVal);
580         return NULL;
581       }
582     } else {
583       DP("Loading global exec_mode '%s' - symbol missing, using default value "
584           "GENERIC (1)\n", ExecModeName);
585       CUDA_ERR_STRING(err);
586     }
587 
588     KernelsList.push_back(KernelTy(fun, ExecModeVal));
589 
590     __tgt_offload_entry entry = *e;
591     entry.addr = (void *)&KernelsList.back();
592     DeviceInfo.addOffloadEntry(device_id, entry);
593   }
594 
595   // send device environment data to the device
596   {
597     omptarget_device_environmentTy device_env;
598 
599     device_env.debug_level = 0;
600 
601 #ifdef OMPTARGET_DEBUG
602     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
603       device_env.debug_level = std::stoi(envStr);
604     }
605 #endif
606 
607     const char * device_env_Name="omptarget_device_environment";
608     CUdeviceptr device_env_Ptr;
609     size_t cusize;
610 
611     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
612 
613     if (err == CUDA_SUCCESS) {
614       if ((size_t)cusize != sizeof(device_env)) {
615         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
616             device_env_Name, cusize, sizeof(int32_t));
617         CUDA_ERR_STRING(err);
618         return NULL;
619       }
620 
621       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
622       if (err != CUDA_SUCCESS) {
623         DP("Error when copying data from host to device. Pointers: "
624             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
625             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
626         CUDA_ERR_STRING(err);
627         return NULL;
628       }
629 
630       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
631     } else {
632       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
633       DP("Continue, considering this is a device RTL which does not accept environment setting.\n");
634     }
635   }
636 
637   return DeviceInfo.getOffloadEntriesTable(device_id);
638 }
639 
640 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
641   if (size == 0) {
642     return NULL;
643   }
644 
645   // Set the context we are using.
646   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
647   if (err != CUDA_SUCCESS) {
648     DP("Error while trying to set CUDA current context\n");
649     CUDA_ERR_STRING(err);
650     return NULL;
651   }
652 
653   CUdeviceptr ptr;
654   err = cuMemAlloc(&ptr, size);
655   if (err != CUDA_SUCCESS) {
656     DP("Error while trying to allocate %d\n", err);
657     CUDA_ERR_STRING(err);
658     return NULL;
659   }
660 
661   void *vptr = (void *)ptr;
662   return vptr;
663 }
664 
665 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
666     int64_t size) {
667   // Set the context we are using.
668   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
669   if (err != CUDA_SUCCESS) {
670     DP("Error when setting CUDA context\n");
671     CUDA_ERR_STRING(err);
672     return OFFLOAD_FAIL;
673   }
674 
675   CUstream &Stream = DeviceInfo.getNextStream(device_id);
676 
677   err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream);
678   if (err != CUDA_SUCCESS) {
679     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
680        ", device = " DPxMOD ", size = %" PRId64 "\n",
681        DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
682     CUDA_ERR_STRING(err);
683     return OFFLOAD_FAIL;
684   }
685 
686   err = cuStreamSynchronize(Stream);
687   if (err != CUDA_SUCCESS) {
688     DP("Error when synchronizing async data transfer from host to device. "
689        "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
690        DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
691     CUDA_ERR_STRING(err);
692     return OFFLOAD_FAIL;
693   }
694 
695   return OFFLOAD_SUCCESS;
696 }
697 
698 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
699     int64_t size) {
700   // Set the context we are using.
701   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
702   if (err != CUDA_SUCCESS) {
703     DP("Error when setting CUDA context\n");
704     CUDA_ERR_STRING(err);
705     return OFFLOAD_FAIL;
706   }
707 
708   CUstream &Stream = DeviceInfo.getNextStream(device_id);
709 
710   err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream);
711   if (err != CUDA_SUCCESS) {
712     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
713        ", device = " DPxMOD ", size = %" PRId64 "\n",
714        DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
715     CUDA_ERR_STRING(err);
716     return OFFLOAD_FAIL;
717   }
718 
719   err = cuStreamSynchronize(Stream);
720   if (err != CUDA_SUCCESS) {
721     DP("Error when synchronizing async data transfer from device to host. "
722        "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
723        DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size);
724     CUDA_ERR_STRING(err);
725     return OFFLOAD_FAIL;
726   }
727 
728   return OFFLOAD_SUCCESS;
729 }
730 
731 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
732   // Set the context we are using.
733   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
734   if (err != CUDA_SUCCESS) {
735     DP("Error when setting CUDA context\n");
736     CUDA_ERR_STRING(err);
737     return OFFLOAD_FAIL;
738   }
739 
740   err = cuMemFree((CUdeviceptr)tgt_ptr);
741   if (err != CUDA_SUCCESS) {
742     DP("Error when freeing CUDA memory\n");
743     CUDA_ERR_STRING(err);
744     return OFFLOAD_FAIL;
745   }
746   return OFFLOAD_SUCCESS;
747 }
748 
749 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
750     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
751     int32_t thread_limit, uint64_t loop_tripcount) {
752   // Set the context we are using.
753   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
754   if (err != CUDA_SUCCESS) {
755     DP("Error when setting CUDA context\n");
756     CUDA_ERR_STRING(err);
757     return OFFLOAD_FAIL;
758   }
759 
760   // All args are references.
761   std::vector<void *> args(arg_num);
762   std::vector<void *> ptrs(arg_num);
763 
764   for (int32_t i = 0; i < arg_num; ++i) {
765     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
766     args[i] = &ptrs[i];
767   }
768 
769   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
770 
771   int cudaThreadsPerBlock;
772 
773   if (thread_limit > 0) {
774     cudaThreadsPerBlock = thread_limit;
775     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
776     // Add master warp if necessary
777     if (KernelInfo->ExecutionMode == GENERIC) {
778       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
779       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
780     }
781   } else {
782     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
783     DP("Setting CUDA threads per block to default %d\n",
784         DeviceInfo.NumThreads[device_id]);
785   }
786 
787   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
788     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
789     DP("Threads per block capped at device limit %d\n",
790         DeviceInfo.ThreadsPerBlock[device_id]);
791   }
792 
793   int kernel_limit;
794   err = cuFuncGetAttribute(&kernel_limit,
795       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
796   if (err == CUDA_SUCCESS) {
797     if (kernel_limit < cudaThreadsPerBlock) {
798       cudaThreadsPerBlock = kernel_limit;
799       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
800     }
801   }
802 
803   int cudaBlocksPerGrid;
804   if (team_num <= 0) {
805     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
806       if (KernelInfo->ExecutionMode == SPMD) {
807         // We have a combined construct, i.e. `target teams distribute parallel
808         // for [simd]`. We launch so many teams so that each thread will
809         // execute one iteration of the loop.
810         // round up to the nearest integer
811         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
812       } else {
813         // If we reach this point, then we have a non-combined construct, i.e.
814         // `teams distribute` with a nested `parallel for` and each team is
815         // assigned one iteration of the `distribute` loop. E.g.:
816         //
817         // #pragma omp target teams distribute
818         // for(...loop_tripcount...) {
819         //   #pragma omp parallel for
820         //   for(...) {}
821         // }
822         //
823         // Threads within a team will execute the iterations of the `parallel`
824         // loop.
825         cudaBlocksPerGrid = loop_tripcount;
826       }
827       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
828           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
829           cudaThreadsPerBlock);
830     } else {
831       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
832       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
833     }
834   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
835     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
836     DP("Capping number of teams to team limit %d\n",
837         DeviceInfo.BlocksPerGrid[device_id]);
838   } else {
839     cudaBlocksPerGrid = team_num;
840     DP("Using requested number of teams %d\n", team_num);
841   }
842 
843   // Run on the device.
844   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
845      cudaThreadsPerBlock);
846 
847   CUstream &Stream = DeviceInfo.getNextStream(device_id);
848 
849   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
850                        cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/,
851                        Stream, &args[0], 0);
852   if (err != CUDA_SUCCESS) {
853     DP("Device kernel launch failed!\n");
854     CUDA_ERR_STRING(err);
855     return OFFLOAD_FAIL;
856   }
857 
858   DP("Launch of entry point at " DPxMOD " successful!\n",
859       DPxPTR(tgt_entry_ptr));
860 
861   CUresult sync_err = cuStreamSynchronize(Stream);
862   if (sync_err != CUDA_SUCCESS) {
863     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
864     CUDA_ERR_STRING(sync_err);
865     return OFFLOAD_FAIL;
866   } else {
867     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
868   }
869 
870   return OFFLOAD_SUCCESS;
871 }
872 
873 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
874     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
875   // use one team and the default number of threads.
876   const int32_t team_num = 1;
877   const int32_t thread_limit = 0;
878   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
879       tgt_offsets, arg_num, team_num, thread_limit, 0);
880 }
881 
882 #ifdef __cplusplus
883 }
884 #endif
885