1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <string>
19 #include <vector>
20 
21 #include "omptargetplugin.h"
22 
23 #ifndef TARGET_NAME
24 #define TARGET_NAME CUDA
25 #endif
26 
27 #ifdef OMPTARGET_DEBUG
28 static int DebugLevel = 0;
29 
30 #define GETNAME2(name) #name
31 #define GETNAME(name) GETNAME2(name)
32 #define DP(...) \
33   do { \
34     if (DebugLevel > 0) { \
35       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
36     } \
37   } while (false)
38 #else // OMPTARGET_DEBUG
39 #define DP(...) {}
40 #endif // OMPTARGET_DEBUG
41 
42 #include "../../common/elf_common.c"
43 
44 // Utility for retrieving and printing CUDA error string.
45 #ifdef CUDA_ERROR_REPORT
46 #define CUDA_ERR_STRING(err)                                                   \
47   do {                                                                         \
48     const char *errStr;                                                        \
49     cuGetErrorString(err, &errStr);                                            \
50     DP("CUDA error is: %s\n", errStr);                                         \
51   } while (0)
52 #else
53 #define CUDA_ERR_STRING(err)                                                   \
54   {}
55 #endif
56 
57 /// Keep entries table per device.
58 struct FuncOrGblEntryTy {
59   __tgt_target_table Table;
60   std::vector<__tgt_offload_entry> Entries;
61 };
62 
63 enum ExecutionModeType {
64   SPMD, // constructors, destructors,
65         // combined constructs (`teams distribute parallel for [simd]`)
66   GENERIC, // everything else
67   NONE
68 };
69 
70 /// Use a single entity to encode a kernel and a set of flags
71 struct KernelTy {
72   CUfunction Func;
73 
74   // execution mode of kernel
75   // 0 - SPMD mode (without master warp)
76   // 1 - Generic mode (with master warp)
77   int8_t ExecutionMode;
78 
79   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
80       : Func(_Func), ExecutionMode(_ExecutionMode) {}
81 };
82 
83 /// Device envrionment data
84 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
85 struct omptarget_device_environmentTy {
86   int32_t debug_level;
87 };
88 
89 /// List that contains all the kernels.
90 /// FIXME: we may need this to be per device and per library.
91 std::list<KernelTy> KernelsList;
92 
93 /// Class containing all the device information.
94 class RTLDeviceInfoTy {
95   std::vector<FuncOrGblEntryTy> FuncGblEntries;
96 
97 public:
98   int NumberOfDevices;
99   std::vector<CUmodule> Modules;
100   std::vector<CUcontext> Contexts;
101 
102   // Device properties
103   std::vector<int> ThreadsPerBlock;
104   std::vector<int> BlocksPerGrid;
105   std::vector<int> WarpSize;
106 
107   // OpenMP properties
108   std::vector<int> NumTeams;
109   std::vector<int> NumThreads;
110 
111   // OpenMP Environment properties
112   int EnvNumTeams;
113   int EnvTeamLimit;
114 
115   //static int EnvNumThreads;
116   static const int HardTeamLimit = 1<<16; // 64k
117   static const int HardThreadLimit = 1024;
118   static const int DefaultNumTeams = 128;
119   static const int DefaultNumThreads = 128;
120 
121   // Record entry point associated with device
122   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
123     assert(device_id < (int32_t)FuncGblEntries.size() &&
124            "Unexpected device id!");
125     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
126 
127     E.Entries.push_back(entry);
128   }
129 
130   // Return true if the entry is associated with device
131   bool findOffloadEntry(int32_t device_id, void *addr) {
132     assert(device_id < (int32_t)FuncGblEntries.size() &&
133            "Unexpected device id!");
134     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
135 
136     for (auto &it : E.Entries) {
137       if (it.addr == addr)
138         return true;
139     }
140 
141     return false;
142   }
143 
144   // Return the pointer to the target entries table
145   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
146     assert(device_id < (int32_t)FuncGblEntries.size() &&
147            "Unexpected device id!");
148     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
149 
150     int32_t size = E.Entries.size();
151 
152     // Table is empty
153     if (!size)
154       return 0;
155 
156     __tgt_offload_entry *begin = &E.Entries[0];
157     __tgt_offload_entry *end = &E.Entries[size - 1];
158 
159     // Update table info according to the entries and return the pointer
160     E.Table.EntriesBegin = begin;
161     E.Table.EntriesEnd = ++end;
162 
163     return &E.Table;
164   }
165 
166   // Clear entries table for a device
167   void clearOffloadEntriesTable(int32_t device_id) {
168     assert(device_id < (int32_t)FuncGblEntries.size() &&
169            "Unexpected device id!");
170     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
171     E.Entries.clear();
172     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
173   }
174 
175   RTLDeviceInfoTy() {
176 #ifdef OMPTARGET_DEBUG
177     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
178       DebugLevel = std::stoi(envStr);
179     }
180 #endif // OMPTARGET_DEBUG
181 
182     DP("Start initializing CUDA\n");
183 
184     CUresult err = cuInit(0);
185     if (err != CUDA_SUCCESS) {
186       DP("Error when initializing CUDA\n");
187       CUDA_ERR_STRING(err);
188       return;
189     }
190 
191     NumberOfDevices = 0;
192 
193     err = cuDeviceGetCount(&NumberOfDevices);
194     if (err != CUDA_SUCCESS) {
195       DP("Error when getting CUDA device count\n");
196       CUDA_ERR_STRING(err);
197       return;
198     }
199 
200     if (NumberOfDevices == 0) {
201       DP("There are no devices supporting CUDA.\n");
202       return;
203     }
204 
205     FuncGblEntries.resize(NumberOfDevices);
206     Contexts.resize(NumberOfDevices);
207     ThreadsPerBlock.resize(NumberOfDevices);
208     BlocksPerGrid.resize(NumberOfDevices);
209     WarpSize.resize(NumberOfDevices);
210     NumTeams.resize(NumberOfDevices);
211     NumThreads.resize(NumberOfDevices);
212 
213     // Get environment variables regarding teams
214     char *envStr = getenv("OMP_TEAM_LIMIT");
215     if (envStr) {
216       // OMP_TEAM_LIMIT has been set
217       EnvTeamLimit = std::stoi(envStr);
218       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
219     } else {
220       EnvTeamLimit = -1;
221     }
222     envStr = getenv("OMP_NUM_TEAMS");
223     if (envStr) {
224       // OMP_NUM_TEAMS has been set
225       EnvNumTeams = std::stoi(envStr);
226       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
227     } else {
228       EnvNumTeams = -1;
229     }
230   }
231 
232   ~RTLDeviceInfoTy() {
233     // Close modules
234     for (auto &module : Modules)
235       if (module) {
236         CUresult err = cuModuleUnload(module);
237         if (err != CUDA_SUCCESS) {
238           DP("Error when unloading CUDA module\n");
239           CUDA_ERR_STRING(err);
240         }
241       }
242 
243     // Destroy contexts
244     for (auto &ctx : Contexts)
245       if (ctx) {
246         CUresult err = cuCtxDestroy(ctx);
247         if (err != CUDA_SUCCESS) {
248           DP("Error when destroying CUDA context\n");
249           CUDA_ERR_STRING(err);
250         }
251       }
252   }
253 };
254 
255 static RTLDeviceInfoTy DeviceInfo;
256 
257 #ifdef __cplusplus
258 extern "C" {
259 #endif
260 
261 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
262   return elf_check_machine(image, 190); // EM_CUDA = 190.
263 }
264 
265 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
266 
267 int32_t __tgt_rtl_init_device(int32_t device_id) {
268 
269   CUdevice cuDevice;
270   DP("Getting device %d\n", device_id);
271   CUresult err = cuDeviceGet(&cuDevice, device_id);
272   if (err != CUDA_SUCCESS) {
273     DP("Error when getting CUDA device with id = %d\n", device_id);
274     CUDA_ERR_STRING(err);
275     return OFFLOAD_FAIL;
276   }
277 
278   // Create the context and save it to use whenever this device is selected.
279   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
280                     cuDevice);
281   if (err != CUDA_SUCCESS) {
282     DP("Error when creating a CUDA context\n");
283     CUDA_ERR_STRING(err);
284     return OFFLOAD_FAIL;
285   }
286 
287   // scan properties to determine number of threads/block and blocks/grid.
288   CUdevprop Properties;
289   err = cuDeviceGetProperties(&Properties, cuDevice);
290   if (err != CUDA_SUCCESS) {
291     DP("Error getting device Properties, use defaults\n");
292     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
293     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
294     DeviceInfo.WarpSize[device_id] = 32;
295   } else {
296     // Get blocks per grid
297     if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
298       DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
299       DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
300     } else {
301       DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
302       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
303           "at the hard limit\n", Properties.maxGridSize[0],
304           RTLDeviceInfoTy::HardTeamLimit);
305     }
306 
307     // Get threads per block, exploit threads only along x axis
308     if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
309       DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
310       DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
311       if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
312         DP("(fewer than max per block along all xyz dims %d)\n",
313             Properties.maxThreadsPerBlock);
314       }
315     } else {
316       DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
317       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
318           "capping at the hard limit\n", Properties.maxThreadsDim[0],
319           RTLDeviceInfoTy::HardThreadLimit);
320     }
321 
322     // According to the documentation, SIMDWidth is "Warp size in threads".
323     DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth;
324   }
325 
326   // Adjust teams to the env variables
327   if (DeviceInfo.EnvTeamLimit > 0 &&
328       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
329     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
330     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
331         DeviceInfo.EnvTeamLimit);
332   }
333 
334   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
335      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
336      DeviceInfo.WarpSize[device_id]);
337 
338   // Set default number of teams
339   if (DeviceInfo.EnvNumTeams > 0) {
340     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
341     DP("Default number of teams set according to environment %d\n",
342         DeviceInfo.EnvNumTeams);
343   } else {
344     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
345     DP("Default number of teams set according to library's default %d\n",
346         RTLDeviceInfoTy::DefaultNumTeams);
347   }
348   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
349     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
350     DP("Default number of teams exceeds device limit, capping at %d\n",
351         DeviceInfo.BlocksPerGrid[device_id]);
352   }
353 
354   // Set default number of threads
355   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
356   DP("Default number of threads set according to library's default %d\n",
357           RTLDeviceInfoTy::DefaultNumThreads);
358   if (DeviceInfo.NumThreads[device_id] >
359       DeviceInfo.ThreadsPerBlock[device_id]) {
360     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
361     DP("Default number of threads exceeds device limit, capping at %d\n",
362         DeviceInfo.ThreadsPerBlock[device_id]);
363   }
364 
365   return OFFLOAD_SUCCESS;
366 }
367 
368 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
369     __tgt_device_image *image) {
370 
371   // Set the context we are using.
372   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
373   if (err != CUDA_SUCCESS) {
374     DP("Error when setting a CUDA context for device %d\n", device_id);
375     CUDA_ERR_STRING(err);
376     return NULL;
377   }
378 
379   // Clear the offload table as we are going to create a new one.
380   DeviceInfo.clearOffloadEntriesTable(device_id);
381 
382   // Create the module and extract the function pointers.
383 
384   CUmodule cumod;
385   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
386   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
387   if (err != CUDA_SUCCESS) {
388     DP("Error when loading CUDA module\n");
389     CUDA_ERR_STRING(err);
390     return NULL;
391   }
392 
393   DP("CUDA module successfully loaded!\n");
394   DeviceInfo.Modules.push_back(cumod);
395 
396   // Find the symbols in the module by name.
397   __tgt_offload_entry *HostBegin = image->EntriesBegin;
398   __tgt_offload_entry *HostEnd = image->EntriesEnd;
399 
400   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
401 
402     if (!e->addr) {
403       // We return NULL when something like this happens, the host should have
404       // always something in the address to uniquely identify the target region.
405       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
406 
407       return NULL;
408     }
409 
410     if (e->size) {
411       __tgt_offload_entry entry = *e;
412 
413       CUdeviceptr cuptr;
414       size_t cusize;
415       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
416 
417       if (err != CUDA_SUCCESS) {
418         DP("Loading global '%s' (Failed)\n", e->name);
419         CUDA_ERR_STRING(err);
420         return NULL;
421       }
422 
423       if (cusize != e->size) {
424         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
425             cusize, e->size);
426         CUDA_ERR_STRING(err);
427         return NULL;
428       }
429 
430       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
431           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
432       entry.addr = (void *)cuptr;
433 
434       DeviceInfo.addOffloadEntry(device_id, entry);
435 
436       continue;
437     }
438 
439     CUfunction fun;
440     err = cuModuleGetFunction(&fun, cumod, e->name);
441 
442     if (err != CUDA_SUCCESS) {
443       DP("Loading '%s' (Failed)\n", e->name);
444       CUDA_ERR_STRING(err);
445       return NULL;
446     }
447 
448     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
449         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
450 
451     // default value GENERIC (in case symbol is missing from cubin file)
452     int8_t ExecModeVal = ExecutionModeType::GENERIC;
453     std::string ExecModeNameStr (e->name);
454     ExecModeNameStr += "_exec_mode";
455     const char *ExecModeName = ExecModeNameStr.c_str();
456 
457     CUdeviceptr ExecModePtr;
458     size_t cusize;
459     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
460     if (err == CUDA_SUCCESS) {
461       if ((size_t)cusize != sizeof(int8_t)) {
462         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
463            ExecModeName, cusize, sizeof(int8_t));
464         CUDA_ERR_STRING(err);
465         return NULL;
466       }
467 
468       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
469       if (err != CUDA_SUCCESS) {
470         DP("Error when copying data from device to host. Pointers: "
471            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
472            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
473         CUDA_ERR_STRING(err);
474         return NULL;
475       }
476 
477       if (ExecModeVal < 0 || ExecModeVal > 1) {
478         DP("Error wrong exec_mode value specified in cubin file: %d\n",
479            ExecModeVal);
480         return NULL;
481       }
482     } else {
483       DP("Loading global exec_mode '%s' - symbol missing, using default value "
484           "GENERIC (1)\n", ExecModeName);
485       CUDA_ERR_STRING(err);
486     }
487 
488     KernelsList.push_back(KernelTy(fun, ExecModeVal));
489 
490     __tgt_offload_entry entry = *e;
491     entry.addr = (void *)&KernelsList.back();
492     DeviceInfo.addOffloadEntry(device_id, entry);
493   }
494 
495   // send device environment data to the device
496   {
497     omptarget_device_environmentTy device_env;
498 
499     device_env.debug_level = 0;
500 
501 #ifdef OMPTARGET_DEBUG
502     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
503       device_env.debug_level = std::stoi(envStr);
504     }
505 #endif
506 
507     const char * device_env_Name="omptarget_device_environment";
508     CUdeviceptr device_env_Ptr;
509     size_t cusize;
510 
511     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
512 
513     if (err == CUDA_SUCCESS) {
514       if ((size_t)cusize != sizeof(device_env)) {
515         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
516             device_env_Name, cusize, sizeof(int32_t));
517         CUDA_ERR_STRING(err);
518         return NULL;
519       }
520 
521       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
522       if (err != CUDA_SUCCESS) {
523         DP("Error when copying data from host to device. Pointers: "
524             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
525             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
526         CUDA_ERR_STRING(err);
527         return NULL;
528       }
529 
530       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
531     } else {
532       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
533       DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
534     }
535   }
536 
537   return DeviceInfo.getOffloadEntriesTable(device_id);
538 }
539 
540 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
541   if (size == 0) {
542     return NULL;
543   }
544 
545   // Set the context we are using.
546   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
547   if (err != CUDA_SUCCESS) {
548     DP("Error while trying to set CUDA current context\n");
549     CUDA_ERR_STRING(err);
550     return NULL;
551   }
552 
553   CUdeviceptr ptr;
554   err = cuMemAlloc(&ptr, size);
555   if (err != CUDA_SUCCESS) {
556     DP("Error while trying to allocate %d\n", err);
557     CUDA_ERR_STRING(err);
558     return NULL;
559   }
560 
561   void *vptr = (void *)ptr;
562   return vptr;
563 }
564 
565 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
566     int64_t size) {
567   // Set the context we are using.
568   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
569   if (err != CUDA_SUCCESS) {
570     DP("Error when setting CUDA context\n");
571     CUDA_ERR_STRING(err);
572     return OFFLOAD_FAIL;
573   }
574 
575   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
576   if (err != CUDA_SUCCESS) {
577     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
578        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
579        DPxPTR(tgt_ptr), size);
580     CUDA_ERR_STRING(err);
581     return OFFLOAD_FAIL;
582   }
583   return OFFLOAD_SUCCESS;
584 }
585 
586 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
587     int64_t size) {
588   // Set the context we are using.
589   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
590   if (err != CUDA_SUCCESS) {
591     DP("Error when setting CUDA context\n");
592     CUDA_ERR_STRING(err);
593     return OFFLOAD_FAIL;
594   }
595 
596   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
597   if (err != CUDA_SUCCESS) {
598     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
599         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
600         DPxPTR(tgt_ptr), size);
601     CUDA_ERR_STRING(err);
602     return OFFLOAD_FAIL;
603   }
604   return OFFLOAD_SUCCESS;
605 }
606 
607 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
608   // Set the context we are using.
609   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
610   if (err != CUDA_SUCCESS) {
611     DP("Error when setting CUDA context\n");
612     CUDA_ERR_STRING(err);
613     return OFFLOAD_FAIL;
614   }
615 
616   err = cuMemFree((CUdeviceptr)tgt_ptr);
617   if (err != CUDA_SUCCESS) {
618     DP("Error when freeing CUDA memory\n");
619     CUDA_ERR_STRING(err);
620     return OFFLOAD_FAIL;
621   }
622   return OFFLOAD_SUCCESS;
623 }
624 
625 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
626     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
627     int32_t thread_limit, uint64_t loop_tripcount) {
628   // Set the context we are using.
629   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
630   if (err != CUDA_SUCCESS) {
631     DP("Error when setting CUDA context\n");
632     CUDA_ERR_STRING(err);
633     return OFFLOAD_FAIL;
634   }
635 
636   // All args are references.
637   std::vector<void *> args(arg_num);
638   std::vector<void *> ptrs(arg_num);
639 
640   for (int32_t i = 0; i < arg_num; ++i) {
641     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
642     args[i] = &ptrs[i];
643   }
644 
645   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
646 
647   int cudaThreadsPerBlock;
648 
649   if (thread_limit > 0) {
650     cudaThreadsPerBlock = thread_limit;
651     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
652     // Add master warp if necessary
653     if (KernelInfo->ExecutionMode == GENERIC) {
654       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
655       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
656     }
657   } else {
658     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
659     DP("Setting CUDA threads per block to default %d\n",
660         DeviceInfo.NumThreads[device_id]);
661   }
662 
663   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
664     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
665     DP("Threads per block capped at device limit %d\n",
666         DeviceInfo.ThreadsPerBlock[device_id]);
667   }
668 
669   int kernel_limit;
670   err = cuFuncGetAttribute(&kernel_limit,
671       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
672   if (err == CUDA_SUCCESS) {
673     if (kernel_limit < cudaThreadsPerBlock) {
674       cudaThreadsPerBlock = kernel_limit;
675       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
676     }
677   }
678 
679   int cudaBlocksPerGrid;
680   if (team_num <= 0) {
681     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
682       if (KernelInfo->ExecutionMode == SPMD) {
683         // We have a combined construct, i.e. `target teams distribute parallel
684         // for [simd]`. We launch so many teams so that each thread will
685         // execute one iteration of the loop.
686         // round up to the nearest integer
687         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
688       } else {
689         // If we reach this point, then we have a non-combined construct, i.e.
690         // `teams distribute` with a nested `parallel for` and each team is
691         // assigned one iteration of the `distribute` loop. E.g.:
692         //
693         // #pragma omp target teams distribute
694         // for(...loop_tripcount...) {
695         //   #pragma omp parallel for
696         //   for(...) {}
697         // }
698         //
699         // Threads within a team will execute the iterations of the `parallel`
700         // loop.
701         cudaBlocksPerGrid = loop_tripcount;
702       }
703       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
704           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
705           cudaThreadsPerBlock);
706     } else {
707       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
708       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
709     }
710   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
711     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
712     DP("Capping number of teams to team limit %d\n",
713         DeviceInfo.BlocksPerGrid[device_id]);
714   } else {
715     cudaBlocksPerGrid = team_num;
716     DP("Using requested number of teams %d\n", team_num);
717   }
718 
719   // Run on the device.
720   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
721      cudaThreadsPerBlock);
722 
723   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
724       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
725   if (err != CUDA_SUCCESS) {
726     DP("Device kernel launch failed!\n");
727     CUDA_ERR_STRING(err);
728     return OFFLOAD_FAIL;
729   }
730 
731   DP("Launch of entry point at " DPxMOD " successful!\n",
732       DPxPTR(tgt_entry_ptr));
733 
734   CUresult sync_err = cuCtxSynchronize();
735   if (sync_err != CUDA_SUCCESS) {
736     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
737     CUDA_ERR_STRING(sync_err);
738     return OFFLOAD_FAIL;
739   } else {
740     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
741   }
742 
743   return OFFLOAD_SUCCESS;
744 }
745 
746 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
747     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
748   // use one team and the default number of threads.
749   const int32_t team_num = 1;
750   const int32_t thread_limit = 0;
751   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
752       tgt_offsets, arg_num, team_num, thread_limit, 0);
753 }
754 
755 #ifdef __cplusplus
756 }
757 #endif
758