1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <string>
19 #include <vector>
20 
21 #include "omptargetplugin.h"
22 
23 #ifndef TARGET_NAME
24 #define TARGET_NAME CUDA
25 #endif
26 
27 #ifdef OMPTARGET_DEBUG
28 static int DebugLevel = 0;
29 
30 #define GETNAME2(name) #name
31 #define GETNAME(name) GETNAME2(name)
32 #define DP(...) \
33   do { \
34     if (DebugLevel > 0) { \
35       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
36     } \
37   } while (false)
38 #else // OMPTARGET_DEBUG
39 #define DP(...) {}
40 #endif // OMPTARGET_DEBUG
41 
42 #include "../../common/elf_common.c"
43 
44 // Utility for retrieving and printing CUDA error string.
45 #ifdef CUDA_ERROR_REPORT
46 #define CUDA_ERR_STRING(err)                                                   \
47   do {                                                                         \
48     const char *errStr;                                                        \
49     cuGetErrorString(err, &errStr);                                            \
50     DP("CUDA error is: %s\n", errStr);                                         \
51   } while (0)
52 #else
53 #define CUDA_ERR_STRING(err)                                                   \
54   {}
55 #endif
56 
57 /// Keep entries table per device.
58 struct FuncOrGblEntryTy {
59   __tgt_target_table Table;
60   std::vector<__tgt_offload_entry> Entries;
61 };
62 
63 enum ExecutionModeType {
64   SPMD, // constructors, destructors,
65         // combined constructs (`teams distribute parallel for [simd]`)
66   GENERIC, // everything else
67   NONE
68 };
69 
70 /// Use a single entity to encode a kernel and a set of flags
71 struct KernelTy {
72   CUfunction Func;
73 
74   // execution mode of kernel
75   // 0 - SPMD mode (without master warp)
76   // 1 - Generic mode (with master warp)
77   int8_t ExecutionMode;
78 
79   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
80       : Func(_Func), ExecutionMode(_ExecutionMode) {}
81 };
82 
83 /// Device envrionment data
84 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
85 struct omptarget_device_environmentTy {
86   int32_t debug_level;
87 };
88 
89 /// List that contains all the kernels.
90 /// FIXME: we may need this to be per device and per library.
91 std::list<KernelTy> KernelsList;
92 
93 /// Class containing all the device information.
94 class RTLDeviceInfoTy {
95   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
96 
97 public:
98   int NumberOfDevices;
99   std::vector<CUmodule> Modules;
100   std::vector<CUcontext> Contexts;
101 
102   // Device properties
103   std::vector<int> ThreadsPerBlock;
104   std::vector<int> BlocksPerGrid;
105   std::vector<int> WarpSize;
106 
107   // OpenMP properties
108   std::vector<int> NumTeams;
109   std::vector<int> NumThreads;
110 
111   // OpenMP Environment properties
112   int EnvNumTeams;
113   int EnvTeamLimit;
114 
115   //static int EnvNumThreads;
116   static const int HardTeamLimit = 1<<16; // 64k
117   static const int HardThreadLimit = 1024;
118   static const int DefaultNumTeams = 128;
119   static const int DefaultNumThreads = 128;
120 
121   // Record entry point associated with device
122   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
123     assert(device_id < (int32_t)FuncGblEntries.size() &&
124            "Unexpected device id!");
125     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
126 
127     E.Entries.push_back(entry);
128   }
129 
130   // Return true if the entry is associated with device
131   bool findOffloadEntry(int32_t device_id, void *addr) {
132     assert(device_id < (int32_t)FuncGblEntries.size() &&
133            "Unexpected device id!");
134     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
135 
136     for (auto &it : E.Entries) {
137       if (it.addr == addr)
138         return true;
139     }
140 
141     return false;
142   }
143 
144   // Return the pointer to the target entries table
145   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
146     assert(device_id < (int32_t)FuncGblEntries.size() &&
147            "Unexpected device id!");
148     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
149 
150     int32_t size = E.Entries.size();
151 
152     // Table is empty
153     if (!size)
154       return 0;
155 
156     __tgt_offload_entry *begin = &E.Entries[0];
157     __tgt_offload_entry *end = &E.Entries[size - 1];
158 
159     // Update table info according to the entries and return the pointer
160     E.Table.EntriesBegin = begin;
161     E.Table.EntriesEnd = ++end;
162 
163     return &E.Table;
164   }
165 
166   // Clear entries table for a device
167   void clearOffloadEntriesTable(int32_t device_id) {
168     assert(device_id < (int32_t)FuncGblEntries.size() &&
169            "Unexpected device id!");
170     FuncGblEntries[device_id].emplace_back();
171     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
172     E.Entries.clear();
173     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
174   }
175 
176   RTLDeviceInfoTy() {
177 #ifdef OMPTARGET_DEBUG
178     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
179       DebugLevel = std::stoi(envStr);
180     }
181 #endif // OMPTARGET_DEBUG
182 
183     DP("Start initializing CUDA\n");
184 
185     CUresult err = cuInit(0);
186     if (err != CUDA_SUCCESS) {
187       DP("Error when initializing CUDA\n");
188       CUDA_ERR_STRING(err);
189       return;
190     }
191 
192     NumberOfDevices = 0;
193 
194     err = cuDeviceGetCount(&NumberOfDevices);
195     if (err != CUDA_SUCCESS) {
196       DP("Error when getting CUDA device count\n");
197       CUDA_ERR_STRING(err);
198       return;
199     }
200 
201     if (NumberOfDevices == 0) {
202       DP("There are no devices supporting CUDA.\n");
203       return;
204     }
205 
206     FuncGblEntries.resize(NumberOfDevices);
207     Contexts.resize(NumberOfDevices);
208     ThreadsPerBlock.resize(NumberOfDevices);
209     BlocksPerGrid.resize(NumberOfDevices);
210     WarpSize.resize(NumberOfDevices);
211     NumTeams.resize(NumberOfDevices);
212     NumThreads.resize(NumberOfDevices);
213 
214     // Get environment variables regarding teams
215     char *envStr = getenv("OMP_TEAM_LIMIT");
216     if (envStr) {
217       // OMP_TEAM_LIMIT has been set
218       EnvTeamLimit = std::stoi(envStr);
219       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
220     } else {
221       EnvTeamLimit = -1;
222     }
223     envStr = getenv("OMP_NUM_TEAMS");
224     if (envStr) {
225       // OMP_NUM_TEAMS has been set
226       EnvNumTeams = std::stoi(envStr);
227       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
228     } else {
229       EnvNumTeams = -1;
230     }
231   }
232 
233   ~RTLDeviceInfoTy() {
234     // Close modules
235     for (auto &module : Modules)
236       if (module) {
237         CUresult err = cuModuleUnload(module);
238         if (err != CUDA_SUCCESS) {
239           DP("Error when unloading CUDA module\n");
240           CUDA_ERR_STRING(err);
241         }
242       }
243 
244     // Destroy contexts
245     for (auto &ctx : Contexts)
246       if (ctx) {
247         CUresult err = cuCtxDestroy(ctx);
248         if (err != CUDA_SUCCESS) {
249           DP("Error when destroying CUDA context\n");
250           CUDA_ERR_STRING(err);
251         }
252       }
253   }
254 };
255 
256 static RTLDeviceInfoTy DeviceInfo;
257 
258 #ifdef __cplusplus
259 extern "C" {
260 #endif
261 
262 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
263   return elf_check_machine(image, 190); // EM_CUDA = 190.
264 }
265 
266 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
267 
268 int32_t __tgt_rtl_init_device(int32_t device_id) {
269 
270   CUdevice cuDevice;
271   DP("Getting device %d\n", device_id);
272   CUresult err = cuDeviceGet(&cuDevice, device_id);
273   if (err != CUDA_SUCCESS) {
274     DP("Error when getting CUDA device with id = %d\n", device_id);
275     CUDA_ERR_STRING(err);
276     return OFFLOAD_FAIL;
277   }
278 
279   // Create the context and save it to use whenever this device is selected.
280   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
281                     cuDevice);
282   if (err != CUDA_SUCCESS) {
283     DP("Error when creating a CUDA context\n");
284     CUDA_ERR_STRING(err);
285     return OFFLOAD_FAIL;
286   }
287 
288   // scan properties to determine number of threads/block and blocks/grid.
289   CUdevprop Properties;
290   err = cuDeviceGetProperties(&Properties, cuDevice);
291   if (err != CUDA_SUCCESS) {
292     DP("Error getting device Properties, use defaults\n");
293     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
294     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
295     DeviceInfo.WarpSize[device_id] = 32;
296   } else {
297     // Get blocks per grid
298     if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
299       DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
300       DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
301     } else {
302       DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
303       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
304           "at the hard limit\n", Properties.maxGridSize[0],
305           RTLDeviceInfoTy::HardTeamLimit);
306     }
307 
308     // Get threads per block, exploit threads only along x axis
309     if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
310       DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
311       DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
312       if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
313         DP("(fewer than max per block along all xyz dims %d)\n",
314             Properties.maxThreadsPerBlock);
315       }
316     } else {
317       DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
318       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
319           "capping at the hard limit\n", Properties.maxThreadsDim[0],
320           RTLDeviceInfoTy::HardThreadLimit);
321     }
322 
323     // According to the documentation, SIMDWidth is "Warp size in threads".
324     DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth;
325   }
326 
327   // Adjust teams to the env variables
328   if (DeviceInfo.EnvTeamLimit > 0 &&
329       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
330     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
331     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
332         DeviceInfo.EnvTeamLimit);
333   }
334 
335   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
336      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
337      DeviceInfo.WarpSize[device_id]);
338 
339   // Set default number of teams
340   if (DeviceInfo.EnvNumTeams > 0) {
341     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
342     DP("Default number of teams set according to environment %d\n",
343         DeviceInfo.EnvNumTeams);
344   } else {
345     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
346     DP("Default number of teams set according to library's default %d\n",
347         RTLDeviceInfoTy::DefaultNumTeams);
348   }
349   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
350     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
351     DP("Default number of teams exceeds device limit, capping at %d\n",
352         DeviceInfo.BlocksPerGrid[device_id]);
353   }
354 
355   // Set default number of threads
356   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
357   DP("Default number of threads set according to library's default %d\n",
358           RTLDeviceInfoTy::DefaultNumThreads);
359   if (DeviceInfo.NumThreads[device_id] >
360       DeviceInfo.ThreadsPerBlock[device_id]) {
361     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
362     DP("Default number of threads exceeds device limit, capping at %d\n",
363         DeviceInfo.ThreadsPerBlock[device_id]);
364   }
365 
366   return OFFLOAD_SUCCESS;
367 }
368 
369 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
370     __tgt_device_image *image) {
371 
372   // Set the context we are using.
373   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
374   if (err != CUDA_SUCCESS) {
375     DP("Error when setting a CUDA context for device %d\n", device_id);
376     CUDA_ERR_STRING(err);
377     return NULL;
378   }
379 
380   // Clear the offload table as we are going to create a new one.
381   DeviceInfo.clearOffloadEntriesTable(device_id);
382 
383   // Create the module and extract the function pointers.
384 
385   CUmodule cumod;
386   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
387   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
388   if (err != CUDA_SUCCESS) {
389     DP("Error when loading CUDA module\n");
390     CUDA_ERR_STRING(err);
391     return NULL;
392   }
393 
394   DP("CUDA module successfully loaded!\n");
395   DeviceInfo.Modules.push_back(cumod);
396 
397   // Find the symbols in the module by name.
398   __tgt_offload_entry *HostBegin = image->EntriesBegin;
399   __tgt_offload_entry *HostEnd = image->EntriesEnd;
400 
401   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
402 
403     if (!e->addr) {
404       // We return NULL when something like this happens, the host should have
405       // always something in the address to uniquely identify the target region.
406       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
407 
408       return NULL;
409     }
410 
411     if (e->size) {
412       __tgt_offload_entry entry = *e;
413 
414       CUdeviceptr cuptr;
415       size_t cusize;
416       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
417 
418       if (err != CUDA_SUCCESS) {
419         DP("Loading global '%s' (Failed)\n", e->name);
420         CUDA_ERR_STRING(err);
421         return NULL;
422       }
423 
424       if (cusize != e->size) {
425         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
426             cusize, e->size);
427         CUDA_ERR_STRING(err);
428         return NULL;
429       }
430 
431       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
432           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
433       entry.addr = (void *)cuptr;
434 
435       DeviceInfo.addOffloadEntry(device_id, entry);
436 
437       continue;
438     }
439 
440     CUfunction fun;
441     err = cuModuleGetFunction(&fun, cumod, e->name);
442 
443     if (err != CUDA_SUCCESS) {
444       DP("Loading '%s' (Failed)\n", e->name);
445       CUDA_ERR_STRING(err);
446       return NULL;
447     }
448 
449     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
450         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
451 
452     // default value GENERIC (in case symbol is missing from cubin file)
453     int8_t ExecModeVal = ExecutionModeType::GENERIC;
454     std::string ExecModeNameStr (e->name);
455     ExecModeNameStr += "_exec_mode";
456     const char *ExecModeName = ExecModeNameStr.c_str();
457 
458     CUdeviceptr ExecModePtr;
459     size_t cusize;
460     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
461     if (err == CUDA_SUCCESS) {
462       if ((size_t)cusize != sizeof(int8_t)) {
463         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
464            ExecModeName, cusize, sizeof(int8_t));
465         CUDA_ERR_STRING(err);
466         return NULL;
467       }
468 
469       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
470       if (err != CUDA_SUCCESS) {
471         DP("Error when copying data from device to host. Pointers: "
472            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
473            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
474         CUDA_ERR_STRING(err);
475         return NULL;
476       }
477 
478       if (ExecModeVal < 0 || ExecModeVal > 1) {
479         DP("Error wrong exec_mode value specified in cubin file: %d\n",
480            ExecModeVal);
481         return NULL;
482       }
483     } else {
484       DP("Loading global exec_mode '%s' - symbol missing, using default value "
485           "GENERIC (1)\n", ExecModeName);
486       CUDA_ERR_STRING(err);
487     }
488 
489     KernelsList.push_back(KernelTy(fun, ExecModeVal));
490 
491     __tgt_offload_entry entry = *e;
492     entry.addr = (void *)&KernelsList.back();
493     DeviceInfo.addOffloadEntry(device_id, entry);
494   }
495 
496   // send device environment data to the device
497   {
498     omptarget_device_environmentTy device_env;
499 
500     device_env.debug_level = 0;
501 
502 #ifdef OMPTARGET_DEBUG
503     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
504       device_env.debug_level = std::stoi(envStr);
505     }
506 #endif
507 
508     const char * device_env_Name="omptarget_device_environment";
509     CUdeviceptr device_env_Ptr;
510     size_t cusize;
511 
512     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
513 
514     if (err == CUDA_SUCCESS) {
515       if ((size_t)cusize != sizeof(device_env)) {
516         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
517             device_env_Name, cusize, sizeof(int32_t));
518         CUDA_ERR_STRING(err);
519         return NULL;
520       }
521 
522       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
523       if (err != CUDA_SUCCESS) {
524         DP("Error when copying data from host to device. Pointers: "
525             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
526             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
527         CUDA_ERR_STRING(err);
528         return NULL;
529       }
530 
531       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
532     } else {
533       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
534       DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
535     }
536   }
537 
538   return DeviceInfo.getOffloadEntriesTable(device_id);
539 }
540 
541 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
542   if (size == 0) {
543     return NULL;
544   }
545 
546   // Set the context we are using.
547   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
548   if (err != CUDA_SUCCESS) {
549     DP("Error while trying to set CUDA current context\n");
550     CUDA_ERR_STRING(err);
551     return NULL;
552   }
553 
554   CUdeviceptr ptr;
555   err = cuMemAlloc(&ptr, size);
556   if (err != CUDA_SUCCESS) {
557     DP("Error while trying to allocate %d\n", err);
558     CUDA_ERR_STRING(err);
559     return NULL;
560   }
561 
562   void *vptr = (void *)ptr;
563   return vptr;
564 }
565 
566 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
567     int64_t size) {
568   // Set the context we are using.
569   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
570   if (err != CUDA_SUCCESS) {
571     DP("Error when setting CUDA context\n");
572     CUDA_ERR_STRING(err);
573     return OFFLOAD_FAIL;
574   }
575 
576   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
577   if (err != CUDA_SUCCESS) {
578     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
579        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
580        DPxPTR(tgt_ptr), size);
581     CUDA_ERR_STRING(err);
582     return OFFLOAD_FAIL;
583   }
584   return OFFLOAD_SUCCESS;
585 }
586 
587 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
588     int64_t size) {
589   // Set the context we are using.
590   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
591   if (err != CUDA_SUCCESS) {
592     DP("Error when setting CUDA context\n");
593     CUDA_ERR_STRING(err);
594     return OFFLOAD_FAIL;
595   }
596 
597   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
598   if (err != CUDA_SUCCESS) {
599     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
600         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
601         DPxPTR(tgt_ptr), size);
602     CUDA_ERR_STRING(err);
603     return OFFLOAD_FAIL;
604   }
605   return OFFLOAD_SUCCESS;
606 }
607 
608 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
609   // Set the context we are using.
610   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
611   if (err != CUDA_SUCCESS) {
612     DP("Error when setting CUDA context\n");
613     CUDA_ERR_STRING(err);
614     return OFFLOAD_FAIL;
615   }
616 
617   err = cuMemFree((CUdeviceptr)tgt_ptr);
618   if (err != CUDA_SUCCESS) {
619     DP("Error when freeing CUDA memory\n");
620     CUDA_ERR_STRING(err);
621     return OFFLOAD_FAIL;
622   }
623   return OFFLOAD_SUCCESS;
624 }
625 
626 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
627     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
628     int32_t thread_limit, uint64_t loop_tripcount) {
629   // Set the context we are using.
630   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
631   if (err != CUDA_SUCCESS) {
632     DP("Error when setting CUDA context\n");
633     CUDA_ERR_STRING(err);
634     return OFFLOAD_FAIL;
635   }
636 
637   // All args are references.
638   std::vector<void *> args(arg_num);
639   std::vector<void *> ptrs(arg_num);
640 
641   for (int32_t i = 0; i < arg_num; ++i) {
642     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
643     args[i] = &ptrs[i];
644   }
645 
646   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
647 
648   int cudaThreadsPerBlock;
649 
650   if (thread_limit > 0) {
651     cudaThreadsPerBlock = thread_limit;
652     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
653     // Add master warp if necessary
654     if (KernelInfo->ExecutionMode == GENERIC) {
655       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
656       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
657     }
658   } else {
659     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
660     DP("Setting CUDA threads per block to default %d\n",
661         DeviceInfo.NumThreads[device_id]);
662   }
663 
664   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
665     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
666     DP("Threads per block capped at device limit %d\n",
667         DeviceInfo.ThreadsPerBlock[device_id]);
668   }
669 
670   int kernel_limit;
671   err = cuFuncGetAttribute(&kernel_limit,
672       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
673   if (err == CUDA_SUCCESS) {
674     if (kernel_limit < cudaThreadsPerBlock) {
675       cudaThreadsPerBlock = kernel_limit;
676       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
677     }
678   }
679 
680   int cudaBlocksPerGrid;
681   if (team_num <= 0) {
682     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
683       if (KernelInfo->ExecutionMode == SPMD) {
684         // We have a combined construct, i.e. `target teams distribute parallel
685         // for [simd]`. We launch so many teams so that each thread will
686         // execute one iteration of the loop.
687         // round up to the nearest integer
688         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
689       } else {
690         // If we reach this point, then we have a non-combined construct, i.e.
691         // `teams distribute` with a nested `parallel for` and each team is
692         // assigned one iteration of the `distribute` loop. E.g.:
693         //
694         // #pragma omp target teams distribute
695         // for(...loop_tripcount...) {
696         //   #pragma omp parallel for
697         //   for(...) {}
698         // }
699         //
700         // Threads within a team will execute the iterations of the `parallel`
701         // loop.
702         cudaBlocksPerGrid = loop_tripcount;
703       }
704       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
705           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
706           cudaThreadsPerBlock);
707     } else {
708       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
709       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
710     }
711   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
712     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
713     DP("Capping number of teams to team limit %d\n",
714         DeviceInfo.BlocksPerGrid[device_id]);
715   } else {
716     cudaBlocksPerGrid = team_num;
717     DP("Using requested number of teams %d\n", team_num);
718   }
719 
720   // Run on the device.
721   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
722      cudaThreadsPerBlock);
723 
724   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
725       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
726   if (err != CUDA_SUCCESS) {
727     DP("Device kernel launch failed!\n");
728     CUDA_ERR_STRING(err);
729     return OFFLOAD_FAIL;
730   }
731 
732   DP("Launch of entry point at " DPxMOD " successful!\n",
733       DPxPTR(tgt_entry_ptr));
734 
735   CUresult sync_err = cuCtxSynchronize();
736   if (sync_err != CUDA_SUCCESS) {
737     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
738     CUDA_ERR_STRING(sync_err);
739     return OFFLOAD_FAIL;
740   } else {
741     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
742   }
743 
744   return OFFLOAD_SUCCESS;
745 }
746 
747 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
748     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
749   // use one team and the default number of threads.
750   const int32_t team_num = 1;
751   const int32_t thread_limit = 0;
752   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
753       tgt_offsets, arg_num, team_num, thread_limit, 0);
754 }
755 
756 #ifdef __cplusplus
757 }
758 #endif
759