1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <string>
18 #include <vector>
19 
20 #include "omptargetplugin.h"
21 
22 #ifndef TARGET_NAME
23 #define TARGET_NAME CUDA
24 #endif
25 
26 #ifdef OMPTARGET_DEBUG
27 static int DebugLevel = 0;
28 
29 #define GETNAME2(name) #name
30 #define GETNAME(name) GETNAME2(name)
31 #define DP(...) \
32   do { \
33     if (DebugLevel > 0) { \
34       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
35     } \
36   } while (false)
37 #else // OMPTARGET_DEBUG
38 #define DP(...) {}
39 #endif // OMPTARGET_DEBUG
40 
41 #include "../../common/elf_common.c"
42 
43 // Utility for retrieving and printing CUDA error string.
44 #ifdef CUDA_ERROR_REPORT
45 #define CUDA_ERR_STRING(err)                                                   \
46   do {                                                                         \
47     const char *errStr;                                                        \
48     cuGetErrorString(err, &errStr);                                            \
49     DP("CUDA error is: %s\n", errStr);                                         \
50   } while (0)
51 #else
52 #define CUDA_ERR_STRING(err)                                                   \
53   {}
54 #endif
55 
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58   __tgt_target_table Table;
59   std::vector<__tgt_offload_entry> Entries;
60 };
61 
62 enum ExecutionModeType {
63   SPMD, // constructors, destructors,
64         // combined constructs (`teams distribute parallel for [simd]`)
65   GENERIC, // everything else
66   NONE
67 };
68 
69 /// Use a single entity to encode a kernel and a set of flags
70 struct KernelTy {
71   CUfunction Func;
72 
73   // execution mode of kernel
74   // 0 - SPMD mode (without master warp)
75   // 1 - Generic mode (with master warp)
76   int8_t ExecutionMode;
77 
78   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
79       : Func(_Func), ExecutionMode(_ExecutionMode) {}
80 };
81 
82 /// Device envrionment data
83 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
84 struct omptarget_device_environmentTy {
85   int32_t debug_level;
86 };
87 
88 /// List that contains all the kernels.
89 /// FIXME: we may need this to be per device and per library.
90 std::list<KernelTy> KernelsList;
91 
92 /// Class containing all the device information.
93 class RTLDeviceInfoTy {
94   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
95 
96 public:
97   int NumberOfDevices;
98   std::vector<CUmodule> Modules;
99   std::vector<CUcontext> Contexts;
100 
101   // Device properties
102   std::vector<int> ThreadsPerBlock;
103   std::vector<int> BlocksPerGrid;
104   std::vector<int> WarpSize;
105 
106   // OpenMP properties
107   std::vector<int> NumTeams;
108   std::vector<int> NumThreads;
109 
110   // OpenMP Environment properties
111   int EnvNumTeams;
112   int EnvTeamLimit;
113 
114   //static int EnvNumThreads;
115   static const int HardTeamLimit = 1<<16; // 64k
116   static const int HardThreadLimit = 1024;
117   static const int DefaultNumTeams = 128;
118   static const int DefaultNumThreads = 128;
119 
120   // Record entry point associated with device
121   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
122     assert(device_id < (int32_t)FuncGblEntries.size() &&
123            "Unexpected device id!");
124     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
125 
126     E.Entries.push_back(entry);
127   }
128 
129   // Return true if the entry is associated with device
130   bool findOffloadEntry(int32_t device_id, void *addr) {
131     assert(device_id < (int32_t)FuncGblEntries.size() &&
132            "Unexpected device id!");
133     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
134 
135     for (auto &it : E.Entries) {
136       if (it.addr == addr)
137         return true;
138     }
139 
140     return false;
141   }
142 
143   // Return the pointer to the target entries table
144   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
145     assert(device_id < (int32_t)FuncGblEntries.size() &&
146            "Unexpected device id!");
147     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
148 
149     int32_t size = E.Entries.size();
150 
151     // Table is empty
152     if (!size)
153       return 0;
154 
155     __tgt_offload_entry *begin = &E.Entries[0];
156     __tgt_offload_entry *end = &E.Entries[size - 1];
157 
158     // Update table info according to the entries and return the pointer
159     E.Table.EntriesBegin = begin;
160     E.Table.EntriesEnd = ++end;
161 
162     return &E.Table;
163   }
164 
165   // Clear entries table for a device
166   void clearOffloadEntriesTable(int32_t device_id) {
167     assert(device_id < (int32_t)FuncGblEntries.size() &&
168            "Unexpected device id!");
169     FuncGblEntries[device_id].emplace_back();
170     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
171     E.Entries.clear();
172     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
173   }
174 
175   RTLDeviceInfoTy() {
176 #ifdef OMPTARGET_DEBUG
177     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
178       DebugLevel = std::stoi(envStr);
179     }
180 #endif // OMPTARGET_DEBUG
181 
182     DP("Start initializing CUDA\n");
183 
184     CUresult err = cuInit(0);
185     if (err != CUDA_SUCCESS) {
186       DP("Error when initializing CUDA\n");
187       CUDA_ERR_STRING(err);
188       return;
189     }
190 
191     NumberOfDevices = 0;
192 
193     err = cuDeviceGetCount(&NumberOfDevices);
194     if (err != CUDA_SUCCESS) {
195       DP("Error when getting CUDA device count\n");
196       CUDA_ERR_STRING(err);
197       return;
198     }
199 
200     if (NumberOfDevices == 0) {
201       DP("There are no devices supporting CUDA.\n");
202       return;
203     }
204 
205     FuncGblEntries.resize(NumberOfDevices);
206     Contexts.resize(NumberOfDevices);
207     ThreadsPerBlock.resize(NumberOfDevices);
208     BlocksPerGrid.resize(NumberOfDevices);
209     WarpSize.resize(NumberOfDevices);
210     NumTeams.resize(NumberOfDevices);
211     NumThreads.resize(NumberOfDevices);
212 
213     // Get environment variables regarding teams
214     char *envStr = getenv("OMP_TEAM_LIMIT");
215     if (envStr) {
216       // OMP_TEAM_LIMIT has been set
217       EnvTeamLimit = std::stoi(envStr);
218       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
219     } else {
220       EnvTeamLimit = -1;
221     }
222     envStr = getenv("OMP_NUM_TEAMS");
223     if (envStr) {
224       // OMP_NUM_TEAMS has been set
225       EnvNumTeams = std::stoi(envStr);
226       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
227     } else {
228       EnvNumTeams = -1;
229     }
230   }
231 
232   ~RTLDeviceInfoTy() {
233     // Close modules
234     for (auto &module : Modules)
235       if (module) {
236         CUresult err = cuModuleUnload(module);
237         if (err != CUDA_SUCCESS) {
238           DP("Error when unloading CUDA module\n");
239           CUDA_ERR_STRING(err);
240         }
241       }
242 
243     // Destroy contexts
244     for (auto &ctx : Contexts)
245       if (ctx) {
246         CUresult err = cuCtxDestroy(ctx);
247         if (err != CUDA_SUCCESS) {
248           DP("Error when destroying CUDA context\n");
249           CUDA_ERR_STRING(err);
250         }
251       }
252   }
253 };
254 
255 static RTLDeviceInfoTy DeviceInfo;
256 
257 #ifdef __cplusplus
258 extern "C" {
259 #endif
260 
261 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
262   return elf_check_machine(image, 190); // EM_CUDA = 190.
263 }
264 
265 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
266 
267 int32_t __tgt_rtl_init_device(int32_t device_id) {
268 
269   CUdevice cuDevice;
270   DP("Getting device %d\n", device_id);
271   CUresult err = cuDeviceGet(&cuDevice, device_id);
272   if (err != CUDA_SUCCESS) {
273     DP("Error when getting CUDA device with id = %d\n", device_id);
274     CUDA_ERR_STRING(err);
275     return OFFLOAD_FAIL;
276   }
277 
278   // Create the context and save it to use whenever this device is selected.
279   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
280                     cuDevice);
281   if (err != CUDA_SUCCESS) {
282     DP("Error when creating a CUDA context\n");
283     CUDA_ERR_STRING(err);
284     return OFFLOAD_FAIL;
285   }
286 
287   // Query attributes to determine number of threads/block and blocks/grid.
288   int maxGridDimX;
289   err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
290                              cuDevice);
291   if (err != CUDA_SUCCESS) {
292     DP("Error getting max grid dimension, use default\n");
293     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
294   } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
295     DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
296     DP("Using %d CUDA blocks per grid\n", maxGridDimX);
297   } else {
298     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
299     DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
300        "at the hard limit\n",
301        maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
302   }
303 
304   // We are only exploiting threads along the x axis.
305   int maxBlockDimX;
306   err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
307                              cuDevice);
308   if (err != CUDA_SUCCESS) {
309     DP("Error getting max block dimension, use default\n");
310     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
311   } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
312     DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
313     DP("Using %d CUDA threads per block\n", maxBlockDimX);
314   } else {
315     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
316     DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
317        "at the hard limit\n",
318        maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
319   }
320 
321   int warpSize;
322   err =
323       cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
324   if (err != CUDA_SUCCESS) {
325     DP("Error getting warp size, assume default\n");
326     DeviceInfo.WarpSize[device_id] = 32;
327   } else {
328     DeviceInfo.WarpSize[device_id] = warpSize;
329   }
330 
331   // Adjust teams to the env variables
332   if (DeviceInfo.EnvTeamLimit > 0 &&
333       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
334     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
335     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
336         DeviceInfo.EnvTeamLimit);
337   }
338 
339   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
340      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
341      DeviceInfo.WarpSize[device_id]);
342 
343   // Set default number of teams
344   if (DeviceInfo.EnvNumTeams > 0) {
345     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
346     DP("Default number of teams set according to environment %d\n",
347         DeviceInfo.EnvNumTeams);
348   } else {
349     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
350     DP("Default number of teams set according to library's default %d\n",
351         RTLDeviceInfoTy::DefaultNumTeams);
352   }
353   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
354     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
355     DP("Default number of teams exceeds device limit, capping at %d\n",
356         DeviceInfo.BlocksPerGrid[device_id]);
357   }
358 
359   // Set default number of threads
360   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
361   DP("Default number of threads set according to library's default %d\n",
362           RTLDeviceInfoTy::DefaultNumThreads);
363   if (DeviceInfo.NumThreads[device_id] >
364       DeviceInfo.ThreadsPerBlock[device_id]) {
365     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
366     DP("Default number of threads exceeds device limit, capping at %d\n",
367         DeviceInfo.ThreadsPerBlock[device_id]);
368   }
369 
370   return OFFLOAD_SUCCESS;
371 }
372 
373 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
374     __tgt_device_image *image) {
375 
376   // Set the context we are using.
377   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
378   if (err != CUDA_SUCCESS) {
379     DP("Error when setting a CUDA context for device %d\n", device_id);
380     CUDA_ERR_STRING(err);
381     return NULL;
382   }
383 
384   // Clear the offload table as we are going to create a new one.
385   DeviceInfo.clearOffloadEntriesTable(device_id);
386 
387   // Create the module and extract the function pointers.
388 
389   CUmodule cumod;
390   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
391   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
392   if (err != CUDA_SUCCESS) {
393     DP("Error when loading CUDA module\n");
394     CUDA_ERR_STRING(err);
395     return NULL;
396   }
397 
398   DP("CUDA module successfully loaded!\n");
399   DeviceInfo.Modules.push_back(cumod);
400 
401   // Find the symbols in the module by name.
402   __tgt_offload_entry *HostBegin = image->EntriesBegin;
403   __tgt_offload_entry *HostEnd = image->EntriesEnd;
404 
405   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
406 
407     if (!e->addr) {
408       // We return NULL when something like this happens, the host should have
409       // always something in the address to uniquely identify the target region.
410       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
411 
412       return NULL;
413     }
414 
415     if (e->size) {
416       __tgt_offload_entry entry = *e;
417 
418       CUdeviceptr cuptr;
419       size_t cusize;
420       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
421 
422       if (err != CUDA_SUCCESS) {
423         DP("Loading global '%s' (Failed)\n", e->name);
424         CUDA_ERR_STRING(err);
425         return NULL;
426       }
427 
428       if (cusize != e->size) {
429         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
430             cusize, e->size);
431         CUDA_ERR_STRING(err);
432         return NULL;
433       }
434 
435       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
436           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
437       entry.addr = (void *)cuptr;
438 
439       DeviceInfo.addOffloadEntry(device_id, entry);
440 
441       continue;
442     }
443 
444     CUfunction fun;
445     err = cuModuleGetFunction(&fun, cumod, e->name);
446 
447     if (err != CUDA_SUCCESS) {
448       DP("Loading '%s' (Failed)\n", e->name);
449       CUDA_ERR_STRING(err);
450       return NULL;
451     }
452 
453     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
454         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
455 
456     // default value GENERIC (in case symbol is missing from cubin file)
457     int8_t ExecModeVal = ExecutionModeType::GENERIC;
458     std::string ExecModeNameStr (e->name);
459     ExecModeNameStr += "_exec_mode";
460     const char *ExecModeName = ExecModeNameStr.c_str();
461 
462     CUdeviceptr ExecModePtr;
463     size_t cusize;
464     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
465     if (err == CUDA_SUCCESS) {
466       if ((size_t)cusize != sizeof(int8_t)) {
467         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
468            ExecModeName, cusize, sizeof(int8_t));
469         CUDA_ERR_STRING(err);
470         return NULL;
471       }
472 
473       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
474       if (err != CUDA_SUCCESS) {
475         DP("Error when copying data from device to host. Pointers: "
476            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
477            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
478         CUDA_ERR_STRING(err);
479         return NULL;
480       }
481 
482       if (ExecModeVal < 0 || ExecModeVal > 1) {
483         DP("Error wrong exec_mode value specified in cubin file: %d\n",
484            ExecModeVal);
485         return NULL;
486       }
487     } else {
488       DP("Loading global exec_mode '%s' - symbol missing, using default value "
489           "GENERIC (1)\n", ExecModeName);
490       CUDA_ERR_STRING(err);
491     }
492 
493     KernelsList.push_back(KernelTy(fun, ExecModeVal));
494 
495     __tgt_offload_entry entry = *e;
496     entry.addr = (void *)&KernelsList.back();
497     DeviceInfo.addOffloadEntry(device_id, entry);
498   }
499 
500   // send device environment data to the device
501   {
502     omptarget_device_environmentTy device_env;
503 
504     device_env.debug_level = 0;
505 
506 #ifdef OMPTARGET_DEBUG
507     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
508       device_env.debug_level = std::stoi(envStr);
509     }
510 #endif
511 
512     const char * device_env_Name="omptarget_device_environment";
513     CUdeviceptr device_env_Ptr;
514     size_t cusize;
515 
516     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
517 
518     if (err == CUDA_SUCCESS) {
519       if ((size_t)cusize != sizeof(device_env)) {
520         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
521             device_env_Name, cusize, sizeof(int32_t));
522         CUDA_ERR_STRING(err);
523         return NULL;
524       }
525 
526       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
527       if (err != CUDA_SUCCESS) {
528         DP("Error when copying data from host to device. Pointers: "
529             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
530             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
531         CUDA_ERR_STRING(err);
532         return NULL;
533       }
534 
535       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
536     } else {
537       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
538       DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
539     }
540   }
541 
542   return DeviceInfo.getOffloadEntriesTable(device_id);
543 }
544 
545 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
546   if (size == 0) {
547     return NULL;
548   }
549 
550   // Set the context we are using.
551   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
552   if (err != CUDA_SUCCESS) {
553     DP("Error while trying to set CUDA current context\n");
554     CUDA_ERR_STRING(err);
555     return NULL;
556   }
557 
558   CUdeviceptr ptr;
559   err = cuMemAlloc(&ptr, size);
560   if (err != CUDA_SUCCESS) {
561     DP("Error while trying to allocate %d\n", err);
562     CUDA_ERR_STRING(err);
563     return NULL;
564   }
565 
566   void *vptr = (void *)ptr;
567   return vptr;
568 }
569 
570 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
571     int64_t size) {
572   // Set the context we are using.
573   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
574   if (err != CUDA_SUCCESS) {
575     DP("Error when setting CUDA context\n");
576     CUDA_ERR_STRING(err);
577     return OFFLOAD_FAIL;
578   }
579 
580   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
581   if (err != CUDA_SUCCESS) {
582     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
583        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
584        DPxPTR(tgt_ptr), size);
585     CUDA_ERR_STRING(err);
586     return OFFLOAD_FAIL;
587   }
588   return OFFLOAD_SUCCESS;
589 }
590 
591 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
592     int64_t size) {
593   // Set the context we are using.
594   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
595   if (err != CUDA_SUCCESS) {
596     DP("Error when setting CUDA context\n");
597     CUDA_ERR_STRING(err);
598     return OFFLOAD_FAIL;
599   }
600 
601   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
602   if (err != CUDA_SUCCESS) {
603     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
604         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
605         DPxPTR(tgt_ptr), size);
606     CUDA_ERR_STRING(err);
607     return OFFLOAD_FAIL;
608   }
609   return OFFLOAD_SUCCESS;
610 }
611 
612 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
613   // Set the context we are using.
614   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
615   if (err != CUDA_SUCCESS) {
616     DP("Error when setting CUDA context\n");
617     CUDA_ERR_STRING(err);
618     return OFFLOAD_FAIL;
619   }
620 
621   err = cuMemFree((CUdeviceptr)tgt_ptr);
622   if (err != CUDA_SUCCESS) {
623     DP("Error when freeing CUDA memory\n");
624     CUDA_ERR_STRING(err);
625     return OFFLOAD_FAIL;
626   }
627   return OFFLOAD_SUCCESS;
628 }
629 
630 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
631     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
632     int32_t thread_limit, uint64_t loop_tripcount) {
633   // Set the context we are using.
634   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
635   if (err != CUDA_SUCCESS) {
636     DP("Error when setting CUDA context\n");
637     CUDA_ERR_STRING(err);
638     return OFFLOAD_FAIL;
639   }
640 
641   // All args are references.
642   std::vector<void *> args(arg_num);
643   std::vector<void *> ptrs(arg_num);
644 
645   for (int32_t i = 0; i < arg_num; ++i) {
646     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
647     args[i] = &ptrs[i];
648   }
649 
650   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
651 
652   int cudaThreadsPerBlock;
653 
654   if (thread_limit > 0) {
655     cudaThreadsPerBlock = thread_limit;
656     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
657     // Add master warp if necessary
658     if (KernelInfo->ExecutionMode == GENERIC) {
659       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
660       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
661     }
662   } else {
663     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
664     DP("Setting CUDA threads per block to default %d\n",
665         DeviceInfo.NumThreads[device_id]);
666   }
667 
668   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
669     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
670     DP("Threads per block capped at device limit %d\n",
671         DeviceInfo.ThreadsPerBlock[device_id]);
672   }
673 
674   int kernel_limit;
675   err = cuFuncGetAttribute(&kernel_limit,
676       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
677   if (err == CUDA_SUCCESS) {
678     if (kernel_limit < cudaThreadsPerBlock) {
679       cudaThreadsPerBlock = kernel_limit;
680       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
681     }
682   }
683 
684   int cudaBlocksPerGrid;
685   if (team_num <= 0) {
686     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
687       if (KernelInfo->ExecutionMode == SPMD) {
688         // We have a combined construct, i.e. `target teams distribute parallel
689         // for [simd]`. We launch so many teams so that each thread will
690         // execute one iteration of the loop.
691         // round up to the nearest integer
692         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
693       } else {
694         // If we reach this point, then we have a non-combined construct, i.e.
695         // `teams distribute` with a nested `parallel for` and each team is
696         // assigned one iteration of the `distribute` loop. E.g.:
697         //
698         // #pragma omp target teams distribute
699         // for(...loop_tripcount...) {
700         //   #pragma omp parallel for
701         //   for(...) {}
702         // }
703         //
704         // Threads within a team will execute the iterations of the `parallel`
705         // loop.
706         cudaBlocksPerGrid = loop_tripcount;
707       }
708       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
709           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
710           cudaThreadsPerBlock);
711     } else {
712       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
713       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
714     }
715   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
716     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
717     DP("Capping number of teams to team limit %d\n",
718         DeviceInfo.BlocksPerGrid[device_id]);
719   } else {
720     cudaBlocksPerGrid = team_num;
721     DP("Using requested number of teams %d\n", team_num);
722   }
723 
724   // Run on the device.
725   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
726      cudaThreadsPerBlock);
727 
728   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
729       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
730   if (err != CUDA_SUCCESS) {
731     DP("Device kernel launch failed!\n");
732     CUDA_ERR_STRING(err);
733     return OFFLOAD_FAIL;
734   }
735 
736   DP("Launch of entry point at " DPxMOD " successful!\n",
737       DPxPTR(tgt_entry_ptr));
738 
739   CUresult sync_err = cuCtxSynchronize();
740   if (sync_err != CUDA_SUCCESS) {
741     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
742     CUDA_ERR_STRING(sync_err);
743     return OFFLOAD_FAIL;
744   } else {
745     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
746   }
747 
748   return OFFLOAD_SUCCESS;
749 }
750 
751 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
752     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
753   // use one team and the default number of threads.
754   const int32_t team_num = 1;
755   const int32_t thread_limit = 0;
756   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
757       tgt_offsets, arg_num, team_num, thread_limit, 0);
758 }
759 
760 #ifdef __cplusplus
761 }
762 #endif
763