1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <string>
19 #include <vector>
20 
21 #include "omptargetplugin.h"
22 
23 #ifndef TARGET_NAME
24 #define TARGET_NAME CUDA
25 #endif
26 
27 #ifdef OMPTARGET_DEBUG
28 static int DebugLevel = 0;
29 
30 #define GETNAME2(name) #name
31 #define GETNAME(name) GETNAME2(name)
32 #define DP(...) \
33   do { \
34     if (DebugLevel > 0) { \
35       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
36     } \
37   } while (false)
38 #else // OMPTARGET_DEBUG
39 #define DP(...) {}
40 #endif // OMPTARGET_DEBUG
41 
42 #include "../../common/elf_common.c"
43 
44 // Utility for retrieving and printing CUDA error string.
45 #ifdef CUDA_ERROR_REPORT
46 #define CUDA_ERR_STRING(err)                                                   \
47   do {                                                                         \
48     const char *errStr;                                                        \
49     cuGetErrorString(err, &errStr);                                            \
50     DP("CUDA error is: %s\n", errStr);                                         \
51   } while (0)
52 #else
53 #define CUDA_ERR_STRING(err)                                                   \
54   {}
55 #endif
56 
57 /// Keep entries table per device.
58 struct FuncOrGblEntryTy {
59   __tgt_target_table Table;
60   std::vector<__tgt_offload_entry> Entries;
61 };
62 
63 enum ExecutionModeType {
64   SPMD, // constructors, destructors,
65         // combined constructs (`teams distribute parallel for [simd]`)
66   GENERIC, // everything else
67   NONE
68 };
69 
70 /// Use a single entity to encode a kernel and a set of flags
71 struct KernelTy {
72   CUfunction Func;
73 
74   // execution mode of kernel
75   // 0 - SPMD mode (without master warp)
76   // 1 - Generic mode (with master warp)
77   int8_t ExecutionMode;
78 
79   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
80       : Func(_Func), ExecutionMode(_ExecutionMode) {}
81 };
82 
83 /// Device envrionment data
84 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
85 struct omptarget_device_environmentTy {
86   int32_t debug_level;
87 };
88 
89 /// List that contains all the kernels.
90 /// FIXME: we may need this to be per device and per library.
91 std::list<KernelTy> KernelsList;
92 
93 /// Class containing all the device information.
94 class RTLDeviceInfoTy {
95   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
96 
97 public:
98   int NumberOfDevices;
99   std::vector<CUmodule> Modules;
100   std::vector<CUcontext> Contexts;
101 
102   // Device properties
103   std::vector<int> ThreadsPerBlock;
104   std::vector<int> BlocksPerGrid;
105   std::vector<int> WarpSize;
106 
107   // OpenMP properties
108   std::vector<int> NumTeams;
109   std::vector<int> NumThreads;
110 
111   // OpenMP Environment properties
112   int EnvNumTeams;
113   int EnvTeamLimit;
114 
115   //static int EnvNumThreads;
116   static const int HardTeamLimit = 1<<16; // 64k
117   static const int HardThreadLimit = 1024;
118   static const int DefaultNumTeams = 128;
119   static const int DefaultNumThreads = 128;
120 
121   // Record entry point associated with device
122   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
123     assert(device_id < (int32_t)FuncGblEntries.size() &&
124            "Unexpected device id!");
125     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
126 
127     E.Entries.push_back(entry);
128   }
129 
130   // Return true if the entry is associated with device
131   bool findOffloadEntry(int32_t device_id, void *addr) {
132     assert(device_id < (int32_t)FuncGblEntries.size() &&
133            "Unexpected device id!");
134     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
135 
136     for (auto &it : E.Entries) {
137       if (it.addr == addr)
138         return true;
139     }
140 
141     return false;
142   }
143 
144   // Return the pointer to the target entries table
145   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
146     assert(device_id < (int32_t)FuncGblEntries.size() &&
147            "Unexpected device id!");
148     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
149 
150     int32_t size = E.Entries.size();
151 
152     // Table is empty
153     if (!size)
154       return 0;
155 
156     __tgt_offload_entry *begin = &E.Entries[0];
157     __tgt_offload_entry *end = &E.Entries[size - 1];
158 
159     // Update table info according to the entries and return the pointer
160     E.Table.EntriesBegin = begin;
161     E.Table.EntriesEnd = ++end;
162 
163     return &E.Table;
164   }
165 
166   // Clear entries table for a device
167   void clearOffloadEntriesTable(int32_t device_id) {
168     assert(device_id < (int32_t)FuncGblEntries.size() &&
169            "Unexpected device id!");
170     FuncGblEntries[device_id].emplace_back();
171     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
172     E.Entries.clear();
173     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
174   }
175 
176   RTLDeviceInfoTy() {
177 #ifdef OMPTARGET_DEBUG
178     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
179       DebugLevel = std::stoi(envStr);
180     }
181 #endif // OMPTARGET_DEBUG
182 
183     DP("Start initializing CUDA\n");
184 
185     CUresult err = cuInit(0);
186     if (err != CUDA_SUCCESS) {
187       DP("Error when initializing CUDA\n");
188       CUDA_ERR_STRING(err);
189       return;
190     }
191 
192     NumberOfDevices = 0;
193 
194     err = cuDeviceGetCount(&NumberOfDevices);
195     if (err != CUDA_SUCCESS) {
196       DP("Error when getting CUDA device count\n");
197       CUDA_ERR_STRING(err);
198       return;
199     }
200 
201     if (NumberOfDevices == 0) {
202       DP("There are no devices supporting CUDA.\n");
203       return;
204     }
205 
206     FuncGblEntries.resize(NumberOfDevices);
207     Contexts.resize(NumberOfDevices);
208     ThreadsPerBlock.resize(NumberOfDevices);
209     BlocksPerGrid.resize(NumberOfDevices);
210     WarpSize.resize(NumberOfDevices);
211     NumTeams.resize(NumberOfDevices);
212     NumThreads.resize(NumberOfDevices);
213 
214     // Get environment variables regarding teams
215     char *envStr = getenv("OMP_TEAM_LIMIT");
216     if (envStr) {
217       // OMP_TEAM_LIMIT has been set
218       EnvTeamLimit = std::stoi(envStr);
219       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
220     } else {
221       EnvTeamLimit = -1;
222     }
223     envStr = getenv("OMP_NUM_TEAMS");
224     if (envStr) {
225       // OMP_NUM_TEAMS has been set
226       EnvNumTeams = std::stoi(envStr);
227       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
228     } else {
229       EnvNumTeams = -1;
230     }
231   }
232 
233   ~RTLDeviceInfoTy() {
234     // Close modules
235     for (auto &module : Modules)
236       if (module) {
237         CUresult err = cuModuleUnload(module);
238         if (err != CUDA_SUCCESS) {
239           DP("Error when unloading CUDA module\n");
240           CUDA_ERR_STRING(err);
241         }
242       }
243 
244     // Destroy contexts
245     for (auto &ctx : Contexts)
246       if (ctx) {
247         CUresult err = cuCtxDestroy(ctx);
248         if (err != CUDA_SUCCESS) {
249           DP("Error when destroying CUDA context\n");
250           CUDA_ERR_STRING(err);
251         }
252       }
253   }
254 };
255 
256 static RTLDeviceInfoTy DeviceInfo;
257 
258 #ifdef __cplusplus
259 extern "C" {
260 #endif
261 
262 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
263   return elf_check_machine(image, 190); // EM_CUDA = 190.
264 }
265 
266 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
267 
268 int32_t __tgt_rtl_init_device(int32_t device_id) {
269 
270   CUdevice cuDevice;
271   DP("Getting device %d\n", device_id);
272   CUresult err = cuDeviceGet(&cuDevice, device_id);
273   if (err != CUDA_SUCCESS) {
274     DP("Error when getting CUDA device with id = %d\n", device_id);
275     CUDA_ERR_STRING(err);
276     return OFFLOAD_FAIL;
277   }
278 
279   // Create the context and save it to use whenever this device is selected.
280   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
281                     cuDevice);
282   if (err != CUDA_SUCCESS) {
283     DP("Error when creating a CUDA context\n");
284     CUDA_ERR_STRING(err);
285     return OFFLOAD_FAIL;
286   }
287 
288   // Query attributes to determine number of threads/block and blocks/grid.
289   int maxGridDimX;
290   err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
291                              cuDevice);
292   if (err != CUDA_SUCCESS) {
293     DP("Error getting max grid dimension, use default\n");
294     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
295   } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
296     DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
297     DP("Using %d CUDA blocks per grid\n", maxGridDimX);
298   } else {
299     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
300     DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
301        "at the hard limit\n",
302        maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
303   }
304 
305   // We are only exploiting threads along the x axis.
306   int maxBlockDimX;
307   err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
308                              cuDevice);
309   if (err != CUDA_SUCCESS) {
310     DP("Error getting max block dimension, use default\n");
311     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
312   } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
313     DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
314     DP("Using %d CUDA threads per block\n", maxBlockDimX);
315   } else {
316     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
317     DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
318        "at the hard limit\n",
319        maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
320   }
321 
322   int warpSize;
323   err =
324       cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
325   if (err != CUDA_SUCCESS) {
326     DP("Error getting warp size, assume default\n");
327     DeviceInfo.WarpSize[device_id] = 32;
328   } else {
329     DeviceInfo.WarpSize[device_id] = warpSize;
330   }
331 
332   // Adjust teams to the env variables
333   if (DeviceInfo.EnvTeamLimit > 0 &&
334       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
335     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
336     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
337         DeviceInfo.EnvTeamLimit);
338   }
339 
340   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
341      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
342      DeviceInfo.WarpSize[device_id]);
343 
344   // Set default number of teams
345   if (DeviceInfo.EnvNumTeams > 0) {
346     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
347     DP("Default number of teams set according to environment %d\n",
348         DeviceInfo.EnvNumTeams);
349   } else {
350     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
351     DP("Default number of teams set according to library's default %d\n",
352         RTLDeviceInfoTy::DefaultNumTeams);
353   }
354   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
355     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
356     DP("Default number of teams exceeds device limit, capping at %d\n",
357         DeviceInfo.BlocksPerGrid[device_id]);
358   }
359 
360   // Set default number of threads
361   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
362   DP("Default number of threads set according to library's default %d\n",
363           RTLDeviceInfoTy::DefaultNumThreads);
364   if (DeviceInfo.NumThreads[device_id] >
365       DeviceInfo.ThreadsPerBlock[device_id]) {
366     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
367     DP("Default number of threads exceeds device limit, capping at %d\n",
368         DeviceInfo.ThreadsPerBlock[device_id]);
369   }
370 
371   return OFFLOAD_SUCCESS;
372 }
373 
374 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
375     __tgt_device_image *image) {
376 
377   // Set the context we are using.
378   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
379   if (err != CUDA_SUCCESS) {
380     DP("Error when setting a CUDA context for device %d\n", device_id);
381     CUDA_ERR_STRING(err);
382     return NULL;
383   }
384 
385   // Clear the offload table as we are going to create a new one.
386   DeviceInfo.clearOffloadEntriesTable(device_id);
387 
388   // Create the module and extract the function pointers.
389 
390   CUmodule cumod;
391   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
392   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
393   if (err != CUDA_SUCCESS) {
394     DP("Error when loading CUDA module\n");
395     CUDA_ERR_STRING(err);
396     return NULL;
397   }
398 
399   DP("CUDA module successfully loaded!\n");
400   DeviceInfo.Modules.push_back(cumod);
401 
402   // Find the symbols in the module by name.
403   __tgt_offload_entry *HostBegin = image->EntriesBegin;
404   __tgt_offload_entry *HostEnd = image->EntriesEnd;
405 
406   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
407 
408     if (!e->addr) {
409       // We return NULL when something like this happens, the host should have
410       // always something in the address to uniquely identify the target region.
411       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
412 
413       return NULL;
414     }
415 
416     if (e->size) {
417       __tgt_offload_entry entry = *e;
418 
419       CUdeviceptr cuptr;
420       size_t cusize;
421       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
422 
423       if (err != CUDA_SUCCESS) {
424         DP("Loading global '%s' (Failed)\n", e->name);
425         CUDA_ERR_STRING(err);
426         return NULL;
427       }
428 
429       if (cusize != e->size) {
430         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
431             cusize, e->size);
432         CUDA_ERR_STRING(err);
433         return NULL;
434       }
435 
436       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
437           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
438       entry.addr = (void *)cuptr;
439 
440       DeviceInfo.addOffloadEntry(device_id, entry);
441 
442       continue;
443     }
444 
445     CUfunction fun;
446     err = cuModuleGetFunction(&fun, cumod, e->name);
447 
448     if (err != CUDA_SUCCESS) {
449       DP("Loading '%s' (Failed)\n", e->name);
450       CUDA_ERR_STRING(err);
451       return NULL;
452     }
453 
454     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
455         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
456 
457     // default value GENERIC (in case symbol is missing from cubin file)
458     int8_t ExecModeVal = ExecutionModeType::GENERIC;
459     std::string ExecModeNameStr (e->name);
460     ExecModeNameStr += "_exec_mode";
461     const char *ExecModeName = ExecModeNameStr.c_str();
462 
463     CUdeviceptr ExecModePtr;
464     size_t cusize;
465     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
466     if (err == CUDA_SUCCESS) {
467       if ((size_t)cusize != sizeof(int8_t)) {
468         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
469            ExecModeName, cusize, sizeof(int8_t));
470         CUDA_ERR_STRING(err);
471         return NULL;
472       }
473 
474       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
475       if (err != CUDA_SUCCESS) {
476         DP("Error when copying data from device to host. Pointers: "
477            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
478            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
479         CUDA_ERR_STRING(err);
480         return NULL;
481       }
482 
483       if (ExecModeVal < 0 || ExecModeVal > 1) {
484         DP("Error wrong exec_mode value specified in cubin file: %d\n",
485            ExecModeVal);
486         return NULL;
487       }
488     } else {
489       DP("Loading global exec_mode '%s' - symbol missing, using default value "
490           "GENERIC (1)\n", ExecModeName);
491       CUDA_ERR_STRING(err);
492     }
493 
494     KernelsList.push_back(KernelTy(fun, ExecModeVal));
495 
496     __tgt_offload_entry entry = *e;
497     entry.addr = (void *)&KernelsList.back();
498     DeviceInfo.addOffloadEntry(device_id, entry);
499   }
500 
501   // send device environment data to the device
502   {
503     omptarget_device_environmentTy device_env;
504 
505     device_env.debug_level = 0;
506 
507 #ifdef OMPTARGET_DEBUG
508     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
509       device_env.debug_level = std::stoi(envStr);
510     }
511 #endif
512 
513     const char * device_env_Name="omptarget_device_environment";
514     CUdeviceptr device_env_Ptr;
515     size_t cusize;
516 
517     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
518 
519     if (err == CUDA_SUCCESS) {
520       if ((size_t)cusize != sizeof(device_env)) {
521         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
522             device_env_Name, cusize, sizeof(int32_t));
523         CUDA_ERR_STRING(err);
524         return NULL;
525       }
526 
527       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
528       if (err != CUDA_SUCCESS) {
529         DP("Error when copying data from host to device. Pointers: "
530             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
531             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
532         CUDA_ERR_STRING(err);
533         return NULL;
534       }
535 
536       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
537     } else {
538       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
539       DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
540     }
541   }
542 
543   return DeviceInfo.getOffloadEntriesTable(device_id);
544 }
545 
546 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
547   if (size == 0) {
548     return NULL;
549   }
550 
551   // Set the context we are using.
552   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
553   if (err != CUDA_SUCCESS) {
554     DP("Error while trying to set CUDA current context\n");
555     CUDA_ERR_STRING(err);
556     return NULL;
557   }
558 
559   CUdeviceptr ptr;
560   err = cuMemAlloc(&ptr, size);
561   if (err != CUDA_SUCCESS) {
562     DP("Error while trying to allocate %d\n", err);
563     CUDA_ERR_STRING(err);
564     return NULL;
565   }
566 
567   void *vptr = (void *)ptr;
568   return vptr;
569 }
570 
571 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
572     int64_t size) {
573   // Set the context we are using.
574   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
575   if (err != CUDA_SUCCESS) {
576     DP("Error when setting CUDA context\n");
577     CUDA_ERR_STRING(err);
578     return OFFLOAD_FAIL;
579   }
580 
581   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
582   if (err != CUDA_SUCCESS) {
583     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
584        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
585        DPxPTR(tgt_ptr), size);
586     CUDA_ERR_STRING(err);
587     return OFFLOAD_FAIL;
588   }
589   return OFFLOAD_SUCCESS;
590 }
591 
592 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
593     int64_t size) {
594   // Set the context we are using.
595   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
596   if (err != CUDA_SUCCESS) {
597     DP("Error when setting CUDA context\n");
598     CUDA_ERR_STRING(err);
599     return OFFLOAD_FAIL;
600   }
601 
602   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
603   if (err != CUDA_SUCCESS) {
604     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
605         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
606         DPxPTR(tgt_ptr), size);
607     CUDA_ERR_STRING(err);
608     return OFFLOAD_FAIL;
609   }
610   return OFFLOAD_SUCCESS;
611 }
612 
613 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
614   // Set the context we are using.
615   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
616   if (err != CUDA_SUCCESS) {
617     DP("Error when setting CUDA context\n");
618     CUDA_ERR_STRING(err);
619     return OFFLOAD_FAIL;
620   }
621 
622   err = cuMemFree((CUdeviceptr)tgt_ptr);
623   if (err != CUDA_SUCCESS) {
624     DP("Error when freeing CUDA memory\n");
625     CUDA_ERR_STRING(err);
626     return OFFLOAD_FAIL;
627   }
628   return OFFLOAD_SUCCESS;
629 }
630 
631 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
632     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
633     int32_t thread_limit, uint64_t loop_tripcount) {
634   // Set the context we are using.
635   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
636   if (err != CUDA_SUCCESS) {
637     DP("Error when setting CUDA context\n");
638     CUDA_ERR_STRING(err);
639     return OFFLOAD_FAIL;
640   }
641 
642   // All args are references.
643   std::vector<void *> args(arg_num);
644   std::vector<void *> ptrs(arg_num);
645 
646   for (int32_t i = 0; i < arg_num; ++i) {
647     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
648     args[i] = &ptrs[i];
649   }
650 
651   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
652 
653   int cudaThreadsPerBlock;
654 
655   if (thread_limit > 0) {
656     cudaThreadsPerBlock = thread_limit;
657     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
658     // Add master warp if necessary
659     if (KernelInfo->ExecutionMode == GENERIC) {
660       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
661       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
662     }
663   } else {
664     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
665     DP("Setting CUDA threads per block to default %d\n",
666         DeviceInfo.NumThreads[device_id]);
667   }
668 
669   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
670     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
671     DP("Threads per block capped at device limit %d\n",
672         DeviceInfo.ThreadsPerBlock[device_id]);
673   }
674 
675   int kernel_limit;
676   err = cuFuncGetAttribute(&kernel_limit,
677       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
678   if (err == CUDA_SUCCESS) {
679     if (kernel_limit < cudaThreadsPerBlock) {
680       cudaThreadsPerBlock = kernel_limit;
681       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
682     }
683   }
684 
685   int cudaBlocksPerGrid;
686   if (team_num <= 0) {
687     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
688       if (KernelInfo->ExecutionMode == SPMD) {
689         // We have a combined construct, i.e. `target teams distribute parallel
690         // for [simd]`. We launch so many teams so that each thread will
691         // execute one iteration of the loop.
692         // round up to the nearest integer
693         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
694       } else {
695         // If we reach this point, then we have a non-combined construct, i.e.
696         // `teams distribute` with a nested `parallel for` and each team is
697         // assigned one iteration of the `distribute` loop. E.g.:
698         //
699         // #pragma omp target teams distribute
700         // for(...loop_tripcount...) {
701         //   #pragma omp parallel for
702         //   for(...) {}
703         // }
704         //
705         // Threads within a team will execute the iterations of the `parallel`
706         // loop.
707         cudaBlocksPerGrid = loop_tripcount;
708       }
709       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
710           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
711           cudaThreadsPerBlock);
712     } else {
713       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
714       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
715     }
716   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
717     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
718     DP("Capping number of teams to team limit %d\n",
719         DeviceInfo.BlocksPerGrid[device_id]);
720   } else {
721     cudaBlocksPerGrid = team_num;
722     DP("Using requested number of teams %d\n", team_num);
723   }
724 
725   // Run on the device.
726   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
727      cudaThreadsPerBlock);
728 
729   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
730       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
731   if (err != CUDA_SUCCESS) {
732     DP("Device kernel launch failed!\n");
733     CUDA_ERR_STRING(err);
734     return OFFLOAD_FAIL;
735   }
736 
737   DP("Launch of entry point at " DPxMOD " successful!\n",
738       DPxPTR(tgt_entry_ptr));
739 
740   CUresult sync_err = cuCtxSynchronize();
741   if (sync_err != CUDA_SUCCESS) {
742     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
743     CUDA_ERR_STRING(sync_err);
744     return OFFLOAD_FAIL;
745   } else {
746     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
747   }
748 
749   return OFFLOAD_SUCCESS;
750 }
751 
752 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
753     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
754   // use one team and the default number of threads.
755   const int32_t team_num = 1;
756   const int32_t thread_limit = 0;
757   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
758       tgt_offsets, arg_num, team_num, thread_limit, 0);
759 }
760 
761 #ifdef __cplusplus
762 }
763 #endif
764