1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <list>
18 #include <string>
19 #include <vector>
20 
21 #include "omptargetplugin.h"
22 
23 #ifndef TARGET_NAME
24 #define TARGET_NAME CUDA
25 #endif
26 
27 #ifdef OMPTARGET_DEBUG
28 static int DebugLevel = 0;
29 
30 #define GETNAME2(name) #name
31 #define GETNAME(name) GETNAME2(name)
32 #define DP(...) \
33   do { \
34     if (DebugLevel > 0) { \
35       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
36     } \
37   } while (false)
38 #else // OMPTARGET_DEBUG
39 #define DP(...) {}
40 #endif // OMPTARGET_DEBUG
41 
42 #include "../../common/elf_common.c"
43 
44 // Utility for retrieving and printing CUDA error string.
45 #ifdef CUDA_ERROR_REPORT
46 #define CUDA_ERR_STRING(err)                                                   \
47   do {                                                                         \
48     const char *errStr;                                                        \
49     cuGetErrorString(err, &errStr);                                            \
50     DP("CUDA error is: %s\n", errStr);                                         \
51   } while (0)
52 #else
53 #define CUDA_ERR_STRING(err)                                                   \
54   {}
55 #endif
56 
57 /// Keep entries table per device.
58 struct FuncOrGblEntryTy {
59   __tgt_target_table Table;
60   std::vector<__tgt_offload_entry> Entries;
61 };
62 
63 enum ExecutionModeType {
64   SPMD, // constructors, destructors,
65         // combined constructs (`teams distribute parallel for [simd]`)
66   GENERIC, // everything else
67   NONE
68 };
69 
70 /// Use a single entity to encode a kernel and a set of flags
71 struct KernelTy {
72   CUfunction Func;
73 
74   // execution mode of kernel
75   // 0 - SPMD mode (without master warp)
76   // 1 - Generic mode (with master warp)
77   int8_t ExecutionMode;
78 
79   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
80       : Func(_Func), ExecutionMode(_ExecutionMode) {}
81 };
82 
83 /// List that contains all the kernels.
84 /// FIXME: we may need this to be per device and per library.
85 std::list<KernelTy> KernelsList;
86 
87 /// Class containing all the device information.
88 class RTLDeviceInfoTy {
89   std::vector<FuncOrGblEntryTy> FuncGblEntries;
90 
91 public:
92   int NumberOfDevices;
93   std::vector<CUmodule> Modules;
94   std::vector<CUcontext> Contexts;
95 
96   // Device properties
97   std::vector<int> ThreadsPerBlock;
98   std::vector<int> BlocksPerGrid;
99   std::vector<int> WarpSize;
100 
101   // OpenMP properties
102   std::vector<int> NumTeams;
103   std::vector<int> NumThreads;
104 
105   // OpenMP Environment properties
106   int EnvNumTeams;
107   int EnvTeamLimit;
108 
109   //static int EnvNumThreads;
110   static const int HardTeamLimit = 1<<16; // 64k
111   static const int HardThreadLimit = 1024;
112   static const int DefaultNumTeams = 128;
113   static const int DefaultNumThreads = 128;
114 
115   // Record entry point associated with device
116   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
117     assert(device_id < (int32_t)FuncGblEntries.size() &&
118            "Unexpected device id!");
119     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
120 
121     E.Entries.push_back(entry);
122   }
123 
124   // Return true if the entry is associated with device
125   bool findOffloadEntry(int32_t device_id, void *addr) {
126     assert(device_id < (int32_t)FuncGblEntries.size() &&
127            "Unexpected device id!");
128     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
129 
130     for (auto &it : E.Entries) {
131       if (it.addr == addr)
132         return true;
133     }
134 
135     return false;
136   }
137 
138   // Return the pointer to the target entries table
139   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
140     assert(device_id < (int32_t)FuncGblEntries.size() &&
141            "Unexpected device id!");
142     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
143 
144     int32_t size = E.Entries.size();
145 
146     // Table is empty
147     if (!size)
148       return 0;
149 
150     __tgt_offload_entry *begin = &E.Entries[0];
151     __tgt_offload_entry *end = &E.Entries[size - 1];
152 
153     // Update table info according to the entries and return the pointer
154     E.Table.EntriesBegin = begin;
155     E.Table.EntriesEnd = ++end;
156 
157     return &E.Table;
158   }
159 
160   // Clear entries table for a device
161   void clearOffloadEntriesTable(int32_t device_id) {
162     assert(device_id < (int32_t)FuncGblEntries.size() &&
163            "Unexpected device id!");
164     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
165     E.Entries.clear();
166     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
167   }
168 
169   RTLDeviceInfoTy() {
170 #ifdef OMPTARGET_DEBUG
171     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
172       DebugLevel = std::stoi(envStr);
173     }
174 #endif // OMPTARGET_DEBUG
175 
176     DP("Start initializing CUDA\n");
177 
178     CUresult err = cuInit(0);
179     if (err != CUDA_SUCCESS) {
180       DP("Error when initializing CUDA\n");
181       CUDA_ERR_STRING(err);
182       return;
183     }
184 
185     NumberOfDevices = 0;
186 
187     err = cuDeviceGetCount(&NumberOfDevices);
188     if (err != CUDA_SUCCESS) {
189       DP("Error when getting CUDA device count\n");
190       CUDA_ERR_STRING(err);
191       return;
192     }
193 
194     if (NumberOfDevices == 0) {
195       DP("There are no devices supporting CUDA.\n");
196       return;
197     }
198 
199     FuncGblEntries.resize(NumberOfDevices);
200     Contexts.resize(NumberOfDevices);
201     ThreadsPerBlock.resize(NumberOfDevices);
202     BlocksPerGrid.resize(NumberOfDevices);
203     WarpSize.resize(NumberOfDevices);
204     NumTeams.resize(NumberOfDevices);
205     NumThreads.resize(NumberOfDevices);
206 
207     // Get environment variables regarding teams
208     char *envStr = getenv("OMP_TEAM_LIMIT");
209     if (envStr) {
210       // OMP_TEAM_LIMIT has been set
211       EnvTeamLimit = std::stoi(envStr);
212       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
213     } else {
214       EnvTeamLimit = -1;
215     }
216     envStr = getenv("OMP_NUM_TEAMS");
217     if (envStr) {
218       // OMP_NUM_TEAMS has been set
219       EnvNumTeams = std::stoi(envStr);
220       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
221     } else {
222       EnvNumTeams = -1;
223     }
224   }
225 
226   ~RTLDeviceInfoTy() {
227     // Close modules
228     for (auto &module : Modules)
229       if (module) {
230         CUresult err = cuModuleUnload(module);
231         if (err != CUDA_SUCCESS) {
232           DP("Error when unloading CUDA module\n");
233           CUDA_ERR_STRING(err);
234         }
235       }
236 
237     // Destroy contexts
238     for (auto &ctx : Contexts)
239       if (ctx) {
240         CUresult err = cuCtxDestroy(ctx);
241         if (err != CUDA_SUCCESS) {
242           DP("Error when destroying CUDA context\n");
243           CUDA_ERR_STRING(err);
244         }
245       }
246   }
247 };
248 
249 static RTLDeviceInfoTy DeviceInfo;
250 
251 #ifdef __cplusplus
252 extern "C" {
253 #endif
254 
255 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
256   return elf_check_machine(image, 190); // EM_CUDA = 190.
257 }
258 
259 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
260 
261 int32_t __tgt_rtl_init_device(int32_t device_id) {
262 
263   CUdevice cuDevice;
264   DP("Getting device %d\n", device_id);
265   CUresult err = cuDeviceGet(&cuDevice, device_id);
266   if (err != CUDA_SUCCESS) {
267     DP("Error when getting CUDA device with id = %d\n", device_id);
268     CUDA_ERR_STRING(err);
269     return OFFLOAD_FAIL;
270   }
271 
272   // Create the context and save it to use whenever this device is selected.
273   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
274                     cuDevice);
275   if (err != CUDA_SUCCESS) {
276     DP("Error when creating a CUDA context\n");
277     CUDA_ERR_STRING(err);
278     return OFFLOAD_FAIL;
279   }
280 
281   // scan properties to determine number of threads/block and blocks/grid.
282   CUdevprop Properties;
283   err = cuDeviceGetProperties(&Properties, cuDevice);
284   if (err != CUDA_SUCCESS) {
285     DP("Error getting device Properties, use defaults\n");
286     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
287     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
288     DeviceInfo.WarpSize[device_id] = 32;
289   } else {
290     // Get blocks per grid
291     if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
292       DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
293       DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
294     } else {
295       DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
296       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
297           "at the hard limit\n", Properties.maxGridSize[0],
298           RTLDeviceInfoTy::HardTeamLimit);
299     }
300 
301     // Get threads per block, exploit threads only along x axis
302     if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
303       DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
304       DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
305       if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
306         DP("(fewer than max per block along all xyz dims %d)\n",
307             Properties.maxThreadsPerBlock);
308       }
309     } else {
310       DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
311       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
312           "capping at the hard limit\n", Properties.maxThreadsDim[0],
313           RTLDeviceInfoTy::HardThreadLimit);
314     }
315 
316     // According to the documentation, SIMDWidth is "Warp size in threads".
317     DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth;
318   }
319 
320   // Adjust teams to the env variables
321   if (DeviceInfo.EnvTeamLimit > 0 &&
322       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
323     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
324     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
325         DeviceInfo.EnvTeamLimit);
326   }
327 
328   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
329      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
330      DeviceInfo.WarpSize[device_id]);
331 
332   // Set default number of teams
333   if (DeviceInfo.EnvNumTeams > 0) {
334     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
335     DP("Default number of teams set according to environment %d\n",
336         DeviceInfo.EnvNumTeams);
337   } else {
338     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
339     DP("Default number of teams set according to library's default %d\n",
340         RTLDeviceInfoTy::DefaultNumTeams);
341   }
342   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
343     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
344     DP("Default number of teams exceeds device limit, capping at %d\n",
345         DeviceInfo.BlocksPerGrid[device_id]);
346   }
347 
348   // Set default number of threads
349   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
350   DP("Default number of threads set according to library's default %d\n",
351           RTLDeviceInfoTy::DefaultNumThreads);
352   if (DeviceInfo.NumThreads[device_id] >
353       DeviceInfo.ThreadsPerBlock[device_id]) {
354     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
355     DP("Default number of threads exceeds device limit, capping at %d\n",
356         DeviceInfo.ThreadsPerBlock[device_id]);
357   }
358 
359   return OFFLOAD_SUCCESS;
360 }
361 
362 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
363     __tgt_device_image *image) {
364 
365   // Set the context we are using.
366   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
367   if (err != CUDA_SUCCESS) {
368     DP("Error when setting a CUDA context for device %d\n", device_id);
369     CUDA_ERR_STRING(err);
370     return NULL;
371   }
372 
373   // Clear the offload table as we are going to create a new one.
374   DeviceInfo.clearOffloadEntriesTable(device_id);
375 
376   // Create the module and extract the function pointers.
377 
378   CUmodule cumod;
379   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
380   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
381   if (err != CUDA_SUCCESS) {
382     DP("Error when loading CUDA module\n");
383     CUDA_ERR_STRING(err);
384     return NULL;
385   }
386 
387   DP("CUDA module successfully loaded!\n");
388   DeviceInfo.Modules.push_back(cumod);
389 
390   // Find the symbols in the module by name.
391   __tgt_offload_entry *HostBegin = image->EntriesBegin;
392   __tgt_offload_entry *HostEnd = image->EntriesEnd;
393 
394   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
395 
396     if (!e->addr) {
397       // We return NULL when something like this happens, the host should have
398       // always something in the address to uniquely identify the target region.
399       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
400 
401       return NULL;
402     }
403 
404     if (e->size) {
405       __tgt_offload_entry entry = *e;
406 
407       CUdeviceptr cuptr;
408       size_t cusize;
409       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
410 
411       if (err != CUDA_SUCCESS) {
412         DP("Loading global '%s' (Failed)\n", e->name);
413         CUDA_ERR_STRING(err);
414         return NULL;
415       }
416 
417       if (cusize != e->size) {
418         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
419             cusize, e->size);
420         CUDA_ERR_STRING(err);
421         return NULL;
422       }
423 
424       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
425           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
426       entry.addr = (void *)cuptr;
427 
428       DeviceInfo.addOffloadEntry(device_id, entry);
429 
430       continue;
431     }
432 
433     CUfunction fun;
434     err = cuModuleGetFunction(&fun, cumod, e->name);
435 
436     if (err != CUDA_SUCCESS) {
437       DP("Loading '%s' (Failed)\n", e->name);
438       CUDA_ERR_STRING(err);
439       return NULL;
440     }
441 
442     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
443         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
444 
445     // default value GENERIC (in case symbol is missing from cubin file)
446     int8_t ExecModeVal = ExecutionModeType::GENERIC;
447     std::string ExecModeNameStr (e->name);
448     ExecModeNameStr += "_exec_mode";
449     const char *ExecModeName = ExecModeNameStr.c_str();
450 
451     CUdeviceptr ExecModePtr;
452     size_t cusize;
453     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
454     if (err == CUDA_SUCCESS) {
455       if ((size_t)cusize != sizeof(int8_t)) {
456         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
457            ExecModeName, cusize, sizeof(int8_t));
458         CUDA_ERR_STRING(err);
459         return NULL;
460       }
461 
462       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
463       if (err != CUDA_SUCCESS) {
464         DP("Error when copying data from device to host. Pointers: "
465            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
466            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
467         CUDA_ERR_STRING(err);
468         return NULL;
469       }
470 
471       if (ExecModeVal < 0 || ExecModeVal > 1) {
472         DP("Error wrong exec_mode value specified in cubin file: %d\n",
473            ExecModeVal);
474         return NULL;
475       }
476     } else {
477       DP("Loading global exec_mode '%s' - symbol missing, using default value "
478           "GENERIC (1)\n", ExecModeName);
479       CUDA_ERR_STRING(err);
480     }
481 
482     KernelsList.push_back(KernelTy(fun, ExecModeVal));
483 
484     __tgt_offload_entry entry = *e;
485     entry.addr = (void *)&KernelsList.back();
486     DeviceInfo.addOffloadEntry(device_id, entry);
487   }
488 
489   return DeviceInfo.getOffloadEntriesTable(device_id);
490 }
491 
492 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
493   if (size == 0) {
494     return NULL;
495   }
496 
497   // Set the context we are using.
498   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
499   if (err != CUDA_SUCCESS) {
500     DP("Error while trying to set CUDA current context\n");
501     CUDA_ERR_STRING(err);
502     return NULL;
503   }
504 
505   CUdeviceptr ptr;
506   err = cuMemAlloc(&ptr, size);
507   if (err != CUDA_SUCCESS) {
508     DP("Error while trying to allocate %d\n", err);
509     CUDA_ERR_STRING(err);
510     return NULL;
511   }
512 
513   void *vptr = (void *)ptr;
514   return vptr;
515 }
516 
517 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
518     int64_t size) {
519   // Set the context we are using.
520   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
521   if (err != CUDA_SUCCESS) {
522     DP("Error when setting CUDA context\n");
523     CUDA_ERR_STRING(err);
524     return OFFLOAD_FAIL;
525   }
526 
527   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
528   if (err != CUDA_SUCCESS) {
529     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
530        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
531        DPxPTR(tgt_ptr), size);
532     CUDA_ERR_STRING(err);
533     return OFFLOAD_FAIL;
534   }
535   return OFFLOAD_SUCCESS;
536 }
537 
538 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
539     int64_t size) {
540   // Set the context we are using.
541   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
542   if (err != CUDA_SUCCESS) {
543     DP("Error when setting CUDA context\n");
544     CUDA_ERR_STRING(err);
545     return OFFLOAD_FAIL;
546   }
547 
548   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
549   if (err != CUDA_SUCCESS) {
550     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
551         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
552         DPxPTR(tgt_ptr), size);
553     CUDA_ERR_STRING(err);
554     return OFFLOAD_FAIL;
555   }
556   return OFFLOAD_SUCCESS;
557 }
558 
559 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
560   // Set the context we are using.
561   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
562   if (err != CUDA_SUCCESS) {
563     DP("Error when setting CUDA context\n");
564     CUDA_ERR_STRING(err);
565     return OFFLOAD_FAIL;
566   }
567 
568   err = cuMemFree((CUdeviceptr)tgt_ptr);
569   if (err != CUDA_SUCCESS) {
570     DP("Error when freeing CUDA memory\n");
571     CUDA_ERR_STRING(err);
572     return OFFLOAD_FAIL;
573   }
574   return OFFLOAD_SUCCESS;
575 }
576 
577 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
578     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
579     int32_t thread_limit, uint64_t loop_tripcount) {
580   // Set the context we are using.
581   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
582   if (err != CUDA_SUCCESS) {
583     DP("Error when setting CUDA context\n");
584     CUDA_ERR_STRING(err);
585     return OFFLOAD_FAIL;
586   }
587 
588   // All args are references.
589   std::vector<void *> args(arg_num);
590   std::vector<void *> ptrs(arg_num);
591 
592   for (int32_t i = 0; i < arg_num; ++i) {
593     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
594     args[i] = &ptrs[i];
595   }
596 
597   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
598 
599   int cudaThreadsPerBlock;
600 
601   if (thread_limit > 0) {
602     cudaThreadsPerBlock = thread_limit;
603     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
604     // Add master warp if necessary
605     if (KernelInfo->ExecutionMode == GENERIC) {
606       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
607       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
608     }
609   } else {
610     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
611     DP("Setting CUDA threads per block to default %d\n",
612         DeviceInfo.NumThreads[device_id]);
613   }
614 
615   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
616     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
617     DP("Threads per block capped at device limit %d\n",
618         DeviceInfo.ThreadsPerBlock[device_id]);
619   }
620 
621   int kernel_limit;
622   err = cuFuncGetAttribute(&kernel_limit,
623       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
624   if (err == CUDA_SUCCESS) {
625     if (kernel_limit < cudaThreadsPerBlock) {
626       cudaThreadsPerBlock = kernel_limit;
627       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
628     }
629   }
630 
631   int cudaBlocksPerGrid;
632   if (team_num <= 0) {
633     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
634       if (KernelInfo->ExecutionMode == SPMD) {
635         // We have a combined construct, i.e. `target teams distribute parallel
636         // for [simd]`. We launch so many teams so that each thread will
637         // execute one iteration of the loop.
638         // round up to the nearest integer
639         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
640       } else {
641         // If we reach this point, then we have a non-combined construct, i.e.
642         // `teams distribute` with a nested `parallel for` and each team is
643         // assigned one iteration of the `distribute` loop. E.g.:
644         //
645         // #pragma omp target teams distribute
646         // for(...loop_tripcount...) {
647         //   #pragma omp parallel for
648         //   for(...) {}
649         // }
650         //
651         // Threads within a team will execute the iterations of the `parallel`
652         // loop.
653         cudaBlocksPerGrid = loop_tripcount;
654       }
655       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
656           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
657           cudaThreadsPerBlock);
658     } else {
659       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
660       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
661     }
662   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
663     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
664     DP("Capping number of teams to team limit %d\n",
665         DeviceInfo.BlocksPerGrid[device_id]);
666   } else {
667     cudaBlocksPerGrid = team_num;
668     DP("Using requested number of teams %d\n", team_num);
669   }
670 
671   // Run on the device.
672   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
673      cudaThreadsPerBlock);
674 
675   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
676       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
677   if (err != CUDA_SUCCESS) {
678     DP("Device kernel launch failed!\n");
679     CUDA_ERR_STRING(err);
680     return OFFLOAD_FAIL;
681   }
682 
683   DP("Launch of entry point at " DPxMOD " successful!\n",
684       DPxPTR(tgt_entry_ptr));
685 
686   CUresult sync_err = cuCtxSynchronize();
687   if (sync_err != CUDA_SUCCESS) {
688     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
689     CUDA_ERR_STRING(sync_err);
690     return OFFLOAD_FAIL;
691   } else {
692     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
693   }
694 
695   return OFFLOAD_SUCCESS;
696 }
697 
698 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
699     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
700   // use one team and the default number of threads.
701   const int32_t team_num = 1;
702   const int32_t thread_limit = 0;
703   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
704       tgt_offsets, arg_num, team_num, thread_limit, 0);
705 }
706 
707 #ifdef __cplusplus
708 }
709 #endif
710