1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <cuda_runtime_api.h>
18 #include <list>
19 #include <string>
20 #include <vector>
21 
22 #include "omptarget.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #define GETNAME2(name) #name
29 #define GETNAME(name) GETNAME2(name)
30 #define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__)
31 
32 #include "../../common/elf_common.c"
33 
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef CUDA_ERROR_REPORT
36 #define CUDA_ERR_STRING(err)                                                   \
37   do {                                                                         \
38     const char *errStr;                                                        \
39     cuGetErrorString(err, &errStr);                                            \
40     DP("CUDA error is: %s\n", errStr);                                         \
41   } while (0)
42 #else
43 #define CUDA_ERR_STRING(err)                                                   \
44   {}
45 #endif
46 
47 /// Keep entries table per device.
48 struct FuncOrGblEntryTy {
49   __tgt_target_table Table;
50   std::vector<__tgt_offload_entry> Entries;
51 };
52 
53 enum ExecutionModeType {
54   SPMD,
55   GENERIC,
56   NONE
57 };
58 
59 /// Use a single entity to encode a kernel and a set of flags
60 struct KernelTy {
61   CUfunction Func;
62 
63   // execution mode of kernel
64   // 0 - SPMD mode (without master warp)
65   // 1 - Generic mode (with master warp)
66   int8_t ExecutionMode;
67 
68   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
69       : Func(_Func), ExecutionMode(_ExecutionMode) {}
70 };
71 
72 /// List that contains all the kernels.
73 /// FIXME: we may need this to be per device and per library.
74 std::list<KernelTy> KernelsList;
75 
76 /// Class containing all the device information.
77 class RTLDeviceInfoTy {
78   std::vector<FuncOrGblEntryTy> FuncGblEntries;
79 
80 public:
81   int NumberOfDevices;
82   std::vector<CUmodule> Modules;
83   std::vector<CUcontext> Contexts;
84 
85   // Device properties
86   std::vector<int> ThreadsPerBlock;
87   std::vector<int> BlocksPerGrid;
88   std::vector<int> WarpSize;
89 
90   // OpenMP properties
91   std::vector<int> NumTeams;
92   std::vector<int> NumThreads;
93 
94   // OpenMP Environment properties
95   int EnvNumTeams;
96   int EnvTeamLimit;
97 
98   //static int EnvNumThreads;
99   static const int HardTeamLimit = 1<<16; // 64k
100   static const int HardThreadLimit = 1024;
101   static const int DefaultNumTeams = 128;
102   static const int DefaultNumThreads = 1024;
103 
104   // Record entry point associated with device
105   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
106     assert(device_id < (int32_t)FuncGblEntries.size() &&
107            "Unexpected device id!");
108     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
109 
110     E.Entries.push_back(entry);
111   }
112 
113   // Return true if the entry is associated with device
114   bool findOffloadEntry(int32_t device_id, void *addr) {
115     assert(device_id < (int32_t)FuncGblEntries.size() &&
116            "Unexpected device id!");
117     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
118 
119     for (auto &it : E.Entries) {
120       if (it.addr == addr)
121         return true;
122     }
123 
124     return false;
125   }
126 
127   // Return the pointer to the target entries table
128   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
129     assert(device_id < (int32_t)FuncGblEntries.size() &&
130            "Unexpected device id!");
131     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
132 
133     int32_t size = E.Entries.size();
134 
135     // Table is empty
136     if (!size)
137       return 0;
138 
139     __tgt_offload_entry *begin = &E.Entries[0];
140     __tgt_offload_entry *end = &E.Entries[size - 1];
141 
142     // Update table info according to the entries and return the pointer
143     E.Table.EntriesBegin = begin;
144     E.Table.EntriesEnd = ++end;
145 
146     return &E.Table;
147   }
148 
149   // Clear entries table for a device
150   void clearOffloadEntriesTable(int32_t device_id) {
151     assert(device_id < (int32_t)FuncGblEntries.size() &&
152            "Unexpected device id!");
153     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
154     E.Entries.clear();
155     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
156   }
157 
158   RTLDeviceInfoTy() {
159     DP("Start initializing CUDA\n");
160 
161     CUresult err = cuInit(0);
162     if (err != CUDA_SUCCESS) {
163       DP("Error when initializing CUDA\n");
164       CUDA_ERR_STRING(err);
165       return;
166     }
167 
168     NumberOfDevices = 0;
169 
170     err = cuDeviceGetCount(&NumberOfDevices);
171     if (err != CUDA_SUCCESS) {
172       DP("Error when getting CUDA device count\n");
173       CUDA_ERR_STRING(err);
174       return;
175     }
176 
177     if (NumberOfDevices == 0) {
178       DP("There are no devices supporting CUDA.\n");
179       return;
180     }
181 
182     FuncGblEntries.resize(NumberOfDevices);
183     Contexts.resize(NumberOfDevices);
184     ThreadsPerBlock.resize(NumberOfDevices);
185     BlocksPerGrid.resize(NumberOfDevices);
186     WarpSize.resize(NumberOfDevices);
187     NumTeams.resize(NumberOfDevices);
188     NumThreads.resize(NumberOfDevices);
189 
190     // Get environment variables regarding teams
191     char *envStr = getenv("OMP_TEAM_LIMIT");
192     if (envStr) {
193       // OMP_TEAM_LIMIT has been set
194       EnvTeamLimit = std::stoi(envStr);
195       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
196     } else {
197       EnvTeamLimit = -1;
198     }
199     envStr = getenv("OMP_NUM_TEAMS");
200     if (envStr) {
201       // OMP_NUM_TEAMS has been set
202       EnvNumTeams = std::stoi(envStr);
203       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
204     } else {
205       EnvNumTeams = -1;
206     }
207   }
208 
209   ~RTLDeviceInfoTy() {
210     // Close modules
211     for (auto &module : Modules)
212       if (module) {
213         CUresult err = cuModuleUnload(module);
214         if (err != CUDA_SUCCESS) {
215           DP("Error when unloading CUDA module\n");
216           CUDA_ERR_STRING(err);
217         }
218       }
219 
220     // Destroy contexts
221     for (auto &ctx : Contexts)
222       if (ctx) {
223         CUresult err = cuCtxDestroy(ctx);
224         if (err != CUDA_SUCCESS) {
225           DP("Error when destroying CUDA context\n");
226           CUDA_ERR_STRING(err);
227         }
228       }
229   }
230 };
231 
232 static RTLDeviceInfoTy DeviceInfo;
233 
234 #ifdef __cplusplus
235 extern "C" {
236 #endif
237 
238 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
239   return elf_check_machine(image, 190); // EM_CUDA = 190.
240 }
241 
242 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
243 
244 int32_t __tgt_rtl_init_device(int32_t device_id) {
245 
246   CUdevice cuDevice;
247   DP("Getting device %d\n", device_id);
248   CUresult err = cuDeviceGet(&cuDevice, device_id);
249   if (err != CUDA_SUCCESS) {
250     DP("Error when getting CUDA device with id = %d\n", device_id);
251     CUDA_ERR_STRING(err);
252     return OFFLOAD_FAIL;
253   }
254 
255   // Create the context and save it to use whenever this device is selected.
256   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
257                     cuDevice);
258   if (err != CUDA_SUCCESS) {
259     DP("Error when creating a CUDA context\n");
260     CUDA_ERR_STRING(err);
261     return OFFLOAD_FAIL;
262   }
263 
264   // scan properties to determine number of threads/block and blocks/grid.
265   struct cudaDeviceProp Properties;
266   cudaError_t error = cudaGetDeviceProperties(&Properties, device_id);
267   if (error != cudaSuccess) {
268     DP("Error getting device Properties, use defaults\n");
269     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
270     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
271     DeviceInfo.WarpSize[device_id] = 32;
272   } else {
273     // Get blocks per grid
274     if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
275       DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
276       DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
277     } else {
278       DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
279       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
280           "at the hard limit\n", Properties.maxGridSize[0],
281           RTLDeviceInfoTy::HardTeamLimit);
282     }
283 
284     // Get threads per block, exploit threads only along x axis
285     if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
286       DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
287       DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
288       if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
289         DP("(fewer than max per block along all xyz dims %d)\n",
290             Properties.maxThreadsPerBlock);
291       }
292     } else {
293       DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
294       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
295           "capping at the hard limit\n", Properties.maxThreadsDim[0],
296           RTLDeviceInfoTy::HardThreadLimit);
297     }
298 
299     // Get warp size
300     DeviceInfo.WarpSize[device_id] = Properties.warpSize;
301   }
302 
303   // Adjust teams to the env variables
304   if (DeviceInfo.EnvTeamLimit > 0 &&
305       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
306     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
307     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
308         DeviceInfo.EnvTeamLimit);
309   }
310 
311   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
312      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
313      DeviceInfo.WarpSize[device_id]);
314 
315   // Set default number of teams
316   if (DeviceInfo.EnvNumTeams > 0) {
317     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
318     DP("Default number of teams set according to environment %d\n",
319         DeviceInfo.EnvNumTeams);
320   } else {
321     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
322     DP("Default number of teams set according to library's default %d\n",
323         RTLDeviceInfoTy::DefaultNumTeams);
324   }
325   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
326     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
327     DP("Default number of teams exceeds device limit, capping at %d\n",
328         DeviceInfo.BlocksPerGrid[device_id]);
329   }
330 
331   // Set default number of threads
332   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
333   DP("Default number of threads set according to library's default %d\n",
334           RTLDeviceInfoTy::DefaultNumThreads);
335   if (DeviceInfo.NumThreads[device_id] >
336       DeviceInfo.ThreadsPerBlock[device_id]) {
337     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
338     DP("Default number of threads exceeds device limit, capping at %d\n",
339         DeviceInfo.ThreadsPerBlock[device_id]);
340   }
341 
342   return OFFLOAD_SUCCESS;
343 }
344 
345 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
346     __tgt_device_image *image) {
347 
348   // Set the context we are using.
349   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
350   if (err != CUDA_SUCCESS) {
351     DP("Error when setting a CUDA context for device %d\n", device_id);
352     CUDA_ERR_STRING(err);
353     return NULL;
354   }
355 
356   // Clear the offload table as we are going to create a new one.
357   DeviceInfo.clearOffloadEntriesTable(device_id);
358 
359   // Create the module and extract the function pointers.
360 
361   CUmodule cumod;
362   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
363   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
364   if (err != CUDA_SUCCESS) {
365     DP("Error when loading CUDA module\n");
366     CUDA_ERR_STRING(err);
367     return NULL;
368   }
369 
370   DP("CUDA module successfully loaded!\n");
371   DeviceInfo.Modules.push_back(cumod);
372 
373   // Find the symbols in the module by name.
374   __tgt_offload_entry *HostBegin = image->EntriesBegin;
375   __tgt_offload_entry *HostEnd = image->EntriesEnd;
376 
377   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
378 
379     if (!e->addr) {
380       // We return NULL when something like this happens, the host should have
381       // always something in the address to uniquely identify the target region.
382       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
383 
384       return NULL;
385     }
386 
387     if (e->size) {
388       __tgt_offload_entry entry = *e;
389 
390       CUdeviceptr cuptr;
391       size_t cusize;
392       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
393 
394       if (err != CUDA_SUCCESS) {
395         DP("Loading global '%s' (Failed)\n", e->name);
396         CUDA_ERR_STRING(err);
397         return NULL;
398       }
399 
400       if (cusize != e->size) {
401         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
402             cusize, e->size);
403         CUDA_ERR_STRING(err);
404         return NULL;
405       }
406 
407       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
408           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
409       entry.addr = (void *)cuptr;
410 
411       DeviceInfo.addOffloadEntry(device_id, entry);
412 
413       continue;
414     }
415 
416     CUfunction fun;
417     err = cuModuleGetFunction(&fun, cumod, e->name);
418 
419     if (err != CUDA_SUCCESS) {
420       DP("Loading '%s' (Failed)\n", e->name);
421       CUDA_ERR_STRING(err);
422       return NULL;
423     }
424 
425     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
426         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
427 
428     // default value GENERIC (in case symbol is missing from cubin file)
429     int8_t ExecModeVal = ExecutionModeType::GENERIC;
430     std::string ExecModeNameStr (e->name);
431     ExecModeNameStr += "_exec_mode";
432     const char *ExecModeName = ExecModeNameStr.c_str();
433 
434     CUdeviceptr ExecModePtr;
435     size_t cusize;
436     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
437     if (err == CUDA_SUCCESS) {
438       if ((size_t)cusize != sizeof(int8_t)) {
439         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
440            ExecModeName, cusize, sizeof(int8_t));
441         CUDA_ERR_STRING(err);
442         return NULL;
443       }
444 
445       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
446       if (err != CUDA_SUCCESS) {
447         DP("Error when copying data from device to host. Pointers: "
448            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
449            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
450         CUDA_ERR_STRING(err);
451         return NULL;
452       }
453 
454       if (ExecModeVal < 0 || ExecModeVal > 1) {
455         DP("Error wrong exec_mode value specified in cubin file: %d\n",
456            ExecModeVal);
457         return NULL;
458       }
459     } else {
460       DP("Loading global exec_mode '%s' - symbol missing, using default value "
461           "GENERIC (1)\n", ExecModeName);
462       CUDA_ERR_STRING(err);
463     }
464 
465     KernelsList.push_back(KernelTy(fun, ExecModeVal));
466 
467     __tgt_offload_entry entry = *e;
468     entry.addr = (void *)&KernelsList.back();
469     DeviceInfo.addOffloadEntry(device_id, entry);
470   }
471 
472   return DeviceInfo.getOffloadEntriesTable(device_id);
473 }
474 
475 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) {
476   if (size == 0) {
477     return NULL;
478   }
479 
480   // Set the context we are using.
481   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
482   if (err != CUDA_SUCCESS) {
483     DP("Error while trying to set CUDA current context\n");
484     CUDA_ERR_STRING(err);
485     return NULL;
486   }
487 
488   CUdeviceptr ptr;
489   err = cuMemAlloc(&ptr, size);
490   if (err != CUDA_SUCCESS) {
491     DP("Error while trying to allocate %d\n", err);
492     CUDA_ERR_STRING(err);
493     return NULL;
494   }
495 
496   void *vptr = (void *)ptr;
497   return vptr;
498 }
499 
500 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
501     int64_t size) {
502   // Set the context we are using.
503   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
504   if (err != CUDA_SUCCESS) {
505     DP("Error when setting CUDA context\n");
506     CUDA_ERR_STRING(err);
507     return OFFLOAD_FAIL;
508   }
509 
510   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
511   if (err != CUDA_SUCCESS) {
512     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
513        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
514        DPxPTR(tgt_ptr), size);
515     CUDA_ERR_STRING(err);
516     return OFFLOAD_FAIL;
517   }
518   return OFFLOAD_SUCCESS;
519 }
520 
521 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
522     int64_t size) {
523   // Set the context we are using.
524   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
525   if (err != CUDA_SUCCESS) {
526     DP("Error when setting CUDA context\n");
527     CUDA_ERR_STRING(err);
528     return OFFLOAD_FAIL;
529   }
530 
531   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
532   if (err != CUDA_SUCCESS) {
533     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
534         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
535         DPxPTR(tgt_ptr), size);
536     CUDA_ERR_STRING(err);
537     return OFFLOAD_FAIL;
538   }
539   return OFFLOAD_SUCCESS;
540 }
541 
542 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
543   // Set the context we are using.
544   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
545   if (err != CUDA_SUCCESS) {
546     DP("Error when setting CUDA context\n");
547     CUDA_ERR_STRING(err);
548     return OFFLOAD_FAIL;
549   }
550 
551   err = cuMemFree((CUdeviceptr)tgt_ptr);
552   if (err != CUDA_SUCCESS) {
553     DP("Error when freeing CUDA memory\n");
554     CUDA_ERR_STRING(err);
555     return OFFLOAD_FAIL;
556   }
557   return OFFLOAD_SUCCESS;
558 }
559 
560 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
561     void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit,
562     uint64_t loop_tripcount) {
563   // Set the context we are using.
564   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
565   if (err != CUDA_SUCCESS) {
566     DP("Error when setting CUDA context\n");
567     CUDA_ERR_STRING(err);
568     return OFFLOAD_FAIL;
569   }
570 
571   // All args are references.
572   std::vector<void *> args(arg_num);
573 
574   for (int32_t i = 0; i < arg_num; ++i)
575     args[i] = &tgt_args[i];
576 
577   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
578 
579   int cudaThreadsPerBlock;
580 
581   if (thread_limit > 0) {
582     cudaThreadsPerBlock = thread_limit;
583     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
584   } else {
585     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
586     DP("Setting CUDA threads per block to default %d\n",
587         DeviceInfo.NumThreads[device_id]);
588   }
589 
590   // Add master warp if necessary
591   if (KernelInfo->ExecutionMode == GENERIC) {
592     cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
593     DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
594   }
595 
596   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
597     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
598     DP("Threads per block capped at device limit %d\n",
599         DeviceInfo.ThreadsPerBlock[device_id]);
600   }
601 
602   int kernel_limit;
603   err = cuFuncGetAttribute(&kernel_limit,
604       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
605   if (err == CUDA_SUCCESS) {
606     if (kernel_limit < cudaThreadsPerBlock) {
607       cudaThreadsPerBlock = kernel_limit;
608       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
609     }
610   }
611 
612   int cudaBlocksPerGrid;
613   if (team_num <= 0) {
614     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
615       // round up to the nearest integer
616       cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
617       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
618           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
619           cudaThreadsPerBlock);
620     } else {
621       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
622       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
623     }
624   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
625     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
626     DP("Capping number of teams to team limit %d\n",
627         DeviceInfo.BlocksPerGrid[device_id]);
628   } else {
629     cudaBlocksPerGrid = team_num;
630     DP("Using requested number of teams %d\n", team_num);
631   }
632 
633   // Run on the device.
634   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
635      cudaThreadsPerBlock);
636 
637   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
638       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
639   if (err != CUDA_SUCCESS) {
640     DP("Device kernel launch failed!\n");
641     CUDA_ERR_STRING(err);
642     assert(err == CUDA_SUCCESS && "Unable to launch target execution!");
643     return OFFLOAD_FAIL;
644   }
645 
646   DP("Launch of entry point at " DPxMOD " successful!\n",
647       DPxPTR(tgt_entry_ptr));
648 
649   if (cudaDeviceSynchronize() != cudaSuccess) {
650     DP("Kernel execution error at " DPxMOD ".\n", DPxPTR(tgt_entry_ptr));
651     return OFFLOAD_FAIL;
652   } else {
653     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
654   }
655 
656   return OFFLOAD_SUCCESS;
657 }
658 
659 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
660     void **tgt_args, int32_t arg_num) {
661   // use one team and the default number of threads.
662   const int32_t team_num = 1;
663   const int32_t thread_limit = 0;
664   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
665       arg_num, team_num, thread_limit, 0);
666 }
667 
668 #ifdef __cplusplus
669 }
670 #endif
671