1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <cuda_runtime_api.h>
18 #include <list>
19 #include <string>
20 #include <vector>
21 
22 #include "omptargetplugin.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #define GETNAME2(name) #name
29 #define GETNAME(name) GETNAME2(name)
30 #define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__)
31 
32 #include "../../common/elf_common.c"
33 
34 // Utility for retrieving and printing CUDA error string.
35 #ifdef CUDA_ERROR_REPORT
36 #define CUDA_ERR_STRING(err)                                                   \
37   do {                                                                         \
38     const char *errStr;                                                        \
39     cuGetErrorString(err, &errStr);                                            \
40     DP("CUDA error is: %s\n", errStr);                                         \
41   } while (0)
42 #else
43 #define CUDA_ERR_STRING(err)                                                   \
44   {}
45 #endif
46 
47 /// Keep entries table per device.
48 struct FuncOrGblEntryTy {
49   __tgt_target_table Table;
50   std::vector<__tgt_offload_entry> Entries;
51 };
52 
53 enum ExecutionModeType {
54   SPMD, // constructors, destructors,
55         // combined constructs (`teams distribute parallel for [simd]`)
56   GENERIC, // everything else
57   NONE
58 };
59 
60 /// Use a single entity to encode a kernel and a set of flags
61 struct KernelTy {
62   CUfunction Func;
63 
64   // execution mode of kernel
65   // 0 - SPMD mode (without master warp)
66   // 1 - Generic mode (with master warp)
67   int8_t ExecutionMode;
68 
69   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
70       : Func(_Func), ExecutionMode(_ExecutionMode) {}
71 };
72 
73 /// List that contains all the kernels.
74 /// FIXME: we may need this to be per device and per library.
75 std::list<KernelTy> KernelsList;
76 
77 /// Class containing all the device information.
78 class RTLDeviceInfoTy {
79   std::vector<FuncOrGblEntryTy> FuncGblEntries;
80 
81 public:
82   int NumberOfDevices;
83   std::vector<CUmodule> Modules;
84   std::vector<CUcontext> Contexts;
85 
86   // Device properties
87   std::vector<int> ThreadsPerBlock;
88   std::vector<int> BlocksPerGrid;
89   std::vector<int> WarpSize;
90 
91   // OpenMP properties
92   std::vector<int> NumTeams;
93   std::vector<int> NumThreads;
94 
95   // OpenMP Environment properties
96   int EnvNumTeams;
97   int EnvTeamLimit;
98 
99   //static int EnvNumThreads;
100   static const int HardTeamLimit = 1<<16; // 64k
101   static const int HardThreadLimit = 1024;
102   static const int DefaultNumTeams = 128;
103   static const int DefaultNumThreads = 128;
104 
105   // Record entry point associated with device
106   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
107     assert(device_id < (int32_t)FuncGblEntries.size() &&
108            "Unexpected device id!");
109     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
110 
111     E.Entries.push_back(entry);
112   }
113 
114   // Return true if the entry is associated with device
115   bool findOffloadEntry(int32_t device_id, void *addr) {
116     assert(device_id < (int32_t)FuncGblEntries.size() &&
117            "Unexpected device id!");
118     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
119 
120     for (auto &it : E.Entries) {
121       if (it.addr == addr)
122         return true;
123     }
124 
125     return false;
126   }
127 
128   // Return the pointer to the target entries table
129   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
130     assert(device_id < (int32_t)FuncGblEntries.size() &&
131            "Unexpected device id!");
132     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
133 
134     int32_t size = E.Entries.size();
135 
136     // Table is empty
137     if (!size)
138       return 0;
139 
140     __tgt_offload_entry *begin = &E.Entries[0];
141     __tgt_offload_entry *end = &E.Entries[size - 1];
142 
143     // Update table info according to the entries and return the pointer
144     E.Table.EntriesBegin = begin;
145     E.Table.EntriesEnd = ++end;
146 
147     return &E.Table;
148   }
149 
150   // Clear entries table for a device
151   void clearOffloadEntriesTable(int32_t device_id) {
152     assert(device_id < (int32_t)FuncGblEntries.size() &&
153            "Unexpected device id!");
154     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
155     E.Entries.clear();
156     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
157   }
158 
159   RTLDeviceInfoTy() {
160     DP("Start initializing CUDA\n");
161 
162     CUresult err = cuInit(0);
163     if (err != CUDA_SUCCESS) {
164       DP("Error when initializing CUDA\n");
165       CUDA_ERR_STRING(err);
166       return;
167     }
168 
169     NumberOfDevices = 0;
170 
171     err = cuDeviceGetCount(&NumberOfDevices);
172     if (err != CUDA_SUCCESS) {
173       DP("Error when getting CUDA device count\n");
174       CUDA_ERR_STRING(err);
175       return;
176     }
177 
178     if (NumberOfDevices == 0) {
179       DP("There are no devices supporting CUDA.\n");
180       return;
181     }
182 
183     FuncGblEntries.resize(NumberOfDevices);
184     Contexts.resize(NumberOfDevices);
185     ThreadsPerBlock.resize(NumberOfDevices);
186     BlocksPerGrid.resize(NumberOfDevices);
187     WarpSize.resize(NumberOfDevices);
188     NumTeams.resize(NumberOfDevices);
189     NumThreads.resize(NumberOfDevices);
190 
191     // Get environment variables regarding teams
192     char *envStr = getenv("OMP_TEAM_LIMIT");
193     if (envStr) {
194       // OMP_TEAM_LIMIT has been set
195       EnvTeamLimit = std::stoi(envStr);
196       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
197     } else {
198       EnvTeamLimit = -1;
199     }
200     envStr = getenv("OMP_NUM_TEAMS");
201     if (envStr) {
202       // OMP_NUM_TEAMS has been set
203       EnvNumTeams = std::stoi(envStr);
204       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
205     } else {
206       EnvNumTeams = -1;
207     }
208   }
209 
210   ~RTLDeviceInfoTy() {
211     // Close modules
212     for (auto &module : Modules)
213       if (module) {
214         CUresult err = cuModuleUnload(module);
215         if (err != CUDA_SUCCESS) {
216           DP("Error when unloading CUDA module\n");
217           CUDA_ERR_STRING(err);
218         }
219       }
220 
221     // Destroy contexts
222     for (auto &ctx : Contexts)
223       if (ctx) {
224         CUresult err = cuCtxDestroy(ctx);
225         if (err != CUDA_SUCCESS) {
226           DP("Error when destroying CUDA context\n");
227           CUDA_ERR_STRING(err);
228         }
229       }
230   }
231 };
232 
233 static RTLDeviceInfoTy DeviceInfo;
234 
235 #ifdef __cplusplus
236 extern "C" {
237 #endif
238 
239 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
240   return elf_check_machine(image, 190); // EM_CUDA = 190.
241 }
242 
243 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
244 
245 int32_t __tgt_rtl_init_device(int32_t device_id) {
246 
247   CUdevice cuDevice;
248   DP("Getting device %d\n", device_id);
249   CUresult err = cuDeviceGet(&cuDevice, device_id);
250   if (err != CUDA_SUCCESS) {
251     DP("Error when getting CUDA device with id = %d\n", device_id);
252     CUDA_ERR_STRING(err);
253     return OFFLOAD_FAIL;
254   }
255 
256   // Create the context and save it to use whenever this device is selected.
257   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
258                     cuDevice);
259   if (err != CUDA_SUCCESS) {
260     DP("Error when creating a CUDA context\n");
261     CUDA_ERR_STRING(err);
262     return OFFLOAD_FAIL;
263   }
264 
265   // scan properties to determine number of threads/block and blocks/grid.
266   struct cudaDeviceProp Properties;
267   cudaError_t error = cudaGetDeviceProperties(&Properties, device_id);
268   if (error != cudaSuccess) {
269     DP("Error getting device Properties, use defaults\n");
270     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
271     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
272     DeviceInfo.WarpSize[device_id] = 32;
273   } else {
274     // Get blocks per grid
275     if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
276       DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
277       DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
278     } else {
279       DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
280       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
281           "at the hard limit\n", Properties.maxGridSize[0],
282           RTLDeviceInfoTy::HardTeamLimit);
283     }
284 
285     // Get threads per block, exploit threads only along x axis
286     if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
287       DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
288       DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
289       if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
290         DP("(fewer than max per block along all xyz dims %d)\n",
291             Properties.maxThreadsPerBlock);
292       }
293     } else {
294       DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
295       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
296           "capping at the hard limit\n", Properties.maxThreadsDim[0],
297           RTLDeviceInfoTy::HardThreadLimit);
298     }
299 
300     // Get warp size
301     DeviceInfo.WarpSize[device_id] = Properties.warpSize;
302   }
303 
304   // Adjust teams to the env variables
305   if (DeviceInfo.EnvTeamLimit > 0 &&
306       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
307     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
308     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
309         DeviceInfo.EnvTeamLimit);
310   }
311 
312   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
313      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
314      DeviceInfo.WarpSize[device_id]);
315 
316   // Set default number of teams
317   if (DeviceInfo.EnvNumTeams > 0) {
318     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
319     DP("Default number of teams set according to environment %d\n",
320         DeviceInfo.EnvNumTeams);
321   } else {
322     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
323     DP("Default number of teams set according to library's default %d\n",
324         RTLDeviceInfoTy::DefaultNumTeams);
325   }
326   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
327     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
328     DP("Default number of teams exceeds device limit, capping at %d\n",
329         DeviceInfo.BlocksPerGrid[device_id]);
330   }
331 
332   // Set default number of threads
333   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
334   DP("Default number of threads set according to library's default %d\n",
335           RTLDeviceInfoTy::DefaultNumThreads);
336   if (DeviceInfo.NumThreads[device_id] >
337       DeviceInfo.ThreadsPerBlock[device_id]) {
338     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
339     DP("Default number of threads exceeds device limit, capping at %d\n",
340         DeviceInfo.ThreadsPerBlock[device_id]);
341   }
342 
343   return OFFLOAD_SUCCESS;
344 }
345 
346 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
347     __tgt_device_image *image) {
348 
349   // Set the context we are using.
350   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
351   if (err != CUDA_SUCCESS) {
352     DP("Error when setting a CUDA context for device %d\n", device_id);
353     CUDA_ERR_STRING(err);
354     return NULL;
355   }
356 
357   // Clear the offload table as we are going to create a new one.
358   DeviceInfo.clearOffloadEntriesTable(device_id);
359 
360   // Create the module and extract the function pointers.
361 
362   CUmodule cumod;
363   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
364   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
365   if (err != CUDA_SUCCESS) {
366     DP("Error when loading CUDA module\n");
367     CUDA_ERR_STRING(err);
368     return NULL;
369   }
370 
371   DP("CUDA module successfully loaded!\n");
372   DeviceInfo.Modules.push_back(cumod);
373 
374   // Find the symbols in the module by name.
375   __tgt_offload_entry *HostBegin = image->EntriesBegin;
376   __tgt_offload_entry *HostEnd = image->EntriesEnd;
377 
378   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
379 
380     if (!e->addr) {
381       // We return NULL when something like this happens, the host should have
382       // always something in the address to uniquely identify the target region.
383       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
384 
385       return NULL;
386     }
387 
388     if (e->size) {
389       __tgt_offload_entry entry = *e;
390 
391       CUdeviceptr cuptr;
392       size_t cusize;
393       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
394 
395       if (err != CUDA_SUCCESS) {
396         DP("Loading global '%s' (Failed)\n", e->name);
397         CUDA_ERR_STRING(err);
398         return NULL;
399       }
400 
401       if (cusize != e->size) {
402         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
403             cusize, e->size);
404         CUDA_ERR_STRING(err);
405         return NULL;
406       }
407 
408       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
409           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
410       entry.addr = (void *)cuptr;
411 
412       DeviceInfo.addOffloadEntry(device_id, entry);
413 
414       continue;
415     }
416 
417     CUfunction fun;
418     err = cuModuleGetFunction(&fun, cumod, e->name);
419 
420     if (err != CUDA_SUCCESS) {
421       DP("Loading '%s' (Failed)\n", e->name);
422       CUDA_ERR_STRING(err);
423       return NULL;
424     }
425 
426     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
427         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
428 
429     // default value GENERIC (in case symbol is missing from cubin file)
430     int8_t ExecModeVal = ExecutionModeType::GENERIC;
431     std::string ExecModeNameStr (e->name);
432     ExecModeNameStr += "_exec_mode";
433     const char *ExecModeName = ExecModeNameStr.c_str();
434 
435     CUdeviceptr ExecModePtr;
436     size_t cusize;
437     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
438     if (err == CUDA_SUCCESS) {
439       if ((size_t)cusize != sizeof(int8_t)) {
440         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
441            ExecModeName, cusize, sizeof(int8_t));
442         CUDA_ERR_STRING(err);
443         return NULL;
444       }
445 
446       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
447       if (err != CUDA_SUCCESS) {
448         DP("Error when copying data from device to host. Pointers: "
449            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
450            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
451         CUDA_ERR_STRING(err);
452         return NULL;
453       }
454 
455       if (ExecModeVal < 0 || ExecModeVal > 1) {
456         DP("Error wrong exec_mode value specified in cubin file: %d\n",
457            ExecModeVal);
458         return NULL;
459       }
460     } else {
461       DP("Loading global exec_mode '%s' - symbol missing, using default value "
462           "GENERIC (1)\n", ExecModeName);
463       CUDA_ERR_STRING(err);
464     }
465 
466     KernelsList.push_back(KernelTy(fun, ExecModeVal));
467 
468     __tgt_offload_entry entry = *e;
469     entry.addr = (void *)&KernelsList.back();
470     DeviceInfo.addOffloadEntry(device_id, entry);
471   }
472 
473   return DeviceInfo.getOffloadEntriesTable(device_id);
474 }
475 
476 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
477   if (size == 0) {
478     return NULL;
479   }
480 
481   // Set the context we are using.
482   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
483   if (err != CUDA_SUCCESS) {
484     DP("Error while trying to set CUDA current context\n");
485     CUDA_ERR_STRING(err);
486     return NULL;
487   }
488 
489   CUdeviceptr ptr;
490   err = cuMemAlloc(&ptr, size);
491   if (err != CUDA_SUCCESS) {
492     DP("Error while trying to allocate %d\n", err);
493     CUDA_ERR_STRING(err);
494     return NULL;
495   }
496 
497   void *vptr = (void *)ptr;
498   return vptr;
499 }
500 
501 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
502     int64_t size) {
503   // Set the context we are using.
504   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
505   if (err != CUDA_SUCCESS) {
506     DP("Error when setting CUDA context\n");
507     CUDA_ERR_STRING(err);
508     return OFFLOAD_FAIL;
509   }
510 
511   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
512   if (err != CUDA_SUCCESS) {
513     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
514        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
515        DPxPTR(tgt_ptr), size);
516     CUDA_ERR_STRING(err);
517     return OFFLOAD_FAIL;
518   }
519   return OFFLOAD_SUCCESS;
520 }
521 
522 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
523     int64_t size) {
524   // Set the context we are using.
525   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
526   if (err != CUDA_SUCCESS) {
527     DP("Error when setting CUDA context\n");
528     CUDA_ERR_STRING(err);
529     return OFFLOAD_FAIL;
530   }
531 
532   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
533   if (err != CUDA_SUCCESS) {
534     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
535         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
536         DPxPTR(tgt_ptr), size);
537     CUDA_ERR_STRING(err);
538     return OFFLOAD_FAIL;
539   }
540   return OFFLOAD_SUCCESS;
541 }
542 
543 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
544   // Set the context we are using.
545   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
546   if (err != CUDA_SUCCESS) {
547     DP("Error when setting CUDA context\n");
548     CUDA_ERR_STRING(err);
549     return OFFLOAD_FAIL;
550   }
551 
552   err = cuMemFree((CUdeviceptr)tgt_ptr);
553   if (err != CUDA_SUCCESS) {
554     DP("Error when freeing CUDA memory\n");
555     CUDA_ERR_STRING(err);
556     return OFFLOAD_FAIL;
557   }
558   return OFFLOAD_SUCCESS;
559 }
560 
561 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
562     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
563     int32_t thread_limit, uint64_t loop_tripcount) {
564   // Set the context we are using.
565   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
566   if (err != CUDA_SUCCESS) {
567     DP("Error when setting CUDA context\n");
568     CUDA_ERR_STRING(err);
569     return OFFLOAD_FAIL;
570   }
571 
572   // All args are references.
573   std::vector<void *> args(arg_num);
574   std::vector<void *> ptrs(arg_num);
575 
576   for (int32_t i = 0; i < arg_num; ++i) {
577     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
578     args[i] = &ptrs[i];
579   }
580 
581   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
582 
583   int cudaThreadsPerBlock;
584 
585   if (thread_limit > 0) {
586     cudaThreadsPerBlock = thread_limit;
587     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
588     // Add master warp if necessary
589     if (KernelInfo->ExecutionMode == GENERIC) {
590       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
591       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
592     }
593   } else {
594     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
595     DP("Setting CUDA threads per block to default %d\n",
596         DeviceInfo.NumThreads[device_id]);
597   }
598 
599   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
600     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
601     DP("Threads per block capped at device limit %d\n",
602         DeviceInfo.ThreadsPerBlock[device_id]);
603   }
604 
605   int kernel_limit;
606   err = cuFuncGetAttribute(&kernel_limit,
607       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
608   if (err == CUDA_SUCCESS) {
609     if (kernel_limit < cudaThreadsPerBlock) {
610       cudaThreadsPerBlock = kernel_limit;
611       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
612     }
613   }
614 
615   int cudaBlocksPerGrid;
616   if (team_num <= 0) {
617     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
618       if (KernelInfo->ExecutionMode == SPMD) {
619         // We have a combined construct, i.e. `target teams distribute parallel
620         // for [simd]`. We launch so many teams so that each thread will
621         // execute one iteration of the loop.
622         // round up to the nearest integer
623         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
624       } else {
625         // If we reach this point, then we have a non-combined construct, i.e.
626         // `teams distribute` with a nested `parallel for` and each team is
627         // assigned one iteration of the `distribute` loop. E.g.:
628         //
629         // #pragma omp target teams distribute
630         // for(...loop_tripcount...) {
631         //   #pragma omp parallel for
632         //   for(...) {}
633         // }
634         //
635         // Threads within a team will execute the iterations of the `parallel`
636         // loop.
637         cudaBlocksPerGrid = loop_tripcount;
638       }
639       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
640           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
641           cudaThreadsPerBlock);
642     } else {
643       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
644       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
645     }
646   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
647     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
648     DP("Capping number of teams to team limit %d\n",
649         DeviceInfo.BlocksPerGrid[device_id]);
650   } else {
651     cudaBlocksPerGrid = team_num;
652     DP("Using requested number of teams %d\n", team_num);
653   }
654 
655   // Run on the device.
656   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
657      cudaThreadsPerBlock);
658 
659   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
660       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
661   if (err != CUDA_SUCCESS) {
662     DP("Device kernel launch failed!\n");
663     CUDA_ERR_STRING(err);
664     assert(err == CUDA_SUCCESS && "Unable to launch target execution!");
665     return OFFLOAD_FAIL;
666   }
667 
668   DP("Launch of entry point at " DPxMOD " successful!\n",
669       DPxPTR(tgt_entry_ptr));
670 
671   cudaError_t sync_error = cudaDeviceSynchronize();
672   if (sync_error != cudaSuccess) {
673   DP("Kernel execution error at " DPxMOD ", %s.\n", DPxPTR(tgt_entry_ptr),
674       cudaGetErrorString(sync_error));
675     return OFFLOAD_FAIL;
676   } else {
677     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
678   }
679 
680   return OFFLOAD_SUCCESS;
681 }
682 
683 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
684     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
685   // use one team and the default number of threads.
686   const int32_t team_num = 1;
687   const int32_t thread_limit = 0;
688   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
689       tgt_offsets, arg_num, team_num, thread_limit, 0);
690 }
691 
692 #ifdef __cplusplus
693 }
694 #endif
695