1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for CUDA machine
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <cassert>
15 #include <cstddef>
16 #include <cuda.h>
17 #include <cuda_runtime_api.h>
18 #include <list>
19 #include <string>
20 #include <vector>
21 
22 #include "omptargetplugin.h"
23 
24 #ifndef TARGET_NAME
25 #define TARGET_NAME CUDA
26 #endif
27 
28 #ifdef OMPTARGET_DEBUG
29 static int DebugLevel = 0;
30 
31 #define GETNAME2(name) #name
32 #define GETNAME(name) GETNAME2(name)
33 #define DP(...) \
34   do { \
35     if (DebugLevel > 0) { \
36       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
37     } \
38   } while (false)
39 #else // OMPTARGET_DEBUG
40 #define DP(...) {}
41 #endif // OMPTARGET_DEBUG
42 
43 #include "../../common/elf_common.c"
44 
45 // Utility for retrieving and printing CUDA error string.
46 #ifdef CUDA_ERROR_REPORT
47 #define CUDA_ERR_STRING(err)                                                   \
48   do {                                                                         \
49     const char *errStr;                                                        \
50     cuGetErrorString(err, &errStr);                                            \
51     DP("CUDA error is: %s\n", errStr);                                         \
52   } while (0)
53 #else
54 #define CUDA_ERR_STRING(err)                                                   \
55   {}
56 #endif
57 
58 /// Keep entries table per device.
59 struct FuncOrGblEntryTy {
60   __tgt_target_table Table;
61   std::vector<__tgt_offload_entry> Entries;
62 };
63 
64 enum ExecutionModeType {
65   SPMD, // constructors, destructors,
66         // combined constructs (`teams distribute parallel for [simd]`)
67   GENERIC, // everything else
68   NONE
69 };
70 
71 /// Use a single entity to encode a kernel and a set of flags
72 struct KernelTy {
73   CUfunction Func;
74 
75   // execution mode of kernel
76   // 0 - SPMD mode (without master warp)
77   // 1 - Generic mode (with master warp)
78   int8_t ExecutionMode;
79 
80   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
81       : Func(_Func), ExecutionMode(_ExecutionMode) {}
82 };
83 
84 /// List that contains all the kernels.
85 /// FIXME: we may need this to be per device and per library.
86 std::list<KernelTy> KernelsList;
87 
88 /// Class containing all the device information.
89 class RTLDeviceInfoTy {
90   std::vector<FuncOrGblEntryTy> FuncGblEntries;
91 
92 public:
93   int NumberOfDevices;
94   std::vector<CUmodule> Modules;
95   std::vector<CUcontext> Contexts;
96 
97   // Device properties
98   std::vector<int> ThreadsPerBlock;
99   std::vector<int> BlocksPerGrid;
100   std::vector<int> WarpSize;
101 
102   // OpenMP properties
103   std::vector<int> NumTeams;
104   std::vector<int> NumThreads;
105 
106   // OpenMP Environment properties
107   int EnvNumTeams;
108   int EnvTeamLimit;
109 
110   //static int EnvNumThreads;
111   static const int HardTeamLimit = 1<<16; // 64k
112   static const int HardThreadLimit = 1024;
113   static const int DefaultNumTeams = 128;
114   static const int DefaultNumThreads = 128;
115 
116   // Record entry point associated with device
117   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
118     assert(device_id < (int32_t)FuncGblEntries.size() &&
119            "Unexpected device id!");
120     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
121 
122     E.Entries.push_back(entry);
123   }
124 
125   // Return true if the entry is associated with device
126   bool findOffloadEntry(int32_t device_id, void *addr) {
127     assert(device_id < (int32_t)FuncGblEntries.size() &&
128            "Unexpected device id!");
129     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
130 
131     for (auto &it : E.Entries) {
132       if (it.addr == addr)
133         return true;
134     }
135 
136     return false;
137   }
138 
139   // Return the pointer to the target entries table
140   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
141     assert(device_id < (int32_t)FuncGblEntries.size() &&
142            "Unexpected device id!");
143     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
144 
145     int32_t size = E.Entries.size();
146 
147     // Table is empty
148     if (!size)
149       return 0;
150 
151     __tgt_offload_entry *begin = &E.Entries[0];
152     __tgt_offload_entry *end = &E.Entries[size - 1];
153 
154     // Update table info according to the entries and return the pointer
155     E.Table.EntriesBegin = begin;
156     E.Table.EntriesEnd = ++end;
157 
158     return &E.Table;
159   }
160 
161   // Clear entries table for a device
162   void clearOffloadEntriesTable(int32_t device_id) {
163     assert(device_id < (int32_t)FuncGblEntries.size() &&
164            "Unexpected device id!");
165     FuncOrGblEntryTy &E = FuncGblEntries[device_id];
166     E.Entries.clear();
167     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
168   }
169 
170   RTLDeviceInfoTy() {
171 #ifdef OMPTARGET_DEBUG
172     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
173       DebugLevel = std::stoi(envStr);
174     }
175 #endif // OMPTARGET_DEBUG
176 
177     DP("Start initializing CUDA\n");
178 
179     CUresult err = cuInit(0);
180     if (err != CUDA_SUCCESS) {
181       DP("Error when initializing CUDA\n");
182       CUDA_ERR_STRING(err);
183       return;
184     }
185 
186     NumberOfDevices = 0;
187 
188     err = cuDeviceGetCount(&NumberOfDevices);
189     if (err != CUDA_SUCCESS) {
190       DP("Error when getting CUDA device count\n");
191       CUDA_ERR_STRING(err);
192       return;
193     }
194 
195     if (NumberOfDevices == 0) {
196       DP("There are no devices supporting CUDA.\n");
197       return;
198     }
199 
200     FuncGblEntries.resize(NumberOfDevices);
201     Contexts.resize(NumberOfDevices);
202     ThreadsPerBlock.resize(NumberOfDevices);
203     BlocksPerGrid.resize(NumberOfDevices);
204     WarpSize.resize(NumberOfDevices);
205     NumTeams.resize(NumberOfDevices);
206     NumThreads.resize(NumberOfDevices);
207 
208     // Get environment variables regarding teams
209     char *envStr = getenv("OMP_TEAM_LIMIT");
210     if (envStr) {
211       // OMP_TEAM_LIMIT has been set
212       EnvTeamLimit = std::stoi(envStr);
213       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
214     } else {
215       EnvTeamLimit = -1;
216     }
217     envStr = getenv("OMP_NUM_TEAMS");
218     if (envStr) {
219       // OMP_NUM_TEAMS has been set
220       EnvNumTeams = std::stoi(envStr);
221       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
222     } else {
223       EnvNumTeams = -1;
224     }
225   }
226 
227   ~RTLDeviceInfoTy() {
228     // Close modules
229     for (auto &module : Modules)
230       if (module) {
231         CUresult err = cuModuleUnload(module);
232         if (err != CUDA_SUCCESS) {
233           DP("Error when unloading CUDA module\n");
234           CUDA_ERR_STRING(err);
235         }
236       }
237 
238     // Destroy contexts
239     for (auto &ctx : Contexts)
240       if (ctx) {
241         CUresult err = cuCtxDestroy(ctx);
242         if (err != CUDA_SUCCESS) {
243           DP("Error when destroying CUDA context\n");
244           CUDA_ERR_STRING(err);
245         }
246       }
247   }
248 };
249 
250 static RTLDeviceInfoTy DeviceInfo;
251 
252 #ifdef __cplusplus
253 extern "C" {
254 #endif
255 
256 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
257   return elf_check_machine(image, 190); // EM_CUDA = 190.
258 }
259 
260 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
261 
262 int32_t __tgt_rtl_init_device(int32_t device_id) {
263 
264   CUdevice cuDevice;
265   DP("Getting device %d\n", device_id);
266   CUresult err = cuDeviceGet(&cuDevice, device_id);
267   if (err != CUDA_SUCCESS) {
268     DP("Error when getting CUDA device with id = %d\n", device_id);
269     CUDA_ERR_STRING(err);
270     return OFFLOAD_FAIL;
271   }
272 
273   // Create the context and save it to use whenever this device is selected.
274   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
275                     cuDevice);
276   if (err != CUDA_SUCCESS) {
277     DP("Error when creating a CUDA context\n");
278     CUDA_ERR_STRING(err);
279     return OFFLOAD_FAIL;
280   }
281 
282   // scan properties to determine number of threads/block and blocks/grid.
283   struct cudaDeviceProp Properties;
284   cudaError_t error = cudaGetDeviceProperties(&Properties, device_id);
285   if (error != cudaSuccess) {
286     DP("Error getting device Properties, use defaults\n");
287     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
288     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
289     DeviceInfo.WarpSize[device_id] = 32;
290   } else {
291     // Get blocks per grid
292     if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) {
293       DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0];
294       DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]);
295     } else {
296       DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
297       DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
298           "at the hard limit\n", Properties.maxGridSize[0],
299           RTLDeviceInfoTy::HardTeamLimit);
300     }
301 
302     // Get threads per block, exploit threads only along x axis
303     if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) {
304       DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0];
305       DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]);
306       if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) {
307         DP("(fewer than max per block along all xyz dims %d)\n",
308             Properties.maxThreadsPerBlock);
309       }
310     } else {
311       DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
312       DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
313           "capping at the hard limit\n", Properties.maxThreadsDim[0],
314           RTLDeviceInfoTy::HardThreadLimit);
315     }
316 
317     // Get warp size
318     DeviceInfo.WarpSize[device_id] = Properties.warpSize;
319   }
320 
321   // Adjust teams to the env variables
322   if (DeviceInfo.EnvTeamLimit > 0 &&
323       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
324     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
325     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
326         DeviceInfo.EnvTeamLimit);
327   }
328 
329   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
330      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
331      DeviceInfo.WarpSize[device_id]);
332 
333   // Set default number of teams
334   if (DeviceInfo.EnvNumTeams > 0) {
335     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
336     DP("Default number of teams set according to environment %d\n",
337         DeviceInfo.EnvNumTeams);
338   } else {
339     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
340     DP("Default number of teams set according to library's default %d\n",
341         RTLDeviceInfoTy::DefaultNumTeams);
342   }
343   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
344     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
345     DP("Default number of teams exceeds device limit, capping at %d\n",
346         DeviceInfo.BlocksPerGrid[device_id]);
347   }
348 
349   // Set default number of threads
350   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
351   DP("Default number of threads set according to library's default %d\n",
352           RTLDeviceInfoTy::DefaultNumThreads);
353   if (DeviceInfo.NumThreads[device_id] >
354       DeviceInfo.ThreadsPerBlock[device_id]) {
355     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
356     DP("Default number of threads exceeds device limit, capping at %d\n",
357         DeviceInfo.ThreadsPerBlock[device_id]);
358   }
359 
360   return OFFLOAD_SUCCESS;
361 }
362 
363 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
364     __tgt_device_image *image) {
365 
366   // Set the context we are using.
367   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
368   if (err != CUDA_SUCCESS) {
369     DP("Error when setting a CUDA context for device %d\n", device_id);
370     CUDA_ERR_STRING(err);
371     return NULL;
372   }
373 
374   // Clear the offload table as we are going to create a new one.
375   DeviceInfo.clearOffloadEntriesTable(device_id);
376 
377   // Create the module and extract the function pointers.
378 
379   CUmodule cumod;
380   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
381   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
382   if (err != CUDA_SUCCESS) {
383     DP("Error when loading CUDA module\n");
384     CUDA_ERR_STRING(err);
385     return NULL;
386   }
387 
388   DP("CUDA module successfully loaded!\n");
389   DeviceInfo.Modules.push_back(cumod);
390 
391   // Find the symbols in the module by name.
392   __tgt_offload_entry *HostBegin = image->EntriesBegin;
393   __tgt_offload_entry *HostEnd = image->EntriesEnd;
394 
395   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
396 
397     if (!e->addr) {
398       // We return NULL when something like this happens, the host should have
399       // always something in the address to uniquely identify the target region.
400       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
401 
402       return NULL;
403     }
404 
405     if (e->size) {
406       __tgt_offload_entry entry = *e;
407 
408       CUdeviceptr cuptr;
409       size_t cusize;
410       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
411 
412       if (err != CUDA_SUCCESS) {
413         DP("Loading global '%s' (Failed)\n", e->name);
414         CUDA_ERR_STRING(err);
415         return NULL;
416       }
417 
418       if (cusize != e->size) {
419         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
420             cusize, e->size);
421         CUDA_ERR_STRING(err);
422         return NULL;
423       }
424 
425       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
426           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
427       entry.addr = (void *)cuptr;
428 
429       DeviceInfo.addOffloadEntry(device_id, entry);
430 
431       continue;
432     }
433 
434     CUfunction fun;
435     err = cuModuleGetFunction(&fun, cumod, e->name);
436 
437     if (err != CUDA_SUCCESS) {
438       DP("Loading '%s' (Failed)\n", e->name);
439       CUDA_ERR_STRING(err);
440       return NULL;
441     }
442 
443     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
444         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
445 
446     // default value GENERIC (in case symbol is missing from cubin file)
447     int8_t ExecModeVal = ExecutionModeType::GENERIC;
448     std::string ExecModeNameStr (e->name);
449     ExecModeNameStr += "_exec_mode";
450     const char *ExecModeName = ExecModeNameStr.c_str();
451 
452     CUdeviceptr ExecModePtr;
453     size_t cusize;
454     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
455     if (err == CUDA_SUCCESS) {
456       if ((size_t)cusize != sizeof(int8_t)) {
457         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
458            ExecModeName, cusize, sizeof(int8_t));
459         CUDA_ERR_STRING(err);
460         return NULL;
461       }
462 
463       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
464       if (err != CUDA_SUCCESS) {
465         DP("Error when copying data from device to host. Pointers: "
466            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
467            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
468         CUDA_ERR_STRING(err);
469         return NULL;
470       }
471 
472       if (ExecModeVal < 0 || ExecModeVal > 1) {
473         DP("Error wrong exec_mode value specified in cubin file: %d\n",
474            ExecModeVal);
475         return NULL;
476       }
477     } else {
478       DP("Loading global exec_mode '%s' - symbol missing, using default value "
479           "GENERIC (1)\n", ExecModeName);
480       CUDA_ERR_STRING(err);
481     }
482 
483     KernelsList.push_back(KernelTy(fun, ExecModeVal));
484 
485     __tgt_offload_entry entry = *e;
486     entry.addr = (void *)&KernelsList.back();
487     DeviceInfo.addOffloadEntry(device_id, entry);
488   }
489 
490   return DeviceInfo.getOffloadEntriesTable(device_id);
491 }
492 
493 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
494   if (size == 0) {
495     return NULL;
496   }
497 
498   // Set the context we are using.
499   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
500   if (err != CUDA_SUCCESS) {
501     DP("Error while trying to set CUDA current context\n");
502     CUDA_ERR_STRING(err);
503     return NULL;
504   }
505 
506   CUdeviceptr ptr;
507   err = cuMemAlloc(&ptr, size);
508   if (err != CUDA_SUCCESS) {
509     DP("Error while trying to allocate %d\n", err);
510     CUDA_ERR_STRING(err);
511     return NULL;
512   }
513 
514   void *vptr = (void *)ptr;
515   return vptr;
516 }
517 
518 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
519     int64_t size) {
520   // Set the context we are using.
521   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
522   if (err != CUDA_SUCCESS) {
523     DP("Error when setting CUDA context\n");
524     CUDA_ERR_STRING(err);
525     return OFFLOAD_FAIL;
526   }
527 
528   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
529   if (err != CUDA_SUCCESS) {
530     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
531        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
532        DPxPTR(tgt_ptr), size);
533     CUDA_ERR_STRING(err);
534     return OFFLOAD_FAIL;
535   }
536   return OFFLOAD_SUCCESS;
537 }
538 
539 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
540     int64_t size) {
541   // Set the context we are using.
542   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
543   if (err != CUDA_SUCCESS) {
544     DP("Error when setting CUDA context\n");
545     CUDA_ERR_STRING(err);
546     return OFFLOAD_FAIL;
547   }
548 
549   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
550   if (err != CUDA_SUCCESS) {
551     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
552         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
553         DPxPTR(tgt_ptr), size);
554     CUDA_ERR_STRING(err);
555     return OFFLOAD_FAIL;
556   }
557   return OFFLOAD_SUCCESS;
558 }
559 
560 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
561   // Set the context we are using.
562   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
563   if (err != CUDA_SUCCESS) {
564     DP("Error when setting CUDA context\n");
565     CUDA_ERR_STRING(err);
566     return OFFLOAD_FAIL;
567   }
568 
569   err = cuMemFree((CUdeviceptr)tgt_ptr);
570   if (err != CUDA_SUCCESS) {
571     DP("Error when freeing CUDA memory\n");
572     CUDA_ERR_STRING(err);
573     return OFFLOAD_FAIL;
574   }
575   return OFFLOAD_SUCCESS;
576 }
577 
578 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
579     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
580     int32_t thread_limit, uint64_t loop_tripcount) {
581   // Set the context we are using.
582   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
583   if (err != CUDA_SUCCESS) {
584     DP("Error when setting CUDA context\n");
585     CUDA_ERR_STRING(err);
586     return OFFLOAD_FAIL;
587   }
588 
589   // All args are references.
590   std::vector<void *> args(arg_num);
591   std::vector<void *> ptrs(arg_num);
592 
593   for (int32_t i = 0; i < arg_num; ++i) {
594     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
595     args[i] = &ptrs[i];
596   }
597 
598   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
599 
600   int cudaThreadsPerBlock;
601 
602   if (thread_limit > 0) {
603     cudaThreadsPerBlock = thread_limit;
604     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
605     // Add master warp if necessary
606     if (KernelInfo->ExecutionMode == GENERIC) {
607       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
608       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
609     }
610   } else {
611     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
612     DP("Setting CUDA threads per block to default %d\n",
613         DeviceInfo.NumThreads[device_id]);
614   }
615 
616   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
617     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
618     DP("Threads per block capped at device limit %d\n",
619         DeviceInfo.ThreadsPerBlock[device_id]);
620   }
621 
622   int kernel_limit;
623   err = cuFuncGetAttribute(&kernel_limit,
624       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
625   if (err == CUDA_SUCCESS) {
626     if (kernel_limit < cudaThreadsPerBlock) {
627       cudaThreadsPerBlock = kernel_limit;
628       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
629     }
630   }
631 
632   int cudaBlocksPerGrid;
633   if (team_num <= 0) {
634     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
635       if (KernelInfo->ExecutionMode == SPMD) {
636         // We have a combined construct, i.e. `target teams distribute parallel
637         // for [simd]`. We launch so many teams so that each thread will
638         // execute one iteration of the loop.
639         // round up to the nearest integer
640         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
641       } else {
642         // If we reach this point, then we have a non-combined construct, i.e.
643         // `teams distribute` with a nested `parallel for` and each team is
644         // assigned one iteration of the `distribute` loop. E.g.:
645         //
646         // #pragma omp target teams distribute
647         // for(...loop_tripcount...) {
648         //   #pragma omp parallel for
649         //   for(...) {}
650         // }
651         //
652         // Threads within a team will execute the iterations of the `parallel`
653         // loop.
654         cudaBlocksPerGrid = loop_tripcount;
655       }
656       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
657           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
658           cudaThreadsPerBlock);
659     } else {
660       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
661       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
662     }
663   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
664     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
665     DP("Capping number of teams to team limit %d\n",
666         DeviceInfo.BlocksPerGrid[device_id]);
667   } else {
668     cudaBlocksPerGrid = team_num;
669     DP("Using requested number of teams %d\n", team_num);
670   }
671 
672   // Run on the device.
673   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
674      cudaThreadsPerBlock);
675 
676   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
677       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
678   if (err != CUDA_SUCCESS) {
679     DP("Device kernel launch failed!\n");
680     CUDA_ERR_STRING(err);
681     assert(err == CUDA_SUCCESS && "Unable to launch target execution!");
682     return OFFLOAD_FAIL;
683   }
684 
685   DP("Launch of entry point at " DPxMOD " successful!\n",
686       DPxPTR(tgt_entry_ptr));
687 
688   cudaError_t sync_error = cudaDeviceSynchronize();
689   if (sync_error != cudaSuccess) {
690   DP("Kernel execution error at " DPxMOD ", %s.\n", DPxPTR(tgt_entry_ptr),
691       cudaGetErrorString(sync_error));
692     return OFFLOAD_FAIL;
693   } else {
694     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
695   }
696 
697   return OFFLOAD_SUCCESS;
698 }
699 
700 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
701     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
702   // use one team and the default number of threads.
703   const int32_t team_num = 1;
704   const int32_t thread_limit = 0;
705   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
706       tgt_offsets, arg_num, team_num, thread_limit, 0);
707 }
708 
709 #ifdef __cplusplus
710 }
711 #endif
712