1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <string>
18 #include <vector>
19 
20 #include "omptargetplugin.h"
21 
22 #ifndef TARGET_NAME
23 #define TARGET_NAME CUDA
24 #endif
25 
26 #ifdef OMPTARGET_DEBUG
27 static int DebugLevel = 0;
28 
29 #define GETNAME2(name) #name
30 #define GETNAME(name) GETNAME2(name)
31 #define DP(...) \
32   do { \
33     if (DebugLevel > 0) { \
34       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
35     } \
36   } while (false)
37 #else // OMPTARGET_DEBUG
38 #define DP(...) {}
39 #endif // OMPTARGET_DEBUG
40 
41 #include "../../common/elf_common.c"
42 
43 // Utility for retrieving and printing CUDA error string.
44 #ifdef CUDA_ERROR_REPORT
45 #define CUDA_ERR_STRING(err)                                                   \
46   do {                                                                         \
47     const char *errStr;                                                        \
48     cuGetErrorString(err, &errStr);                                            \
49     DP("CUDA error is: %s\n", errStr);                                         \
50   } while (0)
51 #else
52 #define CUDA_ERR_STRING(err)                                                   \
53   {}
54 #endif
55 
56 /// Keep entries table per device.
57 struct FuncOrGblEntryTy {
58   __tgt_target_table Table;
59   std::vector<__tgt_offload_entry> Entries;
60 };
61 
62 enum ExecutionModeType {
63   SPMD, // constructors, destructors,
64         // combined constructs (`teams distribute parallel for [simd]`)
65   GENERIC, // everything else
66   NONE
67 };
68 
69 /// Use a single entity to encode a kernel and a set of flags
70 struct KernelTy {
71   CUfunction Func;
72 
73   // execution mode of kernel
74   // 0 - SPMD mode (without master warp)
75   // 1 - Generic mode (with master warp)
76   int8_t ExecutionMode;
77 
78   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
79       : Func(_Func), ExecutionMode(_ExecutionMode) {}
80 };
81 
82 /// Device envrionment data
83 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
84 struct omptarget_device_environmentTy {
85   int32_t debug_level;
86 };
87 
88 /// List that contains all the kernels.
89 /// FIXME: we may need this to be per device and per library.
90 std::list<KernelTy> KernelsList;
91 
92 /// Class containing all the device information.
93 class RTLDeviceInfoTy {
94   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
95 
96 public:
97   int NumberOfDevices;
98   std::vector<CUmodule> Modules;
99   std::vector<CUcontext> Contexts;
100 
101   // Device properties
102   std::vector<int> ThreadsPerBlock;
103   std::vector<int> BlocksPerGrid;
104   std::vector<int> WarpSize;
105 
106   // OpenMP properties
107   std::vector<int> NumTeams;
108   std::vector<int> NumThreads;
109 
110   // OpenMP Environment properties
111   int EnvNumTeams;
112   int EnvTeamLimit;
113 
114   // OpenMP Requires Flags
115   int64_t RequiresFlags;
116 
117   //static int EnvNumThreads;
118   static const int HardTeamLimit = 1<<16; // 64k
119   static const int HardThreadLimit = 1024;
120   static const int DefaultNumTeams = 128;
121   static const int DefaultNumThreads = 128;
122 
123   // Record entry point associated with device
124   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
125     assert(device_id < (int32_t)FuncGblEntries.size() &&
126            "Unexpected device id!");
127     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
128 
129     E.Entries.push_back(entry);
130   }
131 
132   // Return true if the entry is associated with device
133   bool findOffloadEntry(int32_t device_id, void *addr) {
134     assert(device_id < (int32_t)FuncGblEntries.size() &&
135            "Unexpected device id!");
136     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
137 
138     for (auto &it : E.Entries) {
139       if (it.addr == addr)
140         return true;
141     }
142 
143     return false;
144   }
145 
146   // Return the pointer to the target entries table
147   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
148     assert(device_id < (int32_t)FuncGblEntries.size() &&
149            "Unexpected device id!");
150     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
151 
152     int32_t size = E.Entries.size();
153 
154     // Table is empty
155     if (!size)
156       return 0;
157 
158     __tgt_offload_entry *begin = &E.Entries[0];
159     __tgt_offload_entry *end = &E.Entries[size - 1];
160 
161     // Update table info according to the entries and return the pointer
162     E.Table.EntriesBegin = begin;
163     E.Table.EntriesEnd = ++end;
164 
165     return &E.Table;
166   }
167 
168   // Clear entries table for a device
169   void clearOffloadEntriesTable(int32_t device_id) {
170     assert(device_id < (int32_t)FuncGblEntries.size() &&
171            "Unexpected device id!");
172     FuncGblEntries[device_id].emplace_back();
173     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
174     E.Entries.clear();
175     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
176   }
177 
178   RTLDeviceInfoTy() {
179 #ifdef OMPTARGET_DEBUG
180     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
181       DebugLevel = std::stoi(envStr);
182     }
183 #endif // OMPTARGET_DEBUG
184 
185     DP("Start initializing CUDA\n");
186 
187     CUresult err = cuInit(0);
188     if (err != CUDA_SUCCESS) {
189       DP("Error when initializing CUDA\n");
190       CUDA_ERR_STRING(err);
191       return;
192     }
193 
194     NumberOfDevices = 0;
195 
196     err = cuDeviceGetCount(&NumberOfDevices);
197     if (err != CUDA_SUCCESS) {
198       DP("Error when getting CUDA device count\n");
199       CUDA_ERR_STRING(err);
200       return;
201     }
202 
203     if (NumberOfDevices == 0) {
204       DP("There are no devices supporting CUDA.\n");
205       return;
206     }
207 
208     FuncGblEntries.resize(NumberOfDevices);
209     Contexts.resize(NumberOfDevices);
210     ThreadsPerBlock.resize(NumberOfDevices);
211     BlocksPerGrid.resize(NumberOfDevices);
212     WarpSize.resize(NumberOfDevices);
213     NumTeams.resize(NumberOfDevices);
214     NumThreads.resize(NumberOfDevices);
215 
216     // Get environment variables regarding teams
217     char *envStr = getenv("OMP_TEAM_LIMIT");
218     if (envStr) {
219       // OMP_TEAM_LIMIT has been set
220       EnvTeamLimit = std::stoi(envStr);
221       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
222     } else {
223       EnvTeamLimit = -1;
224     }
225     envStr = getenv("OMP_NUM_TEAMS");
226     if (envStr) {
227       // OMP_NUM_TEAMS has been set
228       EnvNumTeams = std::stoi(envStr);
229       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
230     } else {
231       EnvNumTeams = -1;
232     }
233 
234     // Default state.
235     RequiresFlags = OMP_REQ_UNDEFINED;
236   }
237 
238   ~RTLDeviceInfoTy() {
239     // Close modules
240     for (auto &module : Modules)
241       if (module) {
242         CUresult err = cuModuleUnload(module);
243         if (err != CUDA_SUCCESS) {
244           DP("Error when unloading CUDA module\n");
245           CUDA_ERR_STRING(err);
246         }
247       }
248 
249     // Destroy contexts
250     for (auto &ctx : Contexts)
251       if (ctx) {
252         CUresult err = cuCtxDestroy(ctx);
253         if (err != CUDA_SUCCESS) {
254           DP("Error when destroying CUDA context\n");
255           CUDA_ERR_STRING(err);
256         }
257       }
258   }
259 };
260 
261 static RTLDeviceInfoTy DeviceInfo;
262 
263 #ifdef __cplusplus
264 extern "C" {
265 #endif
266 
267 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
268   return elf_check_machine(image, 190); // EM_CUDA = 190.
269 }
270 
271 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
272 
273 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
274   DP("Init requires flags to %ld\n", RequiresFlags);
275   DeviceInfo.RequiresFlags = RequiresFlags;
276   return RequiresFlags;
277 }
278 
279 int32_t __tgt_rtl_init_device(int32_t device_id) {
280 
281   CUdevice cuDevice;
282   DP("Getting device %d\n", device_id);
283   CUresult err = cuDeviceGet(&cuDevice, device_id);
284   if (err != CUDA_SUCCESS) {
285     DP("Error when getting CUDA device with id = %d\n", device_id);
286     CUDA_ERR_STRING(err);
287     return OFFLOAD_FAIL;
288   }
289 
290   // Create the context and save it to use whenever this device is selected.
291   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
292                     cuDevice);
293   if (err != CUDA_SUCCESS) {
294     DP("Error when creating a CUDA context\n");
295     CUDA_ERR_STRING(err);
296     return OFFLOAD_FAIL;
297   }
298 
299   // Query attributes to determine number of threads/block and blocks/grid.
300   int maxGridDimX;
301   err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
302                              cuDevice);
303   if (err != CUDA_SUCCESS) {
304     DP("Error getting max grid dimension, use default\n");
305     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
306   } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
307     DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
308     DP("Using %d CUDA blocks per grid\n", maxGridDimX);
309   } else {
310     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
311     DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
312        "at the hard limit\n",
313        maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
314   }
315 
316   // We are only exploiting threads along the x axis.
317   int maxBlockDimX;
318   err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
319                              cuDevice);
320   if (err != CUDA_SUCCESS) {
321     DP("Error getting max block dimension, use default\n");
322     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
323   } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
324     DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
325     DP("Using %d CUDA threads per block\n", maxBlockDimX);
326   } else {
327     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
328     DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
329        "at the hard limit\n",
330        maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
331   }
332 
333   int warpSize;
334   err =
335       cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
336   if (err != CUDA_SUCCESS) {
337     DP("Error getting warp size, assume default\n");
338     DeviceInfo.WarpSize[device_id] = 32;
339   } else {
340     DeviceInfo.WarpSize[device_id] = warpSize;
341   }
342 
343   // Adjust teams to the env variables
344   if (DeviceInfo.EnvTeamLimit > 0 &&
345       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
346     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
347     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
348         DeviceInfo.EnvTeamLimit);
349   }
350 
351   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
352      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
353      DeviceInfo.WarpSize[device_id]);
354 
355   // Set default number of teams
356   if (DeviceInfo.EnvNumTeams > 0) {
357     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
358     DP("Default number of teams set according to environment %d\n",
359         DeviceInfo.EnvNumTeams);
360   } else {
361     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
362     DP("Default number of teams set according to library's default %d\n",
363         RTLDeviceInfoTy::DefaultNumTeams);
364   }
365   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
366     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
367     DP("Default number of teams exceeds device limit, capping at %d\n",
368         DeviceInfo.BlocksPerGrid[device_id]);
369   }
370 
371   // Set default number of threads
372   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
373   DP("Default number of threads set according to library's default %d\n",
374           RTLDeviceInfoTy::DefaultNumThreads);
375   if (DeviceInfo.NumThreads[device_id] >
376       DeviceInfo.ThreadsPerBlock[device_id]) {
377     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
378     DP("Default number of threads exceeds device limit, capping at %d\n",
379         DeviceInfo.ThreadsPerBlock[device_id]);
380   }
381 
382   return OFFLOAD_SUCCESS;
383 }
384 
385 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
386     __tgt_device_image *image) {
387 
388   // Set the context we are using.
389   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
390   if (err != CUDA_SUCCESS) {
391     DP("Error when setting a CUDA context for device %d\n", device_id);
392     CUDA_ERR_STRING(err);
393     return NULL;
394   }
395 
396   // Clear the offload table as we are going to create a new one.
397   DeviceInfo.clearOffloadEntriesTable(device_id);
398 
399   // Create the module and extract the function pointers.
400 
401   CUmodule cumod;
402   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
403   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
404   if (err != CUDA_SUCCESS) {
405     DP("Error when loading CUDA module\n");
406     CUDA_ERR_STRING(err);
407     return NULL;
408   }
409 
410   DP("CUDA module successfully loaded!\n");
411   DeviceInfo.Modules.push_back(cumod);
412 
413   // Find the symbols in the module by name.
414   __tgt_offload_entry *HostBegin = image->EntriesBegin;
415   __tgt_offload_entry *HostEnd = image->EntriesEnd;
416 
417   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
418 
419     if (!e->addr) {
420       // We return NULL when something like this happens, the host should have
421       // always something in the address to uniquely identify the target region.
422       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
423 
424       return NULL;
425     }
426 
427     if (e->size) {
428       __tgt_offload_entry entry = *e;
429 
430       CUdeviceptr cuptr;
431       size_t cusize;
432       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
433 
434       if (err != CUDA_SUCCESS) {
435         DP("Loading global '%s' (Failed)\n", e->name);
436         CUDA_ERR_STRING(err);
437         return NULL;
438       }
439 
440       if (cusize != e->size) {
441         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
442             cusize, e->size);
443         CUDA_ERR_STRING(err);
444         return NULL;
445       }
446 
447       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
448           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
449       entry.addr = (void *)cuptr;
450 
451       // Note: In the current implementation declare target variables
452       // can either be link or to. This means that once unified
453       // memory is activated via the requires directive, the variable
454       // can be used directly from the host in both cases.
455       // TODO: when variables types other than to or link are added,
456       // the below condition should be changed to explicitely
457       // check for to and link variables types:
458       //  (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
459       //   (e->flags & OMP_DECLARE_TARGET_LINK ||
460       //    e->flags == OMP_DECLARE_TARGET_TO))
461       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
462         // If unified memory is present any target link or to variables
463         // can access host addresses directly. There is no longer a
464         // need for device copies.
465         cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
466         DP("Copy linked variable host address (" DPxMOD ")"
467            "to device address (" DPxMOD ")\n",
468           DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
469       }
470 
471       DeviceInfo.addOffloadEntry(device_id, entry);
472 
473       continue;
474     }
475 
476     CUfunction fun;
477     err = cuModuleGetFunction(&fun, cumod, e->name);
478 
479     if (err != CUDA_SUCCESS) {
480       DP("Loading '%s' (Failed)\n", e->name);
481       CUDA_ERR_STRING(err);
482       return NULL;
483     }
484 
485     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
486         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
487 
488     // default value GENERIC (in case symbol is missing from cubin file)
489     int8_t ExecModeVal = ExecutionModeType::GENERIC;
490     std::string ExecModeNameStr (e->name);
491     ExecModeNameStr += "_exec_mode";
492     const char *ExecModeName = ExecModeNameStr.c_str();
493 
494     CUdeviceptr ExecModePtr;
495     size_t cusize;
496     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
497     if (err == CUDA_SUCCESS) {
498       if ((size_t)cusize != sizeof(int8_t)) {
499         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
500            ExecModeName, cusize, sizeof(int8_t));
501         CUDA_ERR_STRING(err);
502         return NULL;
503       }
504 
505       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
506       if (err != CUDA_SUCCESS) {
507         DP("Error when copying data from device to host. Pointers: "
508            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
509            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
510         CUDA_ERR_STRING(err);
511         return NULL;
512       }
513 
514       if (ExecModeVal < 0 || ExecModeVal > 1) {
515         DP("Error wrong exec_mode value specified in cubin file: %d\n",
516            ExecModeVal);
517         return NULL;
518       }
519     } else {
520       DP("Loading global exec_mode '%s' - symbol missing, using default value "
521           "GENERIC (1)\n", ExecModeName);
522       CUDA_ERR_STRING(err);
523     }
524 
525     KernelsList.push_back(KernelTy(fun, ExecModeVal));
526 
527     __tgt_offload_entry entry = *e;
528     entry.addr = (void *)&KernelsList.back();
529     DeviceInfo.addOffloadEntry(device_id, entry);
530   }
531 
532   // send device environment data to the device
533   {
534     omptarget_device_environmentTy device_env;
535 
536     device_env.debug_level = 0;
537 
538 #ifdef OMPTARGET_DEBUG
539     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
540       device_env.debug_level = std::stoi(envStr);
541     }
542 #endif
543 
544     const char * device_env_Name="omptarget_device_environment";
545     CUdeviceptr device_env_Ptr;
546     size_t cusize;
547 
548     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
549 
550     if (err == CUDA_SUCCESS) {
551       if ((size_t)cusize != sizeof(device_env)) {
552         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
553             device_env_Name, cusize, sizeof(int32_t));
554         CUDA_ERR_STRING(err);
555         return NULL;
556       }
557 
558       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
559       if (err != CUDA_SUCCESS) {
560         DP("Error when copying data from host to device. Pointers: "
561             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
562             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
563         CUDA_ERR_STRING(err);
564         return NULL;
565       }
566 
567       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
568     } else {
569       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
570       DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n");
571     }
572   }
573 
574   return DeviceInfo.getOffloadEntriesTable(device_id);
575 }
576 
577 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
578   if (size == 0) {
579     return NULL;
580   }
581 
582   // Set the context we are using.
583   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
584   if (err != CUDA_SUCCESS) {
585     DP("Error while trying to set CUDA current context\n");
586     CUDA_ERR_STRING(err);
587     return NULL;
588   }
589 
590   CUdeviceptr ptr;
591   err = cuMemAlloc(&ptr, size);
592   if (err != CUDA_SUCCESS) {
593     DP("Error while trying to allocate %d\n", err);
594     CUDA_ERR_STRING(err);
595     return NULL;
596   }
597 
598   void *vptr = (void *)ptr;
599   return vptr;
600 }
601 
602 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
603     int64_t size) {
604   // Set the context we are using.
605   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
606   if (err != CUDA_SUCCESS) {
607     DP("Error when setting CUDA context\n");
608     CUDA_ERR_STRING(err);
609     return OFFLOAD_FAIL;
610   }
611 
612   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
613   if (err != CUDA_SUCCESS) {
614     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
615        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
616        DPxPTR(tgt_ptr), size);
617     CUDA_ERR_STRING(err);
618     return OFFLOAD_FAIL;
619   }
620   return OFFLOAD_SUCCESS;
621 }
622 
623 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
624     int64_t size) {
625   // Set the context we are using.
626   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
627   if (err != CUDA_SUCCESS) {
628     DP("Error when setting CUDA context\n");
629     CUDA_ERR_STRING(err);
630     return OFFLOAD_FAIL;
631   }
632 
633   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
634   if (err != CUDA_SUCCESS) {
635     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
636         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
637         DPxPTR(tgt_ptr), size);
638     CUDA_ERR_STRING(err);
639     return OFFLOAD_FAIL;
640   }
641   return OFFLOAD_SUCCESS;
642 }
643 
644 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
645   // Set the context we are using.
646   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
647   if (err != CUDA_SUCCESS) {
648     DP("Error when setting CUDA context\n");
649     CUDA_ERR_STRING(err);
650     return OFFLOAD_FAIL;
651   }
652 
653   err = cuMemFree((CUdeviceptr)tgt_ptr);
654   if (err != CUDA_SUCCESS) {
655     DP("Error when freeing CUDA memory\n");
656     CUDA_ERR_STRING(err);
657     return OFFLOAD_FAIL;
658   }
659   return OFFLOAD_SUCCESS;
660 }
661 
662 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
663     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
664     int32_t thread_limit, uint64_t loop_tripcount) {
665   // Set the context we are using.
666   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
667   if (err != CUDA_SUCCESS) {
668     DP("Error when setting CUDA context\n");
669     CUDA_ERR_STRING(err);
670     return OFFLOAD_FAIL;
671   }
672 
673   // All args are references.
674   std::vector<void *> args(arg_num);
675   std::vector<void *> ptrs(arg_num);
676 
677   for (int32_t i = 0; i < arg_num; ++i) {
678     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
679     args[i] = &ptrs[i];
680   }
681 
682   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
683 
684   int cudaThreadsPerBlock;
685 
686   if (thread_limit > 0) {
687     cudaThreadsPerBlock = thread_limit;
688     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
689     // Add master warp if necessary
690     if (KernelInfo->ExecutionMode == GENERIC) {
691       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
692       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
693     }
694   } else {
695     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
696     DP("Setting CUDA threads per block to default %d\n",
697         DeviceInfo.NumThreads[device_id]);
698   }
699 
700   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
701     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
702     DP("Threads per block capped at device limit %d\n",
703         DeviceInfo.ThreadsPerBlock[device_id]);
704   }
705 
706   int kernel_limit;
707   err = cuFuncGetAttribute(&kernel_limit,
708       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
709   if (err == CUDA_SUCCESS) {
710     if (kernel_limit < cudaThreadsPerBlock) {
711       cudaThreadsPerBlock = kernel_limit;
712       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
713     }
714   }
715 
716   int cudaBlocksPerGrid;
717   if (team_num <= 0) {
718     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
719       if (KernelInfo->ExecutionMode == SPMD) {
720         // We have a combined construct, i.e. `target teams distribute parallel
721         // for [simd]`. We launch so many teams so that each thread will
722         // execute one iteration of the loop.
723         // round up to the nearest integer
724         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
725       } else {
726         // If we reach this point, then we have a non-combined construct, i.e.
727         // `teams distribute` with a nested `parallel for` and each team is
728         // assigned one iteration of the `distribute` loop. E.g.:
729         //
730         // #pragma omp target teams distribute
731         // for(...loop_tripcount...) {
732         //   #pragma omp parallel for
733         //   for(...) {}
734         // }
735         //
736         // Threads within a team will execute the iterations of the `parallel`
737         // loop.
738         cudaBlocksPerGrid = loop_tripcount;
739       }
740       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
741           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
742           cudaThreadsPerBlock);
743     } else {
744       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
745       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
746     }
747   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
748     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
749     DP("Capping number of teams to team limit %d\n",
750         DeviceInfo.BlocksPerGrid[device_id]);
751   } else {
752     cudaBlocksPerGrid = team_num;
753     DP("Using requested number of teams %d\n", team_num);
754   }
755 
756   // Run on the device.
757   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
758      cudaThreadsPerBlock);
759 
760   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
761       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
762   if (err != CUDA_SUCCESS) {
763     DP("Device kernel launch failed!\n");
764     CUDA_ERR_STRING(err);
765     return OFFLOAD_FAIL;
766   }
767 
768   DP("Launch of entry point at " DPxMOD " successful!\n",
769       DPxPTR(tgt_entry_ptr));
770 
771   CUresult sync_err = cuCtxSynchronize();
772   if (sync_err != CUDA_SUCCESS) {
773     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
774     CUDA_ERR_STRING(sync_err);
775     return OFFLOAD_FAIL;
776   } else {
777     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
778   }
779 
780   return OFFLOAD_SUCCESS;
781 }
782 
783 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
784     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
785   // use one team and the default number of threads.
786   const int32_t team_num = 1;
787   const int32_t thread_limit = 0;
788   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
789       tgt_offsets, arg_num, team_num, thread_limit, 0);
790 }
791 
792 #ifdef __cplusplus
793 }
794 #endif
795