1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <cassert>
14 #include <cstddef>
15 #include <cuda.h>
16 #include <list>
17 #include <string>
18 #include <vector>
19 
20 #include "omptargetplugin.h"
21 
22 #ifndef TARGET_NAME
23 #define TARGET_NAME CUDA
24 #endif
25 
26 #ifdef OMPTARGET_DEBUG
27 static int DebugLevel = 0;
28 
29 #define GETNAME2(name) #name
30 #define GETNAME(name) GETNAME2(name)
31 #define DP(...) \
32   do { \
33     if (DebugLevel > 0) { \
34       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \
35     } \
36   } while (false)
37 
38 // Utility for retrieving and printing CUDA error string.
39 #define CUDA_ERR_STRING(err) \
40   do { \
41     if (DebugLevel > 0) { \
42       const char *errStr; \
43       cuGetErrorString(err, &errStr); \
44       DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \
45     } \
46   } while (false)
47 #else // OMPTARGET_DEBUG
48 #define DP(...) {}
49 #define CUDA_ERR_STRING(err) {}
50 #endif // OMPTARGET_DEBUG
51 
52 #include "../../common/elf_common.c"
53 
54 /// Keep entries table per device.
55 struct FuncOrGblEntryTy {
56   __tgt_target_table Table;
57   std::vector<__tgt_offload_entry> Entries;
58 };
59 
60 enum ExecutionModeType {
61   SPMD, // constructors, destructors,
62         // combined constructs (`teams distribute parallel for [simd]`)
63   GENERIC, // everything else
64   NONE
65 };
66 
67 /// Use a single entity to encode a kernel and a set of flags.
68 struct KernelTy {
69   CUfunction Func;
70 
71   // execution mode of kernel
72   // 0 - SPMD mode (without master warp)
73   // 1 - Generic mode (with master warp)
74   int8_t ExecutionMode;
75 
76   KernelTy(CUfunction _Func, int8_t _ExecutionMode)
77       : Func(_Func), ExecutionMode(_ExecutionMode) {}
78 };
79 
80 /// Device environment data
81 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later.
82 struct omptarget_device_environmentTy {
83   int32_t debug_level;
84 };
85 
86 /// List that contains all the kernels.
87 /// FIXME: we may need this to be per device and per library.
88 std::list<KernelTy> KernelsList;
89 
90 /// Class containing all the device information.
91 class RTLDeviceInfoTy {
92   std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
93 
94 public:
95   int NumberOfDevices;
96   std::vector<CUmodule> Modules;
97   std::vector<CUcontext> Contexts;
98 
99   // Device properties
100   std::vector<int> ThreadsPerBlock;
101   std::vector<int> BlocksPerGrid;
102   std::vector<int> WarpSize;
103 
104   // OpenMP properties
105   std::vector<int> NumTeams;
106   std::vector<int> NumThreads;
107 
108   // OpenMP Environment properties
109   int EnvNumTeams;
110   int EnvTeamLimit;
111 
112   // OpenMP Requires Flags
113   int64_t RequiresFlags;
114 
115   //static int EnvNumThreads;
116   static const int HardTeamLimit = 1<<16; // 64k
117   static const int HardThreadLimit = 1024;
118   static const int DefaultNumTeams = 128;
119   static const int DefaultNumThreads = 128;
120 
121   // Record entry point associated with device
122   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
123     assert(device_id < (int32_t)FuncGblEntries.size() &&
124            "Unexpected device id!");
125     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
126 
127     E.Entries.push_back(entry);
128   }
129 
130   // Return true if the entry is associated with device
131   bool findOffloadEntry(int32_t device_id, void *addr) {
132     assert(device_id < (int32_t)FuncGblEntries.size() &&
133            "Unexpected device id!");
134     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
135 
136     for (auto &it : E.Entries) {
137       if (it.addr == addr)
138         return true;
139     }
140 
141     return false;
142   }
143 
144   // Return the pointer to the target entries table
145   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
146     assert(device_id < (int32_t)FuncGblEntries.size() &&
147            "Unexpected device id!");
148     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
149 
150     int32_t size = E.Entries.size();
151 
152     // Table is empty
153     if (!size)
154       return 0;
155 
156     __tgt_offload_entry *begin = &E.Entries[0];
157     __tgt_offload_entry *end = &E.Entries[size - 1];
158 
159     // Update table info according to the entries and return the pointer
160     E.Table.EntriesBegin = begin;
161     E.Table.EntriesEnd = ++end;
162 
163     return &E.Table;
164   }
165 
166   // Clear entries table for a device
167   void clearOffloadEntriesTable(int32_t device_id) {
168     assert(device_id < (int32_t)FuncGblEntries.size() &&
169            "Unexpected device id!");
170     FuncGblEntries[device_id].emplace_back();
171     FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
172     E.Entries.clear();
173     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
174   }
175 
176   RTLDeviceInfoTy() {
177 #ifdef OMPTARGET_DEBUG
178     if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) {
179       DebugLevel = std::stoi(envStr);
180     }
181 #endif // OMPTARGET_DEBUG
182 
183     DP("Start initializing CUDA\n");
184 
185     CUresult err = cuInit(0);
186     if (err != CUDA_SUCCESS) {
187       DP("Error when initializing CUDA\n");
188       CUDA_ERR_STRING(err);
189       return;
190     }
191 
192     NumberOfDevices = 0;
193 
194     err = cuDeviceGetCount(&NumberOfDevices);
195     if (err != CUDA_SUCCESS) {
196       DP("Error when getting CUDA device count\n");
197       CUDA_ERR_STRING(err);
198       return;
199     }
200 
201     if (NumberOfDevices == 0) {
202       DP("There are no devices supporting CUDA.\n");
203       return;
204     }
205 
206     FuncGblEntries.resize(NumberOfDevices);
207     Contexts.resize(NumberOfDevices);
208     ThreadsPerBlock.resize(NumberOfDevices);
209     BlocksPerGrid.resize(NumberOfDevices);
210     WarpSize.resize(NumberOfDevices);
211     NumTeams.resize(NumberOfDevices);
212     NumThreads.resize(NumberOfDevices);
213 
214     // Get environment variables regarding teams
215     char *envStr = getenv("OMP_TEAM_LIMIT");
216     if (envStr) {
217       // OMP_TEAM_LIMIT has been set
218       EnvTeamLimit = std::stoi(envStr);
219       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
220     } else {
221       EnvTeamLimit = -1;
222     }
223     envStr = getenv("OMP_NUM_TEAMS");
224     if (envStr) {
225       // OMP_NUM_TEAMS has been set
226       EnvNumTeams = std::stoi(envStr);
227       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
228     } else {
229       EnvNumTeams = -1;
230     }
231 
232     // Default state.
233     RequiresFlags = OMP_REQ_UNDEFINED;
234   }
235 
236   ~RTLDeviceInfoTy() {
237     // Close modules
238     for (auto &module : Modules)
239       if (module) {
240         CUresult err = cuModuleUnload(module);
241         if (err != CUDA_SUCCESS) {
242           DP("Error when unloading CUDA module\n");
243           CUDA_ERR_STRING(err);
244         }
245       }
246 
247     // Destroy contexts
248     for (auto &ctx : Contexts)
249       if (ctx) {
250         CUresult err = cuCtxDestroy(ctx);
251         if (err != CUDA_SUCCESS) {
252           DP("Error when destroying CUDA context\n");
253           CUDA_ERR_STRING(err);
254         }
255       }
256   }
257 };
258 
259 static RTLDeviceInfoTy DeviceInfo;
260 
261 #ifdef __cplusplus
262 extern "C" {
263 #endif
264 
265 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) {
266   return elf_check_machine(image, 190); // EM_CUDA = 190.
267 }
268 
269 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; }
270 
271 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
272   DP("Init requires flags to %ld\n", RequiresFlags);
273   DeviceInfo.RequiresFlags = RequiresFlags;
274   return RequiresFlags;
275 }
276 
277 int32_t __tgt_rtl_init_device(int32_t device_id) {
278 
279   CUdevice cuDevice;
280   DP("Getting device %d\n", device_id);
281   CUresult err = cuDeviceGet(&cuDevice, device_id);
282   if (err != CUDA_SUCCESS) {
283     DP("Error when getting CUDA device with id = %d\n", device_id);
284     CUDA_ERR_STRING(err);
285     return OFFLOAD_FAIL;
286   }
287 
288   // Create the context and save it to use whenever this device is selected.
289   err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC,
290                     cuDevice);
291   if (err != CUDA_SUCCESS) {
292     DP("Error when creating a CUDA context\n");
293     CUDA_ERR_STRING(err);
294     return OFFLOAD_FAIL;
295   }
296 
297   // Query attributes to determine number of threads/block and blocks/grid.
298   int maxGridDimX;
299   err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
300                              cuDevice);
301   if (err != CUDA_SUCCESS) {
302     DP("Error getting max grid dimension, use default\n");
303     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
304   } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) {
305     DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX;
306     DP("Using %d CUDA blocks per grid\n", maxGridDimX);
307   } else {
308     DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit;
309     DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping "
310        "at the hard limit\n",
311        maxGridDimX, RTLDeviceInfoTy::HardTeamLimit);
312   }
313 
314   // We are only exploiting threads along the x axis.
315   int maxBlockDimX;
316   err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
317                              cuDevice);
318   if (err != CUDA_SUCCESS) {
319     DP("Error getting max block dimension, use default\n");
320     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
321   } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) {
322     DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX;
323     DP("Using %d CUDA threads per block\n", maxBlockDimX);
324   } else {
325     DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit;
326     DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping"
327        "at the hard limit\n",
328        maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit);
329   }
330 
331   int warpSize;
332   err =
333       cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
334   if (err != CUDA_SUCCESS) {
335     DP("Error getting warp size, assume default\n");
336     DeviceInfo.WarpSize[device_id] = 32;
337   } else {
338     DeviceInfo.WarpSize[device_id] = warpSize;
339   }
340 
341   // Adjust teams to the env variables
342   if (DeviceInfo.EnvTeamLimit > 0 &&
343       DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) {
344     DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit;
345     DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
346         DeviceInfo.EnvTeamLimit);
347   }
348 
349   DP("Max number of CUDA blocks %d, threads %d & warp size %d\n",
350      DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id],
351      DeviceInfo.WarpSize[device_id]);
352 
353   // Set default number of teams
354   if (DeviceInfo.EnvNumTeams > 0) {
355     DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams;
356     DP("Default number of teams set according to environment %d\n",
357         DeviceInfo.EnvNumTeams);
358   } else {
359     DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams;
360     DP("Default number of teams set according to library's default %d\n",
361         RTLDeviceInfoTy::DefaultNumTeams);
362   }
363   if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) {
364     DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id];
365     DP("Default number of teams exceeds device limit, capping at %d\n",
366         DeviceInfo.BlocksPerGrid[device_id]);
367   }
368 
369   // Set default number of threads
370   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads;
371   DP("Default number of threads set according to library's default %d\n",
372           RTLDeviceInfoTy::DefaultNumThreads);
373   if (DeviceInfo.NumThreads[device_id] >
374       DeviceInfo.ThreadsPerBlock[device_id]) {
375     DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id];
376     DP("Default number of threads exceeds device limit, capping at %d\n",
377         DeviceInfo.ThreadsPerBlock[device_id]);
378   }
379 
380   return OFFLOAD_SUCCESS;
381 }
382 
383 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
384     __tgt_device_image *image) {
385 
386   // Set the context we are using.
387   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
388   if (err != CUDA_SUCCESS) {
389     DP("Error when setting a CUDA context for device %d\n", device_id);
390     CUDA_ERR_STRING(err);
391     return NULL;
392   }
393 
394   // Clear the offload table as we are going to create a new one.
395   DeviceInfo.clearOffloadEntriesTable(device_id);
396 
397   // Create the module and extract the function pointers.
398 
399   CUmodule cumod;
400   DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart));
401   err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL);
402   if (err != CUDA_SUCCESS) {
403     DP("Error when loading CUDA module\n");
404     CUDA_ERR_STRING(err);
405     return NULL;
406   }
407 
408   DP("CUDA module successfully loaded!\n");
409   DeviceInfo.Modules.push_back(cumod);
410 
411   // Find the symbols in the module by name.
412   __tgt_offload_entry *HostBegin = image->EntriesBegin;
413   __tgt_offload_entry *HostEnd = image->EntriesEnd;
414 
415   for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) {
416 
417     if (!e->addr) {
418       // We return NULL when something like this happens, the host should have
419       // always something in the address to uniquely identify the target region.
420       DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size);
421 
422       return NULL;
423     }
424 
425     if (e->size) {
426       __tgt_offload_entry entry = *e;
427 
428       CUdeviceptr cuptr;
429       size_t cusize;
430       err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name);
431 
432       if (err != CUDA_SUCCESS) {
433         DP("Loading global '%s' (Failed)\n", e->name);
434         CUDA_ERR_STRING(err);
435         return NULL;
436       }
437 
438       if (cusize != e->size) {
439         DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name,
440             cusize, e->size);
441         CUDA_ERR_STRING(err);
442         return NULL;
443       }
444 
445       DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
446           DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr));
447       entry.addr = (void *)cuptr;
448 
449       // Note: In the current implementation declare target variables
450       // can either be link or to. This means that once unified
451       // memory is activated via the requires directive, the variable
452       // can be used directly from the host in both cases.
453       // TODO: when variables types other than to or link are added,
454       // the below condition should be changed to explicitly
455       // check for to and link variables types:
456       //  (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
457       //   (e->flags & OMP_DECLARE_TARGET_LINK ||
458       //    e->flags == OMP_DECLARE_TARGET_TO))
459       if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
460         // If unified memory is present any target link or to variables
461         // can access host addresses directly. There is no longer a
462         // need for device copies.
463         cuMemcpyHtoD(cuptr, e->addr, sizeof(void *));
464         DP("Copy linked variable host address (" DPxMOD ")"
465            "to device address (" DPxMOD ")\n",
466           DPxPTR(*((void**)e->addr)), DPxPTR(cuptr));
467       }
468 
469       DeviceInfo.addOffloadEntry(device_id, entry);
470 
471       continue;
472     }
473 
474     CUfunction fun;
475     err = cuModuleGetFunction(&fun, cumod, e->name);
476 
477     if (err != CUDA_SUCCESS) {
478       DP("Loading '%s' (Failed)\n", e->name);
479       CUDA_ERR_STRING(err);
480       return NULL;
481     }
482 
483     DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
484         DPxPTR(e - HostBegin), e->name, DPxPTR(fun));
485 
486     // default value GENERIC (in case symbol is missing from cubin file)
487     int8_t ExecModeVal = ExecutionModeType::GENERIC;
488     std::string ExecModeNameStr (e->name);
489     ExecModeNameStr += "_exec_mode";
490     const char *ExecModeName = ExecModeNameStr.c_str();
491 
492     CUdeviceptr ExecModePtr;
493     size_t cusize;
494     err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName);
495     if (err == CUDA_SUCCESS) {
496       if ((size_t)cusize != sizeof(int8_t)) {
497         DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
498            ExecModeName, cusize, sizeof(int8_t));
499         CUDA_ERR_STRING(err);
500         return NULL;
501       }
502 
503       err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize);
504       if (err != CUDA_SUCCESS) {
505         DP("Error when copying data from device to host. Pointers: "
506            "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
507            DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize);
508         CUDA_ERR_STRING(err);
509         return NULL;
510       }
511 
512       if (ExecModeVal < 0 || ExecModeVal > 1) {
513         DP("Error wrong exec_mode value specified in cubin file: %d\n",
514            ExecModeVal);
515         return NULL;
516       }
517     } else {
518       DP("Loading global exec_mode '%s' - symbol missing, using default value "
519           "GENERIC (1)\n", ExecModeName);
520       CUDA_ERR_STRING(err);
521     }
522 
523     KernelsList.push_back(KernelTy(fun, ExecModeVal));
524 
525     __tgt_offload_entry entry = *e;
526     entry.addr = (void *)&KernelsList.back();
527     DeviceInfo.addOffloadEntry(device_id, entry);
528   }
529 
530   // send device environment data to the device
531   {
532     omptarget_device_environmentTy device_env;
533 
534     device_env.debug_level = 0;
535 
536 #ifdef OMPTARGET_DEBUG
537     if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) {
538       device_env.debug_level = std::stoi(envStr);
539     }
540 #endif
541 
542     const char * device_env_Name="omptarget_device_environment";
543     CUdeviceptr device_env_Ptr;
544     size_t cusize;
545 
546     err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name);
547 
548     if (err == CUDA_SUCCESS) {
549       if ((size_t)cusize != sizeof(device_env)) {
550         DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n",
551             device_env_Name, cusize, sizeof(int32_t));
552         CUDA_ERR_STRING(err);
553         return NULL;
554       }
555 
556       err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize);
557       if (err != CUDA_SUCCESS) {
558         DP("Error when copying data from host to device. Pointers: "
559             "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
560             DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize);
561         CUDA_ERR_STRING(err);
562         return NULL;
563       }
564 
565       DP("Sending global device environment data %zu bytes\n", (size_t)cusize);
566     } else {
567       DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name);
568       DP("Continue, considering this is a device RTL which does not accept environment setting.\n");
569     }
570   }
571 
572   return DeviceInfo.getOffloadEntriesTable(device_id);
573 }
574 
575 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
576   if (size == 0) {
577     return NULL;
578   }
579 
580   // Set the context we are using.
581   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
582   if (err != CUDA_SUCCESS) {
583     DP("Error while trying to set CUDA current context\n");
584     CUDA_ERR_STRING(err);
585     return NULL;
586   }
587 
588   CUdeviceptr ptr;
589   err = cuMemAlloc(&ptr, size);
590   if (err != CUDA_SUCCESS) {
591     DP("Error while trying to allocate %d\n", err);
592     CUDA_ERR_STRING(err);
593     return NULL;
594   }
595 
596   void *vptr = (void *)ptr;
597   return vptr;
598 }
599 
600 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr,
601     int64_t size) {
602   // Set the context we are using.
603   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
604   if (err != CUDA_SUCCESS) {
605     DP("Error when setting CUDA context\n");
606     CUDA_ERR_STRING(err);
607     return OFFLOAD_FAIL;
608   }
609 
610   err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size);
611   if (err != CUDA_SUCCESS) {
612     DP("Error when copying data from host to device. Pointers: host = " DPxMOD
613        ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
614        DPxPTR(tgt_ptr), size);
615     CUDA_ERR_STRING(err);
616     return OFFLOAD_FAIL;
617   }
618   return OFFLOAD_SUCCESS;
619 }
620 
621 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr,
622     int64_t size) {
623   // Set the context we are using.
624   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
625   if (err != CUDA_SUCCESS) {
626     DP("Error when setting CUDA context\n");
627     CUDA_ERR_STRING(err);
628     return OFFLOAD_FAIL;
629   }
630 
631   err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size);
632   if (err != CUDA_SUCCESS) {
633     DP("Error when copying data from device to host. Pointers: host = " DPxMOD
634         ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr),
635         DPxPTR(tgt_ptr), size);
636     CUDA_ERR_STRING(err);
637     return OFFLOAD_FAIL;
638   }
639   return OFFLOAD_SUCCESS;
640 }
641 
642 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
643   // Set the context we are using.
644   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
645   if (err != CUDA_SUCCESS) {
646     DP("Error when setting CUDA context\n");
647     CUDA_ERR_STRING(err);
648     return OFFLOAD_FAIL;
649   }
650 
651   err = cuMemFree((CUdeviceptr)tgt_ptr);
652   if (err != CUDA_SUCCESS) {
653     DP("Error when freeing CUDA memory\n");
654     CUDA_ERR_STRING(err);
655     return OFFLOAD_FAIL;
656   }
657   return OFFLOAD_SUCCESS;
658 }
659 
660 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
661     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
662     int32_t thread_limit, uint64_t loop_tripcount) {
663   // Set the context we are using.
664   CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
665   if (err != CUDA_SUCCESS) {
666     DP("Error when setting CUDA context\n");
667     CUDA_ERR_STRING(err);
668     return OFFLOAD_FAIL;
669   }
670 
671   // All args are references.
672   std::vector<void *> args(arg_num);
673   std::vector<void *> ptrs(arg_num);
674 
675   for (int32_t i = 0; i < arg_num; ++i) {
676     ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
677     args[i] = &ptrs[i];
678   }
679 
680   KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
681 
682   int cudaThreadsPerBlock;
683 
684   if (thread_limit > 0) {
685     cudaThreadsPerBlock = thread_limit;
686     DP("Setting CUDA threads per block to requested %d\n", thread_limit);
687     // Add master warp if necessary
688     if (KernelInfo->ExecutionMode == GENERIC) {
689       cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id];
690       DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]);
691     }
692   } else {
693     cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id];
694     DP("Setting CUDA threads per block to default %d\n",
695         DeviceInfo.NumThreads[device_id]);
696   }
697 
698   if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) {
699     cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id];
700     DP("Threads per block capped at device limit %d\n",
701         DeviceInfo.ThreadsPerBlock[device_id]);
702   }
703 
704   int kernel_limit;
705   err = cuFuncGetAttribute(&kernel_limit,
706       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func);
707   if (err == CUDA_SUCCESS) {
708     if (kernel_limit < cudaThreadsPerBlock) {
709       cudaThreadsPerBlock = kernel_limit;
710       DP("Threads per block capped at kernel limit %d\n", kernel_limit);
711     }
712   }
713 
714   int cudaBlocksPerGrid;
715   if (team_num <= 0) {
716     if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) {
717       if (KernelInfo->ExecutionMode == SPMD) {
718         // We have a combined construct, i.e. `target teams distribute parallel
719         // for [simd]`. We launch so many teams so that each thread will
720         // execute one iteration of the loop.
721         // round up to the nearest integer
722         cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1;
723       } else {
724         // If we reach this point, then we have a non-combined construct, i.e.
725         // `teams distribute` with a nested `parallel for` and each team is
726         // assigned one iteration of the `distribute` loop. E.g.:
727         //
728         // #pragma omp target teams distribute
729         // for(...loop_tripcount...) {
730         //   #pragma omp parallel for
731         //   for(...) {}
732         // }
733         //
734         // Threads within a team will execute the iterations of the `parallel`
735         // loop.
736         cudaBlocksPerGrid = loop_tripcount;
737       }
738       DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
739           "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount,
740           cudaThreadsPerBlock);
741     } else {
742       cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id];
743       DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]);
744     }
745   } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) {
746     cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id];
747     DP("Capping number of teams to team limit %d\n",
748         DeviceInfo.BlocksPerGrid[device_id]);
749   } else {
750     cudaBlocksPerGrid = team_num;
751     DP("Using requested number of teams %d\n", team_num);
752   }
753 
754   // Run on the device.
755   DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid,
756      cudaThreadsPerBlock);
757 
758   err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1,
759       cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0);
760   if (err != CUDA_SUCCESS) {
761     DP("Device kernel launch failed!\n");
762     CUDA_ERR_STRING(err);
763     return OFFLOAD_FAIL;
764   }
765 
766   DP("Launch of entry point at " DPxMOD " successful!\n",
767       DPxPTR(tgt_entry_ptr));
768 
769   CUresult sync_err = cuCtxSynchronize();
770   if (sync_err != CUDA_SUCCESS) {
771     DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr));
772     CUDA_ERR_STRING(sync_err);
773     return OFFLOAD_FAIL;
774   } else {
775     DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr));
776   }
777 
778   return OFFLOAD_SUCCESS;
779 }
780 
781 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
782     void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
783   // use one team and the default number of threads.
784   const int32_t team_num = 1;
785   const int32_t thread_limit = 0;
786   return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
787       tgt_offsets, arg_num, team_num, thread_limit, 0);
788 }
789 
790 #ifdef __cplusplus
791 }
792 #endif
793