1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18 
19 #include <cassert>
20 #include <cstdint>
21 #include <vector>
22 
23 int AsyncInfoTy::synchronize() {
24   int Result = OFFLOAD_SUCCESS;
25   if (AsyncInfo.Queue) {
26     // If we have a queue we need to synchronize it now.
27     Result = Device.synchronize(*this);
28     assert(AsyncInfo.Queue == nullptr &&
29            "The device plugin should have nulled the queue to indicate there "
30            "are no outstanding actions!");
31   }
32   return Result;
33 }
34 
35 void *&AsyncInfoTy::getVoidPtrLocation() {
36   BufferLocations.push_back(nullptr);
37   return BufferLocations.back();
38 }
39 
40 /* All begin addresses for partially mapped structs must be 8-aligned in order
41  * to ensure proper alignment of members. E.g.
42  *
43  * struct S {
44  *   int a;   // 4-aligned
45  *   int b;   // 4-aligned
46  *   int *p;  // 8-aligned
47  * } s1;
48  * ...
49  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
50  * {
51  *   s1.b = 5;
52  *   for (int i...) s1.p[i] = ...;
53  * }
54  *
55  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
56  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
57  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
58  * requirements for its type. Now, when we allocate memory on the device, in
59  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
60  * This means that the chunk of the struct on the device will start at a
61  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
62  * address of p will be a misaligned 0x204 (on the host there was no need to add
63  * padding between b and p, so p comes exactly 4 bytes after b). If the device
64  * kernel tries to access s1.p, a misaligned address error occurs (as reported
65  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
66  * extending the size of the allocated chuck accordingly, the chuck on the
67  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
68  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
69  */
70 static const int64_t Alignment = 8;
71 
72 /// Map global data and execute pending ctors
73 static int InitLibrary(DeviceTy &Device) {
74   /*
75    * Map global data
76    */
77   int32_t device_id = Device.DeviceID;
78   int rc = OFFLOAD_SUCCESS;
79   bool supportsEmptyImages = Device.RTL->supports_empty_images &&
80                              Device.RTL->supports_empty_images() > 0;
81 
82   std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
83       Device.PendingGlobalsMtx);
84   {
85     std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
86     for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
87       TranslationTable *TransTable =
88           &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
89       if (TransTable->HostTable.EntriesBegin ==
90               TransTable->HostTable.EntriesEnd &&
91           !supportsEmptyImages) {
92         // No host entry so no need to proceed
93         continue;
94       }
95 
96       if (TransTable->TargetsTable[device_id] != 0) {
97         // Library entries have already been processed
98         continue;
99       }
100 
101       // 1) get image.
102       assert(TransTable->TargetsImages.size() > (size_t)device_id &&
103              "Not expecting a device ID outside the table's bounds!");
104       __tgt_device_image *img = TransTable->TargetsImages[device_id];
105       if (!img) {
106         REPORT("No image loaded for device id %d.\n", device_id);
107         rc = OFFLOAD_FAIL;
108         break;
109       }
110       // 2) load image into the target table.
111       __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
112           Device.load_binary(img);
113       // Unable to get table for this image: invalidate image and fail.
114       if (!TargetTable) {
115         REPORT("Unable to generate entries table for device id %d.\n",
116                device_id);
117         TransTable->TargetsImages[device_id] = 0;
118         rc = OFFLOAD_FAIL;
119         break;
120       }
121 
122       // Verify whether the two table sizes match.
123       size_t hsize =
124           TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
125       size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
126 
127       // Invalid image for these host entries!
128       if (hsize != tsize) {
129         REPORT(
130             "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
131             device_id, hsize, tsize);
132         TransTable->TargetsImages[device_id] = 0;
133         TransTable->TargetsTable[device_id] = 0;
134         rc = OFFLOAD_FAIL;
135         break;
136       }
137 
138       DeviceTy::HDTTMapAccessorTy HDTTMap =
139           Device.HostDataToTargetMap.getExclusiveAccessor();
140 
141       __tgt_target_table *HostTable = &TransTable->HostTable;
142       for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
143                                *CurrHostEntry = HostTable->EntriesBegin,
144                                *EntryDeviceEnd = TargetTable->EntriesEnd;
145            CurrDeviceEntry != EntryDeviceEnd;
146            CurrDeviceEntry++, CurrHostEntry++) {
147         if (CurrDeviceEntry->size != 0) {
148           // has data.
149           assert(CurrDeviceEntry->size == CurrHostEntry->size &&
150                  "data size mismatch");
151 
152           // Fortran may use multiple weak declarations for the same symbol,
153           // therefore we must allow for multiple weak symbols to be loaded from
154           // the fat binary. Treat these mappings as any other "regular"
155           // mapping. Add entry to map.
156           if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr,
157                                     CurrHostEntry->size))
158             continue;
159 
160           DP("Add mapping from host " DPxMOD " to device " DPxMOD
161              " with size %zu"
162              "\n",
163              DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
164              CurrDeviceEntry->size);
165           HDTTMap->emplace(new HostDataToTargetTy(
166               (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
167               (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
168               (uintptr_t)CurrHostEntry->addr +
169                   CurrHostEntry->size /*HstPtrEnd*/,
170               (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
171               false /*UseHoldRefCount*/, CurrHostEntry->name,
172               true /*IsRefCountINF*/));
173         }
174       }
175     }
176   }
177 
178   if (rc != OFFLOAD_SUCCESS) {
179     return rc;
180   }
181 
182   /*
183    * Run ctors for static objects
184    */
185   if (!Device.PendingCtorsDtors.empty()) {
186     AsyncInfoTy AsyncInfo(Device);
187     // Call all ctors for all libraries registered so far
188     for (auto &lib : Device.PendingCtorsDtors) {
189       if (!lib.second.PendingCtors.empty()) {
190         DP("Has pending ctors... call now\n");
191         for (auto &entry : lib.second.PendingCtors) {
192           void *ctor = entry;
193           int rc =
194               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
195                      nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
196           if (rc != OFFLOAD_SUCCESS) {
197             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
198             return OFFLOAD_FAIL;
199           }
200         }
201         // Clear the list to indicate that this device has been used
202         lib.second.PendingCtors.clear();
203         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
204       }
205     }
206     // All constructors have been issued, wait for them now.
207     if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
208       return OFFLOAD_FAIL;
209   }
210   Device.HasPendingGlobals = false;
211 
212   return OFFLOAD_SUCCESS;
213 }
214 
215 void handleTargetOutcome(bool Success, ident_t *Loc) {
216   switch (PM->TargetOffloadPolicy) {
217   case tgt_disabled:
218     if (Success) {
219       FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
220     }
221     break;
222   case tgt_default:
223     FATAL_MESSAGE0(1, "default offloading policy must be switched to "
224                       "mandatory or disabled");
225     break;
226   case tgt_mandatory:
227     if (!Success) {
228       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
229         for (auto &Device : PM->Devices)
230           dumpTargetPointerMappings(Loc, *Device);
231       else
232         FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
233                         "for debugging options.\n");
234 
235       SourceInfo info(Loc);
236       if (info.isAvailible())
237         fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
238                 info.getColumn());
239       else
240         FAILURE_MESSAGE("Source location information not present. Compile with "
241                         "-g or -gline-tables-only.\n");
242       FATAL_MESSAGE0(
243           1, "failure of target construct while offloading is mandatory");
244     } else {
245       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
246         for (auto &Device : PM->Devices)
247           dumpTargetPointerMappings(Loc, *Device);
248     }
249     break;
250   }
251 }
252 
253 static void handleDefaultTargetOffload() {
254   std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx);
255   if (PM->TargetOffloadPolicy == tgt_default) {
256     if (omp_get_num_devices() > 0) {
257       DP("Default TARGET OFFLOAD policy is now mandatory "
258          "(devices were found)\n");
259       PM->TargetOffloadPolicy = tgt_mandatory;
260     } else {
261       DP("Default TARGET OFFLOAD policy is now disabled "
262          "(no devices were found)\n");
263       PM->TargetOffloadPolicy = tgt_disabled;
264     }
265   }
266 }
267 
268 static bool isOffloadDisabled() {
269   if (PM->TargetOffloadPolicy == tgt_default)
270     handleDefaultTargetOffload();
271   return PM->TargetOffloadPolicy == tgt_disabled;
272 }
273 
274 // If offload is enabled, ensure that device DeviceID has been initialized,
275 // global ctors have been executed, and global data has been mapped.
276 //
277 // The return bool indicates if the offload is to the host device
278 // There are three possible results:
279 // - Return false if the taregt device is ready for offload
280 // - Return true without reporting a runtime error if offload is
281 //   disabled, perhaps because the initial device was specified.
282 // - Report a runtime error and return true.
283 //
284 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
285 // This step might be skipped if offload is disabled.
286 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
287   if (isOffloadDisabled()) {
288     DP("Offload is disabled\n");
289     return true;
290   }
291 
292   if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
293     DeviceID = omp_get_default_device();
294     DP("Use default device id %" PRId64 "\n", DeviceID);
295   }
296 
297   // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
298   if (omp_get_num_devices() == 0) {
299     DP("omp_get_num_devices() == 0 but offload is manadatory\n");
300     handleTargetOutcome(false, Loc);
301     return true;
302   }
303 
304   if (DeviceID == omp_get_initial_device()) {
305     DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
306        DeviceID);
307     return true;
308   }
309 
310   // Is device ready?
311   if (!device_is_ready(DeviceID)) {
312     REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
313     handleTargetOutcome(false, Loc);
314     return true;
315   }
316 
317   // Get device info.
318   DeviceTy &Device = *PM->Devices[DeviceID];
319 
320   // Check whether global data has been mapped for this device
321   bool HasPendingGlobals;
322   {
323     std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
324         Device.PendingGlobalsMtx);
325     HasPendingGlobals = Device.HasPendingGlobals;
326   }
327   if (HasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
328     REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
329     handleTargetOutcome(false, Loc);
330     return true;
331   }
332 
333   return false;
334 }
335 
336 static int32_t getParentIndex(int64_t type) {
337   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
338 }
339 
340 void *targetAllocExplicit(size_t size, int device_num, int kind,
341                           const char *name) {
342   TIMESCOPE();
343   DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
344 
345   if (size <= 0) {
346     DP("Call to %s with non-positive length\n", name);
347     return NULL;
348   }
349 
350   void *rc = NULL;
351 
352   if (device_num == omp_get_initial_device()) {
353     rc = malloc(size);
354     DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
355     return rc;
356   }
357 
358   if (!device_is_ready(device_num)) {
359     DP("%s returns NULL ptr\n", name);
360     return NULL;
361   }
362 
363   DeviceTy &Device = *PM->Devices[device_num];
364   rc = Device.allocData(size, nullptr, kind);
365   DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
366   return rc;
367 }
368 
369 /// Call the user-defined mapper function followed by the appropriate
370 // targetData* function (targetData{Begin,End,Update}).
371 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
372                      int64_t arg_size, int64_t arg_type,
373                      map_var_info_t arg_names, void *arg_mapper,
374                      AsyncInfoTy &AsyncInfo,
375                      TargetDataFuncPtrTy target_data_function) {
376   TIMESCOPE_WITH_IDENT(loc);
377   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
378 
379   // The mapper function fills up Components.
380   MapperComponentsTy MapperComponents;
381   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
382   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
383                    arg_names);
384 
385   // Construct new arrays for args_base, args, arg_sizes and arg_types
386   // using the information in MapperComponents and call the corresponding
387   // targetData* function using these new arrays.
388   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
389   std::vector<void *> MapperArgs(MapperComponents.Components.size());
390   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
391   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
392   std::vector<void *> MapperArgNames(MapperComponents.Components.size());
393 
394   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
395     auto &C = MapperComponents.Components[I];
396     MapperArgsBase[I] = C.Base;
397     MapperArgs[I] = C.Begin;
398     MapperArgSizes[I] = C.Size;
399     MapperArgTypes[I] = C.Type;
400     MapperArgNames[I] = C.Name;
401   }
402 
403   int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
404                                 MapperArgsBase.data(), MapperArgs.data(),
405                                 MapperArgSizes.data(), MapperArgTypes.data(),
406                                 MapperArgNames.data(), /*arg_mappers*/ nullptr,
407                                 AsyncInfo, /*FromMapper=*/true);
408 
409   return rc;
410 }
411 
412 /// Internal function to do the mapping and transfer the data to the device
413 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
414                     void **args_base, void **args, int64_t *arg_sizes,
415                     int64_t *arg_types, map_var_info_t *arg_names,
416                     void **arg_mappers, AsyncInfoTy &AsyncInfo,
417                     bool FromMapper) {
418   // process each input.
419   for (int32_t i = 0; i < arg_num; ++i) {
420     // Ignore private variables and arrays - there is no mapping for them.
421     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
422         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
423       continue;
424 
425     if (arg_mappers && arg_mappers[i]) {
426       // Instead of executing the regular path of targetDataBegin, call the
427       // targetDataMapper variant which will call targetDataBegin again
428       // with new arguments.
429       DP("Calling targetDataMapper for the %dth argument\n", i);
430 
431       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
432       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
433                                 arg_sizes[i], arg_types[i], arg_name,
434                                 arg_mappers[i], AsyncInfo, targetDataBegin);
435 
436       if (rc != OFFLOAD_SUCCESS) {
437         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
438                " failed.\n");
439         return OFFLOAD_FAIL;
440       }
441 
442       // Skip the rest of this function, continue to the next argument.
443       continue;
444     }
445 
446     void *HstPtrBegin = args[i];
447     void *HstPtrBase = args_base[i];
448     int64_t data_size = arg_sizes[i];
449     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
450 
451     // Adjust for proper alignment if this is a combined entry (for structs).
452     // Look at the next argument - if that is MEMBER_OF this one, then this one
453     // is a combined entry.
454     int64_t padding = 0;
455     const int next_i = i + 1;
456     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
457         getParentIndex(arg_types[next_i]) == i) {
458       padding = (int64_t)HstPtrBegin % Alignment;
459       if (padding) {
460         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
461            "\n",
462            padding, DPxPTR(HstPtrBegin));
463         HstPtrBegin = (char *)HstPtrBegin - padding;
464         data_size += padding;
465       }
466     }
467 
468     // Address of pointer on the host and device, respectively.
469     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
470     TargetPointerResultTy Pointer_TPR;
471     bool IsHostPtr = false;
472     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
473     // Force the creation of a device side copy of the data when:
474     // a close map modifier was associated with a map that contained a to.
475     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
476     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
477     bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD;
478     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
479     // have reached this point via __tgt_target_data_begin and not __tgt_target
480     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
481     // associated with a target region, so there are no target parameters). This
482     // may be considered a hack, we could revise the scheme in the future.
483     bool UpdateRef =
484         !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
485     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
486       DP("Has a pointer entry: \n");
487       // Base is address of pointer.
488       //
489       // Usually, the pointer is already allocated by this time.  For example:
490       //
491       //   #pragma omp target map(s.p[0:N])
492       //
493       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
494       // afterward, so the pointer is already allocated by the time the
495       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
496       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
497       // entry for a global that might not already be allocated by the time the
498       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
499       // when HasPresentModifier.
500       Pointer_TPR = Device.getTargetPointer(
501           HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr,
502           /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
503           HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo);
504       PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
505       IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
506       if (!PointerTgtPtrBegin) {
507         REPORT("Call to getTargetPointer returned null pointer (%s).\n",
508                HasPresentModifier ? "'present' map type modifier"
509                                   : "device failure or illegal mapping");
510         return OFFLOAD_FAIL;
511       }
512       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
513          "\n",
514          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
515          (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
516       Pointer_HstPtrBegin = HstPtrBase;
517       // modify current entry.
518       HstPtrBase = *(void **)HstPtrBase;
519       // No need to update pointee ref count for the first element of the
520       // subelement that comes from mapper.
521       UpdateRef =
522           (!FromMapper || i != 0); // subsequently update ref count of pointee
523     }
524 
525     const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
526     const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
527     auto TPR = Device.getTargetPointer(
528         HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo,
529         HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
530         HasPresentModifier, HasHoldModifier, AsyncInfo);
531     void *TgtPtrBegin = TPR.TargetPointer;
532     IsHostPtr = TPR.Flags.IsHostPointer;
533     // If data_size==0, then the argument could be a zero-length pointer to
534     // NULL, so getOrAlloc() returning NULL is not an error.
535     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
536       REPORT("Call to getTargetPointer returned null pointer (%s).\n",
537              HasPresentModifier ? "'present' map type modifier"
538                                 : "device failure or illegal mapping");
539       return OFFLOAD_FAIL;
540     }
541     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
542        " - is%s new\n",
543        data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
544 
545     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
546       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
547       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
548       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
549       args_base[i] = TgtPtrBase;
550     }
551 
552     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
553       // Check whether we need to update the pointer on the device
554       bool UpdateDevPtr = false;
555 
556       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
557       void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
558 
559       Device.ShadowMtx.lock();
560       auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
561       // If this pointer is not in the map we need to insert it. If the map
562       // contains a stale entry, we need to update it (e.g. if the pointee was
563       // deallocated and later on is reallocated at another device address). The
564       // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
565       // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
566       // pair is deallocated, not when the OBJ is deallocated. In
567       // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
568       // so it stays in the map for the lifetime of the application. When the
569       // OBJ is deallocated and later on allocated again (at a different device
570       // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
571       // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
572       if (Entry == Device.ShadowPtrMap.end() ||
573           Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
574         // create or update shadow pointers for this entry
575         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
576             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
577         Pointer_TPR.Entry->setMayContainAttachedPointers();
578         UpdateDevPtr = true;
579       }
580 
581       if (UpdateDevPtr) {
582         std::lock_guard<decltype(*Pointer_TPR.Entry)> LG(*Pointer_TPR.Entry);
583         Device.ShadowMtx.unlock();
584 
585         DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
586            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
587 
588         void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
589         TgtPtrBase = ExpectedTgtPtrBase;
590 
591         int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
592                                     sizeof(void *), AsyncInfo);
593         if (Ret != OFFLOAD_SUCCESS) {
594           REPORT("Copying data to device failed.\n");
595           return OFFLOAD_FAIL;
596         }
597         if (Pointer_TPR.Entry->addEventIfNecessary(Device, AsyncInfo) !=
598             OFFLOAD_SUCCESS)
599           return OFFLOAD_FAIL;
600       } else
601         Device.ShadowMtx.unlock();
602     }
603   }
604 
605   return OFFLOAD_SUCCESS;
606 }
607 
608 namespace {
609 /// This structure contains information to deallocate a target pointer, aka.
610 /// used to fix up the shadow map and potentially delete the entry from the
611 /// mapping table via \p DeviceTy::deallocTgtPtr.
612 struct PostProcessingInfo {
613   /// Host pointer used to look up into the map table
614   void *HstPtrBegin;
615 
616   /// Size of the data
617   int64_t DataSize;
618 
619   /// The mapping type (bitfield).
620   int64_t ArgType;
621 
622   /// The target pointer information.
623   TargetPointerResultTy TPR;
624 
625   /// Are we expecting to delete this entry or not. Even if set, we might not
626   /// delete the entry if another thread reused the entry in the meantime.
627   bool DelEntry;
628 
629   PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry,
630                      TargetPointerResultTy TPR)
631       : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR),
632         DelEntry(DelEntry) {}
633 };
634 
635 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
636 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the
637 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the
638 /// rest of the map is not checked anymore.
639 template <typename CBTy>
640 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin,
641                                     uintptr_t Size,
642                                     const TargetPointerResultTy &TPR) {
643   // If we have an object that is too small to hold a pointer subobject, no need
644   // to do any checking.
645   if (Size < sizeof(void *))
646     return;
647 
648   // If the map entry for the object was never marked as containing attached
649   // pointers, no need to do any checking.
650   if (!TPR.Entry || !TPR.Entry->getMayContainAttachedPointers())
651     return;
652 
653   uintptr_t LB = (uintptr_t)Begin;
654   uintptr_t UB = LB + Size;
655   // Now we are looking into the shadow map so we need to lock it.
656   std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx);
657   for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
658        Itr != Device.ShadowPtrMap.end();) {
659     uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first;
660 
661     // An STL map is sorted on its keys; use this property
662     // to quickly determine when to break out of the loop.
663     if (ShadowHstPtrAddr < LB) {
664       ++Itr;
665       continue;
666     }
667     if (ShadowHstPtrAddr >= UB)
668       break;
669 
670     if (CB(Itr) == OFFLOAD_FAIL)
671       break;
672   }
673 }
674 
675 } // namespace
676 
677 /// Internal function to undo the mapping and retrieve the data from the device.
678 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
679                   void **ArgBases, void **Args, int64_t *ArgSizes,
680                   int64_t *ArgTypes, map_var_info_t *ArgNames,
681                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
682   int Ret;
683   std::vector<PostProcessingInfo> PostProcessingPtrs;
684   void *FromMapperBase = nullptr;
685   // process each input.
686   for (int32_t I = ArgNum - 1; I >= 0; --I) {
687     // Ignore private variables and arrays - there is no mapping for them.
688     // Also, ignore the use_device_ptr directive, it has no effect here.
689     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
690         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
691       continue;
692 
693     if (ArgMappers && ArgMappers[I]) {
694       // Instead of executing the regular path of targetDataEnd, call the
695       // targetDataMapper variant which will call targetDataEnd again
696       // with new arguments.
697       DP("Calling targetDataMapper for the %dth argument\n", I);
698 
699       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
700       Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
701                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
702                              targetDataEnd);
703 
704       if (Ret != OFFLOAD_SUCCESS) {
705         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
706                " failed.\n");
707         return OFFLOAD_FAIL;
708       }
709 
710       // Skip the rest of this function, continue to the next argument.
711       continue;
712     }
713 
714     void *HstPtrBegin = Args[I];
715     int64_t DataSize = ArgSizes[I];
716     // Adjust for proper alignment if this is a combined entry (for structs).
717     // Look at the next argument - if that is MEMBER_OF this one, then this one
718     // is a combined entry.
719     const int NextI = I + 1;
720     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
721         getParentIndex(ArgTypes[NextI]) == I) {
722       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
723       if (Padding) {
724         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
725            "\n",
726            Padding, DPxPTR(HstPtrBegin));
727         HstPtrBegin = (char *)HstPtrBegin - Padding;
728         DataSize += Padding;
729       }
730     }
731 
732     bool IsLast, IsHostPtr;
733     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
734     bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
735                       (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
736                      !(FromMapper && I == 0);
737     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
738     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
739     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
740 
741     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
742     TargetPointerResultTy TPR = Device.getTgtPtrBegin(
743         HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
744         !IsImplicit, ForceDelete);
745     void *TgtPtrBegin = TPR.TargetPointer;
746     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
747       DP("Mapping does not exist (%s)\n",
748          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
749       if (HasPresentModifier) {
750         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
751         // "If a map clause appears on a target, target data, target enter data
752         // or target exit data construct with a present map-type-modifier then
753         // on entry to the region if the corresponding list item does not appear
754         // in the device data environment then an error occurs and the program
755         // terminates."
756         //
757         // This should be an error upon entering an "omp target exit data".  It
758         // should not be an error upon exiting an "omp target data" or "omp
759         // target".  For "omp target data", Clang thus doesn't include present
760         // modifiers for end calls.  For "omp target", we have not found a valid
761         // OpenMP program for which the error matters: it appears that, if a
762         // program can guarantee that data is present at the beginning of an
763         // "omp target" region so that there's no error there, that data is also
764         // guaranteed to be present at the end.
765         MESSAGE("device mapping required by 'present' map type modifier does "
766                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
767                 DPxPTR(HstPtrBegin), DataSize);
768         return OFFLOAD_FAIL;
769       }
770     } else {
771       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
772          " - is%s last\n",
773          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
774     }
775 
776     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
777     // "If the map clause appears on a target, target data, or target exit data
778     // construct and a corresponding list item of the original list item is not
779     // present in the device data environment on exit from the region then the
780     // list item is ignored."
781     if (!TgtPtrBegin)
782       continue;
783 
784     bool DelEntry = IsLast;
785 
786     // If the last element from the mapper (for end transfer args comes in
787     // reverse order), do not remove the partial entry, the parent struct still
788     // exists.
789     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
790         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
791       DelEntry = false; // protect parent struct from being deallocated
792     }
793 
794     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
795       // Move data back to the host
796       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
797         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
798         if ((Always || IsLast) && !IsHostPtr) {
799           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
800              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
801 
802           std::lock_guard<decltype(*TPR.Entry)> LG(*TPR.Entry);
803           // Wait for any previous transfer if an event is present.
804           if (void *Event = TPR.Entry->getEvent()) {
805             if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
806               REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
807               return OFFLOAD_FAIL;
808             }
809           }
810 
811           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
812                                     AsyncInfo);
813           if (Ret != OFFLOAD_SUCCESS) {
814             REPORT("Copying data from device failed.\n");
815             return OFFLOAD_FAIL;
816           }
817 
818           // As we are expecting to delete the entry the d2h copy might race
819           // with another one that also tries to delete the entry. This happens
820           // as the entry can be reused and the reuse might happen after the
821           // copy-back was issued but before it completed. Since the reuse might
822           // also copy-back a value we would race.
823           if (IsLast) {
824             if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) !=
825                 OFFLOAD_SUCCESS)
826               return OFFLOAD_FAIL;
827           }
828         }
829       }
830       if (DelEntry && FromMapper && I == 0) {
831         DelEntry = false;
832         FromMapperBase = HstPtrBegin;
833       }
834 
835       // Add pointer to the buffer for post-synchronize processing.
836       PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
837                                       DelEntry && !IsHostPtr, TPR);
838     }
839   }
840 
841   // TODO: We should not synchronize here but pass the AsyncInfo object to the
842   //       allocate/deallocate device APIs.
843   //
844   // We need to synchronize before deallocating data.
845   Ret = AsyncInfo.synchronize();
846   if (Ret != OFFLOAD_SUCCESS)
847     return OFFLOAD_FAIL;
848 
849   // Deallocate target pointer
850   for (PostProcessingInfo &Info : PostProcessingPtrs) {
851     // If we marked the entry to be deleted we need to verify no other thread
852     // reused it by now. If deletion is still supposed to happen by this thread
853     // LR will be set and exclusive access to the HDTT map will avoid another
854     // thread reusing the entry now. Note that we do not request (exclusive)
855     // access to the HDTT map if Info.DelEntry is not set.
856     LookupResult LR;
857     DeviceTy::HDTTMapAccessorTy HDTTMap =
858         Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry);
859 
860     if (Info.DelEntry) {
861       LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize);
862       if (LR.Entry->getTotalRefCount() != 0 ||
863           LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) {
864         // The thread is not in charge of deletion anymore. Give up access to
865         // the HDTT map and unset the deletion flag.
866         HDTTMap.destroy();
867         Info.DelEntry = false;
868       }
869     }
870 
871     // If we copied back to the host a struct/array containing pointers, we
872     // need to restore the original host pointer values from their shadow
873     // copies. If the struct is going to be deallocated, remove any remaining
874     // shadow pointer entries for this struct.
875     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
876       // If we copied the struct to the host, we need to restore the pointer.
877       if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) {
878         void **ShadowHstPtrAddr = (void **)Itr->first;
879         *ShadowHstPtrAddr = Itr->second.HstPtrVal;
880         DP("Restoring original host pointer value " DPxMOD " for host "
881            "pointer " DPxMOD "\n",
882            DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
883       }
884       // If the struct is to be deallocated, remove the shadow entry.
885       if (Info.DelEntry) {
886         DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first));
887         Itr = Device.ShadowPtrMap.erase(Itr);
888       } else {
889         ++Itr;
890       }
891       return OFFLOAD_SUCCESS;
892     };
893     applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize,
894                             Info.TPR);
895 
896     // If we are deleting the entry the DataMapMtx is locked and we own the
897     // entry.
898     if (Info.DelEntry) {
899       if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin)
900         Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize);
901 
902       if (Ret != OFFLOAD_SUCCESS) {
903         REPORT("Deallocating data from device failed.\n");
904         break;
905       }
906     }
907   }
908 
909   return Ret;
910 }
911 
912 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
913                                 void *HstPtrBegin, int64_t ArgSize,
914                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
915   TIMESCOPE_WITH_IDENT(loc);
916   bool IsLast, IsHostPtr;
917   TargetPointerResultTy TPR = Device.getTgtPtrBegin(
918       HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
919       /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
920   void *TgtPtrBegin = TPR.TargetPointer;
921   if (!TgtPtrBegin) {
922     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
923     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
924       MESSAGE("device mapping required by 'present' motion modifier does not "
925               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
926               DPxPTR(HstPtrBegin), ArgSize);
927       return OFFLOAD_FAIL;
928     }
929     return OFFLOAD_SUCCESS;
930   }
931 
932   if (IsHostPtr) {
933     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
934        DPxPTR(HstPtrBegin));
935     return OFFLOAD_SUCCESS;
936   }
937 
938   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
939     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
940        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
941     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
942     if (Ret != OFFLOAD_SUCCESS) {
943       REPORT("Copying data from device failed.\n");
944       return OFFLOAD_FAIL;
945     }
946 
947     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
948       void **ShadowHstPtrAddr = (void **)Itr->first;
949       // Wait for device-to-host memcopies for whole struct to complete,
950       // before restoring the correct host pointer.
951       if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
952         return OFFLOAD_FAIL;
953       *ShadowHstPtrAddr = Itr->second.HstPtrVal;
954       DP("Restoring original host pointer value " DPxMOD
955          " for host pointer " DPxMOD "\n",
956          DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
957       ++Itr;
958       return OFFLOAD_SUCCESS;
959     };
960     applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
961   }
962 
963   if (ArgType & OMP_TGT_MAPTYPE_TO) {
964     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
965        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
966     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
967     if (Ret != OFFLOAD_SUCCESS) {
968       REPORT("Copying data to device failed.\n");
969       return OFFLOAD_FAIL;
970     }
971 
972     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
973       DP("Restoring original target pointer value " DPxMOD " for target "
974          "pointer " DPxMOD "\n",
975          DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr));
976       Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal,
977                               sizeof(void *), AsyncInfo);
978       if (Ret != OFFLOAD_SUCCESS)
979         REPORT("Copying data to device failed.\n");
980       ++Itr;
981       return Ret;
982     };
983     applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
984   }
985   return OFFLOAD_SUCCESS;
986 }
987 
988 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
989                                    void *ArgsBase,
990                                    __tgt_target_non_contig *NonContig,
991                                    uint64_t Size, int64_t ArgType,
992                                    int CurrentDim, int DimSize, uint64_t Offset,
993                                    AsyncInfoTy &AsyncInfo) {
994   TIMESCOPE_WITH_IDENT(loc);
995   int Ret = OFFLOAD_SUCCESS;
996   if (CurrentDim < DimSize) {
997     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
998       uint64_t CurOffset =
999           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
1000       // we only need to transfer the first element for the last dimension
1001       // since we've already got a contiguous piece.
1002       if (CurrentDim != DimSize - 1 || I == 0) {
1003         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
1004                                       ArgType, CurrentDim + 1, DimSize,
1005                                       Offset + CurOffset, AsyncInfo);
1006         // Stop the whole process if any contiguous piece returns anything
1007         // other than OFFLOAD_SUCCESS.
1008         if (Ret != OFFLOAD_SUCCESS)
1009           return Ret;
1010       }
1011     }
1012   } else {
1013     char *Ptr = (char *)ArgsBase + Offset;
1014     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
1015        " len %" PRIu64 "\n",
1016        DPxPTR(Ptr), Offset, Size);
1017     Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
1018                                AsyncInfo);
1019   }
1020   return Ret;
1021 }
1022 
1023 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
1024                                        int32_t DimSize) {
1025   int RemovedDim = 0;
1026   for (int I = DimSize - 1; I > 0; --I) {
1027     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
1028       RemovedDim++;
1029   }
1030   return RemovedDim;
1031 }
1032 
1033 /// Internal function to pass data to/from the target.
1034 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
1035                      void **ArgsBase, void **Args, int64_t *ArgSizes,
1036                      int64_t *ArgTypes, map_var_info_t *ArgNames,
1037                      void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
1038   // process each input.
1039   for (int32_t I = 0; I < ArgNum; ++I) {
1040     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
1041         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
1042       continue;
1043 
1044     if (ArgMappers && ArgMappers[I]) {
1045       // Instead of executing the regular path of targetDataUpdate, call the
1046       // targetDataMapper variant which will call targetDataUpdate again
1047       // with new arguments.
1048       DP("Calling targetDataMapper for the %dth argument\n", I);
1049 
1050       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
1051       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1052                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
1053                                  targetDataUpdate);
1054 
1055       if (Ret != OFFLOAD_SUCCESS) {
1056         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1057                " failed.\n");
1058         return OFFLOAD_FAIL;
1059       }
1060 
1061       // Skip the rest of this function, continue to the next argument.
1062       continue;
1063     }
1064 
1065     int Ret = OFFLOAD_SUCCESS;
1066 
1067     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1068       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1069       int32_t DimSize = ArgSizes[I];
1070       uint64_t Size =
1071           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1072       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1073       Ret = targetDataNonContiguous(
1074           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1075           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1076     } else {
1077       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1078                                  ArgTypes[I], AsyncInfo);
1079     }
1080     if (Ret == OFFLOAD_FAIL)
1081       return OFFLOAD_FAIL;
1082   }
1083   return OFFLOAD_SUCCESS;
1084 }
1085 
1086 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1087                                       OMP_TGT_MAPTYPE_LITERAL |
1088                                       OMP_TGT_MAPTYPE_IMPLICIT;
1089 static bool isLambdaMapping(int64_t Mapping) {
1090   return (Mapping & LambdaMapping) == LambdaMapping;
1091 }
1092 
1093 namespace {
1094 /// Find the table information in the map or look it up in the translation
1095 /// tables.
1096 TableMap *getTableMap(void *HostPtr) {
1097   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1098   HostPtrToTableMapTy::iterator TableMapIt =
1099       PM->HostPtrToTableMap.find(HostPtr);
1100 
1101   if (TableMapIt != PM->HostPtrToTableMap.end())
1102     return &TableMapIt->second;
1103 
1104   // We don't have a map. So search all the registered libraries.
1105   TableMap *TM = nullptr;
1106   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1107   for (HostEntriesBeginToTransTableTy::iterator Itr =
1108            PM->HostEntriesBeginToTransTable.begin();
1109        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1110     // get the translation table (which contains all the good info).
1111     TranslationTable *TransTable = &Itr->second;
1112     // iterate over all the host table entries to see if we can locate the
1113     // host_ptr.
1114     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1115     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1116       if (Cur->addr != HostPtr)
1117         continue;
1118       // we got a match, now fill the HostPtrToTableMap so that we
1119       // may avoid this search next time.
1120       TM = &(PM->HostPtrToTableMap)[HostPtr];
1121       TM->Table = TransTable;
1122       TM->Index = I;
1123       return TM;
1124     }
1125   }
1126 
1127   return nullptr;
1128 }
1129 
1130 /// Get loop trip count
1131 /// FIXME: This function will not work right if calling
1132 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1133 /// another thread, which might occur when we call task yield.
1134 uint64_t getLoopTripCount(int64_t DeviceId) {
1135   DeviceTy &Device = *PM->Devices[DeviceId];
1136   uint64_t LoopTripCount = 0;
1137 
1138   {
1139     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1140     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1141     if (I != Device.LoopTripCnt.end()) {
1142       LoopTripCount = I->second;
1143       Device.LoopTripCnt.erase(I);
1144       DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1145     }
1146   }
1147 
1148   return LoopTripCount;
1149 }
1150 
1151 /// A class manages private arguments in a target region.
1152 class PrivateArgumentManagerTy {
1153   /// A data structure for the information of first-private arguments. We can
1154   /// use this information to optimize data transfer by packing all
1155   /// first-private arguments and transfer them all at once.
1156   struct FirstPrivateArgInfoTy {
1157     /// The index of the element in \p TgtArgs corresponding to the argument
1158     const int Index;
1159     /// Host pointer begin
1160     const char *HstPtrBegin;
1161     /// Host pointer end
1162     const char *HstPtrEnd;
1163     /// Aligned size
1164     const int64_t AlignedSize;
1165     /// Host pointer name
1166     const map_var_info_t HstPtrName = nullptr;
1167 
1168     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1169                           const map_var_info_t HstPtrName = nullptr)
1170         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1171           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1172           HstPtrName(HstPtrName) {}
1173   };
1174 
1175   /// A vector of target pointers for all private arguments
1176   std::vector<void *> TgtPtrs;
1177 
1178   /// A vector of information of all first-private arguments to be packed
1179   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1180   /// Host buffer for all arguments to be packed
1181   std::vector<char> FirstPrivateArgBuffer;
1182   /// The total size of all arguments to be packed
1183   int64_t FirstPrivateArgSize = 0;
1184 
1185   /// A reference to the \p DeviceTy object
1186   DeviceTy &Device;
1187   /// A pointer to a \p AsyncInfoTy object
1188   AsyncInfoTy &AsyncInfo;
1189 
1190   // TODO: What would be the best value here? Should we make it configurable?
1191   // If the size is larger than this threshold, we will allocate and transfer it
1192   // immediately instead of packing it.
1193   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1194 
1195 public:
1196   /// Constructor
1197   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1198       : Device(Dev), AsyncInfo(AsyncInfo) {}
1199 
1200   /// Add a private argument
1201   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1202              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1203              const map_var_info_t HstPtrName = nullptr,
1204              const bool AllocImmediately = false) {
1205     // If the argument is not first-private, or its size is greater than a
1206     // predefined threshold, we will allocate memory and issue the transfer
1207     // immediately.
1208     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1209         AllocImmediately) {
1210       TgtPtr = Device.allocData(ArgSize, HstPtr);
1211       if (!TgtPtr) {
1212         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1213            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1214         return OFFLOAD_FAIL;
1215       }
1216 #ifdef OMPTARGET_DEBUG
1217       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1218       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1219          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1220          "\n",
1221          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1222          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1223 #endif
1224       // If first-private, copy data from host
1225       if (IsFirstPrivate) {
1226         DP("Submitting firstprivate data to the device.\n");
1227         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1228         if (Ret != OFFLOAD_SUCCESS) {
1229           DP("Copying data to device failed, failed.\n");
1230           return OFFLOAD_FAIL;
1231         }
1232       }
1233       TgtPtrs.push_back(TgtPtr);
1234     } else {
1235       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1236          DPxPTR(HstPtr), ArgSize);
1237       // When reach this point, the argument must meet all following
1238       // requirements:
1239       // 1. Its size does not exceed the threshold (see the comment for
1240       // FirstPrivateArgSizeThreshold);
1241       // 2. It must be first-private (needs to be mapped to target device).
1242       // We will pack all this kind of arguments to transfer them all at once
1243       // to reduce the number of data transfer. We will not take
1244       // non-first-private arguments, aka. private arguments that doesn't need
1245       // to be mapped to target device, into account because data allocation
1246       // can be very efficient with memory manager.
1247 
1248       // Placeholder value
1249       TgtPtr = nullptr;
1250       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1251                                        HstPtrName);
1252       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1253     }
1254 
1255     return OFFLOAD_SUCCESS;
1256   }
1257 
1258   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1259   /// and start the transfer.
1260   int packAndTransfer(std::vector<void *> &TgtArgs) {
1261     if (!FirstPrivateArgInfo.empty()) {
1262       assert(FirstPrivateArgSize != 0 &&
1263              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1264       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1265       auto Itr = FirstPrivateArgBuffer.begin();
1266       // Copy all host data to this buffer
1267       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1268         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1269         Itr = std::next(Itr, Info.AlignedSize);
1270       }
1271       // Allocate target memory
1272       void *TgtPtr =
1273           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1274       if (TgtPtr == nullptr) {
1275         DP("Failed to allocate target memory for private arguments.\n");
1276         return OFFLOAD_FAIL;
1277       }
1278       TgtPtrs.push_back(TgtPtr);
1279       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1280          FirstPrivateArgSize, DPxPTR(TgtPtr));
1281       // Transfer data to target device
1282       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1283                                   FirstPrivateArgSize, AsyncInfo);
1284       if (Ret != OFFLOAD_SUCCESS) {
1285         DP("Failed to submit data of private arguments.\n");
1286         return OFFLOAD_FAIL;
1287       }
1288       // Fill in all placeholder pointers
1289       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1290       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1291         void *&Ptr = TgtArgs[Info.Index];
1292         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1293         Ptr = reinterpret_cast<void *>(TP);
1294         TP += Info.AlignedSize;
1295         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1296            "\n",
1297            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1298            DPxPTR(Ptr));
1299       }
1300     }
1301 
1302     return OFFLOAD_SUCCESS;
1303   }
1304 
1305   /// Free all target memory allocated for private arguments
1306   int free() {
1307     for (void *P : TgtPtrs) {
1308       int Ret = Device.deleteData(P);
1309       if (Ret != OFFLOAD_SUCCESS) {
1310         DP("Deallocation of (first-)private arrays failed.\n");
1311         return OFFLOAD_FAIL;
1312       }
1313     }
1314 
1315     TgtPtrs.clear();
1316 
1317     return OFFLOAD_SUCCESS;
1318   }
1319 };
1320 
1321 /// Process data before launching the kernel, including calling targetDataBegin
1322 /// to map and transfer data to target device, transferring (first-)private
1323 /// variables.
1324 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1325                              int32_t ArgNum, void **ArgBases, void **Args,
1326                              int64_t *ArgSizes, int64_t *ArgTypes,
1327                              map_var_info_t *ArgNames, void **ArgMappers,
1328                              std::vector<void *> &TgtArgs,
1329                              std::vector<ptrdiff_t> &TgtOffsets,
1330                              PrivateArgumentManagerTy &PrivateArgumentManager,
1331                              AsyncInfoTy &AsyncInfo) {
1332   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1333   DeviceTy &Device = *PM->Devices[DeviceId];
1334   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1335                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1336   if (Ret != OFFLOAD_SUCCESS) {
1337     REPORT("Call to targetDataBegin failed, abort target.\n");
1338     return OFFLOAD_FAIL;
1339   }
1340 
1341   // List of (first-)private arrays allocated for this target region
1342   std::vector<int> TgtArgsPositions(ArgNum, -1);
1343 
1344   for (int32_t I = 0; I < ArgNum; ++I) {
1345     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1346       // This is not a target parameter, do not push it into TgtArgs.
1347       // Check for lambda mapping.
1348       if (isLambdaMapping(ArgTypes[I])) {
1349         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1350                "PTR_AND_OBJ must be also MEMBER_OF.");
1351         unsigned Idx = getParentIndex(ArgTypes[I]);
1352         int TgtIdx = TgtArgsPositions[Idx];
1353         assert(TgtIdx != -1 && "Base address must be translated already.");
1354         // The parent lambda must be processed already and it must be the last
1355         // in TgtArgs and TgtOffsets arrays.
1356         void *HstPtrVal = Args[I];
1357         void *HstPtrBegin = ArgBases[I];
1358         void *HstPtrBase = Args[Idx];
1359         bool IsLast, IsHostPtr; // IsLast is unused.
1360         void *TgtPtrBase =
1361             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1362         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1363         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1364         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1365         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1366         TargetPointerResultTy TPR = Device.getTgtPtrBegin(
1367             HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
1368             /*UseHoldRefCount=*/false, IsHostPtr);
1369         PointerTgtPtrBegin = TPR.TargetPointer;
1370         if (!PointerTgtPtrBegin) {
1371           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1372              DPxPTR(HstPtrVal));
1373           continue;
1374         }
1375         if (IsHostPtr) {
1376           DP("Unified memory is active, no need to map lambda captured"
1377              "variable (" DPxMOD ")\n",
1378              DPxPTR(HstPtrVal));
1379           continue;
1380         }
1381         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1382            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1383         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1384                                 sizeof(void *), AsyncInfo);
1385         if (Ret != OFFLOAD_SUCCESS) {
1386           REPORT("Copying data to device failed.\n");
1387           return OFFLOAD_FAIL;
1388         }
1389       }
1390       continue;
1391     }
1392     void *HstPtrBegin = Args[I];
1393     void *HstPtrBase = ArgBases[I];
1394     void *TgtPtrBegin;
1395     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1396     ptrdiff_t TgtBaseOffset;
1397     bool IsLast, IsHostPtr; // unused.
1398     TargetPointerResultTy TPR;
1399     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1400       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1401          DPxPTR(HstPtrBase));
1402       TgtPtrBegin = HstPtrBase;
1403       TgtBaseOffset = 0;
1404     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1405       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1406       const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1407       // If there is a next argument and it depends on the current one, we need
1408       // to allocate the private memory immediately. If this is not the case,
1409       // then the argument can be marked for optimization and packed with the
1410       // other privates.
1411       const bool AllocImmediately =
1412           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1413       Ret = PrivateArgumentManager.addArg(
1414           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1415           TgtArgs.size(), HstPtrName, AllocImmediately);
1416       if (Ret != OFFLOAD_SUCCESS) {
1417         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1418                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1419         return OFFLOAD_FAIL;
1420       }
1421     } else {
1422       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1423         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1424       TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1425                                   /*UpdateRefCount=*/false,
1426                                   /*UseHoldRefCount=*/false, IsHostPtr);
1427       TgtPtrBegin = TPR.TargetPointer;
1428       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1429 #ifdef OMPTARGET_DEBUG
1430       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1431       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1432          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1433 #endif
1434     }
1435     TgtArgsPositions[I] = TgtArgs.size();
1436     TgtArgs.push_back(TgtPtrBegin);
1437     TgtOffsets.push_back(TgtBaseOffset);
1438   }
1439 
1440   assert(TgtArgs.size() == TgtOffsets.size() &&
1441          "Size mismatch in arguments and offsets");
1442 
1443   // Pack and transfer first-private arguments
1444   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1445   if (Ret != OFFLOAD_SUCCESS) {
1446     DP("Failed to pack and transfer first private arguments\n");
1447     return OFFLOAD_FAIL;
1448   }
1449 
1450   return OFFLOAD_SUCCESS;
1451 }
1452 
1453 /// Process data after launching the kernel, including transferring data back to
1454 /// host if needed and deallocating target memory of (first-)private variables.
1455 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1456                             int32_t ArgNum, void **ArgBases, void **Args,
1457                             int64_t *ArgSizes, int64_t *ArgTypes,
1458                             map_var_info_t *ArgNames, void **ArgMappers,
1459                             PrivateArgumentManagerTy &PrivateArgumentManager,
1460                             AsyncInfoTy &AsyncInfo) {
1461   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1462   DeviceTy &Device = *PM->Devices[DeviceId];
1463 
1464   // Move data from device.
1465   int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1466                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1467   if (Ret != OFFLOAD_SUCCESS) {
1468     REPORT("Call to targetDataEnd failed, abort target.\n");
1469     return OFFLOAD_FAIL;
1470   }
1471 
1472   // Free target memory for private arguments
1473   Ret = PrivateArgumentManager.free();
1474   if (Ret != OFFLOAD_SUCCESS) {
1475     REPORT("Failed to deallocate target memory for private args\n");
1476     return OFFLOAD_FAIL;
1477   }
1478 
1479   return OFFLOAD_SUCCESS;
1480 }
1481 } // namespace
1482 
1483 /// performs the same actions as data_begin in case arg_num is
1484 /// non-zero and initiates run of the offloaded region on the target platform;
1485 /// if arg_num is non-zero after the region execution is done it also
1486 /// performs the same action as data_update and data_end above. This function
1487 /// returns 0 if it was able to transfer the execution to a target and an
1488 /// integer different from zero otherwise.
1489 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1490            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1491            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1492            int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1493   int32_t DeviceId = Device.DeviceID;
1494 
1495   TableMap *TM = getTableMap(HostPtr);
1496   // No map for this host pointer found!
1497   if (!TM) {
1498     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1499            DPxPTR(HostPtr));
1500     return OFFLOAD_FAIL;
1501   }
1502 
1503   // get target table.
1504   __tgt_target_table *TargetTable = nullptr;
1505   {
1506     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1507     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1508            "Not expecting a device ID outside the table's bounds!");
1509     TargetTable = TM->Table->TargetsTable[DeviceId];
1510   }
1511   assert(TargetTable && "Global data has not been mapped\n");
1512 
1513   // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1514   // need to manifest base pointers prior to launching a kernel. Even if we have
1515   // mapped an object only partially, e.g. A[N:M], although the kernel is
1516   // expected to access elements starting at address &A[N] and beyond, we still
1517   // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1518   // API, we need the begin address itself, i.e. &A[N], as the API operates on
1519   // begin addresses, not bases. That's why we pass args and offsets as two
1520   // separate entities so that each plugin can do what it needs. This behavior
1521   // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1522   std::vector<void *> TgtArgs;
1523   std::vector<ptrdiff_t> TgtOffsets;
1524 
1525   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1526 
1527   int Ret;
1528   if (ArgNum) {
1529     // Process data, such as data mapping, before launching the kernel
1530     Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1531                             ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1532                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1533     if (Ret != OFFLOAD_SUCCESS) {
1534       REPORT("Failed to process data before launching the kernel.\n");
1535       return OFFLOAD_FAIL;
1536     }
1537   }
1538 
1539   // Launch device execution.
1540   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1541   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1542      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1543 
1544   {
1545     TIMESCOPE_WITH_NAME_AND_IDENT(
1546         IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1547     if (IsTeamConstruct)
1548       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1549                                  TgtArgs.size(), TeamNum, ThreadLimit,
1550                                  getLoopTripCount(DeviceId), AsyncInfo);
1551     else
1552       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1553                              TgtArgs.size(), AsyncInfo);
1554   }
1555 
1556   if (Ret != OFFLOAD_SUCCESS) {
1557     REPORT("Executing target region abort target.\n");
1558     return OFFLOAD_FAIL;
1559   }
1560 
1561   if (ArgNum) {
1562     // Transfer data back and deallocate target memory for (first-)private
1563     // variables
1564     Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1565                            ArgSizes, ArgTypes, ArgNames, ArgMappers,
1566                            PrivateArgumentManager, AsyncInfo);
1567     if (Ret != OFFLOAD_SUCCESS) {
1568       REPORT("Failed to process data after launching the kernel.\n");
1569       return OFFLOAD_FAIL;
1570     }
1571   }
1572 
1573   return OFFLOAD_SUCCESS;
1574 }
1575