1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18 
19 #include <cassert>
20 #include <cstdint>
21 #include <vector>
22 
23 int AsyncInfoTy::synchronize() {
24   int Result = OFFLOAD_SUCCESS;
25   if (AsyncInfo.Queue) {
26     // If we have a queue we need to synchronize it now.
27     Result = Device.synchronize(*this);
28     assert(AsyncInfo.Queue == nullptr &&
29            "The device plugin should have nulled the queue to indicate there "
30            "are no outstanding actions!");
31   }
32   return Result;
33 }
34 
35 void *&AsyncInfoTy::getVoidPtrLocation() {
36   BufferLocations.push_back(nullptr);
37   return BufferLocations.back();
38 }
39 
40 /* All begin addresses for partially mapped structs must be 8-aligned in order
41  * to ensure proper alignment of members. E.g.
42  *
43  * struct S {
44  *   int a;   // 4-aligned
45  *   int b;   // 4-aligned
46  *   int *p;  // 8-aligned
47  * } s1;
48  * ...
49  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
50  * {
51  *   s1.b = 5;
52  *   for (int i...) s1.p[i] = ...;
53  * }
54  *
55  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
56  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
57  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
58  * requirements for its type. Now, when we allocate memory on the device, in
59  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
60  * This means that the chunk of the struct on the device will start at a
61  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
62  * address of p will be a misaligned 0x204 (on the host there was no need to add
63  * padding between b and p, so p comes exactly 4 bytes after b). If the device
64  * kernel tries to access s1.p, a misaligned address error occurs (as reported
65  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
66  * extending the size of the allocated chuck accordingly, the chuck on the
67  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
68  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
69  */
70 static const int64_t Alignment = 8;
71 
72 /// Map global data and execute pending ctors
73 static int InitLibrary(DeviceTy &Device) {
74   /*
75    * Map global data
76    */
77   int32_t device_id = Device.DeviceID;
78   int rc = OFFLOAD_SUCCESS;
79   bool supportsEmptyImages = Device.RTL->supports_empty_images &&
80                              Device.RTL->supports_empty_images() > 0;
81 
82   std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
83       Device.PendingGlobalsMtx);
84   {
85     std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
86     for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
87       TranslationTable *TransTable =
88           &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
89       if (TransTable->HostTable.EntriesBegin ==
90               TransTable->HostTable.EntriesEnd &&
91           !supportsEmptyImages) {
92         // No host entry so no need to proceed
93         continue;
94       }
95 
96       if (TransTable->TargetsTable[device_id] != 0) {
97         // Library entries have already been processed
98         continue;
99       }
100 
101       // 1) get image.
102       assert(TransTable->TargetsImages.size() > (size_t)device_id &&
103              "Not expecting a device ID outside the table's bounds!");
104       __tgt_device_image *img = TransTable->TargetsImages[device_id];
105       if (!img) {
106         REPORT("No image loaded for device id %d.\n", device_id);
107         rc = OFFLOAD_FAIL;
108         break;
109       }
110       // 2) load image into the target table.
111       __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
112           Device.load_binary(img);
113       // Unable to get table for this image: invalidate image and fail.
114       if (!TargetTable) {
115         REPORT("Unable to generate entries table for device id %d.\n",
116                device_id);
117         TransTable->TargetsImages[device_id] = 0;
118         rc = OFFLOAD_FAIL;
119         break;
120       }
121 
122       // Verify whether the two table sizes match.
123       size_t hsize =
124           TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
125       size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
126 
127       // Invalid image for these host entries!
128       if (hsize != tsize) {
129         REPORT(
130             "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
131             device_id, hsize, tsize);
132         TransTable->TargetsImages[device_id] = 0;
133         TransTable->TargetsTable[device_id] = 0;
134         rc = OFFLOAD_FAIL;
135         break;
136       }
137 
138       // process global data that needs to be mapped.
139       std::lock_guard<decltype(Device.DataMapMtx)> LG(Device.DataMapMtx);
140 
141       __tgt_target_table *HostTable = &TransTable->HostTable;
142       for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
143                                *CurrHostEntry = HostTable->EntriesBegin,
144                                *EntryDeviceEnd = TargetTable->EntriesEnd;
145            CurrDeviceEntry != EntryDeviceEnd;
146            CurrDeviceEntry++, CurrHostEntry++) {
147         if (CurrDeviceEntry->size != 0) {
148           // has data.
149           assert(CurrDeviceEntry->size == CurrHostEntry->size &&
150                  "data size mismatch");
151 
152           // Fortran may use multiple weak declarations for the same symbol,
153           // therefore we must allow for multiple weak symbols to be loaded from
154           // the fat binary. Treat these mappings as any other "regular"
155           // mapping. Add entry to map.
156           if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
157             continue;
158           DP("Add mapping from host " DPxMOD " to device " DPxMOD
159              " with size %zu"
160              "\n",
161              DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
162              CurrDeviceEntry->size);
163           Device.HostDataToTargetMap.emplace(
164               (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
165               (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
166               (uintptr_t)CurrHostEntry->addr +
167                   CurrHostEntry->size /*HstPtrEnd*/,
168               (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
169               false /*UseHoldRefCount*/, nullptr /*Name*/,
170               true /*IsRefCountINF*/);
171         }
172       }
173     }
174   }
175 
176   if (rc != OFFLOAD_SUCCESS) {
177     return rc;
178   }
179 
180   /*
181    * Run ctors for static objects
182    */
183   if (!Device.PendingCtorsDtors.empty()) {
184     AsyncInfoTy AsyncInfo(Device);
185     // Call all ctors for all libraries registered so far
186     for (auto &lib : Device.PendingCtorsDtors) {
187       if (!lib.second.PendingCtors.empty()) {
188         DP("Has pending ctors... call now\n");
189         for (auto &entry : lib.second.PendingCtors) {
190           void *ctor = entry;
191           int rc =
192               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
193                      nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
194           if (rc != OFFLOAD_SUCCESS) {
195             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
196             return OFFLOAD_FAIL;
197           }
198         }
199         // Clear the list to indicate that this device has been used
200         lib.second.PendingCtors.clear();
201         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
202       }
203     }
204     // All constructors have been issued, wait for them now.
205     if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
206       return OFFLOAD_FAIL;
207   }
208   Device.HasPendingGlobals = false;
209 
210   return OFFLOAD_SUCCESS;
211 }
212 
213 void handleTargetOutcome(bool Success, ident_t *Loc) {
214   switch (PM->TargetOffloadPolicy) {
215   case tgt_disabled:
216     if (Success) {
217       FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
218     }
219     break;
220   case tgt_default:
221     FATAL_MESSAGE0(1, "default offloading policy must be switched to "
222                       "mandatory or disabled");
223     break;
224   case tgt_mandatory:
225     if (!Success) {
226       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
227         for (auto &Device : PM->Devices)
228           dumpTargetPointerMappings(Loc, *Device);
229       else
230         FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
231                         "for debugging options.\n");
232 
233       SourceInfo info(Loc);
234       if (info.isAvailible())
235         fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
236                 info.getColumn());
237       else
238         FAILURE_MESSAGE("Source location information not present. Compile with "
239                         "-g or -gline-tables-only.\n");
240       FATAL_MESSAGE0(
241           1, "failure of target construct while offloading is mandatory");
242     } else {
243       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
244         for (auto &Device : PM->Devices)
245           dumpTargetPointerMappings(Loc, *Device);
246     }
247     break;
248   }
249 }
250 
251 static void handleDefaultTargetOffload() {
252   std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx);
253   if (PM->TargetOffloadPolicy == tgt_default) {
254     if (omp_get_num_devices() > 0) {
255       DP("Default TARGET OFFLOAD policy is now mandatory "
256          "(devices were found)\n");
257       PM->TargetOffloadPolicy = tgt_mandatory;
258     } else {
259       DP("Default TARGET OFFLOAD policy is now disabled "
260          "(no devices were found)\n");
261       PM->TargetOffloadPolicy = tgt_disabled;
262     }
263   }
264 }
265 
266 static bool isOffloadDisabled() {
267   if (PM->TargetOffloadPolicy == tgt_default)
268     handleDefaultTargetOffload();
269   return PM->TargetOffloadPolicy == tgt_disabled;
270 }
271 
272 // If offload is enabled, ensure that device DeviceID has been initialized,
273 // global ctors have been executed, and global data has been mapped.
274 //
275 // The return bool indicates if the offload is to the host device
276 // There are three possible results:
277 // - Return false if the taregt device is ready for offload
278 // - Return true without reporting a runtime error if offload is
279 //   disabled, perhaps because the initial device was specified.
280 // - Report a runtime error and return true.
281 //
282 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
283 // This step might be skipped if offload is disabled.
284 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
285   if (isOffloadDisabled()) {
286     DP("Offload is disabled\n");
287     return true;
288   }
289 
290   if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
291     DeviceID = omp_get_default_device();
292     DP("Use default device id %" PRId64 "\n", DeviceID);
293   }
294 
295   // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
296   if (omp_get_num_devices() == 0) {
297     DP("omp_get_num_devices() == 0 but offload is manadatory\n");
298     handleTargetOutcome(false, Loc);
299     return true;
300   }
301 
302   if (DeviceID == omp_get_initial_device()) {
303     DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
304        DeviceID);
305     return true;
306   }
307 
308   // Is device ready?
309   if (!device_is_ready(DeviceID)) {
310     REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
311     handleTargetOutcome(false, Loc);
312     return true;
313   }
314 
315   // Get device info.
316   DeviceTy &Device = *PM->Devices[DeviceID];
317 
318   // Check whether global data has been mapped for this device
319   bool HasPendingGlobals;
320   {
321     std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
322         Device.PendingGlobalsMtx);
323     HasPendingGlobals = Device.HasPendingGlobals;
324   }
325   if (HasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
326     REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
327     handleTargetOutcome(false, Loc);
328     return true;
329   }
330 
331   return false;
332 }
333 
334 static int32_t getParentIndex(int64_t type) {
335   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
336 }
337 
338 void *targetAllocExplicit(size_t size, int device_num, int kind,
339                           const char *name) {
340   TIMESCOPE();
341   DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
342 
343   if (size <= 0) {
344     DP("Call to %s with non-positive length\n", name);
345     return NULL;
346   }
347 
348   void *rc = NULL;
349 
350   if (device_num == omp_get_initial_device()) {
351     rc = malloc(size);
352     DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
353     return rc;
354   }
355 
356   if (!device_is_ready(device_num)) {
357     DP("%s returns NULL ptr\n", name);
358     return NULL;
359   }
360 
361   DeviceTy &Device = *PM->Devices[device_num];
362   rc = Device.allocData(size, nullptr, kind);
363   DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
364   return rc;
365 }
366 
367 /// Call the user-defined mapper function followed by the appropriate
368 // targetData* function (targetData{Begin,End,Update}).
369 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
370                      int64_t arg_size, int64_t arg_type,
371                      map_var_info_t arg_names, void *arg_mapper,
372                      AsyncInfoTy &AsyncInfo,
373                      TargetDataFuncPtrTy target_data_function) {
374   TIMESCOPE_WITH_IDENT(loc);
375   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
376 
377   // The mapper function fills up Components.
378   MapperComponentsTy MapperComponents;
379   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
380   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
381                    arg_names);
382 
383   // Construct new arrays for args_base, args, arg_sizes and arg_types
384   // using the information in MapperComponents and call the corresponding
385   // targetData* function using these new arrays.
386   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
387   std::vector<void *> MapperArgs(MapperComponents.Components.size());
388   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
389   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
390   std::vector<void *> MapperArgNames(MapperComponents.Components.size());
391 
392   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
393     auto &C = MapperComponents.Components[I];
394     MapperArgsBase[I] = C.Base;
395     MapperArgs[I] = C.Begin;
396     MapperArgSizes[I] = C.Size;
397     MapperArgTypes[I] = C.Type;
398     MapperArgNames[I] = C.Name;
399   }
400 
401   int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
402                                 MapperArgsBase.data(), MapperArgs.data(),
403                                 MapperArgSizes.data(), MapperArgTypes.data(),
404                                 MapperArgNames.data(), /*arg_mappers*/ nullptr,
405                                 AsyncInfo, /*FromMapper=*/true);
406 
407   return rc;
408 }
409 
410 /// Internal function to do the mapping and transfer the data to the device
411 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
412                     void **args_base, void **args, int64_t *arg_sizes,
413                     int64_t *arg_types, map_var_info_t *arg_names,
414                     void **arg_mappers, AsyncInfoTy &AsyncInfo,
415                     bool FromMapper) {
416   // process each input.
417   for (int32_t i = 0; i < arg_num; ++i) {
418     // Ignore private variables and arrays - there is no mapping for them.
419     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
420         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
421       continue;
422 
423     if (arg_mappers && arg_mappers[i]) {
424       // Instead of executing the regular path of targetDataBegin, call the
425       // targetDataMapper variant which will call targetDataBegin again
426       // with new arguments.
427       DP("Calling targetDataMapper for the %dth argument\n", i);
428 
429       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
430       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
431                                 arg_sizes[i], arg_types[i], arg_name,
432                                 arg_mappers[i], AsyncInfo, targetDataBegin);
433 
434       if (rc != OFFLOAD_SUCCESS) {
435         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
436                " failed.\n");
437         return OFFLOAD_FAIL;
438       }
439 
440       // Skip the rest of this function, continue to the next argument.
441       continue;
442     }
443 
444     void *HstPtrBegin = args[i];
445     void *HstPtrBase = args_base[i];
446     int64_t data_size = arg_sizes[i];
447     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
448 
449     // Adjust for proper alignment if this is a combined entry (for structs).
450     // Look at the next argument - if that is MEMBER_OF this one, then this one
451     // is a combined entry.
452     int64_t padding = 0;
453     const int next_i = i + 1;
454     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
455         getParentIndex(arg_types[next_i]) == i) {
456       padding = (int64_t)HstPtrBegin % Alignment;
457       if (padding) {
458         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
459            "\n",
460            padding, DPxPTR(HstPtrBegin));
461         HstPtrBegin = (char *)HstPtrBegin - padding;
462         data_size += padding;
463       }
464     }
465 
466     // Address of pointer on the host and device, respectively.
467     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
468     TargetPointerResultTy Pointer_TPR;
469     bool IsHostPtr = false;
470     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
471     // Force the creation of a device side copy of the data when:
472     // a close map modifier was associated with a map that contained a to.
473     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
474     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
475     bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD;
476     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
477     // have reached this point via __tgt_target_data_begin and not __tgt_target
478     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
479     // associated with a target region, so there are no target parameters). This
480     // may be considered a hack, we could revise the scheme in the future.
481     bool UpdateRef =
482         !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
483     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
484       DP("Has a pointer entry: \n");
485       // Base is address of pointer.
486       //
487       // Usually, the pointer is already allocated by this time.  For example:
488       //
489       //   #pragma omp target map(s.p[0:N])
490       //
491       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
492       // afterward, so the pointer is already allocated by the time the
493       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
494       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
495       // entry for a global that might not already be allocated by the time the
496       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
497       // when HasPresentModifier.
498       Pointer_TPR = Device.getTargetPointer(
499           HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr,
500           /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
501           HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo);
502       PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
503       IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
504       if (!PointerTgtPtrBegin) {
505         REPORT("Call to getTargetPointer returned null pointer (%s).\n",
506                HasPresentModifier ? "'present' map type modifier"
507                                   : "device failure or illegal mapping");
508         return OFFLOAD_FAIL;
509       }
510       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
511          "\n",
512          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
513          (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
514       Pointer_HstPtrBegin = HstPtrBase;
515       // modify current entry.
516       HstPtrBase = *(void **)HstPtrBase;
517       // No need to update pointee ref count for the first element of the
518       // subelement that comes from mapper.
519       UpdateRef =
520           (!FromMapper || i != 0); // subsequently update ref count of pointee
521     }
522 
523     const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
524     const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
525     auto TPR = Device.getTargetPointer(
526         HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo,
527         HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
528         HasPresentModifier, HasHoldModifier, AsyncInfo);
529     void *TgtPtrBegin = TPR.TargetPointer;
530     IsHostPtr = TPR.Flags.IsHostPointer;
531     // If data_size==0, then the argument could be a zero-length pointer to
532     // NULL, so getOrAlloc() returning NULL is not an error.
533     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
534       REPORT("Call to getTargetPointer returned null pointer (%s).\n",
535              HasPresentModifier ? "'present' map type modifier"
536                                 : "device failure or illegal mapping");
537       return OFFLOAD_FAIL;
538     }
539     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
540        " - is%s new\n",
541        data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
542 
543     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
544       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
545       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
546       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
547       args_base[i] = TgtPtrBase;
548     }
549 
550     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
551       // Check whether we need to update the pointer on the device
552       bool UpdateDevPtr = false;
553 
554       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
555       void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
556 
557       Device.ShadowMtx.lock();
558       auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
559       // If this pointer is not in the map we need to insert it. If the map
560       // contains a stale entry, we need to update it (e.g. if the pointee was
561       // deallocated and later on is reallocated at another device address). The
562       // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
563       // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
564       // pair is deallocated, not when the OBJ is deallocated. In
565       // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
566       // so it stays in the map for the lifetime of the application. When the
567       // OBJ is deallocated and later on allocated again (at a different device
568       // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
569       // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
570       if (Entry == Device.ShadowPtrMap.end() ||
571           Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
572         // create or update shadow pointers for this entry
573         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
574             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
575         Pointer_TPR.MapTableEntry->setMayContainAttachedPointers();
576         UpdateDevPtr = true;
577       }
578 
579       if (UpdateDevPtr) {
580         std::lock_guard<decltype(*Pointer_TPR.MapTableEntry)> LG(
581             *Pointer_TPR.MapTableEntry);
582         Device.ShadowMtx.unlock();
583 
584         DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
585            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
586 
587         void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
588         TgtPtrBase = ExpectedTgtPtrBase;
589 
590         int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
591                                     sizeof(void *), AsyncInfo);
592         if (Ret != OFFLOAD_SUCCESS) {
593           REPORT("Copying data to device failed.\n");
594           return OFFLOAD_FAIL;
595         }
596         if (Pointer_TPR.MapTableEntry->addEventIfNecessary(Device, AsyncInfo) !=
597             OFFLOAD_SUCCESS)
598           return OFFLOAD_FAIL;
599       } else
600         Device.ShadowMtx.unlock();
601     }
602   }
603 
604   return OFFLOAD_SUCCESS;
605 }
606 
607 namespace {
608 /// This structure contains information to deallocate a target pointer, aka.
609 /// used to call the function \p DeviceTy::deallocTgtPtr.
610 struct DeallocTgtPtrInfo {
611   /// Host pointer used to look up into the map table
612   void *HstPtrBegin;
613   /// Size of the data
614   int64_t DataSize;
615   /// Whether it has \p ompx_hold modifier
616   bool HasHoldModifier;
617 
618   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
619       : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
620 };
621 
622 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
623 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the
624 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the
625 /// rest of the map is not checked anymore.
626 template <typename CBTy>
627 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin,
628                                     uintptr_t Size,
629                                     const TargetPointerResultTy &TPR) {
630   // If we have an object that is too small to hold a pointer subobject, no need
631   // to do any checking.
632   if (Size < sizeof(void *))
633     return;
634 
635   // If the map entry for the object was never marked as containing attached
636   // pointers, no need to do any checking.
637   if (TPR.MapTableEntry == HostDataToTargetListTy::iterator{} ||
638       !TPR.MapTableEntry->getMayContainAttachedPointers())
639     return;
640 
641   uintptr_t LB = (uintptr_t)Begin;
642   uintptr_t UB = LB + Size;
643   // Now we are looking into the shadow map so we need to lock it.
644   std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx);
645   for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
646        Itr != Device.ShadowPtrMap.end();) {
647     uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first;
648 
649     // An STL map is sorted on its keys; use this property
650     // to quickly determine when to break out of the loop.
651     if (ShadowHstPtrAddr < LB) {
652       ++Itr;
653       continue;
654     }
655     if (ShadowHstPtrAddr >= UB)
656       break;
657 
658     if (CB(Itr) == OFFLOAD_FAIL)
659       break;
660   }
661 }
662 
663 } // namespace
664 
665 /// Internal function to undo the mapping and retrieve the data from the device.
666 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
667                   void **ArgBases, void **Args, int64_t *ArgSizes,
668                   int64_t *ArgTypes, map_var_info_t *ArgNames,
669                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
670   int Ret;
671   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
672   void *FromMapperBase = nullptr;
673   // process each input.
674   for (int32_t I = ArgNum - 1; I >= 0; --I) {
675     // Ignore private variables and arrays - there is no mapping for them.
676     // Also, ignore the use_device_ptr directive, it has no effect here.
677     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
678         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
679       continue;
680 
681     if (ArgMappers && ArgMappers[I]) {
682       // Instead of executing the regular path of targetDataEnd, call the
683       // targetDataMapper variant which will call targetDataEnd again
684       // with new arguments.
685       DP("Calling targetDataMapper for the %dth argument\n", I);
686 
687       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
688       Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
689                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
690                              targetDataEnd);
691 
692       if (Ret != OFFLOAD_SUCCESS) {
693         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
694                " failed.\n");
695         return OFFLOAD_FAIL;
696       }
697 
698       // Skip the rest of this function, continue to the next argument.
699       continue;
700     }
701 
702     void *HstPtrBegin = Args[I];
703     int64_t DataSize = ArgSizes[I];
704     // Adjust for proper alignment if this is a combined entry (for structs).
705     // Look at the next argument - if that is MEMBER_OF this one, then this one
706     // is a combined entry.
707     const int NextI = I + 1;
708     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
709         getParentIndex(ArgTypes[NextI]) == I) {
710       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
711       if (Padding) {
712         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
713            "\n",
714            Padding, DPxPTR(HstPtrBegin));
715         HstPtrBegin = (char *)HstPtrBegin - Padding;
716         DataSize += Padding;
717       }
718     }
719 
720     bool IsLast, IsHostPtr;
721     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
722     bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
723                       (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
724                      !(FromMapper && I == 0);
725     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
726     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
727     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
728 
729     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
730     TargetPointerResultTy TPR = Device.getTgtPtrBegin(
731         HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
732         !IsImplicit, ForceDelete);
733     void *TgtPtrBegin = TPR.TargetPointer;
734     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
735       DP("Mapping does not exist (%s)\n",
736          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
737       if (HasPresentModifier) {
738         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
739         // "If a map clause appears on a target, target data, target enter data
740         // or target exit data construct with a present map-type-modifier then
741         // on entry to the region if the corresponding list item does not appear
742         // in the device data environment then an error occurs and the program
743         // terminates."
744         //
745         // This should be an error upon entering an "omp target exit data".  It
746         // should not be an error upon exiting an "omp target data" or "omp
747         // target".  For "omp target data", Clang thus doesn't include present
748         // modifiers for end calls.  For "omp target", we have not found a valid
749         // OpenMP program for which the error matters: it appears that, if a
750         // program can guarantee that data is present at the beginning of an
751         // "omp target" region so that there's no error there, that data is also
752         // guaranteed to be present at the end.
753         MESSAGE("device mapping required by 'present' map type modifier does "
754                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
755                 DPxPTR(HstPtrBegin), DataSize);
756         return OFFLOAD_FAIL;
757       }
758     } else {
759       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
760          " - is%s last\n",
761          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
762     }
763 
764     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
765     // "If the map clause appears on a target, target data, or target exit data
766     // construct and a corresponding list item of the original list item is not
767     // present in the device data environment on exit from the region then the
768     // list item is ignored."
769     if (!TgtPtrBegin)
770       continue;
771 
772     bool DelEntry = IsLast;
773 
774     // If the last element from the mapper (for end transfer args comes in
775     // reverse order), do not remove the partial entry, the parent struct still
776     // exists.
777     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
778         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
779       DelEntry = false; // protect parent struct from being deallocated
780     }
781 
782     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
783       // Move data back to the host
784       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
785         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
786         if ((Always || IsLast) && !IsHostPtr) {
787           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
788              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
789           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
790                                     AsyncInfo);
791           if (Ret != OFFLOAD_SUCCESS) {
792             REPORT("Copying data from device failed.\n");
793             return OFFLOAD_FAIL;
794           }
795         }
796       }
797       if (DelEntry && FromMapper && I == 0) {
798         DelEntry = false;
799         FromMapperBase = HstPtrBegin;
800       }
801 
802       // If we copied back to the host a struct/array containing pointers, we
803       // need to restore the original host pointer values from their shadow
804       // copies. If the struct is going to be deallocated, remove any remaining
805       // shadow pointer entries for this struct.
806       auto CB = [&](ShadowPtrListTy::iterator &Itr) {
807         // If we copied the struct to the host, we need to restore the pointer.
808         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
809           void **ShadowHstPtrAddr = (void **)Itr->first;
810           // Wait for device-to-host memcopies for whole struct to complete,
811           // before restoring the correct host pointer.
812           if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
813             return OFFLOAD_FAIL;
814           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
815           DP("Restoring original host pointer value " DPxMOD " for host "
816              "pointer " DPxMOD "\n",
817              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
818         }
819         // If the struct is to be deallocated, remove the shadow entry.
820         if (DelEntry) {
821           DP("Removing shadow pointer " DPxMOD "\n",
822              DPxPTR((void **)Itr->first));
823           Itr = Device.ShadowPtrMap.erase(Itr);
824         } else {
825           ++Itr;
826         }
827         return OFFLOAD_SUCCESS;
828       };
829       applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR);
830 
831       // Add pointer to the buffer for later deallocation
832       if (DelEntry && !IsHostPtr)
833         DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier);
834     }
835   }
836 
837   // TODO: We should not synchronize here but pass the AsyncInfo object to the
838   //       allocate/deallocate device APIs.
839   //
840   // We need to synchronize before deallocating data.
841   Ret = AsyncInfo.synchronize();
842   if (Ret != OFFLOAD_SUCCESS)
843     return OFFLOAD_FAIL;
844 
845   // Deallocate target pointer
846   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
847     if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
848       continue;
849     Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
850                                Info.HasHoldModifier);
851     if (Ret != OFFLOAD_SUCCESS) {
852       REPORT("Deallocating data from device failed.\n");
853       return OFFLOAD_FAIL;
854     }
855   }
856 
857   return OFFLOAD_SUCCESS;
858 }
859 
860 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
861                                 void *HstPtrBegin, int64_t ArgSize,
862                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
863   TIMESCOPE_WITH_IDENT(loc);
864   bool IsLast, IsHostPtr;
865   TargetPointerResultTy TPR = Device.getTgtPtrBegin(
866       HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
867       /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
868   void *TgtPtrBegin = TPR.TargetPointer;
869   if (!TgtPtrBegin) {
870     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
871     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
872       MESSAGE("device mapping required by 'present' motion modifier does not "
873               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
874               DPxPTR(HstPtrBegin), ArgSize);
875       return OFFLOAD_FAIL;
876     }
877     return OFFLOAD_SUCCESS;
878   }
879 
880   if (IsHostPtr) {
881     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
882        DPxPTR(HstPtrBegin));
883     return OFFLOAD_SUCCESS;
884   }
885 
886   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
887     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
888        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
889     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
890     if (Ret != OFFLOAD_SUCCESS) {
891       REPORT("Copying data from device failed.\n");
892       return OFFLOAD_FAIL;
893     }
894 
895     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
896       void **ShadowHstPtrAddr = (void **)Itr->first;
897       // Wait for device-to-host memcopies for whole struct to complete,
898       // before restoring the correct host pointer.
899       if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
900         return OFFLOAD_FAIL;
901       *ShadowHstPtrAddr = Itr->second.HstPtrVal;
902       DP("Restoring original host pointer value " DPxMOD
903          " for host pointer " DPxMOD "\n",
904          DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
905       ++Itr;
906       return OFFLOAD_SUCCESS;
907     };
908     applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
909   }
910 
911   if (ArgType & OMP_TGT_MAPTYPE_TO) {
912     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
913        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
914     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
915     if (Ret != OFFLOAD_SUCCESS) {
916       REPORT("Copying data to device failed.\n");
917       return OFFLOAD_FAIL;
918     }
919 
920     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
921       DP("Restoring original target pointer value " DPxMOD " for target "
922          "pointer " DPxMOD "\n",
923          DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr));
924       Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal,
925                               sizeof(void *), AsyncInfo);
926       if (Ret != OFFLOAD_SUCCESS)
927         REPORT("Copying data to device failed.\n");
928       ++Itr;
929       return Ret;
930     };
931     applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
932   }
933   return OFFLOAD_SUCCESS;
934 }
935 
936 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
937                                    void *ArgsBase,
938                                    __tgt_target_non_contig *NonContig,
939                                    uint64_t Size, int64_t ArgType,
940                                    int CurrentDim, int DimSize, uint64_t Offset,
941                                    AsyncInfoTy &AsyncInfo) {
942   TIMESCOPE_WITH_IDENT(loc);
943   int Ret = OFFLOAD_SUCCESS;
944   if (CurrentDim < DimSize) {
945     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
946       uint64_t CurOffset =
947           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
948       // we only need to transfer the first element for the last dimension
949       // since we've already got a contiguous piece.
950       if (CurrentDim != DimSize - 1 || I == 0) {
951         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
952                                       ArgType, CurrentDim + 1, DimSize,
953                                       Offset + CurOffset, AsyncInfo);
954         // Stop the whole process if any contiguous piece returns anything
955         // other than OFFLOAD_SUCCESS.
956         if (Ret != OFFLOAD_SUCCESS)
957           return Ret;
958       }
959     }
960   } else {
961     char *Ptr = (char *)ArgsBase + Offset;
962     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
963        " len %" PRIu64 "\n",
964        DPxPTR(Ptr), Offset, Size);
965     Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
966                                AsyncInfo);
967   }
968   return Ret;
969 }
970 
971 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
972                                        int32_t DimSize) {
973   int RemovedDim = 0;
974   for (int I = DimSize - 1; I > 0; --I) {
975     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
976       RemovedDim++;
977   }
978   return RemovedDim;
979 }
980 
981 /// Internal function to pass data to/from the target.
982 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
983                      void **ArgsBase, void **Args, int64_t *ArgSizes,
984                      int64_t *ArgTypes, map_var_info_t *ArgNames,
985                      void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
986   // process each input.
987   for (int32_t I = 0; I < ArgNum; ++I) {
988     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
989         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
990       continue;
991 
992     if (ArgMappers && ArgMappers[I]) {
993       // Instead of executing the regular path of targetDataUpdate, call the
994       // targetDataMapper variant which will call targetDataUpdate again
995       // with new arguments.
996       DP("Calling targetDataMapper for the %dth argument\n", I);
997 
998       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
999       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1000                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
1001                                  targetDataUpdate);
1002 
1003       if (Ret != OFFLOAD_SUCCESS) {
1004         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1005                " failed.\n");
1006         return OFFLOAD_FAIL;
1007       }
1008 
1009       // Skip the rest of this function, continue to the next argument.
1010       continue;
1011     }
1012 
1013     int Ret = OFFLOAD_SUCCESS;
1014 
1015     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1016       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1017       int32_t DimSize = ArgSizes[I];
1018       uint64_t Size =
1019           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1020       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1021       Ret = targetDataNonContiguous(
1022           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1023           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1024     } else {
1025       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1026                                  ArgTypes[I], AsyncInfo);
1027     }
1028     if (Ret == OFFLOAD_FAIL)
1029       return OFFLOAD_FAIL;
1030   }
1031   return OFFLOAD_SUCCESS;
1032 }
1033 
1034 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1035                                       OMP_TGT_MAPTYPE_LITERAL |
1036                                       OMP_TGT_MAPTYPE_IMPLICIT;
1037 static bool isLambdaMapping(int64_t Mapping) {
1038   return (Mapping & LambdaMapping) == LambdaMapping;
1039 }
1040 
1041 namespace {
1042 /// Find the table information in the map or look it up in the translation
1043 /// tables.
1044 TableMap *getTableMap(void *HostPtr) {
1045   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1046   HostPtrToTableMapTy::iterator TableMapIt =
1047       PM->HostPtrToTableMap.find(HostPtr);
1048 
1049   if (TableMapIt != PM->HostPtrToTableMap.end())
1050     return &TableMapIt->second;
1051 
1052   // We don't have a map. So search all the registered libraries.
1053   TableMap *TM = nullptr;
1054   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1055   for (HostEntriesBeginToTransTableTy::iterator Itr =
1056            PM->HostEntriesBeginToTransTable.begin();
1057        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1058     // get the translation table (which contains all the good info).
1059     TranslationTable *TransTable = &Itr->second;
1060     // iterate over all the host table entries to see if we can locate the
1061     // host_ptr.
1062     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1063     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1064       if (Cur->addr != HostPtr)
1065         continue;
1066       // we got a match, now fill the HostPtrToTableMap so that we
1067       // may avoid this search next time.
1068       TM = &(PM->HostPtrToTableMap)[HostPtr];
1069       TM->Table = TransTable;
1070       TM->Index = I;
1071       return TM;
1072     }
1073   }
1074 
1075   return nullptr;
1076 }
1077 
1078 /// Get loop trip count
1079 /// FIXME: This function will not work right if calling
1080 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1081 /// another thread, which might occur when we call task yield.
1082 uint64_t getLoopTripCount(int64_t DeviceId) {
1083   DeviceTy &Device = *PM->Devices[DeviceId];
1084   uint64_t LoopTripCount = 0;
1085 
1086   {
1087     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1088     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1089     if (I != Device.LoopTripCnt.end()) {
1090       LoopTripCount = I->second;
1091       Device.LoopTripCnt.erase(I);
1092       DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1093     }
1094   }
1095 
1096   return LoopTripCount;
1097 }
1098 
1099 /// A class manages private arguments in a target region.
1100 class PrivateArgumentManagerTy {
1101   /// A data structure for the information of first-private arguments. We can
1102   /// use this information to optimize data transfer by packing all
1103   /// first-private arguments and transfer them all at once.
1104   struct FirstPrivateArgInfoTy {
1105     /// The index of the element in \p TgtArgs corresponding to the argument
1106     const int Index;
1107     /// Host pointer begin
1108     const char *HstPtrBegin;
1109     /// Host pointer end
1110     const char *HstPtrEnd;
1111     /// Aligned size
1112     const int64_t AlignedSize;
1113     /// Host pointer name
1114     const map_var_info_t HstPtrName = nullptr;
1115 
1116     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1117                           const map_var_info_t HstPtrName = nullptr)
1118         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1119           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1120           HstPtrName(HstPtrName) {}
1121   };
1122 
1123   /// A vector of target pointers for all private arguments
1124   std::vector<void *> TgtPtrs;
1125 
1126   /// A vector of information of all first-private arguments to be packed
1127   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1128   /// Host buffer for all arguments to be packed
1129   std::vector<char> FirstPrivateArgBuffer;
1130   /// The total size of all arguments to be packed
1131   int64_t FirstPrivateArgSize = 0;
1132 
1133   /// A reference to the \p DeviceTy object
1134   DeviceTy &Device;
1135   /// A pointer to a \p AsyncInfoTy object
1136   AsyncInfoTy &AsyncInfo;
1137 
1138   // TODO: What would be the best value here? Should we make it configurable?
1139   // If the size is larger than this threshold, we will allocate and transfer it
1140   // immediately instead of packing it.
1141   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1142 
1143 public:
1144   /// Constructor
1145   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1146       : Device(Dev), AsyncInfo(AsyncInfo) {}
1147 
1148   /// Add a private argument
1149   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1150              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1151              const map_var_info_t HstPtrName = nullptr,
1152              const bool AllocImmediately = false) {
1153     // If the argument is not first-private, or its size is greater than a
1154     // predefined threshold, we will allocate memory and issue the transfer
1155     // immediately.
1156     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1157         AllocImmediately) {
1158       TgtPtr = Device.allocData(ArgSize, HstPtr);
1159       if (!TgtPtr) {
1160         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1161            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1162         return OFFLOAD_FAIL;
1163       }
1164 #ifdef OMPTARGET_DEBUG
1165       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1166       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1167          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1168          "\n",
1169          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1170          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1171 #endif
1172       // If first-private, copy data from host
1173       if (IsFirstPrivate) {
1174         DP("Submitting firstprivate data to the device.\n");
1175         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1176         if (Ret != OFFLOAD_SUCCESS) {
1177           DP("Copying data to device failed, failed.\n");
1178           return OFFLOAD_FAIL;
1179         }
1180       }
1181       TgtPtrs.push_back(TgtPtr);
1182     } else {
1183       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1184          DPxPTR(HstPtr), ArgSize);
1185       // When reach this point, the argument must meet all following
1186       // requirements:
1187       // 1. Its size does not exceed the threshold (see the comment for
1188       // FirstPrivateArgSizeThreshold);
1189       // 2. It must be first-private (needs to be mapped to target device).
1190       // We will pack all this kind of arguments to transfer them all at once
1191       // to reduce the number of data transfer. We will not take
1192       // non-first-private arguments, aka. private arguments that doesn't need
1193       // to be mapped to target device, into account because data allocation
1194       // can be very efficient with memory manager.
1195 
1196       // Placeholder value
1197       TgtPtr = nullptr;
1198       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1199                                        HstPtrName);
1200       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1201     }
1202 
1203     return OFFLOAD_SUCCESS;
1204   }
1205 
1206   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1207   /// and start the transfer.
1208   int packAndTransfer(std::vector<void *> &TgtArgs) {
1209     if (!FirstPrivateArgInfo.empty()) {
1210       assert(FirstPrivateArgSize != 0 &&
1211              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1212       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1213       auto Itr = FirstPrivateArgBuffer.begin();
1214       // Copy all host data to this buffer
1215       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1216         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1217         Itr = std::next(Itr, Info.AlignedSize);
1218       }
1219       // Allocate target memory
1220       void *TgtPtr =
1221           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1222       if (TgtPtr == nullptr) {
1223         DP("Failed to allocate target memory for private arguments.\n");
1224         return OFFLOAD_FAIL;
1225       }
1226       TgtPtrs.push_back(TgtPtr);
1227       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1228          FirstPrivateArgSize, DPxPTR(TgtPtr));
1229       // Transfer data to target device
1230       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1231                                   FirstPrivateArgSize, AsyncInfo);
1232       if (Ret != OFFLOAD_SUCCESS) {
1233         DP("Failed to submit data of private arguments.\n");
1234         return OFFLOAD_FAIL;
1235       }
1236       // Fill in all placeholder pointers
1237       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1238       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1239         void *&Ptr = TgtArgs[Info.Index];
1240         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1241         Ptr = reinterpret_cast<void *>(TP);
1242         TP += Info.AlignedSize;
1243         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1244            "\n",
1245            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1246            DPxPTR(Ptr));
1247       }
1248     }
1249 
1250     return OFFLOAD_SUCCESS;
1251   }
1252 
1253   /// Free all target memory allocated for private arguments
1254   int free() {
1255     for (void *P : TgtPtrs) {
1256       int Ret = Device.deleteData(P);
1257       if (Ret != OFFLOAD_SUCCESS) {
1258         DP("Deallocation of (first-)private arrays failed.\n");
1259         return OFFLOAD_FAIL;
1260       }
1261     }
1262 
1263     TgtPtrs.clear();
1264 
1265     return OFFLOAD_SUCCESS;
1266   }
1267 };
1268 
1269 /// Process data before launching the kernel, including calling targetDataBegin
1270 /// to map and transfer data to target device, transferring (first-)private
1271 /// variables.
1272 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1273                              int32_t ArgNum, void **ArgBases, void **Args,
1274                              int64_t *ArgSizes, int64_t *ArgTypes,
1275                              map_var_info_t *ArgNames, void **ArgMappers,
1276                              std::vector<void *> &TgtArgs,
1277                              std::vector<ptrdiff_t> &TgtOffsets,
1278                              PrivateArgumentManagerTy &PrivateArgumentManager,
1279                              AsyncInfoTy &AsyncInfo) {
1280   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1281   DeviceTy &Device = *PM->Devices[DeviceId];
1282   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1283                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1284   if (Ret != OFFLOAD_SUCCESS) {
1285     REPORT("Call to targetDataBegin failed, abort target.\n");
1286     return OFFLOAD_FAIL;
1287   }
1288 
1289   // List of (first-)private arrays allocated for this target region
1290   std::vector<int> TgtArgsPositions(ArgNum, -1);
1291 
1292   for (int32_t I = 0; I < ArgNum; ++I) {
1293     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1294       // This is not a target parameter, do not push it into TgtArgs.
1295       // Check for lambda mapping.
1296       if (isLambdaMapping(ArgTypes[I])) {
1297         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1298                "PTR_AND_OBJ must be also MEMBER_OF.");
1299         unsigned Idx = getParentIndex(ArgTypes[I]);
1300         int TgtIdx = TgtArgsPositions[Idx];
1301         assert(TgtIdx != -1 && "Base address must be translated already.");
1302         // The parent lambda must be processed already and it must be the last
1303         // in TgtArgs and TgtOffsets arrays.
1304         void *HstPtrVal = Args[I];
1305         void *HstPtrBegin = ArgBases[I];
1306         void *HstPtrBase = Args[Idx];
1307         bool IsLast, IsHostPtr; // IsLast is unused.
1308         void *TgtPtrBase =
1309             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1310         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1311         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1312         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1313         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1314         TargetPointerResultTy TPR = Device.getTgtPtrBegin(
1315             HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
1316             /*UseHoldRefCount=*/false, IsHostPtr);
1317         PointerTgtPtrBegin = TPR.TargetPointer;
1318         if (!PointerTgtPtrBegin) {
1319           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1320              DPxPTR(HstPtrVal));
1321           continue;
1322         }
1323         if (IsHostPtr) {
1324           DP("Unified memory is active, no need to map lambda captured"
1325              "variable (" DPxMOD ")\n",
1326              DPxPTR(HstPtrVal));
1327           continue;
1328         }
1329         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1330            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1331         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1332                                 sizeof(void *), AsyncInfo);
1333         if (Ret != OFFLOAD_SUCCESS) {
1334           REPORT("Copying data to device failed.\n");
1335           return OFFLOAD_FAIL;
1336         }
1337       }
1338       continue;
1339     }
1340     void *HstPtrBegin = Args[I];
1341     void *HstPtrBase = ArgBases[I];
1342     void *TgtPtrBegin;
1343     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1344     ptrdiff_t TgtBaseOffset;
1345     bool IsLast, IsHostPtr; // unused.
1346     TargetPointerResultTy TPR;
1347     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1348       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1349          DPxPTR(HstPtrBase));
1350       TgtPtrBegin = HstPtrBase;
1351       TgtBaseOffset = 0;
1352     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1353       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1354       const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1355       // If there is a next argument and it depends on the current one, we need
1356       // to allocate the private memory immediately. If this is not the case,
1357       // then the argument can be marked for optimization and packed with the
1358       // other privates.
1359       const bool AllocImmediately =
1360           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1361       Ret = PrivateArgumentManager.addArg(
1362           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1363           TgtArgs.size(), HstPtrName, AllocImmediately);
1364       if (Ret != OFFLOAD_SUCCESS) {
1365         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1366                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1367         return OFFLOAD_FAIL;
1368       }
1369     } else {
1370       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1371         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1372       TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1373                                   /*UpdateRefCount=*/false,
1374                                   /*UseHoldRefCount=*/false, IsHostPtr);
1375       TgtPtrBegin = TPR.TargetPointer;
1376       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1377 #ifdef OMPTARGET_DEBUG
1378       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1379       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1380          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1381 #endif
1382     }
1383     TgtArgsPositions[I] = TgtArgs.size();
1384     TgtArgs.push_back(TgtPtrBegin);
1385     TgtOffsets.push_back(TgtBaseOffset);
1386   }
1387 
1388   assert(TgtArgs.size() == TgtOffsets.size() &&
1389          "Size mismatch in arguments and offsets");
1390 
1391   // Pack and transfer first-private arguments
1392   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1393   if (Ret != OFFLOAD_SUCCESS) {
1394     DP("Failed to pack and transfer first private arguments\n");
1395     return OFFLOAD_FAIL;
1396   }
1397 
1398   return OFFLOAD_SUCCESS;
1399 }
1400 
1401 /// Process data after launching the kernel, including transferring data back to
1402 /// host if needed and deallocating target memory of (first-)private variables.
1403 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1404                             int32_t ArgNum, void **ArgBases, void **Args,
1405                             int64_t *ArgSizes, int64_t *ArgTypes,
1406                             map_var_info_t *ArgNames, void **ArgMappers,
1407                             PrivateArgumentManagerTy &PrivateArgumentManager,
1408                             AsyncInfoTy &AsyncInfo) {
1409   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1410   DeviceTy &Device = *PM->Devices[DeviceId];
1411 
1412   // Move data from device.
1413   int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1414                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1415   if (Ret != OFFLOAD_SUCCESS) {
1416     REPORT("Call to targetDataEnd failed, abort target.\n");
1417     return OFFLOAD_FAIL;
1418   }
1419 
1420   // Free target memory for private arguments
1421   Ret = PrivateArgumentManager.free();
1422   if (Ret != OFFLOAD_SUCCESS) {
1423     REPORT("Failed to deallocate target memory for private args\n");
1424     return OFFLOAD_FAIL;
1425   }
1426 
1427   return OFFLOAD_SUCCESS;
1428 }
1429 } // namespace
1430 
1431 /// performs the same actions as data_begin in case arg_num is
1432 /// non-zero and initiates run of the offloaded region on the target platform;
1433 /// if arg_num is non-zero after the region execution is done it also
1434 /// performs the same action as data_update and data_end above. This function
1435 /// returns 0 if it was able to transfer the execution to a target and an
1436 /// integer different from zero otherwise.
1437 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1438            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1439            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1440            int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1441   int32_t DeviceId = Device.DeviceID;
1442 
1443   TableMap *TM = getTableMap(HostPtr);
1444   // No map for this host pointer found!
1445   if (!TM) {
1446     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1447            DPxPTR(HostPtr));
1448     return OFFLOAD_FAIL;
1449   }
1450 
1451   // get target table.
1452   __tgt_target_table *TargetTable = nullptr;
1453   {
1454     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1455     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1456            "Not expecting a device ID outside the table's bounds!");
1457     TargetTable = TM->Table->TargetsTable[DeviceId];
1458   }
1459   assert(TargetTable && "Global data has not been mapped\n");
1460 
1461   // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1462   // need to manifest base pointers prior to launching a kernel. Even if we have
1463   // mapped an object only partially, e.g. A[N:M], although the kernel is
1464   // expected to access elements starting at address &A[N] and beyond, we still
1465   // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1466   // API, we need the begin address itself, i.e. &A[N], as the API operates on
1467   // begin addresses, not bases. That's why we pass args and offsets as two
1468   // separate entities so that each plugin can do what it needs. This behavior
1469   // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1470   std::vector<void *> TgtArgs;
1471   std::vector<ptrdiff_t> TgtOffsets;
1472 
1473   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1474 
1475   int Ret;
1476   if (ArgNum) {
1477     // Process data, such as data mapping, before launching the kernel
1478     Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1479                             ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1480                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1481     if (Ret != OFFLOAD_SUCCESS) {
1482       REPORT("Failed to process data before launching the kernel.\n");
1483       return OFFLOAD_FAIL;
1484     }
1485   }
1486 
1487   // Launch device execution.
1488   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1489   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1490      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1491 
1492   {
1493     TIMESCOPE_WITH_NAME_AND_IDENT(
1494         IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1495     if (IsTeamConstruct)
1496       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1497                                  TgtArgs.size(), TeamNum, ThreadLimit,
1498                                  getLoopTripCount(DeviceId), AsyncInfo);
1499     else
1500       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1501                              TgtArgs.size(), AsyncInfo);
1502   }
1503 
1504   if (Ret != OFFLOAD_SUCCESS) {
1505     REPORT("Executing target region abort target.\n");
1506     return OFFLOAD_FAIL;
1507   }
1508 
1509   if (ArgNum) {
1510     // Transfer data back and deallocate target memory for (first-)private
1511     // variables
1512     Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1513                            ArgSizes, ArgTypes, ArgNames, ArgMappers,
1514                            PrivateArgumentManager, AsyncInfo);
1515     if (Ret != OFFLOAD_SUCCESS) {
1516       REPORT("Failed to process data after launching the kernel.\n");
1517       return OFFLOAD_FAIL;
1518     }
1519   }
1520 
1521   return OFFLOAD_SUCCESS;
1522 }
1523