1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18 
19 #include <cassert>
20 #include <vector>
21 
22 int AsyncInfoTy::synchronize() {
23   int Result = OFFLOAD_SUCCESS;
24   if (AsyncInfo.Queue) {
25     // If we have a queue we need to synchronize it now.
26     Result = Device.synchronize(*this);
27     assert(AsyncInfo.Queue == nullptr &&
28            "The device plugin should have nulled the queue to indicate there "
29            "are no outstanding actions!");
30   }
31   return Result;
32 }
33 
34 void *&AsyncInfoTy::getVoidPtrLocation() {
35   BufferLocations.push_back(nullptr);
36   return BufferLocations.back();
37 }
38 
39 /* All begin addresses for partially mapped structs must be 8-aligned in order
40  * to ensure proper alignment of members. E.g.
41  *
42  * struct S {
43  *   int a;   // 4-aligned
44  *   int b;   // 4-aligned
45  *   int *p;  // 8-aligned
46  * } s1;
47  * ...
48  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
49  * {
50  *   s1.b = 5;
51  *   for (int i...) s1.p[i] = ...;
52  * }
53  *
54  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
55  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
56  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
57  * requirements for its type. Now, when we allocate memory on the device, in
58  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
59  * This means that the chunk of the struct on the device will start at a
60  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
61  * address of p will be a misaligned 0x204 (on the host there was no need to add
62  * padding between b and p, so p comes exactly 4 bytes after b). If the device
63  * kernel tries to access s1.p, a misaligned address error occurs (as reported
64  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
65  * extending the size of the allocated chuck accordingly, the chuck on the
66  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
67  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
68  */
69 static const int64_t Alignment = 8;
70 
71 /// Map global data and execute pending ctors
72 static int InitLibrary(DeviceTy &Device) {
73   /*
74    * Map global data
75    */
76   int32_t device_id = Device.DeviceID;
77   int rc = OFFLOAD_SUCCESS;
78   bool supportsEmptyImages = Device.RTL->supports_empty_images &&
79                              Device.RTL->supports_empty_images() > 0;
80 
81   Device.PendingGlobalsMtx.lock();
82   PM->TrlTblMtx.lock();
83   for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
84     TranslationTable *TransTable =
85         &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
86     if (TransTable->HostTable.EntriesBegin ==
87             TransTable->HostTable.EntriesEnd &&
88         !supportsEmptyImages) {
89       // No host entry so no need to proceed
90       continue;
91     }
92 
93     if (TransTable->TargetsTable[device_id] != 0) {
94       // Library entries have already been processed
95       continue;
96     }
97 
98     // 1) get image.
99     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
100            "Not expecting a device ID outside the table's bounds!");
101     __tgt_device_image *img = TransTable->TargetsImages[device_id];
102     if (!img) {
103       REPORT("No image loaded for device id %d.\n", device_id);
104       rc = OFFLOAD_FAIL;
105       break;
106     }
107     // 2) load image into the target table.
108     __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
109         Device.load_binary(img);
110     // Unable to get table for this image: invalidate image and fail.
111     if (!TargetTable) {
112       REPORT("Unable to generate entries table for device id %d.\n", device_id);
113       TransTable->TargetsImages[device_id] = 0;
114       rc = OFFLOAD_FAIL;
115       break;
116     }
117 
118     // Verify whether the two table sizes match.
119     size_t hsize =
120         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
121     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
122 
123     // Invalid image for these host entries!
124     if (hsize != tsize) {
125       REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
126              device_id, hsize, tsize);
127       TransTable->TargetsImages[device_id] = 0;
128       TransTable->TargetsTable[device_id] = 0;
129       rc = OFFLOAD_FAIL;
130       break;
131     }
132 
133     // process global data that needs to be mapped.
134     Device.DataMapMtx.lock();
135     __tgt_target_table *HostTable = &TransTable->HostTable;
136     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
137                              *CurrHostEntry = HostTable->EntriesBegin,
138                              *EntryDeviceEnd = TargetTable->EntriesEnd;
139          CurrDeviceEntry != EntryDeviceEnd;
140          CurrDeviceEntry++, CurrHostEntry++) {
141       if (CurrDeviceEntry->size != 0) {
142         // has data.
143         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
144                "data size mismatch");
145 
146         // Fortran may use multiple weak declarations for the same symbol,
147         // therefore we must allow for multiple weak symbols to be loaded from
148         // the fat binary. Treat these mappings as any other "regular" mapping.
149         // Add entry to map.
150         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
151           continue;
152         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
153            "\n",
154            DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
155            CurrDeviceEntry->size);
156         Device.HostDataToTargetMap.emplace(
157             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
158             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
159             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
160             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
161             false /*UseHoldRefCount*/, nullptr /*Name*/,
162             true /*IsRefCountINF*/);
163       }
164     }
165     Device.DataMapMtx.unlock();
166   }
167   PM->TrlTblMtx.unlock();
168 
169   if (rc != OFFLOAD_SUCCESS) {
170     Device.PendingGlobalsMtx.unlock();
171     return rc;
172   }
173 
174   /*
175    * Run ctors for static objects
176    */
177   if (!Device.PendingCtorsDtors.empty()) {
178     AsyncInfoTy AsyncInfo(Device);
179     // Call all ctors for all libraries registered so far
180     for (auto &lib : Device.PendingCtorsDtors) {
181       if (!lib.second.PendingCtors.empty()) {
182         DP("Has pending ctors... call now\n");
183         for (auto &entry : lib.second.PendingCtors) {
184           void *ctor = entry;
185           int rc =
186               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
187                      nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
188           if (rc != OFFLOAD_SUCCESS) {
189             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
190             Device.PendingGlobalsMtx.unlock();
191             return OFFLOAD_FAIL;
192           }
193         }
194         // Clear the list to indicate that this device has been used
195         lib.second.PendingCtors.clear();
196         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
197       }
198     }
199     // All constructors have been issued, wait for them now.
200     if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
201       return OFFLOAD_FAIL;
202   }
203   Device.HasPendingGlobals = false;
204   Device.PendingGlobalsMtx.unlock();
205 
206   return OFFLOAD_SUCCESS;
207 }
208 
209 void handleTargetOutcome(bool Success, ident_t *Loc) {
210   switch (PM->TargetOffloadPolicy) {
211   case tgt_disabled:
212     if (Success) {
213       FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
214     }
215     break;
216   case tgt_default:
217     FATAL_MESSAGE0(1, "default offloading policy must be switched to "
218                       "mandatory or disabled");
219     break;
220   case tgt_mandatory:
221     if (!Success) {
222       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
223         for (auto &Device : PM->Devices)
224           dumpTargetPointerMappings(Loc, *Device);
225       else
226         FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
227                         "pointer mappings.\n",
228                         OMP_INFOTYPE_DUMP_TABLE);
229 
230       SourceInfo info(Loc);
231       if (info.isAvailible())
232         fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
233                 info.getColumn());
234       else
235         FAILURE_MESSAGE("Source location information not present. Compile with "
236                         "-g or -gline-tables-only.\n");
237       FATAL_MESSAGE0(
238           1, "failure of target construct while offloading is mandatory");
239     } else {
240       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
241         for (auto &Device : PM->Devices)
242           dumpTargetPointerMappings(Loc, *Device);
243     }
244     break;
245   }
246 }
247 
248 static void handleDefaultTargetOffload() {
249   PM->TargetOffloadMtx.lock();
250   if (PM->TargetOffloadPolicy == tgt_default) {
251     if (omp_get_num_devices() > 0) {
252       DP("Default TARGET OFFLOAD policy is now mandatory "
253          "(devices were found)\n");
254       PM->TargetOffloadPolicy = tgt_mandatory;
255     } else {
256       DP("Default TARGET OFFLOAD policy is now disabled "
257          "(no devices were found)\n");
258       PM->TargetOffloadPolicy = tgt_disabled;
259     }
260   }
261   PM->TargetOffloadMtx.unlock();
262 }
263 
264 static bool isOffloadDisabled() {
265   if (PM->TargetOffloadPolicy == tgt_default)
266     handleDefaultTargetOffload();
267   return PM->TargetOffloadPolicy == tgt_disabled;
268 }
269 
270 // If offload is enabled, ensure that device DeviceID has been initialized,
271 // global ctors have been executed, and global data has been mapped.
272 //
273 // The return bool indicates if the offload is to the host device
274 // There are three possible results:
275 // - Return false if the taregt device is ready for offload
276 // - Return true without reporting a runtime error if offload is
277 //   disabled, perhaps because the initial device was specified.
278 // - Report a runtime error and return true.
279 //
280 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
281 // This step might be skipped if offload is disabled.
282 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
283   if (isOffloadDisabled()) {
284     DP("Offload is disabled\n");
285     return true;
286   }
287 
288   if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
289     DeviceID = omp_get_default_device();
290     DP("Use default device id %" PRId64 "\n", DeviceID);
291   }
292 
293   // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
294   if (omp_get_num_devices() == 0) {
295     DP("omp_get_num_devices() == 0 but offload is manadatory\n");
296     handleTargetOutcome(false, Loc);
297     return true;
298   }
299 
300   if (DeviceID == omp_get_initial_device()) {
301     DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
302        DeviceID);
303     return true;
304   }
305 
306   // Is device ready?
307   if (!device_is_ready(DeviceID)) {
308     REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
309     handleTargetOutcome(false, Loc);
310     return true;
311   }
312 
313   // Get device info.
314   DeviceTy &Device = *PM->Devices[DeviceID];
315 
316   // Check whether global data has been mapped for this device
317   Device.PendingGlobalsMtx.lock();
318   bool hasPendingGlobals = Device.HasPendingGlobals;
319   Device.PendingGlobalsMtx.unlock();
320   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
321     REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
322     handleTargetOutcome(false, Loc);
323     return true;
324   }
325 
326   return false;
327 }
328 
329 static int32_t getParentIndex(int64_t type) {
330   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
331 }
332 
333 void *targetAllocExplicit(size_t size, int device_num, int kind,
334                           const char *name) {
335   TIMESCOPE();
336   DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
337 
338   if (size <= 0) {
339     DP("Call to %s with non-positive length\n", name);
340     return NULL;
341   }
342 
343   void *rc = NULL;
344 
345   if (device_num == omp_get_initial_device()) {
346     rc = malloc(size);
347     DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
348     return rc;
349   }
350 
351   if (!device_is_ready(device_num)) {
352     DP("%s returns NULL ptr\n", name);
353     return NULL;
354   }
355 
356   DeviceTy &Device = *PM->Devices[device_num];
357   rc = Device.allocData(size, nullptr, kind);
358   DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
359   return rc;
360 }
361 
362 /// Call the user-defined mapper function followed by the appropriate
363 // targetData* function (targetData{Begin,End,Update}).
364 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
365                      int64_t arg_size, int64_t arg_type,
366                      map_var_info_t arg_names, void *arg_mapper,
367                      AsyncInfoTy &AsyncInfo,
368                      TargetDataFuncPtrTy target_data_function) {
369   TIMESCOPE_WITH_IDENT(loc);
370   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
371 
372   // The mapper function fills up Components.
373   MapperComponentsTy MapperComponents;
374   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
375   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
376                    arg_names);
377 
378   // Construct new arrays for args_base, args, arg_sizes and arg_types
379   // using the information in MapperComponents and call the corresponding
380   // targetData* function using these new arrays.
381   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
382   std::vector<void *> MapperArgs(MapperComponents.Components.size());
383   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
384   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
385   std::vector<void *> MapperArgNames(MapperComponents.Components.size());
386 
387   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
388     auto &C = MapperComponents.Components[I];
389     MapperArgsBase[I] = C.Base;
390     MapperArgs[I] = C.Begin;
391     MapperArgSizes[I] = C.Size;
392     MapperArgTypes[I] = C.Type;
393     MapperArgNames[I] = C.Name;
394   }
395 
396   int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
397                                 MapperArgsBase.data(), MapperArgs.data(),
398                                 MapperArgSizes.data(), MapperArgTypes.data(),
399                                 MapperArgNames.data(), /*arg_mappers*/ nullptr,
400                                 AsyncInfo, /*FromMapper=*/true);
401 
402   return rc;
403 }
404 
405 /// Internal function to do the mapping and transfer the data to the device
406 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
407                     void **args_base, void **args, int64_t *arg_sizes,
408                     int64_t *arg_types, map_var_info_t *arg_names,
409                     void **arg_mappers, AsyncInfoTy &AsyncInfo,
410                     bool FromMapper) {
411   // process each input.
412   for (int32_t i = 0; i < arg_num; ++i) {
413     // Ignore private variables and arrays - there is no mapping for them.
414     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
415         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
416       continue;
417 
418     if (arg_mappers && arg_mappers[i]) {
419       // Instead of executing the regular path of targetDataBegin, call the
420       // targetDataMapper variant which will call targetDataBegin again
421       // with new arguments.
422       DP("Calling targetDataMapper for the %dth argument\n", i);
423 
424       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
425       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
426                                 arg_sizes[i], arg_types[i], arg_name,
427                                 arg_mappers[i], AsyncInfo, targetDataBegin);
428 
429       if (rc != OFFLOAD_SUCCESS) {
430         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
431                " failed.\n");
432         return OFFLOAD_FAIL;
433       }
434 
435       // Skip the rest of this function, continue to the next argument.
436       continue;
437     }
438 
439     void *HstPtrBegin = args[i];
440     void *HstPtrBase = args_base[i];
441     int64_t data_size = arg_sizes[i];
442     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
443 
444     // Adjust for proper alignment if this is a combined entry (for structs).
445     // Look at the next argument - if that is MEMBER_OF this one, then this one
446     // is a combined entry.
447     int64_t padding = 0;
448     const int next_i = i + 1;
449     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
450         getParentIndex(arg_types[next_i]) == i) {
451       padding = (int64_t)HstPtrBegin % Alignment;
452       if (padding) {
453         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
454            "\n",
455            padding, DPxPTR(HstPtrBegin));
456         HstPtrBegin = (char *)HstPtrBegin - padding;
457         data_size += padding;
458       }
459     }
460 
461     // Address of pointer on the host and device, respectively.
462     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
463     TargetPointerResultTy Pointer_TPR;
464     bool IsHostPtr = false;
465     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
466     // Force the creation of a device side copy of the data when:
467     // a close map modifier was associated with a map that contained a to.
468     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
469     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
470     bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD;
471     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
472     // have reached this point via __tgt_target_data_begin and not __tgt_target
473     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
474     // associated with a target region, so there are no target parameters). This
475     // may be considered a hack, we could revise the scheme in the future.
476     bool UpdateRef =
477         !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
478     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
479       DP("Has a pointer entry: \n");
480       // Base is address of pointer.
481       //
482       // Usually, the pointer is already allocated by this time.  For example:
483       //
484       //   #pragma omp target map(s.p[0:N])
485       //
486       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
487       // afterward, so the pointer is already allocated by the time the
488       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
489       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
490       // entry for a global that might not already be allocated by the time the
491       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
492       // when HasPresentModifier.
493       Pointer_TPR = Device.getTargetPointer(
494           HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr,
495           /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
496           HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo);
497       PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
498       IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
499       if (!PointerTgtPtrBegin) {
500         REPORT("Call to getTargetPointer returned null pointer (%s).\n",
501                HasPresentModifier ? "'present' map type modifier"
502                                   : "device failure or illegal mapping");
503         return OFFLOAD_FAIL;
504       }
505       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
506          "\n",
507          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
508          (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
509       Pointer_HstPtrBegin = HstPtrBase;
510       // modify current entry.
511       HstPtrBase = *(void **)HstPtrBase;
512       // No need to update pointee ref count for the first element of the
513       // subelement that comes from mapper.
514       UpdateRef =
515           (!FromMapper || i != 0); // subsequently update ref count of pointee
516     }
517 
518     const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
519     const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
520     auto TPR = Device.getTargetPointer(HstPtrBegin, HstPtrBase, data_size,
521                                        HstPtrName, HasFlagTo, HasFlagAlways,
522                                        IsImplicit, UpdateRef, HasCloseModifier,
523                                        HasPresentModifier, HasHoldModifier,
524                                        AsyncInfo);
525     void *TgtPtrBegin = TPR.TargetPointer;
526     IsHostPtr = TPR.Flags.IsHostPointer;
527     // If data_size==0, then the argument could be a zero-length pointer to
528     // NULL, so getOrAlloc() returning NULL is not an error.
529     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
530       REPORT("Call to getTargetPointer returned null pointer (%s).\n",
531              HasPresentModifier ? "'present' map type modifier"
532                                 : "device failure or illegal mapping");
533       return OFFLOAD_FAIL;
534     }
535     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
536        " - is%s new\n",
537        data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
538 
539     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
540       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
541       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
542       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
543       args_base[i] = TgtPtrBase;
544     }
545 
546     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
547       // Check whether we need to update the pointer on the device
548       bool UpdateDevPtr = false;
549 
550       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
551       void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
552 
553       Device.ShadowMtx.lock();
554       auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
555       // If this pointer is not in the map we need to insert it. If the map
556       // contains a stale entry, we need to update it (e.g. if the pointee was
557       // deallocated and later on is reallocated at another device address). The
558       // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
559       // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
560       // pair is deallocated, not when the OBJ is deallocated. In
561       // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
562       // so it stays in the map for the lifetime of the application. When the
563       // OBJ is deallocated and later on allocated again (at a different device
564       // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
565       // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
566       if (Entry == Device.ShadowPtrMap.end() ||
567           Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
568         // create or update shadow pointers for this entry
569         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
570             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
571         UpdateDevPtr = true;
572       }
573 
574       if (UpdateDevPtr) {
575         Pointer_TPR.MapTableEntry->lock();
576         Device.ShadowMtx.unlock();
577 
578         DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
579            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
580 
581         void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
582         TgtPtrBase = ExpectedTgtPtrBase;
583 
584         int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
585                                     sizeof(void *), AsyncInfo);
586         if (Ret != OFFLOAD_SUCCESS) {
587           Pointer_TPR.MapTableEntry->unlock();
588           REPORT("Copying data to device failed.\n");
589           return OFFLOAD_FAIL;
590         }
591         void *Event = Pointer_TPR.MapTableEntry->getEvent();
592         bool NeedNewEvent = Event == nullptr;
593         if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
594           Pointer_TPR.MapTableEntry->unlock();
595           REPORT("Failed to create event.\n");
596           return OFFLOAD_FAIL;
597         }
598         // We cannot assume the event should not be nullptr because we don't
599         // know if the target support event. But if a target doesn't,
600         // recordEvent should always return success.
601         Ret = Device.recordEvent(Event, AsyncInfo);
602         if (Ret != OFFLOAD_SUCCESS) {
603           Pointer_TPR.MapTableEntry->unlock();
604           REPORT("Failed to set dependence on event " DPxMOD "\n",
605                  DPxPTR(Event));
606           return OFFLOAD_FAIL;
607         }
608         if (NeedNewEvent)
609           Pointer_TPR.MapTableEntry->setEvent(Event);
610         Pointer_TPR.MapTableEntry->unlock();
611       } else
612         Device.ShadowMtx.unlock();
613     }
614   }
615 
616   return OFFLOAD_SUCCESS;
617 }
618 
619 namespace {
620 /// This structure contains information to deallocate a target pointer, aka.
621 /// used to call the function \p DeviceTy::deallocTgtPtr.
622 struct DeallocTgtPtrInfo {
623   /// Host pointer used to look up into the map table
624   void *HstPtrBegin;
625   /// Size of the data
626   int64_t DataSize;
627   /// Whether it has \p ompx_hold modifier
628   bool HasHoldModifier;
629 
630   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
631       : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
632 };
633 } // namespace
634 
635 /// Internal function to undo the mapping and retrieve the data from the device.
636 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
637                   void **ArgBases, void **Args, int64_t *ArgSizes,
638                   int64_t *ArgTypes, map_var_info_t *ArgNames,
639                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
640   int Ret;
641   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
642   void *FromMapperBase = nullptr;
643   // process each input.
644   for (int32_t I = ArgNum - 1; I >= 0; --I) {
645     // Ignore private variables and arrays - there is no mapping for them.
646     // Also, ignore the use_device_ptr directive, it has no effect here.
647     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
648         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
649       continue;
650 
651     if (ArgMappers && ArgMappers[I]) {
652       // Instead of executing the regular path of targetDataEnd, call the
653       // targetDataMapper variant which will call targetDataEnd again
654       // with new arguments.
655       DP("Calling targetDataMapper for the %dth argument\n", I);
656 
657       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
658       Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
659                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
660                              targetDataEnd);
661 
662       if (Ret != OFFLOAD_SUCCESS) {
663         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
664                " failed.\n");
665         return OFFLOAD_FAIL;
666       }
667 
668       // Skip the rest of this function, continue to the next argument.
669       continue;
670     }
671 
672     void *HstPtrBegin = Args[I];
673     int64_t DataSize = ArgSizes[I];
674     // Adjust for proper alignment if this is a combined entry (for structs).
675     // Look at the next argument - if that is MEMBER_OF this one, then this one
676     // is a combined entry.
677     const int NextI = I + 1;
678     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
679         getParentIndex(ArgTypes[NextI]) == I) {
680       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
681       if (Padding) {
682         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
683            "\n",
684            Padding, DPxPTR(HstPtrBegin));
685         HstPtrBegin = (char *)HstPtrBegin - Padding;
686         DataSize += Padding;
687       }
688     }
689 
690     bool IsLast, IsHostPtr;
691     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
692     bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
693                       (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
694                      !(FromMapper && I == 0);
695     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
696     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
697     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
698 
699     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
700     void *TgtPtrBegin = Device.getTgtPtrBegin(
701         HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
702         !IsImplicit, ForceDelete);
703     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
704       DP("Mapping does not exist (%s)\n",
705          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
706       if (HasPresentModifier) {
707         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
708         // "If a map clause appears on a target, target data, target enter data
709         // or target exit data construct with a present map-type-modifier then
710         // on entry to the region if the corresponding list item does not appear
711         // in the device data environment then an error occurs and the program
712         // terminates."
713         //
714         // This should be an error upon entering an "omp target exit data".  It
715         // should not be an error upon exiting an "omp target data" or "omp
716         // target".  For "omp target data", Clang thus doesn't include present
717         // modifiers for end calls.  For "omp target", we have not found a valid
718         // OpenMP program for which the error matters: it appears that, if a
719         // program can guarantee that data is present at the beginning of an
720         // "omp target" region so that there's no error there, that data is also
721         // guaranteed to be present at the end.
722         MESSAGE("device mapping required by 'present' map type modifier does "
723                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
724                 DPxPTR(HstPtrBegin), DataSize);
725         return OFFLOAD_FAIL;
726       }
727     } else {
728       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
729          " - is%s last\n",
730          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
731     }
732 
733     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
734     // "If the map clause appears on a target, target data, or target exit data
735     // construct and a corresponding list item of the original list item is not
736     // present in the device data environment on exit from the region then the
737     // list item is ignored."
738     if (!TgtPtrBegin)
739       continue;
740 
741     bool DelEntry = IsLast;
742 
743     // If the last element from the mapper (for end transfer args comes in
744     // reverse order), do not remove the partial entry, the parent struct still
745     // exists.
746     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
747         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
748       DelEntry = false; // protect parent struct from being deallocated
749     }
750 
751     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
752       // Move data back to the host
753       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
754         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
755         if ((Always || IsLast) && !IsHostPtr) {
756           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
757              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
758           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
759                                     AsyncInfo);
760           if (Ret != OFFLOAD_SUCCESS) {
761             REPORT("Copying data from device failed.\n");
762             return OFFLOAD_FAIL;
763           }
764         }
765       }
766       if (DelEntry && FromMapper && I == 0) {
767         DelEntry = false;
768         FromMapperBase = HstPtrBegin;
769       }
770 
771       // If we copied back to the host a struct/array containing pointers, we
772       // need to restore the original host pointer values from their shadow
773       // copies. If the struct is going to be deallocated, remove any remaining
774       // shadow pointer entries for this struct.
775       uintptr_t LB = (uintptr_t)HstPtrBegin;
776       uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
777       Device.ShadowMtx.lock();
778       for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
779            Itr != Device.ShadowPtrMap.end();) {
780         void **ShadowHstPtrAddr = (void **)Itr->first;
781 
782         // An STL map is sorted on its keys; use this property
783         // to quickly determine when to break out of the loop.
784         if ((uintptr_t)ShadowHstPtrAddr < LB) {
785           ++Itr;
786           continue;
787         }
788         if ((uintptr_t)ShadowHstPtrAddr >= UB)
789           break;
790 
791         // If we copied the struct to the host, we need to restore the pointer.
792         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
793           DP("Restoring original host pointer value " DPxMOD " for host "
794              "pointer " DPxMOD "\n",
795              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
796           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
797         }
798         // If the struct is to be deallocated, remove the shadow entry.
799         if (DelEntry) {
800           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
801           Itr = Device.ShadowPtrMap.erase(Itr);
802         } else {
803           ++Itr;
804         }
805       }
806       Device.ShadowMtx.unlock();
807 
808       // Add pointer to the buffer for later deallocation
809       if (DelEntry && !IsHostPtr)
810         DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier);
811     }
812   }
813 
814   // TODO: We should not synchronize here but pass the AsyncInfo object to the
815   //       allocate/deallocate device APIs.
816   //
817   // We need to synchronize before deallocating data.
818   Ret = AsyncInfo.synchronize();
819   if (Ret != OFFLOAD_SUCCESS)
820     return OFFLOAD_FAIL;
821 
822   // Deallocate target pointer
823   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
824     if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
825       continue;
826     Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
827                                Info.HasHoldModifier);
828     if (Ret != OFFLOAD_SUCCESS) {
829       REPORT("Deallocating data from device failed.\n");
830       return OFFLOAD_FAIL;
831     }
832   }
833 
834   return OFFLOAD_SUCCESS;
835 }
836 
837 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
838                                 void *HstPtrBegin, int64_t ArgSize,
839                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
840   TIMESCOPE_WITH_IDENT(loc);
841   bool IsLast, IsHostPtr;
842   void *TgtPtrBegin = Device.getTgtPtrBegin(
843       HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
844       /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
845   if (!TgtPtrBegin) {
846     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
847     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
848       MESSAGE("device mapping required by 'present' motion modifier does not "
849               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
850               DPxPTR(HstPtrBegin), ArgSize);
851       return OFFLOAD_FAIL;
852     }
853     return OFFLOAD_SUCCESS;
854   }
855 
856   if (IsHostPtr) {
857     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
858        DPxPTR(HstPtrBegin));
859     return OFFLOAD_SUCCESS;
860   }
861 
862   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
863     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
864        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
865     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
866     if (Ret != OFFLOAD_SUCCESS) {
867       REPORT("Copying data from device failed.\n");
868       return OFFLOAD_FAIL;
869     }
870 
871     uintptr_t LB = (uintptr_t)HstPtrBegin;
872     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
873     Device.ShadowMtx.lock();
874     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
875          IT != Device.ShadowPtrMap.end(); ++IT) {
876       void **ShadowHstPtrAddr = (void **)IT->first;
877       if ((uintptr_t)ShadowHstPtrAddr < LB)
878         continue;
879       if ((uintptr_t)ShadowHstPtrAddr >= UB)
880         break;
881       DP("Restoring original host pointer value " DPxMOD
882          " for host pointer " DPxMOD "\n",
883          DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
884       *ShadowHstPtrAddr = IT->second.HstPtrVal;
885     }
886     Device.ShadowMtx.unlock();
887   }
888 
889   if (ArgType & OMP_TGT_MAPTYPE_TO) {
890     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
891        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
892     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
893     if (Ret != OFFLOAD_SUCCESS) {
894       REPORT("Copying data to device failed.\n");
895       return OFFLOAD_FAIL;
896     }
897 
898     uintptr_t LB = (uintptr_t)HstPtrBegin;
899     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
900     Device.ShadowMtx.lock();
901     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
902          IT != Device.ShadowPtrMap.end(); ++IT) {
903       void **ShadowHstPtrAddr = (void **)IT->first;
904       if ((uintptr_t)ShadowHstPtrAddr < LB)
905         continue;
906       if ((uintptr_t)ShadowHstPtrAddr >= UB)
907         break;
908       DP("Restoring original target pointer value " DPxMOD " for target "
909          "pointer " DPxMOD "\n",
910          DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
911       Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
912                               sizeof(void *), AsyncInfo);
913       if (Ret != OFFLOAD_SUCCESS) {
914         REPORT("Copying data to device failed.\n");
915         Device.ShadowMtx.unlock();
916         return OFFLOAD_FAIL;
917       }
918     }
919     Device.ShadowMtx.unlock();
920   }
921   return OFFLOAD_SUCCESS;
922 }
923 
924 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
925                                    void *ArgsBase,
926                                    __tgt_target_non_contig *NonContig,
927                                    uint64_t Size, int64_t ArgType,
928                                    int CurrentDim, int DimSize, uint64_t Offset,
929                                    AsyncInfoTy &AsyncInfo) {
930   TIMESCOPE_WITH_IDENT(loc);
931   int Ret = OFFLOAD_SUCCESS;
932   if (CurrentDim < DimSize) {
933     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
934       uint64_t CurOffset =
935           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
936       // we only need to transfer the first element for the last dimension
937       // since we've already got a contiguous piece.
938       if (CurrentDim != DimSize - 1 || I == 0) {
939         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
940                                       ArgType, CurrentDim + 1, DimSize,
941                                       Offset + CurOffset, AsyncInfo);
942         // Stop the whole process if any contiguous piece returns anything
943         // other than OFFLOAD_SUCCESS.
944         if (Ret != OFFLOAD_SUCCESS)
945           return Ret;
946       }
947     }
948   } else {
949     char *Ptr = (char *)ArgsBase + Offset;
950     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
951        " len %" PRIu64 "\n",
952        DPxPTR(Ptr), Offset, Size);
953     Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
954                                AsyncInfo);
955   }
956   return Ret;
957 }
958 
959 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
960                                        int32_t DimSize) {
961   int RemovedDim = 0;
962   for (int I = DimSize - 1; I > 0; --I) {
963     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
964       RemovedDim++;
965   }
966   return RemovedDim;
967 }
968 
969 /// Internal function to pass data to/from the target.
970 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
971                      void **ArgsBase, void **Args, int64_t *ArgSizes,
972                      int64_t *ArgTypes, map_var_info_t *ArgNames,
973                      void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
974   // process each input.
975   for (int32_t I = 0; I < ArgNum; ++I) {
976     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
977         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
978       continue;
979 
980     if (ArgMappers && ArgMappers[I]) {
981       // Instead of executing the regular path of targetDataUpdate, call the
982       // targetDataMapper variant which will call targetDataUpdate again
983       // with new arguments.
984       DP("Calling targetDataMapper for the %dth argument\n", I);
985 
986       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
987       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
988                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
989                                  targetDataUpdate);
990 
991       if (Ret != OFFLOAD_SUCCESS) {
992         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
993                " failed.\n");
994         return OFFLOAD_FAIL;
995       }
996 
997       // Skip the rest of this function, continue to the next argument.
998       continue;
999     }
1000 
1001     int Ret = OFFLOAD_SUCCESS;
1002 
1003     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1004       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1005       int32_t DimSize = ArgSizes[I];
1006       uint64_t Size =
1007           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1008       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1009       Ret = targetDataNonContiguous(
1010           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1011           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1012     } else {
1013       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1014                                  ArgTypes[I], AsyncInfo);
1015     }
1016     if (Ret == OFFLOAD_FAIL)
1017       return OFFLOAD_FAIL;
1018   }
1019   return OFFLOAD_SUCCESS;
1020 }
1021 
1022 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1023                                       OMP_TGT_MAPTYPE_LITERAL |
1024                                       OMP_TGT_MAPTYPE_IMPLICIT;
1025 static bool isLambdaMapping(int64_t Mapping) {
1026   return (Mapping & LambdaMapping) == LambdaMapping;
1027 }
1028 
1029 namespace {
1030 /// Find the table information in the map or look it up in the translation
1031 /// tables.
1032 TableMap *getTableMap(void *HostPtr) {
1033   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1034   HostPtrToTableMapTy::iterator TableMapIt =
1035       PM->HostPtrToTableMap.find(HostPtr);
1036 
1037   if (TableMapIt != PM->HostPtrToTableMap.end())
1038     return &TableMapIt->second;
1039 
1040   // We don't have a map. So search all the registered libraries.
1041   TableMap *TM = nullptr;
1042   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1043   for (HostEntriesBeginToTransTableTy::iterator Itr =
1044            PM->HostEntriesBeginToTransTable.begin();
1045        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1046     // get the translation table (which contains all the good info).
1047     TranslationTable *TransTable = &Itr->second;
1048     // iterate over all the host table entries to see if we can locate the
1049     // host_ptr.
1050     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1051     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1052       if (Cur->addr != HostPtr)
1053         continue;
1054       // we got a match, now fill the HostPtrToTableMap so that we
1055       // may avoid this search next time.
1056       TM = &(PM->HostPtrToTableMap)[HostPtr];
1057       TM->Table = TransTable;
1058       TM->Index = I;
1059       return TM;
1060     }
1061   }
1062 
1063   return nullptr;
1064 }
1065 
1066 /// Get loop trip count
1067 /// FIXME: This function will not work right if calling
1068 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1069 /// another thread, which might occur when we call task yield.
1070 uint64_t getLoopTripCount(int64_t DeviceId) {
1071   DeviceTy &Device = *PM->Devices[DeviceId];
1072   uint64_t LoopTripCount = 0;
1073 
1074   {
1075     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1076     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1077     if (I != Device.LoopTripCnt.end()) {
1078       LoopTripCount = I->second;
1079       Device.LoopTripCnt.erase(I);
1080       DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1081     }
1082   }
1083 
1084   return LoopTripCount;
1085 }
1086 
1087 /// A class manages private arguments in a target region.
1088 class PrivateArgumentManagerTy {
1089   /// A data structure for the information of first-private arguments. We can
1090   /// use this information to optimize data transfer by packing all
1091   /// first-private arguments and transfer them all at once.
1092   struct FirstPrivateArgInfoTy {
1093     /// The index of the element in \p TgtArgs corresponding to the argument
1094     const int Index;
1095     /// Host pointer begin
1096     const char *HstPtrBegin;
1097     /// Host pointer end
1098     const char *HstPtrEnd;
1099     /// Aligned size
1100     const int64_t AlignedSize;
1101     /// Host pointer name
1102     const map_var_info_t HstPtrName = nullptr;
1103 
1104     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1105                           const map_var_info_t HstPtrName = nullptr)
1106         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1107           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1108           HstPtrName(HstPtrName) {}
1109   };
1110 
1111   /// A vector of target pointers for all private arguments
1112   std::vector<void *> TgtPtrs;
1113 
1114   /// A vector of information of all first-private arguments to be packed
1115   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1116   /// Host buffer for all arguments to be packed
1117   std::vector<char> FirstPrivateArgBuffer;
1118   /// The total size of all arguments to be packed
1119   int64_t FirstPrivateArgSize = 0;
1120 
1121   /// A reference to the \p DeviceTy object
1122   DeviceTy &Device;
1123   /// A pointer to a \p AsyncInfoTy object
1124   AsyncInfoTy &AsyncInfo;
1125 
1126   // TODO: What would be the best value here? Should we make it configurable?
1127   // If the size is larger than this threshold, we will allocate and transfer it
1128   // immediately instead of packing it.
1129   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1130 
1131 public:
1132   /// Constructor
1133   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1134       : Device(Dev), AsyncInfo(AsyncInfo) {}
1135 
1136   /// Add a private argument
1137   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1138              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1139              const map_var_info_t HstPtrName = nullptr,
1140              const bool AllocImmediately = false) {
1141     // If the argument is not first-private, or its size is greater than a
1142     // predefined threshold, we will allocate memory and issue the transfer
1143     // immediately.
1144     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1145         AllocImmediately) {
1146       TgtPtr = Device.allocData(ArgSize, HstPtr);
1147       if (!TgtPtr) {
1148         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1149            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1150         return OFFLOAD_FAIL;
1151       }
1152 #ifdef OMPTARGET_DEBUG
1153       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1154       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1155          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1156          "\n",
1157          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1158          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1159 #endif
1160       // If first-private, copy data from host
1161       if (IsFirstPrivate) {
1162         DP("Submitting firstprivate data to the device.\n");
1163         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1164         if (Ret != OFFLOAD_SUCCESS) {
1165           DP("Copying data to device failed, failed.\n");
1166           return OFFLOAD_FAIL;
1167         }
1168       }
1169       TgtPtrs.push_back(TgtPtr);
1170     } else {
1171       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1172          DPxPTR(HstPtr), ArgSize);
1173       // When reach this point, the argument must meet all following
1174       // requirements:
1175       // 1. Its size does not exceed the threshold (see the comment for
1176       // FirstPrivateArgSizeThreshold);
1177       // 2. It must be first-private (needs to be mapped to target device).
1178       // We will pack all this kind of arguments to transfer them all at once
1179       // to reduce the number of data transfer. We will not take
1180       // non-first-private arguments, aka. private arguments that doesn't need
1181       // to be mapped to target device, into account because data allocation
1182       // can be very efficient with memory manager.
1183 
1184       // Placeholder value
1185       TgtPtr = nullptr;
1186       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1187                                        HstPtrName);
1188       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1189     }
1190 
1191     return OFFLOAD_SUCCESS;
1192   }
1193 
1194   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1195   /// and start the transfer.
1196   int packAndTransfer(std::vector<void *> &TgtArgs) {
1197     if (!FirstPrivateArgInfo.empty()) {
1198       assert(FirstPrivateArgSize != 0 &&
1199              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1200       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1201       auto Itr = FirstPrivateArgBuffer.begin();
1202       // Copy all host data to this buffer
1203       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1204         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1205         Itr = std::next(Itr, Info.AlignedSize);
1206       }
1207       // Allocate target memory
1208       void *TgtPtr =
1209           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1210       if (TgtPtr == nullptr) {
1211         DP("Failed to allocate target memory for private arguments.\n");
1212         return OFFLOAD_FAIL;
1213       }
1214       TgtPtrs.push_back(TgtPtr);
1215       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1216          FirstPrivateArgSize, DPxPTR(TgtPtr));
1217       // Transfer data to target device
1218       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1219                                   FirstPrivateArgSize, AsyncInfo);
1220       if (Ret != OFFLOAD_SUCCESS) {
1221         DP("Failed to submit data of private arguments.\n");
1222         return OFFLOAD_FAIL;
1223       }
1224       // Fill in all placeholder pointers
1225       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1226       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1227         void *&Ptr = TgtArgs[Info.Index];
1228         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1229         Ptr = reinterpret_cast<void *>(TP);
1230         TP += Info.AlignedSize;
1231         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1232            "\n",
1233            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1234            DPxPTR(Ptr));
1235       }
1236     }
1237 
1238     return OFFLOAD_SUCCESS;
1239   }
1240 
1241   /// Free all target memory allocated for private arguments
1242   int free() {
1243     for (void *P : TgtPtrs) {
1244       int Ret = Device.deleteData(P);
1245       if (Ret != OFFLOAD_SUCCESS) {
1246         DP("Deallocation of (first-)private arrays failed.\n");
1247         return OFFLOAD_FAIL;
1248       }
1249     }
1250 
1251     TgtPtrs.clear();
1252 
1253     return OFFLOAD_SUCCESS;
1254   }
1255 };
1256 
1257 /// Process data before launching the kernel, including calling targetDataBegin
1258 /// to map and transfer data to target device, transferring (first-)private
1259 /// variables.
1260 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1261                              int32_t ArgNum, void **ArgBases, void **Args,
1262                              int64_t *ArgSizes, int64_t *ArgTypes,
1263                              map_var_info_t *ArgNames, void **ArgMappers,
1264                              std::vector<void *> &TgtArgs,
1265                              std::vector<ptrdiff_t> &TgtOffsets,
1266                              PrivateArgumentManagerTy &PrivateArgumentManager,
1267                              AsyncInfoTy &AsyncInfo) {
1268   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1269   DeviceTy &Device = *PM->Devices[DeviceId];
1270   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1271                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1272   if (Ret != OFFLOAD_SUCCESS) {
1273     REPORT("Call to targetDataBegin failed, abort target.\n");
1274     return OFFLOAD_FAIL;
1275   }
1276 
1277   // List of (first-)private arrays allocated for this target region
1278   std::vector<int> TgtArgsPositions(ArgNum, -1);
1279 
1280   for (int32_t I = 0; I < ArgNum; ++I) {
1281     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1282       // This is not a target parameter, do not push it into TgtArgs.
1283       // Check for lambda mapping.
1284       if (isLambdaMapping(ArgTypes[I])) {
1285         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1286                "PTR_AND_OBJ must be also MEMBER_OF.");
1287         unsigned Idx = getParentIndex(ArgTypes[I]);
1288         int TgtIdx = TgtArgsPositions[Idx];
1289         assert(TgtIdx != -1 && "Base address must be translated already.");
1290         // The parent lambda must be processed already and it must be the last
1291         // in TgtArgs and TgtOffsets arrays.
1292         void *HstPtrVal = Args[I];
1293         void *HstPtrBegin = ArgBases[I];
1294         void *HstPtrBase = Args[Idx];
1295         bool IsLast, IsHostPtr; // IsLast is unused.
1296         void *TgtPtrBase =
1297             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1298         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1299         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1300         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1301         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1302         PointerTgtPtrBegin = Device.getTgtPtrBegin(
1303             HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
1304             /*UseHoldRefCount=*/false, IsHostPtr);
1305         if (!PointerTgtPtrBegin) {
1306           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1307              DPxPTR(HstPtrVal));
1308           continue;
1309         }
1310         if (IsHostPtr) {
1311           DP("Unified memory is active, no need to map lambda captured"
1312              "variable (" DPxMOD ")\n",
1313              DPxPTR(HstPtrVal));
1314           continue;
1315         }
1316         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1317            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1318         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1319                                 sizeof(void *), AsyncInfo);
1320         if (Ret != OFFLOAD_SUCCESS) {
1321           REPORT("Copying data to device failed.\n");
1322           return OFFLOAD_FAIL;
1323         }
1324       }
1325       continue;
1326     }
1327     void *HstPtrBegin = Args[I];
1328     void *HstPtrBase = ArgBases[I];
1329     void *TgtPtrBegin;
1330     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1331     ptrdiff_t TgtBaseOffset;
1332     bool IsLast, IsHostPtr; // unused.
1333     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1334       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1335          DPxPTR(HstPtrBase));
1336       TgtPtrBegin = HstPtrBase;
1337       TgtBaseOffset = 0;
1338     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1339       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1340       const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1341       // If there is a next argument and it depends on the current one, we need
1342       // to allocate the private memory immediately. If this is not the case,
1343       // then the argument can be marked for optimization and packed with the
1344       // other privates.
1345       const bool AllocImmediately =
1346           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1347       Ret = PrivateArgumentManager.addArg(
1348           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1349           TgtArgs.size(), HstPtrName, AllocImmediately);
1350       if (Ret != OFFLOAD_SUCCESS) {
1351         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1352                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1353         return OFFLOAD_FAIL;
1354       }
1355     } else {
1356       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1357         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1358       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1359                                           /*UpdateRefCount=*/false,
1360                                           /*UseHoldRefCount=*/false, IsHostPtr);
1361       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1362 #ifdef OMPTARGET_DEBUG
1363       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1364       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1365          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1366 #endif
1367     }
1368     TgtArgsPositions[I] = TgtArgs.size();
1369     TgtArgs.push_back(TgtPtrBegin);
1370     TgtOffsets.push_back(TgtBaseOffset);
1371   }
1372 
1373   assert(TgtArgs.size() == TgtOffsets.size() &&
1374          "Size mismatch in arguments and offsets");
1375 
1376   // Pack and transfer first-private arguments
1377   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1378   if (Ret != OFFLOAD_SUCCESS) {
1379     DP("Failed to pack and transfer first private arguments\n");
1380     return OFFLOAD_FAIL;
1381   }
1382 
1383   return OFFLOAD_SUCCESS;
1384 }
1385 
1386 /// Process data after launching the kernel, including transferring data back to
1387 /// host if needed and deallocating target memory of (first-)private variables.
1388 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1389                             int32_t ArgNum, void **ArgBases, void **Args,
1390                             int64_t *ArgSizes, int64_t *ArgTypes,
1391                             map_var_info_t *ArgNames, void **ArgMappers,
1392                             PrivateArgumentManagerTy &PrivateArgumentManager,
1393                             AsyncInfoTy &AsyncInfo) {
1394   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1395   DeviceTy &Device = *PM->Devices[DeviceId];
1396 
1397   // Move data from device.
1398   int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1399                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1400   if (Ret != OFFLOAD_SUCCESS) {
1401     REPORT("Call to targetDataEnd failed, abort target.\n");
1402     return OFFLOAD_FAIL;
1403   }
1404 
1405   // Free target memory for private arguments
1406   Ret = PrivateArgumentManager.free();
1407   if (Ret != OFFLOAD_SUCCESS) {
1408     REPORT("Failed to deallocate target memory for private args\n");
1409     return OFFLOAD_FAIL;
1410   }
1411 
1412   return OFFLOAD_SUCCESS;
1413 }
1414 } // namespace
1415 
1416 /// performs the same actions as data_begin in case arg_num is
1417 /// non-zero and initiates run of the offloaded region on the target platform;
1418 /// if arg_num is non-zero after the region execution is done it also
1419 /// performs the same action as data_update and data_end above. This function
1420 /// returns 0 if it was able to transfer the execution to a target and an
1421 /// integer different from zero otherwise.
1422 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1423            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1424            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1425            int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1426   int32_t DeviceId = Device.DeviceID;
1427 
1428   TableMap *TM = getTableMap(HostPtr);
1429   // No map for this host pointer found!
1430   if (!TM) {
1431     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1432            DPxPTR(HostPtr));
1433     return OFFLOAD_FAIL;
1434   }
1435 
1436   // get target table.
1437   __tgt_target_table *TargetTable = nullptr;
1438   {
1439     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1440     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1441            "Not expecting a device ID outside the table's bounds!");
1442     TargetTable = TM->Table->TargetsTable[DeviceId];
1443   }
1444   assert(TargetTable && "Global data has not been mapped\n");
1445 
1446   // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1447   // need to manifest base pointers prior to launching a kernel. Even if we have
1448   // mapped an object only partially, e.g. A[N:M], although the kernel is
1449   // expected to access elements starting at address &A[N] and beyond, we still
1450   // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1451   // API, we need the begin address itself, i.e. &A[N], as the API operates on
1452   // begin addresses, not bases. That's why we pass args and offsets as two
1453   // separate entities so that each plugin can do what it needs. This behavior
1454   // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1455   std::vector<void *> TgtArgs;
1456   std::vector<ptrdiff_t> TgtOffsets;
1457 
1458   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1459 
1460   int Ret;
1461   if (ArgNum) {
1462     // Process data, such as data mapping, before launching the kernel
1463     Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1464                             ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1465                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1466     if (Ret != OFFLOAD_SUCCESS) {
1467       REPORT("Failed to process data before launching the kernel.\n");
1468       return OFFLOAD_FAIL;
1469     }
1470   }
1471 
1472   // Launch device execution.
1473   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1474   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1475      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1476 
1477   {
1478     TIMESCOPE_WITH_NAME_AND_IDENT(
1479         IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1480     if (IsTeamConstruct)
1481       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1482                                  TgtArgs.size(), TeamNum, ThreadLimit,
1483                                  getLoopTripCount(DeviceId), AsyncInfo);
1484     else
1485       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1486                              TgtArgs.size(), AsyncInfo);
1487   }
1488 
1489   if (Ret != OFFLOAD_SUCCESS) {
1490     REPORT("Executing target region abort target.\n");
1491     return OFFLOAD_FAIL;
1492   }
1493 
1494   if (ArgNum) {
1495     // Transfer data back and deallocate target memory for (first-)private
1496     // variables
1497     Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1498                            ArgSizes, ArgTypes, ArgNames, ArgMappers,
1499                            PrivateArgumentManager, AsyncInfo);
1500     if (Ret != OFFLOAD_SUCCESS) {
1501       REPORT("Failed to process data after launching the kernel.\n");
1502       return OFFLOAD_FAIL;
1503     }
1504   }
1505 
1506   return OFFLOAD_SUCCESS;
1507 }
1508