1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18 
19 #include <cassert>
20 #include <cstdint>
21 #include <vector>
22 
23 int AsyncInfoTy::synchronize() {
24   int Result = OFFLOAD_SUCCESS;
25   if (AsyncInfo.Queue) {
26     // If we have a queue we need to synchronize it now.
27     Result = Device.synchronize(*this);
28     assert(AsyncInfo.Queue == nullptr &&
29            "The device plugin should have nulled the queue to indicate there "
30            "are no outstanding actions!");
31   }
32   return Result;
33 }
34 
35 void *&AsyncInfoTy::getVoidPtrLocation() {
36   BufferLocations.push_back(nullptr);
37   return BufferLocations.back();
38 }
39 
40 /* All begin addresses for partially mapped structs must be 8-aligned in order
41  * to ensure proper alignment of members. E.g.
42  *
43  * struct S {
44  *   int a;   // 4-aligned
45  *   int b;   // 4-aligned
46  *   int *p;  // 8-aligned
47  * } s1;
48  * ...
49  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
50  * {
51  *   s1.b = 5;
52  *   for (int i...) s1.p[i] = ...;
53  * }
54  *
55  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
56  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
57  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
58  * requirements for its type. Now, when we allocate memory on the device, in
59  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
60  * This means that the chunk of the struct on the device will start at a
61  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
62  * address of p will be a misaligned 0x204 (on the host there was no need to add
63  * padding between b and p, so p comes exactly 4 bytes after b). If the device
64  * kernel tries to access s1.p, a misaligned address error occurs (as reported
65  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
66  * extending the size of the allocated chuck accordingly, the chuck on the
67  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
68  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
69  */
70 static const int64_t Alignment = 8;
71 
72 /// Map global data and execute pending ctors
73 static int InitLibrary(DeviceTy &Device) {
74   /*
75    * Map global data
76    */
77   int32_t device_id = Device.DeviceID;
78   int rc = OFFLOAD_SUCCESS;
79   bool supportsEmptyImages = Device.RTL->supports_empty_images &&
80                              Device.RTL->supports_empty_images() > 0;
81 
82   Device.PendingGlobalsMtx.lock();
83   PM->TrlTblMtx.lock();
84   for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
85     TranslationTable *TransTable =
86         &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
87     if (TransTable->HostTable.EntriesBegin ==
88             TransTable->HostTable.EntriesEnd &&
89         !supportsEmptyImages) {
90       // No host entry so no need to proceed
91       continue;
92     }
93 
94     if (TransTable->TargetsTable[device_id] != 0) {
95       // Library entries have already been processed
96       continue;
97     }
98 
99     // 1) get image.
100     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
101            "Not expecting a device ID outside the table's bounds!");
102     __tgt_device_image *img = TransTable->TargetsImages[device_id];
103     if (!img) {
104       REPORT("No image loaded for device id %d.\n", device_id);
105       rc = OFFLOAD_FAIL;
106       break;
107     }
108     // 2) load image into the target table.
109     __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
110         Device.load_binary(img);
111     // Unable to get table for this image: invalidate image and fail.
112     if (!TargetTable) {
113       REPORT("Unable to generate entries table for device id %d.\n", device_id);
114       TransTable->TargetsImages[device_id] = 0;
115       rc = OFFLOAD_FAIL;
116       break;
117     }
118 
119     // Verify whether the two table sizes match.
120     size_t hsize =
121         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
122     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
123 
124     // Invalid image for these host entries!
125     if (hsize != tsize) {
126       REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
127              device_id, hsize, tsize);
128       TransTable->TargetsImages[device_id] = 0;
129       TransTable->TargetsTable[device_id] = 0;
130       rc = OFFLOAD_FAIL;
131       break;
132     }
133 
134     // process global data that needs to be mapped.
135     Device.DataMapMtx.lock();
136     __tgt_target_table *HostTable = &TransTable->HostTable;
137     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
138                              *CurrHostEntry = HostTable->EntriesBegin,
139                              *EntryDeviceEnd = TargetTable->EntriesEnd;
140          CurrDeviceEntry != EntryDeviceEnd;
141          CurrDeviceEntry++, CurrHostEntry++) {
142       if (CurrDeviceEntry->size != 0) {
143         // has data.
144         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
145                "data size mismatch");
146 
147         // Fortran may use multiple weak declarations for the same symbol,
148         // therefore we must allow for multiple weak symbols to be loaded from
149         // the fat binary. Treat these mappings as any other "regular" mapping.
150         // Add entry to map.
151         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
152           continue;
153         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
154            "\n",
155            DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
156            CurrDeviceEntry->size);
157         Device.HostDataToTargetMap.emplace(
158             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
159             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
160             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
161             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
162             false /*UseHoldRefCount*/, nullptr /*Name*/,
163             true /*IsRefCountINF*/);
164       }
165     }
166     Device.DataMapMtx.unlock();
167   }
168   PM->TrlTblMtx.unlock();
169 
170   if (rc != OFFLOAD_SUCCESS) {
171     Device.PendingGlobalsMtx.unlock();
172     return rc;
173   }
174 
175   /*
176    * Run ctors for static objects
177    */
178   if (!Device.PendingCtorsDtors.empty()) {
179     AsyncInfoTy AsyncInfo(Device);
180     // Call all ctors for all libraries registered so far
181     for (auto &lib : Device.PendingCtorsDtors) {
182       if (!lib.second.PendingCtors.empty()) {
183         DP("Has pending ctors... call now\n");
184         for (auto &entry : lib.second.PendingCtors) {
185           void *ctor = entry;
186           int rc =
187               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
188                      nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
189           if (rc != OFFLOAD_SUCCESS) {
190             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
191             Device.PendingGlobalsMtx.unlock();
192             return OFFLOAD_FAIL;
193           }
194         }
195         // Clear the list to indicate that this device has been used
196         lib.second.PendingCtors.clear();
197         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
198       }
199     }
200     // All constructors have been issued, wait for them now.
201     if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
202       return OFFLOAD_FAIL;
203   }
204   Device.HasPendingGlobals = false;
205   Device.PendingGlobalsMtx.unlock();
206 
207   return OFFLOAD_SUCCESS;
208 }
209 
210 void handleTargetOutcome(bool Success, ident_t *Loc) {
211   switch (PM->TargetOffloadPolicy) {
212   case tgt_disabled:
213     if (Success) {
214       FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
215     }
216     break;
217   case tgt_default:
218     FATAL_MESSAGE0(1, "default offloading policy must be switched to "
219                       "mandatory or disabled");
220     break;
221   case tgt_mandatory:
222     if (!Success) {
223       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
224         for (auto &Device : PM->Devices)
225           dumpTargetPointerMappings(Loc, *Device);
226       else
227         FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
228                         "pointer mappings.\n",
229                         OMP_INFOTYPE_DUMP_TABLE);
230 
231       SourceInfo info(Loc);
232       if (info.isAvailible())
233         fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
234                 info.getColumn());
235       else
236         FAILURE_MESSAGE("Source location information not present. Compile with "
237                         "-g or -gline-tables-only.\n");
238       FATAL_MESSAGE0(
239           1, "failure of target construct while offloading is mandatory");
240     } else {
241       if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
242         for (auto &Device : PM->Devices)
243           dumpTargetPointerMappings(Loc, *Device);
244     }
245     break;
246   }
247 }
248 
249 static void handleDefaultTargetOffload() {
250   PM->TargetOffloadMtx.lock();
251   if (PM->TargetOffloadPolicy == tgt_default) {
252     if (omp_get_num_devices() > 0) {
253       DP("Default TARGET OFFLOAD policy is now mandatory "
254          "(devices were found)\n");
255       PM->TargetOffloadPolicy = tgt_mandatory;
256     } else {
257       DP("Default TARGET OFFLOAD policy is now disabled "
258          "(no devices were found)\n");
259       PM->TargetOffloadPolicy = tgt_disabled;
260     }
261   }
262   PM->TargetOffloadMtx.unlock();
263 }
264 
265 static bool isOffloadDisabled() {
266   if (PM->TargetOffloadPolicy == tgt_default)
267     handleDefaultTargetOffload();
268   return PM->TargetOffloadPolicy == tgt_disabled;
269 }
270 
271 // If offload is enabled, ensure that device DeviceID has been initialized,
272 // global ctors have been executed, and global data has been mapped.
273 //
274 // The return bool indicates if the offload is to the host device
275 // There are three possible results:
276 // - Return false if the taregt device is ready for offload
277 // - Return true without reporting a runtime error if offload is
278 //   disabled, perhaps because the initial device was specified.
279 // - Report a runtime error and return true.
280 //
281 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
282 // This step might be skipped if offload is disabled.
283 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
284   if (isOffloadDisabled()) {
285     DP("Offload is disabled\n");
286     return true;
287   }
288 
289   if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
290     DeviceID = omp_get_default_device();
291     DP("Use default device id %" PRId64 "\n", DeviceID);
292   }
293 
294   // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
295   if (omp_get_num_devices() == 0) {
296     DP("omp_get_num_devices() == 0 but offload is manadatory\n");
297     handleTargetOutcome(false, Loc);
298     return true;
299   }
300 
301   if (DeviceID == omp_get_initial_device()) {
302     DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
303        DeviceID);
304     return true;
305   }
306 
307   // Is device ready?
308   if (!device_is_ready(DeviceID)) {
309     REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
310     handleTargetOutcome(false, Loc);
311     return true;
312   }
313 
314   // Get device info.
315   DeviceTy &Device = *PM->Devices[DeviceID];
316 
317   // Check whether global data has been mapped for this device
318   Device.PendingGlobalsMtx.lock();
319   bool hasPendingGlobals = Device.HasPendingGlobals;
320   Device.PendingGlobalsMtx.unlock();
321   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
322     REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
323     handleTargetOutcome(false, Loc);
324     return true;
325   }
326 
327   return false;
328 }
329 
330 static int32_t getParentIndex(int64_t type) {
331   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
332 }
333 
334 void *targetAllocExplicit(size_t size, int device_num, int kind,
335                           const char *name) {
336   TIMESCOPE();
337   DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
338 
339   if (size <= 0) {
340     DP("Call to %s with non-positive length\n", name);
341     return NULL;
342   }
343 
344   void *rc = NULL;
345 
346   if (device_num == omp_get_initial_device()) {
347     rc = malloc(size);
348     DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
349     return rc;
350   }
351 
352   if (!device_is_ready(device_num)) {
353     DP("%s returns NULL ptr\n", name);
354     return NULL;
355   }
356 
357   DeviceTy &Device = *PM->Devices[device_num];
358   rc = Device.allocData(size, nullptr, kind);
359   DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
360   return rc;
361 }
362 
363 /// Call the user-defined mapper function followed by the appropriate
364 // targetData* function (targetData{Begin,End,Update}).
365 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
366                      int64_t arg_size, int64_t arg_type,
367                      map_var_info_t arg_names, void *arg_mapper,
368                      AsyncInfoTy &AsyncInfo,
369                      TargetDataFuncPtrTy target_data_function) {
370   TIMESCOPE_WITH_IDENT(loc);
371   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
372 
373   // The mapper function fills up Components.
374   MapperComponentsTy MapperComponents;
375   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
376   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
377                    arg_names);
378 
379   // Construct new arrays for args_base, args, arg_sizes and arg_types
380   // using the information in MapperComponents and call the corresponding
381   // targetData* function using these new arrays.
382   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
383   std::vector<void *> MapperArgs(MapperComponents.Components.size());
384   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
385   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
386   std::vector<void *> MapperArgNames(MapperComponents.Components.size());
387 
388   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
389     auto &C = MapperComponents.Components[I];
390     MapperArgsBase[I] = C.Base;
391     MapperArgs[I] = C.Begin;
392     MapperArgSizes[I] = C.Size;
393     MapperArgTypes[I] = C.Type;
394     MapperArgNames[I] = C.Name;
395   }
396 
397   int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
398                                 MapperArgsBase.data(), MapperArgs.data(),
399                                 MapperArgSizes.data(), MapperArgTypes.data(),
400                                 MapperArgNames.data(), /*arg_mappers*/ nullptr,
401                                 AsyncInfo, /*FromMapper=*/true);
402 
403   return rc;
404 }
405 
406 /// Internal function to do the mapping and transfer the data to the device
407 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
408                     void **args_base, void **args, int64_t *arg_sizes,
409                     int64_t *arg_types, map_var_info_t *arg_names,
410                     void **arg_mappers, AsyncInfoTy &AsyncInfo,
411                     bool FromMapper) {
412   // process each input.
413   for (int32_t i = 0; i < arg_num; ++i) {
414     // Ignore private variables and arrays - there is no mapping for them.
415     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
416         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
417       continue;
418 
419     if (arg_mappers && arg_mappers[i]) {
420       // Instead of executing the regular path of targetDataBegin, call the
421       // targetDataMapper variant which will call targetDataBegin again
422       // with new arguments.
423       DP("Calling targetDataMapper for the %dth argument\n", i);
424 
425       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
426       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
427                                 arg_sizes[i], arg_types[i], arg_name,
428                                 arg_mappers[i], AsyncInfo, targetDataBegin);
429 
430       if (rc != OFFLOAD_SUCCESS) {
431         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
432                " failed.\n");
433         return OFFLOAD_FAIL;
434       }
435 
436       // Skip the rest of this function, continue to the next argument.
437       continue;
438     }
439 
440     void *HstPtrBegin = args[i];
441     void *HstPtrBase = args_base[i];
442     int64_t data_size = arg_sizes[i];
443     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
444 
445     // Adjust for proper alignment if this is a combined entry (for structs).
446     // Look at the next argument - if that is MEMBER_OF this one, then this one
447     // is a combined entry.
448     int64_t padding = 0;
449     const int next_i = i + 1;
450     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
451         getParentIndex(arg_types[next_i]) == i) {
452       padding = (int64_t)HstPtrBegin % Alignment;
453       if (padding) {
454         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
455            "\n",
456            padding, DPxPTR(HstPtrBegin));
457         HstPtrBegin = (char *)HstPtrBegin - padding;
458         data_size += padding;
459       }
460     }
461 
462     // Address of pointer on the host and device, respectively.
463     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
464     TargetPointerResultTy Pointer_TPR;
465     bool IsHostPtr = false;
466     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
467     // Force the creation of a device side copy of the data when:
468     // a close map modifier was associated with a map that contained a to.
469     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
470     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
471     bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD;
472     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
473     // have reached this point via __tgt_target_data_begin and not __tgt_target
474     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
475     // associated with a target region, so there are no target parameters). This
476     // may be considered a hack, we could revise the scheme in the future.
477     bool UpdateRef =
478         !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
479     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
480       DP("Has a pointer entry: \n");
481       // Base is address of pointer.
482       //
483       // Usually, the pointer is already allocated by this time.  For example:
484       //
485       //   #pragma omp target map(s.p[0:N])
486       //
487       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
488       // afterward, so the pointer is already allocated by the time the
489       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
490       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
491       // entry for a global that might not already be allocated by the time the
492       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
493       // when HasPresentModifier.
494       Pointer_TPR = Device.getTargetPointer(
495           HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr,
496           /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
497           HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo);
498       PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
499       IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
500       if (!PointerTgtPtrBegin) {
501         REPORT("Call to getTargetPointer returned null pointer (%s).\n",
502                HasPresentModifier ? "'present' map type modifier"
503                                   : "device failure or illegal mapping");
504         return OFFLOAD_FAIL;
505       }
506       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
507          "\n",
508          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
509          (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
510       Pointer_HstPtrBegin = HstPtrBase;
511       // modify current entry.
512       HstPtrBase = *(void **)HstPtrBase;
513       // No need to update pointee ref count for the first element of the
514       // subelement that comes from mapper.
515       UpdateRef =
516           (!FromMapper || i != 0); // subsequently update ref count of pointee
517     }
518 
519     const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
520     const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
521     auto TPR = Device.getTargetPointer(
522         HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo,
523         HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
524         HasPresentModifier, HasHoldModifier, AsyncInfo);
525     void *TgtPtrBegin = TPR.TargetPointer;
526     IsHostPtr = TPR.Flags.IsHostPointer;
527     // If data_size==0, then the argument could be a zero-length pointer to
528     // NULL, so getOrAlloc() returning NULL is not an error.
529     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
530       REPORT("Call to getTargetPointer returned null pointer (%s).\n",
531              HasPresentModifier ? "'present' map type modifier"
532                                 : "device failure or illegal mapping");
533       return OFFLOAD_FAIL;
534     }
535     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
536        " - is%s new\n",
537        data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
538 
539     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
540       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
541       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
542       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
543       args_base[i] = TgtPtrBase;
544     }
545 
546     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
547       // Check whether we need to update the pointer on the device
548       bool UpdateDevPtr = false;
549 
550       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
551       void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
552 
553       Device.ShadowMtx.lock();
554       auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
555       // If this pointer is not in the map we need to insert it. If the map
556       // contains a stale entry, we need to update it (e.g. if the pointee was
557       // deallocated and later on is reallocated at another device address). The
558       // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
559       // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
560       // pair is deallocated, not when the OBJ is deallocated. In
561       // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
562       // so it stays in the map for the lifetime of the application. When the
563       // OBJ is deallocated and later on allocated again (at a different device
564       // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
565       // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
566       if (Entry == Device.ShadowPtrMap.end() ||
567           Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
568         // create or update shadow pointers for this entry
569         Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
570             HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
571         Pointer_TPR.MapTableEntry->setMayContainAttachedPointers();
572         UpdateDevPtr = true;
573       }
574 
575       if (UpdateDevPtr) {
576         HostDataToTargetTy::LockGuard LG(*Pointer_TPR.MapTableEntry);
577         Device.ShadowMtx.unlock();
578 
579         DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
580            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
581 
582         void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
583         TgtPtrBase = ExpectedTgtPtrBase;
584 
585         int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
586                                     sizeof(void *), AsyncInfo);
587         if (Ret != OFFLOAD_SUCCESS) {
588           REPORT("Copying data to device failed.\n");
589           return OFFLOAD_FAIL;
590         }
591         if (Pointer_TPR.MapTableEntry->addEventIfNecessary(Device, AsyncInfo) !=
592             OFFLOAD_SUCCESS)
593           return OFFLOAD_FAIL;
594       } else
595         Device.ShadowMtx.unlock();
596     }
597   }
598 
599   return OFFLOAD_SUCCESS;
600 }
601 
602 namespace {
603 /// This structure contains information to deallocate a target pointer, aka.
604 /// used to call the function \p DeviceTy::deallocTgtPtr.
605 struct DeallocTgtPtrInfo {
606   /// Host pointer used to look up into the map table
607   void *HstPtrBegin;
608   /// Size of the data
609   int64_t DataSize;
610   /// Whether it has \p ompx_hold modifier
611   bool HasHoldModifier;
612 
613   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
614       : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
615 };
616 
617 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
618 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the
619 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the
620 /// rest of the map is not checked anymore.
621 template <typename CBTy>
622 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin,
623                                     uintptr_t Size,
624                                     const TargetPointerResultTy &TPR) {
625   // If we have an object that is too small to hold a pointer subobject, no need
626   // to do any checking.
627   if (Size < sizeof(void *))
628     return;
629 
630   // If the map entry for the object was never marked as containing attached
631   // pointers, no need to do any checking.
632   if (TPR.MapTableEntry == HostDataToTargetListTy::iterator{} ||
633       !TPR.MapTableEntry->getMayContainAttachedPointers())
634     return;
635 
636   uintptr_t LB = (uintptr_t)Begin;
637   uintptr_t UB = LB + Size;
638   // Now we are looking into the shadow map so we need to lock it.
639   Device.ShadowMtx.lock();
640   for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
641        Itr != Device.ShadowPtrMap.end();) {
642     uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first;
643 
644     // An STL map is sorted on its keys; use this property
645     // to quickly determine when to break out of the loop.
646     if (ShadowHstPtrAddr < LB) {
647       ++Itr;
648       continue;
649     }
650     if (ShadowHstPtrAddr >= UB)
651       break;
652 
653     if (CB(Itr) == OFFLOAD_FAIL)
654       break;
655   }
656   Device.ShadowMtx.unlock();
657 }
658 
659 } // namespace
660 
661 /// Internal function to undo the mapping and retrieve the data from the device.
662 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
663                   void **ArgBases, void **Args, int64_t *ArgSizes,
664                   int64_t *ArgTypes, map_var_info_t *ArgNames,
665                   void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
666   int Ret;
667   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
668   void *FromMapperBase = nullptr;
669   // process each input.
670   for (int32_t I = ArgNum - 1; I >= 0; --I) {
671     // Ignore private variables and arrays - there is no mapping for them.
672     // Also, ignore the use_device_ptr directive, it has no effect here.
673     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
674         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
675       continue;
676 
677     if (ArgMappers && ArgMappers[I]) {
678       // Instead of executing the regular path of targetDataEnd, call the
679       // targetDataMapper variant which will call targetDataEnd again
680       // with new arguments.
681       DP("Calling targetDataMapper for the %dth argument\n", I);
682 
683       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
684       Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
685                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
686                              targetDataEnd);
687 
688       if (Ret != OFFLOAD_SUCCESS) {
689         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
690                " failed.\n");
691         return OFFLOAD_FAIL;
692       }
693 
694       // Skip the rest of this function, continue to the next argument.
695       continue;
696     }
697 
698     void *HstPtrBegin = Args[I];
699     int64_t DataSize = ArgSizes[I];
700     // Adjust for proper alignment if this is a combined entry (for structs).
701     // Look at the next argument - if that is MEMBER_OF this one, then this one
702     // is a combined entry.
703     const int NextI = I + 1;
704     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
705         getParentIndex(ArgTypes[NextI]) == I) {
706       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
707       if (Padding) {
708         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
709            "\n",
710            Padding, DPxPTR(HstPtrBegin));
711         HstPtrBegin = (char *)HstPtrBegin - Padding;
712         DataSize += Padding;
713       }
714     }
715 
716     bool IsLast, IsHostPtr;
717     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
718     bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
719                       (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
720                      !(FromMapper && I == 0);
721     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
722     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
723     bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
724 
725     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
726     TargetPointerResultTy TPR = Device.getTgtPtrBegin(
727         HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
728         !IsImplicit, ForceDelete);
729     void *TgtPtrBegin = TPR.TargetPointer;
730     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
731       DP("Mapping does not exist (%s)\n",
732          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
733       if (HasPresentModifier) {
734         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
735         // "If a map clause appears on a target, target data, target enter data
736         // or target exit data construct with a present map-type-modifier then
737         // on entry to the region if the corresponding list item does not appear
738         // in the device data environment then an error occurs and the program
739         // terminates."
740         //
741         // This should be an error upon entering an "omp target exit data".  It
742         // should not be an error upon exiting an "omp target data" or "omp
743         // target".  For "omp target data", Clang thus doesn't include present
744         // modifiers for end calls.  For "omp target", we have not found a valid
745         // OpenMP program for which the error matters: it appears that, if a
746         // program can guarantee that data is present at the beginning of an
747         // "omp target" region so that there's no error there, that data is also
748         // guaranteed to be present at the end.
749         MESSAGE("device mapping required by 'present' map type modifier does "
750                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
751                 DPxPTR(HstPtrBegin), DataSize);
752         return OFFLOAD_FAIL;
753       }
754     } else {
755       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
756          " - is%s last\n",
757          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
758     }
759 
760     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
761     // "If the map clause appears on a target, target data, or target exit data
762     // construct and a corresponding list item of the original list item is not
763     // present in the device data environment on exit from the region then the
764     // list item is ignored."
765     if (!TgtPtrBegin)
766       continue;
767 
768     bool DelEntry = IsLast;
769 
770     // If the last element from the mapper (for end transfer args comes in
771     // reverse order), do not remove the partial entry, the parent struct still
772     // exists.
773     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
774         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
775       DelEntry = false; // protect parent struct from being deallocated
776     }
777 
778     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
779       // Move data back to the host
780       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
781         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
782         if ((Always || IsLast) && !IsHostPtr) {
783           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
784              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
785           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
786                                     AsyncInfo);
787           if (Ret != OFFLOAD_SUCCESS) {
788             REPORT("Copying data from device failed.\n");
789             return OFFLOAD_FAIL;
790           }
791         }
792       }
793       if (DelEntry && FromMapper && I == 0) {
794         DelEntry = false;
795         FromMapperBase = HstPtrBegin;
796       }
797 
798       // If we copied back to the host a struct/array containing pointers, we
799       // need to restore the original host pointer values from their shadow
800       // copies. If the struct is going to be deallocated, remove any remaining
801       // shadow pointer entries for this struct.
802       auto CB = [&](ShadowPtrListTy::iterator &Itr) {
803         // If we copied the struct to the host, we need to restore the pointer.
804         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
805           void **ShadowHstPtrAddr = (void **)Itr->first;
806           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
807           DP("Restoring original host pointer value " DPxMOD " for host "
808              "pointer " DPxMOD "\n",
809              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
810         }
811         // If the struct is to be deallocated, remove the shadow entry.
812         if (DelEntry) {
813           DP("Removing shadow pointer " DPxMOD "\n",
814              DPxPTR((void **)Itr->first));
815           Itr = Device.ShadowPtrMap.erase(Itr);
816         } else {
817           ++Itr;
818         }
819         return OFFLOAD_SUCCESS;
820       };
821       applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR);
822 
823       // Add pointer to the buffer for later deallocation
824       if (DelEntry && !IsHostPtr)
825         DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier);
826     }
827   }
828 
829   // TODO: We should not synchronize here but pass the AsyncInfo object to the
830   //       allocate/deallocate device APIs.
831   //
832   // We need to synchronize before deallocating data.
833   Ret = AsyncInfo.synchronize();
834   if (Ret != OFFLOAD_SUCCESS)
835     return OFFLOAD_FAIL;
836 
837   // Deallocate target pointer
838   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
839     if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
840       continue;
841     Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
842                                Info.HasHoldModifier);
843     if (Ret != OFFLOAD_SUCCESS) {
844       REPORT("Deallocating data from device failed.\n");
845       return OFFLOAD_FAIL;
846     }
847   }
848 
849   return OFFLOAD_SUCCESS;
850 }
851 
852 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
853                                 void *HstPtrBegin, int64_t ArgSize,
854                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
855   TIMESCOPE_WITH_IDENT(loc);
856   bool IsLast, IsHostPtr;
857   TargetPointerResultTy TPR = Device.getTgtPtrBegin(
858       HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
859       /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
860   void *TgtPtrBegin = TPR.TargetPointer;
861   if (!TgtPtrBegin) {
862     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
863     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
864       MESSAGE("device mapping required by 'present' motion modifier does not "
865               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
866               DPxPTR(HstPtrBegin), ArgSize);
867       return OFFLOAD_FAIL;
868     }
869     return OFFLOAD_SUCCESS;
870   }
871 
872   if (IsHostPtr) {
873     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
874        DPxPTR(HstPtrBegin));
875     return OFFLOAD_SUCCESS;
876   }
877 
878   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
879     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
880        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
881     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
882     if (Ret != OFFLOAD_SUCCESS) {
883       REPORT("Copying data from device failed.\n");
884       return OFFLOAD_FAIL;
885     }
886 
887     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
888       void **ShadowHstPtrAddr = (void **)Itr->first;
889       *ShadowHstPtrAddr = Itr->second.HstPtrVal;
890       DP("Restoring original host pointer value " DPxMOD
891          " for host pointer " DPxMOD "\n",
892          DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
893       return OFFLOAD_SUCCESS;
894     };
895     applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
896   }
897 
898   if (ArgType & OMP_TGT_MAPTYPE_TO) {
899     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
900        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
901     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
902     if (Ret != OFFLOAD_SUCCESS) {
903       REPORT("Copying data to device failed.\n");
904       return OFFLOAD_FAIL;
905     }
906 
907     auto CB = [&](ShadowPtrListTy::iterator &Itr) {
908       DP("Restoring original target pointer value " DPxMOD " for target "
909          "pointer " DPxMOD "\n",
910          DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr));
911       Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal,
912                               sizeof(void *), AsyncInfo);
913       if (Ret != OFFLOAD_SUCCESS)
914         REPORT("Copying data to device failed.\n");
915       return Ret;
916     };
917     applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
918   }
919   return OFFLOAD_SUCCESS;
920 }
921 
922 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
923                                    void *ArgsBase,
924                                    __tgt_target_non_contig *NonContig,
925                                    uint64_t Size, int64_t ArgType,
926                                    int CurrentDim, int DimSize, uint64_t Offset,
927                                    AsyncInfoTy &AsyncInfo) {
928   TIMESCOPE_WITH_IDENT(loc);
929   int Ret = OFFLOAD_SUCCESS;
930   if (CurrentDim < DimSize) {
931     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
932       uint64_t CurOffset =
933           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
934       // we only need to transfer the first element for the last dimension
935       // since we've already got a contiguous piece.
936       if (CurrentDim != DimSize - 1 || I == 0) {
937         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
938                                       ArgType, CurrentDim + 1, DimSize,
939                                       Offset + CurOffset, AsyncInfo);
940         // Stop the whole process if any contiguous piece returns anything
941         // other than OFFLOAD_SUCCESS.
942         if (Ret != OFFLOAD_SUCCESS)
943           return Ret;
944       }
945     }
946   } else {
947     char *Ptr = (char *)ArgsBase + Offset;
948     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
949        " len %" PRIu64 "\n",
950        DPxPTR(Ptr), Offset, Size);
951     Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
952                                AsyncInfo);
953   }
954   return Ret;
955 }
956 
957 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
958                                        int32_t DimSize) {
959   int RemovedDim = 0;
960   for (int I = DimSize - 1; I > 0; --I) {
961     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
962       RemovedDim++;
963   }
964   return RemovedDim;
965 }
966 
967 /// Internal function to pass data to/from the target.
968 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
969                      void **ArgsBase, void **Args, int64_t *ArgSizes,
970                      int64_t *ArgTypes, map_var_info_t *ArgNames,
971                      void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
972   // process each input.
973   for (int32_t I = 0; I < ArgNum; ++I) {
974     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
975         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
976       continue;
977 
978     if (ArgMappers && ArgMappers[I]) {
979       // Instead of executing the regular path of targetDataUpdate, call the
980       // targetDataMapper variant which will call targetDataUpdate again
981       // with new arguments.
982       DP("Calling targetDataMapper for the %dth argument\n", I);
983 
984       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
985       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
986                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
987                                  targetDataUpdate);
988 
989       if (Ret != OFFLOAD_SUCCESS) {
990         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
991                " failed.\n");
992         return OFFLOAD_FAIL;
993       }
994 
995       // Skip the rest of this function, continue to the next argument.
996       continue;
997     }
998 
999     int Ret = OFFLOAD_SUCCESS;
1000 
1001     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1002       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1003       int32_t DimSize = ArgSizes[I];
1004       uint64_t Size =
1005           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1006       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1007       Ret = targetDataNonContiguous(
1008           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1009           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1010     } else {
1011       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1012                                  ArgTypes[I], AsyncInfo);
1013     }
1014     if (Ret == OFFLOAD_FAIL)
1015       return OFFLOAD_FAIL;
1016   }
1017   return OFFLOAD_SUCCESS;
1018 }
1019 
1020 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1021                                       OMP_TGT_MAPTYPE_LITERAL |
1022                                       OMP_TGT_MAPTYPE_IMPLICIT;
1023 static bool isLambdaMapping(int64_t Mapping) {
1024   return (Mapping & LambdaMapping) == LambdaMapping;
1025 }
1026 
1027 namespace {
1028 /// Find the table information in the map or look it up in the translation
1029 /// tables.
1030 TableMap *getTableMap(void *HostPtr) {
1031   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1032   HostPtrToTableMapTy::iterator TableMapIt =
1033       PM->HostPtrToTableMap.find(HostPtr);
1034 
1035   if (TableMapIt != PM->HostPtrToTableMap.end())
1036     return &TableMapIt->second;
1037 
1038   // We don't have a map. So search all the registered libraries.
1039   TableMap *TM = nullptr;
1040   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1041   for (HostEntriesBeginToTransTableTy::iterator Itr =
1042            PM->HostEntriesBeginToTransTable.begin();
1043        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1044     // get the translation table (which contains all the good info).
1045     TranslationTable *TransTable = &Itr->second;
1046     // iterate over all the host table entries to see if we can locate the
1047     // host_ptr.
1048     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1049     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1050       if (Cur->addr != HostPtr)
1051         continue;
1052       // we got a match, now fill the HostPtrToTableMap so that we
1053       // may avoid this search next time.
1054       TM = &(PM->HostPtrToTableMap)[HostPtr];
1055       TM->Table = TransTable;
1056       TM->Index = I;
1057       return TM;
1058     }
1059   }
1060 
1061   return nullptr;
1062 }
1063 
1064 /// Get loop trip count
1065 /// FIXME: This function will not work right if calling
1066 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1067 /// another thread, which might occur when we call task yield.
1068 uint64_t getLoopTripCount(int64_t DeviceId) {
1069   DeviceTy &Device = *PM->Devices[DeviceId];
1070   uint64_t LoopTripCount = 0;
1071 
1072   {
1073     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1074     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1075     if (I != Device.LoopTripCnt.end()) {
1076       LoopTripCount = I->second;
1077       Device.LoopTripCnt.erase(I);
1078       DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1079     }
1080   }
1081 
1082   return LoopTripCount;
1083 }
1084 
1085 /// A class manages private arguments in a target region.
1086 class PrivateArgumentManagerTy {
1087   /// A data structure for the information of first-private arguments. We can
1088   /// use this information to optimize data transfer by packing all
1089   /// first-private arguments and transfer them all at once.
1090   struct FirstPrivateArgInfoTy {
1091     /// The index of the element in \p TgtArgs corresponding to the argument
1092     const int Index;
1093     /// Host pointer begin
1094     const char *HstPtrBegin;
1095     /// Host pointer end
1096     const char *HstPtrEnd;
1097     /// Aligned size
1098     const int64_t AlignedSize;
1099     /// Host pointer name
1100     const map_var_info_t HstPtrName = nullptr;
1101 
1102     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1103                           const map_var_info_t HstPtrName = nullptr)
1104         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1105           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1106           HstPtrName(HstPtrName) {}
1107   };
1108 
1109   /// A vector of target pointers for all private arguments
1110   std::vector<void *> TgtPtrs;
1111 
1112   /// A vector of information of all first-private arguments to be packed
1113   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1114   /// Host buffer for all arguments to be packed
1115   std::vector<char> FirstPrivateArgBuffer;
1116   /// The total size of all arguments to be packed
1117   int64_t FirstPrivateArgSize = 0;
1118 
1119   /// A reference to the \p DeviceTy object
1120   DeviceTy &Device;
1121   /// A pointer to a \p AsyncInfoTy object
1122   AsyncInfoTy &AsyncInfo;
1123 
1124   // TODO: What would be the best value here? Should we make it configurable?
1125   // If the size is larger than this threshold, we will allocate and transfer it
1126   // immediately instead of packing it.
1127   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1128 
1129 public:
1130   /// Constructor
1131   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1132       : Device(Dev), AsyncInfo(AsyncInfo) {}
1133 
1134   /// Add a private argument
1135   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1136              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1137              const map_var_info_t HstPtrName = nullptr,
1138              const bool AllocImmediately = false) {
1139     // If the argument is not first-private, or its size is greater than a
1140     // predefined threshold, we will allocate memory and issue the transfer
1141     // immediately.
1142     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1143         AllocImmediately) {
1144       TgtPtr = Device.allocData(ArgSize, HstPtr);
1145       if (!TgtPtr) {
1146         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1147            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1148         return OFFLOAD_FAIL;
1149       }
1150 #ifdef OMPTARGET_DEBUG
1151       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1152       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1153          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1154          "\n",
1155          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1156          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1157 #endif
1158       // If first-private, copy data from host
1159       if (IsFirstPrivate) {
1160         DP("Submitting firstprivate data to the device.\n");
1161         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1162         if (Ret != OFFLOAD_SUCCESS) {
1163           DP("Copying data to device failed, failed.\n");
1164           return OFFLOAD_FAIL;
1165         }
1166       }
1167       TgtPtrs.push_back(TgtPtr);
1168     } else {
1169       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1170          DPxPTR(HstPtr), ArgSize);
1171       // When reach this point, the argument must meet all following
1172       // requirements:
1173       // 1. Its size does not exceed the threshold (see the comment for
1174       // FirstPrivateArgSizeThreshold);
1175       // 2. It must be first-private (needs to be mapped to target device).
1176       // We will pack all this kind of arguments to transfer them all at once
1177       // to reduce the number of data transfer. We will not take
1178       // non-first-private arguments, aka. private arguments that doesn't need
1179       // to be mapped to target device, into account because data allocation
1180       // can be very efficient with memory manager.
1181 
1182       // Placeholder value
1183       TgtPtr = nullptr;
1184       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1185                                        HstPtrName);
1186       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1187     }
1188 
1189     return OFFLOAD_SUCCESS;
1190   }
1191 
1192   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1193   /// and start the transfer.
1194   int packAndTransfer(std::vector<void *> &TgtArgs) {
1195     if (!FirstPrivateArgInfo.empty()) {
1196       assert(FirstPrivateArgSize != 0 &&
1197              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1198       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1199       auto Itr = FirstPrivateArgBuffer.begin();
1200       // Copy all host data to this buffer
1201       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1202         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1203         Itr = std::next(Itr, Info.AlignedSize);
1204       }
1205       // Allocate target memory
1206       void *TgtPtr =
1207           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1208       if (TgtPtr == nullptr) {
1209         DP("Failed to allocate target memory for private arguments.\n");
1210         return OFFLOAD_FAIL;
1211       }
1212       TgtPtrs.push_back(TgtPtr);
1213       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1214          FirstPrivateArgSize, DPxPTR(TgtPtr));
1215       // Transfer data to target device
1216       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1217                                   FirstPrivateArgSize, AsyncInfo);
1218       if (Ret != OFFLOAD_SUCCESS) {
1219         DP("Failed to submit data of private arguments.\n");
1220         return OFFLOAD_FAIL;
1221       }
1222       // Fill in all placeholder pointers
1223       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1224       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1225         void *&Ptr = TgtArgs[Info.Index];
1226         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1227         Ptr = reinterpret_cast<void *>(TP);
1228         TP += Info.AlignedSize;
1229         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1230            "\n",
1231            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1232            DPxPTR(Ptr));
1233       }
1234     }
1235 
1236     return OFFLOAD_SUCCESS;
1237   }
1238 
1239   /// Free all target memory allocated for private arguments
1240   int free() {
1241     for (void *P : TgtPtrs) {
1242       int Ret = Device.deleteData(P);
1243       if (Ret != OFFLOAD_SUCCESS) {
1244         DP("Deallocation of (first-)private arrays failed.\n");
1245         return OFFLOAD_FAIL;
1246       }
1247     }
1248 
1249     TgtPtrs.clear();
1250 
1251     return OFFLOAD_SUCCESS;
1252   }
1253 };
1254 
1255 /// Process data before launching the kernel, including calling targetDataBegin
1256 /// to map and transfer data to target device, transferring (first-)private
1257 /// variables.
1258 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1259                              int32_t ArgNum, void **ArgBases, void **Args,
1260                              int64_t *ArgSizes, int64_t *ArgTypes,
1261                              map_var_info_t *ArgNames, void **ArgMappers,
1262                              std::vector<void *> &TgtArgs,
1263                              std::vector<ptrdiff_t> &TgtOffsets,
1264                              PrivateArgumentManagerTy &PrivateArgumentManager,
1265                              AsyncInfoTy &AsyncInfo) {
1266   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1267   DeviceTy &Device = *PM->Devices[DeviceId];
1268   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1269                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1270   if (Ret != OFFLOAD_SUCCESS) {
1271     REPORT("Call to targetDataBegin failed, abort target.\n");
1272     return OFFLOAD_FAIL;
1273   }
1274 
1275   // List of (first-)private arrays allocated for this target region
1276   std::vector<int> TgtArgsPositions(ArgNum, -1);
1277 
1278   for (int32_t I = 0; I < ArgNum; ++I) {
1279     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1280       // This is not a target parameter, do not push it into TgtArgs.
1281       // Check for lambda mapping.
1282       if (isLambdaMapping(ArgTypes[I])) {
1283         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1284                "PTR_AND_OBJ must be also MEMBER_OF.");
1285         unsigned Idx = getParentIndex(ArgTypes[I]);
1286         int TgtIdx = TgtArgsPositions[Idx];
1287         assert(TgtIdx != -1 && "Base address must be translated already.");
1288         // The parent lambda must be processed already and it must be the last
1289         // in TgtArgs and TgtOffsets arrays.
1290         void *HstPtrVal = Args[I];
1291         void *HstPtrBegin = ArgBases[I];
1292         void *HstPtrBase = Args[Idx];
1293         bool IsLast, IsHostPtr; // IsLast is unused.
1294         void *TgtPtrBase =
1295             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1296         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1297         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1298         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1299         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1300         TargetPointerResultTy TPR = Device.getTgtPtrBegin(
1301             HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
1302             /*UseHoldRefCount=*/false, IsHostPtr);
1303         PointerTgtPtrBegin = TPR.TargetPointer;
1304         if (!PointerTgtPtrBegin) {
1305           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1306              DPxPTR(HstPtrVal));
1307           continue;
1308         }
1309         if (IsHostPtr) {
1310           DP("Unified memory is active, no need to map lambda captured"
1311              "variable (" DPxMOD ")\n",
1312              DPxPTR(HstPtrVal));
1313           continue;
1314         }
1315         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1316            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1317         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1318                                 sizeof(void *), AsyncInfo);
1319         if (Ret != OFFLOAD_SUCCESS) {
1320           REPORT("Copying data to device failed.\n");
1321           return OFFLOAD_FAIL;
1322         }
1323       }
1324       continue;
1325     }
1326     void *HstPtrBegin = Args[I];
1327     void *HstPtrBase = ArgBases[I];
1328     void *TgtPtrBegin;
1329     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1330     ptrdiff_t TgtBaseOffset;
1331     bool IsLast, IsHostPtr; // unused.
1332     TargetPointerResultTy TPR;
1333     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1334       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1335          DPxPTR(HstPtrBase));
1336       TgtPtrBegin = HstPtrBase;
1337       TgtBaseOffset = 0;
1338     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1339       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1340       const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1341       // If there is a next argument and it depends on the current one, we need
1342       // to allocate the private memory immediately. If this is not the case,
1343       // then the argument can be marked for optimization and packed with the
1344       // other privates.
1345       const bool AllocImmediately =
1346           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1347       Ret = PrivateArgumentManager.addArg(
1348           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1349           TgtArgs.size(), HstPtrName, AllocImmediately);
1350       if (Ret != OFFLOAD_SUCCESS) {
1351         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1352                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1353         return OFFLOAD_FAIL;
1354       }
1355     } else {
1356       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1357         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1358       TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1359                                   /*UpdateRefCount=*/false,
1360                                   /*UseHoldRefCount=*/false, IsHostPtr);
1361       TgtPtrBegin = TPR.TargetPointer;
1362       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1363 #ifdef OMPTARGET_DEBUG
1364       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1365       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1366          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1367 #endif
1368     }
1369     TgtArgsPositions[I] = TgtArgs.size();
1370     TgtArgs.push_back(TgtPtrBegin);
1371     TgtOffsets.push_back(TgtBaseOffset);
1372   }
1373 
1374   assert(TgtArgs.size() == TgtOffsets.size() &&
1375          "Size mismatch in arguments and offsets");
1376 
1377   // Pack and transfer first-private arguments
1378   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1379   if (Ret != OFFLOAD_SUCCESS) {
1380     DP("Failed to pack and transfer first private arguments\n");
1381     return OFFLOAD_FAIL;
1382   }
1383 
1384   return OFFLOAD_SUCCESS;
1385 }
1386 
1387 /// Process data after launching the kernel, including transferring data back to
1388 /// host if needed and deallocating target memory of (first-)private variables.
1389 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1390                             int32_t ArgNum, void **ArgBases, void **Args,
1391                             int64_t *ArgSizes, int64_t *ArgTypes,
1392                             map_var_info_t *ArgNames, void **ArgMappers,
1393                             PrivateArgumentManagerTy &PrivateArgumentManager,
1394                             AsyncInfoTy &AsyncInfo) {
1395   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1396   DeviceTy &Device = *PM->Devices[DeviceId];
1397 
1398   // Move data from device.
1399   int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1400                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1401   if (Ret != OFFLOAD_SUCCESS) {
1402     REPORT("Call to targetDataEnd failed, abort target.\n");
1403     return OFFLOAD_FAIL;
1404   }
1405 
1406   // Free target memory for private arguments
1407   Ret = PrivateArgumentManager.free();
1408   if (Ret != OFFLOAD_SUCCESS) {
1409     REPORT("Failed to deallocate target memory for private args\n");
1410     return OFFLOAD_FAIL;
1411   }
1412 
1413   return OFFLOAD_SUCCESS;
1414 }
1415 } // namespace
1416 
1417 /// performs the same actions as data_begin in case arg_num is
1418 /// non-zero and initiates run of the offloaded region on the target platform;
1419 /// if arg_num is non-zero after the region execution is done it also
1420 /// performs the same action as data_update and data_end above. This function
1421 /// returns 0 if it was able to transfer the execution to a target and an
1422 /// integer different from zero otherwise.
1423 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1424            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1425            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1426            int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1427   int32_t DeviceId = Device.DeviceID;
1428 
1429   TableMap *TM = getTableMap(HostPtr);
1430   // No map for this host pointer found!
1431   if (!TM) {
1432     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1433            DPxPTR(HostPtr));
1434     return OFFLOAD_FAIL;
1435   }
1436 
1437   // get target table.
1438   __tgt_target_table *TargetTable = nullptr;
1439   {
1440     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1441     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1442            "Not expecting a device ID outside the table's bounds!");
1443     TargetTable = TM->Table->TargetsTable[DeviceId];
1444   }
1445   assert(TargetTable && "Global data has not been mapped\n");
1446 
1447   // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1448   // need to manifest base pointers prior to launching a kernel. Even if we have
1449   // mapped an object only partially, e.g. A[N:M], although the kernel is
1450   // expected to access elements starting at address &A[N] and beyond, we still
1451   // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1452   // API, we need the begin address itself, i.e. &A[N], as the API operates on
1453   // begin addresses, not bases. That's why we pass args and offsets as two
1454   // separate entities so that each plugin can do what it needs. This behavior
1455   // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1456   std::vector<void *> TgtArgs;
1457   std::vector<ptrdiff_t> TgtOffsets;
1458 
1459   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1460 
1461   int Ret;
1462   if (ArgNum) {
1463     // Process data, such as data mapping, before launching the kernel
1464     Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1465                             ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1466                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1467     if (Ret != OFFLOAD_SUCCESS) {
1468       REPORT("Failed to process data before launching the kernel.\n");
1469       return OFFLOAD_FAIL;
1470     }
1471   }
1472 
1473   // Launch device execution.
1474   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1475   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1476      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1477 
1478   {
1479     TIMESCOPE_WITH_NAME_AND_IDENT(
1480         IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1481     if (IsTeamConstruct)
1482       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1483                                  TgtArgs.size(), TeamNum, ThreadLimit,
1484                                  getLoopTripCount(DeviceId), AsyncInfo);
1485     else
1486       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1487                              TgtArgs.size(), AsyncInfo);
1488   }
1489 
1490   if (Ret != OFFLOAD_SUCCESS) {
1491     REPORT("Executing target region abort target.\n");
1492     return OFFLOAD_FAIL;
1493   }
1494 
1495   if (ArgNum) {
1496     // Transfer data back and deallocate target memory for (first-)private
1497     // variables
1498     Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1499                            ArgSizes, ArgTypes, ArgNames, ArgMappers,
1500                            PrivateArgumentManager, AsyncInfo);
1501     if (Ret != OFFLOAD_SUCCESS) {
1502       REPORT("Failed to process data after launching the kernel.\n");
1503       return OFFLOAD_FAIL;
1504     }
1505   }
1506 
1507   return OFFLOAD_SUCCESS;
1508 }
1509