1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18 
19 #include <cassert>
20 #include <vector>
21 
22 int AsyncInfoTy::synchronize() {
23   int Result = OFFLOAD_SUCCESS;
24   if (AsyncInfo.Queue) {
25     // If we have a queue we need to synchronize it now.
26     Result = Device.synchronize(*this);
27     assert(AsyncInfo.Queue == nullptr &&
28            "The device plugin should have nulled the queue to indicate there "
29            "are no outstanding actions!");
30   }
31   return Result;
32 }
33 
34 void *&AsyncInfoTy::getVoidPtrLocation() {
35   BufferLocations.push_back(nullptr);
36   return BufferLocations.back();
37 }
38 
39 /* All begin addresses for partially mapped structs must be 8-aligned in order
40  * to ensure proper alignment of members. E.g.
41  *
42  * struct S {
43  *   int a;   // 4-aligned
44  *   int b;   // 4-aligned
45  *   int *p;  // 8-aligned
46  * } s1;
47  * ...
48  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
49  * {
50  *   s1.b = 5;
51  *   for (int i...) s1.p[i] = ...;
52  * }
53  *
54  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
55  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
56  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
57  * requirements for its type. Now, when we allocate memory on the device, in
58  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
59  * This means that the chunk of the struct on the device will start at a
60  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
61  * address of p will be a misaligned 0x204 (on the host there was no need to add
62  * padding between b and p, so p comes exactly 4 bytes after b). If the device
63  * kernel tries to access s1.p, a misaligned address error occurs (as reported
64  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
65  * extending the size of the allocated chuck accordingly, the chuck on the
66  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
67  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
68  */
69 static const int64_t Alignment = 8;
70 
71 /// Map global data and execute pending ctors
72 static int InitLibrary(DeviceTy &Device) {
73   /*
74    * Map global data
75    */
76   int32_t device_id = Device.DeviceID;
77   int rc = OFFLOAD_SUCCESS;
78   bool supportsEmptyImages = Device.RTL->supports_empty_images &&
79                              Device.RTL->supports_empty_images() > 0;
80 
81   Device.PendingGlobalsMtx.lock();
82   PM->TrlTblMtx.lock();
83   for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
84     TranslationTable *TransTable =
85         &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
86     if (TransTable->HostTable.EntriesBegin ==
87             TransTable->HostTable.EntriesEnd &&
88         !supportsEmptyImages) {
89       // No host entry so no need to proceed
90       continue;
91     }
92 
93     if (TransTable->TargetsTable[device_id] != 0) {
94       // Library entries have already been processed
95       continue;
96     }
97 
98     // 1) get image.
99     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
100            "Not expecting a device ID outside the table's bounds!");
101     __tgt_device_image *img = TransTable->TargetsImages[device_id];
102     if (!img) {
103       REPORT("No image loaded for device id %d.\n", device_id);
104       rc = OFFLOAD_FAIL;
105       break;
106     }
107     // 2) load image into the target table.
108     __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
109         Device.load_binary(img);
110     // Unable to get table for this image: invalidate image and fail.
111     if (!TargetTable) {
112       REPORT("Unable to generate entries table for device id %d.\n", device_id);
113       TransTable->TargetsImages[device_id] = 0;
114       rc = OFFLOAD_FAIL;
115       break;
116     }
117 
118     // Verify whether the two table sizes match.
119     size_t hsize =
120         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
121     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
122 
123     // Invalid image for these host entries!
124     if (hsize != tsize) {
125       REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
126              device_id, hsize, tsize);
127       TransTable->TargetsImages[device_id] = 0;
128       TransTable->TargetsTable[device_id] = 0;
129       rc = OFFLOAD_FAIL;
130       break;
131     }
132 
133     // process global data that needs to be mapped.
134     Device.DataMapMtx.lock();
135     __tgt_target_table *HostTable = &TransTable->HostTable;
136     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
137                              *CurrHostEntry = HostTable->EntriesBegin,
138                              *EntryDeviceEnd = TargetTable->EntriesEnd;
139          CurrDeviceEntry != EntryDeviceEnd;
140          CurrDeviceEntry++, CurrHostEntry++) {
141       if (CurrDeviceEntry->size != 0) {
142         // has data.
143         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
144                "data size mismatch");
145 
146         // Fortran may use multiple weak declarations for the same symbol,
147         // therefore we must allow for multiple weak symbols to be loaded from
148         // the fat binary. Treat these mappings as any other "regular" mapping.
149         // Add entry to map.
150         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
151           continue;
152         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
153            "\n",
154            DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
155            CurrDeviceEntry->size);
156         Device.HostDataToTargetMap.emplace(
157             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
158             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
159             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
160             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr,
161             true /*IsRefCountINF*/);
162       }
163     }
164     Device.DataMapMtx.unlock();
165   }
166   PM->TrlTblMtx.unlock();
167 
168   if (rc != OFFLOAD_SUCCESS) {
169     Device.PendingGlobalsMtx.unlock();
170     return rc;
171   }
172 
173   /*
174    * Run ctors for static objects
175    */
176   if (!Device.PendingCtorsDtors.empty()) {
177     AsyncInfoTy AsyncInfo(Device);
178     // Call all ctors for all libraries registered so far
179     for (auto &lib : Device.PendingCtorsDtors) {
180       if (!lib.second.PendingCtors.empty()) {
181         DP("Has pending ctors... call now\n");
182         for (auto &entry : lib.second.PendingCtors) {
183           void *ctor = entry;
184           int rc =
185               target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
186                      nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
187           if (rc != OFFLOAD_SUCCESS) {
188             REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
189             Device.PendingGlobalsMtx.unlock();
190             return OFFLOAD_FAIL;
191           }
192         }
193         // Clear the list to indicate that this device has been used
194         lib.second.PendingCtors.clear();
195         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
196       }
197     }
198     // All constructors have been issued, wait for them now.
199     if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
200       return OFFLOAD_FAIL;
201   }
202   Device.HasPendingGlobals = false;
203   Device.PendingGlobalsMtx.unlock();
204 
205   return OFFLOAD_SUCCESS;
206 }
207 
208 // Check whether a device has been initialized, global ctors have been
209 // executed and global data has been mapped; do so if not already done.
210 int CheckDeviceAndCtors(int64_t device_id) {
211   // Is device ready?
212   if (!device_is_ready(device_id)) {
213     REPORT("Device %" PRId64 " is not ready.\n", device_id);
214     return OFFLOAD_FAIL;
215   }
216 
217   // Get device info.
218   DeviceTy &Device = PM->Devices[device_id];
219 
220   // Check whether global data has been mapped for this device
221   Device.PendingGlobalsMtx.lock();
222   bool hasPendingGlobals = Device.HasPendingGlobals;
223   Device.PendingGlobalsMtx.unlock();
224   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
225     REPORT("Failed to init globals on device %" PRId64 "\n", device_id);
226     return OFFLOAD_FAIL;
227   }
228 
229   return OFFLOAD_SUCCESS;
230 }
231 
232 static int32_t getParentIndex(int64_t type) {
233   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
234 }
235 
236 /// Call the user-defined mapper function followed by the appropriate
237 // targetData* function (targetData{Begin,End,Update}).
238 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
239                      int64_t arg_size, int64_t arg_type,
240                      map_var_info_t arg_names, void *arg_mapper,
241                      AsyncInfoTy &AsyncInfo,
242                      TargetDataFuncPtrTy target_data_function) {
243   TIMESCOPE_WITH_IDENT(loc);
244   DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
245 
246   // The mapper function fills up Components.
247   MapperComponentsTy MapperComponents;
248   MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
249   (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
250                    arg_names);
251 
252   // Construct new arrays for args_base, args, arg_sizes and arg_types
253   // using the information in MapperComponents and call the corresponding
254   // targetData* function using these new arrays.
255   std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
256   std::vector<void *> MapperArgs(MapperComponents.Components.size());
257   std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
258   std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
259   std::vector<void *> MapperArgNames(MapperComponents.Components.size());
260 
261   for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
262     auto &C =
263         MapperComponents
264             .Components[target_data_function == targetDataEnd ? E - I - 1 : I];
265     MapperArgsBase[I] = C.Base;
266     MapperArgs[I] = C.Begin;
267     MapperArgSizes[I] = C.Size;
268     MapperArgTypes[I] = C.Type;
269     MapperArgNames[I] = C.Name;
270   }
271 
272   int rc = target_data_function(
273       loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(),
274       MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(),
275       MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo);
276 
277   return rc;
278 }
279 
280 /// Internal function to do the mapping and transfer the data to the device
281 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
282                     void **args_base, void **args, int64_t *arg_sizes,
283                     int64_t *arg_types, map_var_info_t *arg_names,
284                     void **arg_mappers, AsyncInfoTy &AsyncInfo) {
285   // process each input.
286   for (int32_t i = 0; i < arg_num; ++i) {
287     // Ignore private variables and arrays - there is no mapping for them.
288     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
289         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
290       continue;
291 
292     if (arg_mappers && arg_mappers[i]) {
293       // Instead of executing the regular path of targetDataBegin, call the
294       // targetDataMapper variant which will call targetDataBegin again
295       // with new arguments.
296       DP("Calling targetDataMapper for the %dth argument\n", i);
297 
298       map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
299       int rc = targetDataMapper(loc, Device, args_base[i], args[i],
300                                 arg_sizes[i], arg_types[i], arg_name,
301                                 arg_mappers[i], AsyncInfo, targetDataBegin);
302 
303       if (rc != OFFLOAD_SUCCESS) {
304         REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
305                " failed.\n");
306         return OFFLOAD_FAIL;
307       }
308 
309       // Skip the rest of this function, continue to the next argument.
310       continue;
311     }
312 
313     void *HstPtrBegin = args[i];
314     void *HstPtrBase = args_base[i];
315     int64_t data_size = arg_sizes[i];
316     map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
317 
318     // Adjust for proper alignment if this is a combined entry (for structs).
319     // Look at the next argument - if that is MEMBER_OF this one, then this one
320     // is a combined entry.
321     int64_t padding = 0;
322     const int next_i = i + 1;
323     if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
324         getParentIndex(arg_types[next_i]) == i) {
325       padding = (int64_t)HstPtrBegin % Alignment;
326       if (padding) {
327         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
328            "\n",
329            padding, DPxPTR(HstPtrBegin));
330         HstPtrBegin = (char *)HstPtrBegin - padding;
331         data_size += padding;
332       }
333     }
334 
335     // Address of pointer on the host and device, respectively.
336     void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
337     bool IsNew, Pointer_IsNew;
338     bool IsHostPtr = false;
339     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
340     // Force the creation of a device side copy of the data when:
341     // a close map modifier was associated with a map that contained a to.
342     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
343     bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
344     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
345     // have reached this point via __tgt_target_data_begin and not __tgt_target
346     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
347     // associated with a target region, so there are no target parameters). This
348     // may be considered a hack, we could revise the scheme in the future.
349     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
350     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
351       DP("Has a pointer entry: \n");
352       // Base is address of pointer.
353       //
354       // Usually, the pointer is already allocated by this time.  For example:
355       //
356       //   #pragma omp target map(s.p[0:N])
357       //
358       // The map entry for s comes first, and the PTR_AND_OBJ entry comes
359       // afterward, so the pointer is already allocated by the time the
360       // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
361       // non-null.  However, "declare target link" can produce a PTR_AND_OBJ
362       // entry for a global that might not already be allocated by the time the
363       // PTR_AND_OBJ entry is handled below, and so the allocation might fail
364       // when HasPresentModifier.
365       PointerTgtPtrBegin = Device.getOrAllocTgtPtr(
366           HstPtrBase, HstPtrBase, sizeof(void *), nullptr, Pointer_IsNew,
367           IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier,
368           HasPresentModifier);
369       if (!PointerTgtPtrBegin) {
370         REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
371                HasPresentModifier ? "'present' map type modifier"
372                                   : "device failure or illegal mapping");
373         return OFFLOAD_FAIL;
374       }
375       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
376          "\n",
377          sizeof(void *), DPxPTR(PointerTgtPtrBegin),
378          (Pointer_IsNew ? "" : " not"));
379       Pointer_HstPtrBegin = HstPtrBase;
380       // modify current entry.
381       HstPtrBase = *(void **)HstPtrBase;
382       UpdateRef = true; // subsequently update ref count of pointee
383     }
384 
385     void *TgtPtrBegin = Device.getOrAllocTgtPtr(
386         HstPtrBegin, HstPtrBase, data_size, HstPtrName, IsNew, IsHostPtr,
387         IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier);
388     // If data_size==0, then the argument could be a zero-length pointer to
389     // NULL, so getOrAlloc() returning NULL is not an error.
390     if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
391       REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n",
392              HasPresentModifier ? "'present' map type modifier"
393                                 : "device failure or illegal mapping");
394       return OFFLOAD_FAIL;
395     }
396     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
397        " - is%s new\n",
398        data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not"));
399 
400     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
401       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
402       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
403       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
404       args_base[i] = TgtPtrBase;
405     }
406 
407     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
408       bool copy = false;
409       if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
410           HasCloseModifier) {
411         if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
412           copy = true;
413         } else if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
414                    !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
415           // Copy data only if the "parent" struct has RefCount==1.
416           // If this is a PTR_AND_OBJ entry, the OBJ is not part of the struct,
417           // so exclude it from this check.
418           int32_t parent_idx = getParentIndex(arg_types[i]);
419           uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
420           assert(parent_rc > 0 && "parent struct not found");
421           if (parent_rc == 1) {
422             copy = true;
423           }
424         }
425       }
426 
427       if (copy && !IsHostPtr) {
428         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
429            data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
430         int rt =
431             Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo);
432         if (rt != OFFLOAD_SUCCESS) {
433           REPORT("Copying data to device failed.\n");
434           return OFFLOAD_FAIL;
435         }
436       }
437     }
438 
439     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
440       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
441          DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
442       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
443       void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
444       TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
445       int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
446                                  sizeof(void *), AsyncInfo);
447       if (rt != OFFLOAD_SUCCESS) {
448         REPORT("Copying data to device failed.\n");
449         return OFFLOAD_FAIL;
450       }
451       // create shadow pointers for this entry
452       Device.ShadowMtx.lock();
453       Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
454           HstPtrBase, PointerTgtPtrBegin, TgtPtrBase};
455       Device.ShadowMtx.unlock();
456     }
457   }
458 
459   return OFFLOAD_SUCCESS;
460 }
461 
462 namespace {
463 /// This structure contains information to deallocate a target pointer, aka.
464 /// used to call the function \p DeviceTy::deallocTgtPtr.
465 struct DeallocTgtPtrInfo {
466   /// Host pointer used to look up into the map table
467   void *HstPtrBegin;
468   /// Size of the data
469   int64_t DataSize;
470   /// Whether it is forced to be removed from the map table
471   bool ForceDelete;
472   /// Whether it has \p close modifier
473   bool HasCloseModifier;
474 
475   DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool ForceDelete,
476                     bool HasCloseModifier)
477       : HstPtrBegin(HstPtr), DataSize(Size), ForceDelete(ForceDelete),
478         HasCloseModifier(HasCloseModifier) {}
479 };
480 } // namespace
481 
482 /// Internal function to undo the mapping and retrieve the data from the device.
483 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
484                   void **ArgBases, void **Args, int64_t *ArgSizes,
485                   int64_t *ArgTypes, map_var_info_t *ArgNames,
486                   void **ArgMappers, AsyncInfoTy &AsyncInfo) {
487   int Ret;
488   std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
489   // process each input.
490   for (int32_t I = ArgNum - 1; I >= 0; --I) {
491     // Ignore private variables and arrays - there is no mapping for them.
492     // Also, ignore the use_device_ptr directive, it has no effect here.
493     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
494         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
495       continue;
496 
497     if (ArgMappers && ArgMappers[I]) {
498       // Instead of executing the regular path of targetDataEnd, call the
499       // targetDataMapper variant which will call targetDataEnd again
500       // with new arguments.
501       DP("Calling targetDataMapper for the %dth argument\n", I);
502 
503       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
504       Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
505                              ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
506                              targetDataEnd);
507 
508       if (Ret != OFFLOAD_SUCCESS) {
509         REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
510                " failed.\n");
511         return OFFLOAD_FAIL;
512       }
513 
514       // Skip the rest of this function, continue to the next argument.
515       continue;
516     }
517 
518     void *HstPtrBegin = Args[I];
519     int64_t DataSize = ArgSizes[I];
520     // Adjust for proper alignment if this is a combined entry (for structs).
521     // Look at the next argument - if that is MEMBER_OF this one, then this one
522     // is a combined entry.
523     const int NextI = I + 1;
524     if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
525         getParentIndex(ArgTypes[NextI]) == I) {
526       int64_t Padding = (int64_t)HstPtrBegin % Alignment;
527       if (Padding) {
528         DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
529            "\n",
530            Padding, DPxPTR(HstPtrBegin));
531         HstPtrBegin = (char *)HstPtrBegin - Padding;
532         DataSize += Padding;
533       }
534     }
535 
536     bool IsLast, IsHostPtr;
537     bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
538     bool UpdateRef = !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
539                      (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
540     bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
541     bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
542     bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
543 
544     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
545     void *TgtPtrBegin = Device.getTgtPtrBegin(
546         HstPtrBegin, DataSize, IsLast, UpdateRef, IsHostPtr, !IsImplicit);
547     if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
548       DP("Mapping does not exist (%s)\n",
549          (HasPresentModifier ? "'present' map type modifier" : "ignored"));
550       if (HasPresentModifier) {
551         // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
552         // "If a map clause appears on a target, target data, target enter data
553         // or target exit data construct with a present map-type-modifier then
554         // on entry to the region if the corresponding list item does not appear
555         // in the device data environment then an error occurs and the program
556         // terminates."
557         //
558         // This should be an error upon entering an "omp target exit data".  It
559         // should not be an error upon exiting an "omp target data" or "omp
560         // target".  For "omp target data", Clang thus doesn't include present
561         // modifiers for end calls.  For "omp target", we have not found a valid
562         // OpenMP program for which the error matters: it appears that, if a
563         // program can guarantee that data is present at the beginning of an
564         // "omp target" region so that there's no error there, that data is also
565         // guaranteed to be present at the end.
566         MESSAGE("device mapping required by 'present' map type modifier does "
567                 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
568                 DPxPTR(HstPtrBegin), DataSize);
569         return OFFLOAD_FAIL;
570       }
571     } else {
572       DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
573          " - is%s last\n",
574          DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
575     }
576 
577     // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
578     // "If the map clause appears on a target, target data, or target exit data
579     // construct and a corresponding list item of the original list item is not
580     // present in the device data environment on exit from the region then the
581     // list item is ignored."
582     if (!TgtPtrBegin)
583       continue;
584 
585     bool DelEntry = IsLast || ForceDelete;
586 
587     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
588         !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
589       DelEntry = false; // protect parent struct from being deallocated
590     }
591 
592     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
593       // Move data back to the host
594       if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
595         bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
596         bool CopyMember = false;
597         if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
598             HasCloseModifier) {
599           if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
600               !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
601             // Copy data only if the "parent" struct has RefCount==1.
602             int32_t ParentIdx = getParentIndex(ArgTypes[I]);
603             uint64_t ParentRC = Device.getMapEntryRefCnt(Args[ParentIdx]);
604             assert(ParentRC > 0 && "parent struct not found");
605             if (ParentRC == 1)
606               CopyMember = true;
607           }
608         }
609 
610         if ((DelEntry || Always || CopyMember) &&
611             !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
612               TgtPtrBegin == HstPtrBegin)) {
613           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
614              DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
615           Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
616                                     AsyncInfo);
617           if (Ret != OFFLOAD_SUCCESS) {
618             REPORT("Copying data from device failed.\n");
619             return OFFLOAD_FAIL;
620           }
621         }
622       }
623 
624       // If we copied back to the host a struct/array containing pointers, we
625       // need to restore the original host pointer values from their shadow
626       // copies. If the struct is going to be deallocated, remove any remaining
627       // shadow pointer entries for this struct.
628       uintptr_t LB = (uintptr_t)HstPtrBegin;
629       uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
630       Device.ShadowMtx.lock();
631       for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
632            Itr != Device.ShadowPtrMap.end();) {
633         void **ShadowHstPtrAddr = (void **)Itr->first;
634 
635         // An STL map is sorted on its keys; use this property
636         // to quickly determine when to break out of the loop.
637         if ((uintptr_t)ShadowHstPtrAddr < LB) {
638           ++Itr;
639           continue;
640         }
641         if ((uintptr_t)ShadowHstPtrAddr >= UB)
642           break;
643 
644         // If we copied the struct to the host, we need to restore the pointer.
645         if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
646           DP("Restoring original host pointer value " DPxMOD " for host "
647              "pointer " DPxMOD "\n",
648              DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
649           *ShadowHstPtrAddr = Itr->second.HstPtrVal;
650         }
651         // If the struct is to be deallocated, remove the shadow entry.
652         if (DelEntry) {
653           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
654           Itr = Device.ShadowPtrMap.erase(Itr);
655         } else {
656           ++Itr;
657         }
658       }
659       Device.ShadowMtx.unlock();
660 
661       // Add pointer to the buffer for later deallocation
662       if (DelEntry)
663         DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, ForceDelete,
664                                     HasCloseModifier);
665     }
666   }
667 
668   // TODO: We should not synchronize here but pass the AsyncInfo object to the
669   //       allocate/deallocate device APIs.
670   //
671   // We need to synchronize before deallocating data.
672   Ret = AsyncInfo.synchronize();
673   if (Ret != OFFLOAD_SUCCESS)
674     return OFFLOAD_FAIL;
675 
676   // Deallocate target pointer
677   for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
678     Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
679                                Info.ForceDelete, Info.HasCloseModifier);
680     if (Ret != OFFLOAD_SUCCESS) {
681       REPORT("Deallocating data from device failed.\n");
682       return OFFLOAD_FAIL;
683     }
684   }
685 
686   return OFFLOAD_SUCCESS;
687 }
688 
689 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
690                                 void *HstPtrBegin, int64_t ArgSize,
691                                 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
692   TIMESCOPE_WITH_IDENT(loc);
693   bool IsLast, IsHostPtr;
694   void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
695                                             IsHostPtr, /*MustContain=*/true);
696   if (!TgtPtrBegin) {
697     DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
698     if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
699       MESSAGE("device mapping required by 'present' motion modifier does not "
700               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
701               DPxPTR(HstPtrBegin), ArgSize);
702       return OFFLOAD_FAIL;
703     }
704     return OFFLOAD_SUCCESS;
705   }
706 
707   if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
708       TgtPtrBegin == HstPtrBegin) {
709     DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
710        DPxPTR(HstPtrBegin));
711     return OFFLOAD_SUCCESS;
712   }
713 
714   if (ArgType & OMP_TGT_MAPTYPE_FROM) {
715     DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
716        ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
717     int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
718     if (Ret != OFFLOAD_SUCCESS) {
719       REPORT("Copying data from device failed.\n");
720       return OFFLOAD_FAIL;
721     }
722 
723     uintptr_t LB = (uintptr_t)HstPtrBegin;
724     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
725     Device.ShadowMtx.lock();
726     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
727          IT != Device.ShadowPtrMap.end(); ++IT) {
728       void **ShadowHstPtrAddr = (void **)IT->first;
729       if ((uintptr_t)ShadowHstPtrAddr < LB)
730         continue;
731       if ((uintptr_t)ShadowHstPtrAddr >= UB)
732         break;
733       DP("Restoring original host pointer value " DPxMOD
734          " for host pointer " DPxMOD "\n",
735          DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
736       *ShadowHstPtrAddr = IT->second.HstPtrVal;
737     }
738     Device.ShadowMtx.unlock();
739   }
740 
741   if (ArgType & OMP_TGT_MAPTYPE_TO) {
742     DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
743        ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
744     int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
745     if (Ret != OFFLOAD_SUCCESS) {
746       REPORT("Copying data to device failed.\n");
747       return OFFLOAD_FAIL;
748     }
749 
750     uintptr_t LB = (uintptr_t)HstPtrBegin;
751     uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
752     Device.ShadowMtx.lock();
753     for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
754          IT != Device.ShadowPtrMap.end(); ++IT) {
755       void **ShadowHstPtrAddr = (void **)IT->first;
756       if ((uintptr_t)ShadowHstPtrAddr < LB)
757         continue;
758       if ((uintptr_t)ShadowHstPtrAddr >= UB)
759         break;
760       DP("Restoring original target pointer value " DPxMOD " for target "
761          "pointer " DPxMOD "\n",
762          DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
763       Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
764                               sizeof(void *), AsyncInfo);
765       if (Ret != OFFLOAD_SUCCESS) {
766         REPORT("Copying data to device failed.\n");
767         Device.ShadowMtx.unlock();
768         return OFFLOAD_FAIL;
769       }
770     }
771     Device.ShadowMtx.unlock();
772   }
773   return OFFLOAD_SUCCESS;
774 }
775 
776 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
777                                    void *ArgsBase,
778                                    __tgt_target_non_contig *NonContig,
779                                    uint64_t Size, int64_t ArgType,
780                                    int CurrentDim, int DimSize, uint64_t Offset,
781                                    AsyncInfoTy &AsyncInfo) {
782   TIMESCOPE_WITH_IDENT(loc);
783   int Ret = OFFLOAD_SUCCESS;
784   if (CurrentDim < DimSize) {
785     for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
786       uint64_t CurOffset =
787           (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
788       // we only need to transfer the first element for the last dimension
789       // since we've already got a contiguous piece.
790       if (CurrentDim != DimSize - 1 || I == 0) {
791         Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
792                                       ArgType, CurrentDim + 1, DimSize,
793                                       Offset + CurOffset, AsyncInfo);
794         // Stop the whole process if any contiguous piece returns anything
795         // other than OFFLOAD_SUCCESS.
796         if (Ret != OFFLOAD_SUCCESS)
797           return Ret;
798       }
799     }
800   } else {
801     char *Ptr = (char *)ArgsBase + Offset;
802     DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
803        " len %" PRIu64 "\n",
804        DPxPTR(Ptr), Offset, Size);
805     Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
806                                AsyncInfo);
807   }
808   return Ret;
809 }
810 
811 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
812                                        int32_t DimSize) {
813   int RemovedDim = 0;
814   for (int I = DimSize - 1; I > 0; --I) {
815     if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
816       RemovedDim++;
817   }
818   return RemovedDim;
819 }
820 
821 /// Internal function to pass data to/from the target.
822 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
823                      void **ArgsBase, void **Args, int64_t *ArgSizes,
824                      int64_t *ArgTypes, map_var_info_t *ArgNames,
825                      void **ArgMappers, AsyncInfoTy &AsyncInfo) {
826   // process each input.
827   for (int32_t I = 0; I < ArgNum; ++I) {
828     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
829         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
830       continue;
831 
832     if (ArgMappers && ArgMappers[I]) {
833       // Instead of executing the regular path of targetDataUpdate, call the
834       // targetDataMapper variant which will call targetDataUpdate again
835       // with new arguments.
836       DP("Calling targetDataMapper for the %dth argument\n", I);
837 
838       map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
839       int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
840                                  ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
841                                  targetDataUpdate);
842 
843       if (Ret != OFFLOAD_SUCCESS) {
844         REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
845                " failed.\n");
846         return OFFLOAD_FAIL;
847       }
848 
849       // Skip the rest of this function, continue to the next argument.
850       continue;
851     }
852 
853     int Ret = OFFLOAD_SUCCESS;
854 
855     if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
856       __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
857       int32_t DimSize = ArgSizes[I];
858       uint64_t Size =
859           NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
860       int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
861       Ret = targetDataNonContiguous(
862           loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
863           /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
864     } else {
865       Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
866                                  ArgTypes[I], AsyncInfo);
867     }
868     if (Ret == OFFLOAD_FAIL)
869       return OFFLOAD_FAIL;
870   }
871   return OFFLOAD_SUCCESS;
872 }
873 
874 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
875                                       OMP_TGT_MAPTYPE_LITERAL |
876                                       OMP_TGT_MAPTYPE_IMPLICIT;
877 static bool isLambdaMapping(int64_t Mapping) {
878   return (Mapping & LambdaMapping) == LambdaMapping;
879 }
880 
881 namespace {
882 /// Find the table information in the map or look it up in the translation
883 /// tables.
884 TableMap *getTableMap(void *HostPtr) {
885   std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
886   HostPtrToTableMapTy::iterator TableMapIt =
887       PM->HostPtrToTableMap.find(HostPtr);
888 
889   if (TableMapIt != PM->HostPtrToTableMap.end())
890     return &TableMapIt->second;
891 
892   // We don't have a map. So search all the registered libraries.
893   TableMap *TM = nullptr;
894   std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
895   for (HostEntriesBeginToTransTableTy::iterator Itr =
896            PM->HostEntriesBeginToTransTable.begin();
897        Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
898     // get the translation table (which contains all the good info).
899     TranslationTable *TransTable = &Itr->second;
900     // iterate over all the host table entries to see if we can locate the
901     // host_ptr.
902     __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
903     for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
904       if (Cur->addr != HostPtr)
905         continue;
906       // we got a match, now fill the HostPtrToTableMap so that we
907       // may avoid this search next time.
908       TM = &(PM->HostPtrToTableMap)[HostPtr];
909       TM->Table = TransTable;
910       TM->Index = I;
911       return TM;
912     }
913   }
914 
915   return nullptr;
916 }
917 
918 /// Get loop trip count
919 /// FIXME: This function will not work right if calling
920 /// __kmpc_push_target_tripcount in one thread but doing offloading in another
921 /// thread, which might occur when we call task yield.
922 uint64_t getLoopTripCount(int64_t DeviceId) {
923   DeviceTy &Device = PM->Devices[DeviceId];
924   uint64_t LoopTripCount = 0;
925 
926   {
927     std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
928     auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
929     if (I != Device.LoopTripCnt.end()) {
930       LoopTripCount = I->second;
931       Device.LoopTripCnt.erase(I);
932       DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
933     }
934   }
935 
936   return LoopTripCount;
937 }
938 
939 /// A class manages private arguments in a target region.
940 class PrivateArgumentManagerTy {
941   /// A data structure for the information of first-private arguments. We can
942   /// use this information to optimize data transfer by packing all
943   /// first-private arguments and transfer them all at once.
944   struct FirstPrivateArgInfoTy {
945     /// The index of the element in \p TgtArgs corresponding to the argument
946     const int Index;
947     /// Host pointer begin
948     const char *HstPtrBegin;
949     /// Host pointer end
950     const char *HstPtrEnd;
951     /// Aligned size
952     const int64_t AlignedSize;
953     /// Host pointer name
954     const map_var_info_t HstPtrName = nullptr;
955 
956     FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
957                           const map_var_info_t HstPtrName = nullptr)
958         : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
959           HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
960           HstPtrName(HstPtrName) {}
961   };
962 
963   /// A vector of target pointers for all private arguments
964   std::vector<void *> TgtPtrs;
965 
966   /// A vector of information of all first-private arguments to be packed
967   std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
968   /// Host buffer for all arguments to be packed
969   std::vector<char> FirstPrivateArgBuffer;
970   /// The total size of all arguments to be packed
971   int64_t FirstPrivateArgSize = 0;
972 
973   /// A reference to the \p DeviceTy object
974   DeviceTy &Device;
975   /// A pointer to a \p AsyncInfoTy object
976   AsyncInfoTy &AsyncInfo;
977 
978   // TODO: What would be the best value here? Should we make it configurable?
979   // If the size is larger than this threshold, we will allocate and transfer it
980   // immediately instead of packing it.
981   static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
982 
983 public:
984   /// Constructor
985   PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
986       : Device(Dev), AsyncInfo(AsyncInfo) {}
987 
988   /// Add a private argument
989   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
990              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
991              const map_var_info_t HstPtrName = nullptr) {
992     // If the argument is not first-private, or its size is greater than a
993     // predefined threshold, we will allocate memory and issue the transfer
994     // immediately.
995     if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate) {
996       TgtPtr = Device.allocData(ArgSize, HstPtr);
997       if (!TgtPtr) {
998         DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
999            (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1000         return OFFLOAD_FAIL;
1001       }
1002 #ifdef OMPTARGET_DEBUG
1003       void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1004       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1005          " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1006          "\n",
1007          ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1008          DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1009 #endif
1010       // If first-private, copy data from host
1011       if (IsFirstPrivate) {
1012         int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1013         if (Ret != OFFLOAD_SUCCESS) {
1014           DP("Copying data to device failed, failed.\n");
1015           return OFFLOAD_FAIL;
1016         }
1017       }
1018       TgtPtrs.push_back(TgtPtr);
1019     } else {
1020       DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1021          DPxPTR(HstPtr), ArgSize);
1022       // When reach this point, the argument must meet all following
1023       // requirements:
1024       // 1. Its size does not exceed the threshold (see the comment for
1025       // FirstPrivateArgSizeThreshold);
1026       // 2. It must be first-private (needs to be mapped to target device).
1027       // We will pack all this kind of arguments to transfer them all at once
1028       // to reduce the number of data transfer. We will not take
1029       // non-first-private arguments, aka. private arguments that doesn't need
1030       // to be mapped to target device, into account because data allocation
1031       // can be very efficient with memory manager.
1032 
1033       // Placeholder value
1034       TgtPtr = nullptr;
1035       FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1036                                        HstPtrName);
1037       FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1038     }
1039 
1040     return OFFLOAD_SUCCESS;
1041   }
1042 
1043   /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1044   /// and start the transfer.
1045   int packAndTransfer(std::vector<void *> &TgtArgs) {
1046     if (!FirstPrivateArgInfo.empty()) {
1047       assert(FirstPrivateArgSize != 0 &&
1048              "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1049       FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1050       auto Itr = FirstPrivateArgBuffer.begin();
1051       // Copy all host data to this buffer
1052       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1053         std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1054         Itr = std::next(Itr, Info.AlignedSize);
1055       }
1056       // Allocate target memory
1057       void *TgtPtr =
1058           Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1059       if (TgtPtr == nullptr) {
1060         DP("Failed to allocate target memory for private arguments.\n");
1061         return OFFLOAD_FAIL;
1062       }
1063       TgtPtrs.push_back(TgtPtr);
1064       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1065          FirstPrivateArgSize, DPxPTR(TgtPtr));
1066       // Transfer data to target device
1067       int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1068                                   FirstPrivateArgSize, AsyncInfo);
1069       if (Ret != OFFLOAD_SUCCESS) {
1070         DP("Failed to submit data of private arguments.\n");
1071         return OFFLOAD_FAIL;
1072       }
1073       // Fill in all placeholder pointers
1074       auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1075       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1076         void *&Ptr = TgtArgs[Info.Index];
1077         assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1078         Ptr = reinterpret_cast<void *>(TP);
1079         TP += Info.AlignedSize;
1080         DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1081            "\n",
1082            DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1083            DPxPTR(Ptr));
1084       }
1085     }
1086 
1087     return OFFLOAD_SUCCESS;
1088   }
1089 
1090   /// Free all target memory allocated for private arguments
1091   int free() {
1092     for (void *P : TgtPtrs) {
1093       int Ret = Device.deleteData(P);
1094       if (Ret != OFFLOAD_SUCCESS) {
1095         DP("Deallocation of (first-)private arrays failed.\n");
1096         return OFFLOAD_FAIL;
1097       }
1098     }
1099 
1100     TgtPtrs.clear();
1101 
1102     return OFFLOAD_SUCCESS;
1103   }
1104 };
1105 
1106 /// Process data before launching the kernel, including calling targetDataBegin
1107 /// to map and transfer data to target device, transferring (first-)private
1108 /// variables.
1109 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1110                              int32_t ArgNum, void **ArgBases, void **Args,
1111                              int64_t *ArgSizes, int64_t *ArgTypes,
1112                              map_var_info_t *ArgNames, void **ArgMappers,
1113                              std::vector<void *> &TgtArgs,
1114                              std::vector<ptrdiff_t> &TgtOffsets,
1115                              PrivateArgumentManagerTy &PrivateArgumentManager,
1116                              AsyncInfoTy &AsyncInfo) {
1117   TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1118   DeviceTy &Device = PM->Devices[DeviceId];
1119   int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1120                             ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1121   if (Ret != OFFLOAD_SUCCESS) {
1122     REPORT("Call to targetDataBegin failed, abort target.\n");
1123     return OFFLOAD_FAIL;
1124   }
1125 
1126   // List of (first-)private arrays allocated for this target region
1127   std::vector<int> TgtArgsPositions(ArgNum, -1);
1128 
1129   for (int32_t I = 0; I < ArgNum; ++I) {
1130     if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1131       // This is not a target parameter, do not push it into TgtArgs.
1132       // Check for lambda mapping.
1133       if (isLambdaMapping(ArgTypes[I])) {
1134         assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1135                "PTR_AND_OBJ must be also MEMBER_OF.");
1136         unsigned Idx = getParentIndex(ArgTypes[I]);
1137         int TgtIdx = TgtArgsPositions[Idx];
1138         assert(TgtIdx != -1 && "Base address must be translated already.");
1139         // The parent lambda must be processed already and it must be the last
1140         // in TgtArgs and TgtOffsets arrays.
1141         void *HstPtrVal = Args[I];
1142         void *HstPtrBegin = ArgBases[I];
1143         void *HstPtrBase = Args[Idx];
1144         bool IsLast, IsHostPtr; // unused.
1145         void *TgtPtrBase =
1146             (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1147         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1148         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1149         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1150         void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1151         PointerTgtPtrBegin = Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I],
1152                                                    IsLast, false, IsHostPtr);
1153         if (!PointerTgtPtrBegin) {
1154           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1155              DPxPTR(HstPtrVal));
1156           continue;
1157         }
1158         if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
1159             TgtPtrBegin == HstPtrBegin) {
1160           DP("Unified memory is active, no need to map lambda captured"
1161              "variable (" DPxMOD ")\n",
1162              DPxPTR(HstPtrVal));
1163           continue;
1164         }
1165         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1166            DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1167         Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1168                                 sizeof(void *), AsyncInfo);
1169         if (Ret != OFFLOAD_SUCCESS) {
1170           REPORT("Copying data to device failed.\n");
1171           return OFFLOAD_FAIL;
1172         }
1173       }
1174       continue;
1175     }
1176     void *HstPtrBegin = Args[I];
1177     void *HstPtrBase = ArgBases[I];
1178     void *TgtPtrBegin;
1179     map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1180     ptrdiff_t TgtBaseOffset;
1181     bool IsLast, IsHostPtr; // unused.
1182     if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1183       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1184          DPxPTR(HstPtrBase));
1185       TgtPtrBegin = HstPtrBase;
1186       TgtBaseOffset = 0;
1187     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1188       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1189       // Can be marked for optimization if the next argument(s) do(es) not
1190       // depend on this one.
1191       const bool IsFirstPrivate =
1192           (I >= ArgNum - 1 || !(ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1193       Ret = PrivateArgumentManager.addArg(
1194           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1195           TgtArgs.size(), HstPtrName);
1196       if (Ret != OFFLOAD_SUCCESS) {
1197         REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1198                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1199         return OFFLOAD_FAIL;
1200       }
1201     } else {
1202       if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1203         HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1204       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1205                                           false, IsHostPtr);
1206       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1207 #ifdef OMPTARGET_DEBUG
1208       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1209       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1210          DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1211 #endif
1212     }
1213     TgtArgsPositions[I] = TgtArgs.size();
1214     TgtArgs.push_back(TgtPtrBegin);
1215     TgtOffsets.push_back(TgtBaseOffset);
1216   }
1217 
1218   assert(TgtArgs.size() == TgtOffsets.size() &&
1219          "Size mismatch in arguments and offsets");
1220 
1221   // Pack and transfer first-private arguments
1222   Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1223   if (Ret != OFFLOAD_SUCCESS) {
1224     DP("Failed to pack and transfer first private arguments\n");
1225     return OFFLOAD_FAIL;
1226   }
1227 
1228   return OFFLOAD_SUCCESS;
1229 }
1230 
1231 /// Process data after launching the kernel, including transferring data back to
1232 /// host if needed and deallocating target memory of (first-)private variables.
1233 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1234                             int32_t ArgNum, void **ArgBases, void **Args,
1235                             int64_t *ArgSizes, int64_t *ArgTypes,
1236                             map_var_info_t *ArgNames, void **ArgMappers,
1237                             PrivateArgumentManagerTy &PrivateArgumentManager,
1238                             AsyncInfoTy &AsyncInfo) {
1239   TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1240   DeviceTy &Device = PM->Devices[DeviceId];
1241 
1242   // Move data from device.
1243   int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1244                           ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1245   if (Ret != OFFLOAD_SUCCESS) {
1246     REPORT("Call to targetDataEnd failed, abort target.\n");
1247     return OFFLOAD_FAIL;
1248   }
1249 
1250   // Free target memory for private arguments
1251   Ret = PrivateArgumentManager.free();
1252   if (Ret != OFFLOAD_SUCCESS) {
1253     REPORT("Failed to deallocate target memory for private args\n");
1254     return OFFLOAD_FAIL;
1255   }
1256 
1257   return OFFLOAD_SUCCESS;
1258 }
1259 } // namespace
1260 
1261 /// performs the same actions as data_begin in case arg_num is
1262 /// non-zero and initiates run of the offloaded region on the target platform;
1263 /// if arg_num is non-zero after the region execution is done it also
1264 /// performs the same action as data_update and data_end above. This function
1265 /// returns 0 if it was able to transfer the execution to a target and an
1266 /// integer different from zero otherwise.
1267 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1268            void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1269            map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1270            int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1271   int32_t DeviceId = Device.DeviceID;
1272 
1273   TableMap *TM = getTableMap(HostPtr);
1274   // No map for this host pointer found!
1275   if (!TM) {
1276     REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1277            DPxPTR(HostPtr));
1278     return OFFLOAD_FAIL;
1279   }
1280 
1281   // get target table.
1282   __tgt_target_table *TargetTable = nullptr;
1283   {
1284     std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1285     assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1286            "Not expecting a device ID outside the table's bounds!");
1287     TargetTable = TM->Table->TargetsTable[DeviceId];
1288   }
1289   assert(TargetTable && "Global data has not been mapped\n");
1290 
1291   std::vector<void *> TgtArgs;
1292   std::vector<ptrdiff_t> TgtOffsets;
1293 
1294   PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1295 
1296   int Ret;
1297   if (ArgNum) {
1298     // Process data, such as data mapping, before launching the kernel
1299     Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1300                             ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1301                             TgtOffsets, PrivateArgumentManager, AsyncInfo);
1302     if (Ret != OFFLOAD_SUCCESS) {
1303       REPORT("Failed to process data before launching the kernel.\n");
1304       return OFFLOAD_FAIL;
1305     }
1306   }
1307 
1308   // Get loop trip count
1309   uint64_t LoopTripCount = getLoopTripCount(DeviceId);
1310 
1311   // Launch device execution.
1312   void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1313   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1314      TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1315 
1316   {
1317     TIMESCOPE_WITH_NAME_AND_IDENT(
1318         IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1319     if (IsTeamConstruct)
1320       Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1321                                  TgtArgs.size(), TeamNum, ThreadLimit,
1322                                  LoopTripCount, AsyncInfo);
1323     else
1324       Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1325                              TgtArgs.size(), AsyncInfo);
1326   }
1327 
1328   if (Ret != OFFLOAD_SUCCESS) {
1329     REPORT("Executing target region abort target.\n");
1330     return OFFLOAD_FAIL;
1331   }
1332 
1333   if (ArgNum) {
1334     // Transfer data back and deallocate target memory for (first-)private
1335     // variables
1336     Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1337                            ArgSizes, ArgTypes, ArgNames, ArgMappers,
1338                            PrivateArgumentManager, AsyncInfo);
1339     if (Ret != OFFLOAD_SUCCESS) {
1340       REPORT("Failed to process data after launching the kernel.\n");
1341       return OFFLOAD_FAIL;
1342     }
1343   }
1344 
1345   return OFFLOAD_SUCCESS;
1346 }
1347