1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include <omptarget.h>
15 
16 #include "device.h"
17 #include "private.h"
18 #include "rtl.h"
19 
20 #include <cassert>
21 #include <vector>
22 
23 #ifdef OMPTARGET_DEBUG
24 int DebugLevel = 0;
25 #endif // OMPTARGET_DEBUG
26 
27 
28 
29 /* All begin addresses for partially mapped structs must be 8-aligned in order
30  * to ensure proper alignment of members. E.g.
31  *
32  * struct S {
33  *   int a;   // 4-aligned
34  *   int b;   // 4-aligned
35  *   int *p;  // 8-aligned
36  * } s1;
37  * ...
38  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
39  * {
40  *   s1.b = 5;
41  *   for (int i...) s1.p[i] = ...;
42  * }
43  *
44  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
45  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
46  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
47  * requirements for its type. Now, when we allocate memory on the device, in
48  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
49  * This means that the chunk of the struct on the device will start at a
50  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
51  * address of p will be a misaligned 0x204 (on the host there was no need to add
52  * padding between b and p, so p comes exactly 4 bytes after b). If the device
53  * kernel tries to access s1.p, a misaligned address error occurs (as reported
54  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
55  * extending the size of the allocated chuck accordingly, the chuck on the
56  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
57  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
58  */
59 static const int64_t alignment = 8;
60 
61 /// Map global data and execute pending ctors
62 static int InitLibrary(DeviceTy& Device) {
63   /*
64    * Map global data
65    */
66   int32_t device_id = Device.DeviceID;
67   int rc = OFFLOAD_SUCCESS;
68 
69   Device.PendingGlobalsMtx.lock();
70   TrlTblMtx.lock();
71   for (HostEntriesBeginToTransTableTy::iterator
72       ii = HostEntriesBeginToTransTable.begin();
73       ii != HostEntriesBeginToTransTable.end(); ++ii) {
74     TranslationTable *TransTable = &ii->second;
75     if (TransTable->TargetsTable[device_id] != 0) {
76       // Library entries have already been processed
77       continue;
78     }
79 
80     // 1) get image.
81     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
82            "Not expecting a device ID outside the table's bounds!");
83     __tgt_device_image *img = TransTable->TargetsImages[device_id];
84     if (!img) {
85       DP("No image loaded for device id %d.\n", device_id);
86       rc = OFFLOAD_FAIL;
87       break;
88     }
89     // 2) load image into the target table.
90     __tgt_target_table *TargetTable =
91         TransTable->TargetsTable[device_id] = Device.load_binary(img);
92     // Unable to get table for this image: invalidate image and fail.
93     if (!TargetTable) {
94       DP("Unable to generate entries table for device id %d.\n", device_id);
95       TransTable->TargetsImages[device_id] = 0;
96       rc = OFFLOAD_FAIL;
97       break;
98     }
99 
100     // Verify whether the two table sizes match.
101     size_t hsize =
102         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
103     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
104 
105     // Invalid image for these host entries!
106     if (hsize != tsize) {
107       DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
108          device_id, hsize, tsize);
109       TransTable->TargetsImages[device_id] = 0;
110       TransTable->TargetsTable[device_id] = 0;
111       rc = OFFLOAD_FAIL;
112       break;
113     }
114 
115     // process global data that needs to be mapped.
116     Device.DataMapMtx.lock();
117     __tgt_target_table *HostTable = &TransTable->HostTable;
118     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
119                              *CurrHostEntry = HostTable->EntriesBegin,
120                              *EntryDeviceEnd = TargetTable->EntriesEnd;
121          CurrDeviceEntry != EntryDeviceEnd;
122          CurrDeviceEntry++, CurrHostEntry++) {
123       if (CurrDeviceEntry->size != 0) {
124         // has data.
125         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
126                "data size mismatch");
127 
128         // Fortran may use multiple weak declarations for the same symbol,
129         // therefore we must allow for multiple weak symbols to be loaded from
130         // the fat binary. Treat these mappings as any other "regular" mapping.
131         // Add entry to map.
132         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
133           continue;
134         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
135             "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
136             CurrDeviceEntry->size);
137         Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
138             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
139             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
140             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
141             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
142             INF_REF_CNT /*RefCount*/));
143       }
144     }
145     Device.DataMapMtx.unlock();
146   }
147   TrlTblMtx.unlock();
148 
149   if (rc != OFFLOAD_SUCCESS) {
150     Device.PendingGlobalsMtx.unlock();
151     return rc;
152   }
153 
154   /*
155    * Run ctors for static objects
156    */
157   if (!Device.PendingCtorsDtors.empty()) {
158     // Call all ctors for all libraries registered so far
159     for (auto &lib : Device.PendingCtorsDtors) {
160       if (!lib.second.PendingCtors.empty()) {
161         DP("Has pending ctors... call now\n");
162         for (auto &entry : lib.second.PendingCtors) {
163           void *ctor = entry;
164           int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
165                           NULL, 1, 1, true /*team*/);
166           if (rc != OFFLOAD_SUCCESS) {
167             DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
168             Device.PendingGlobalsMtx.unlock();
169             return OFFLOAD_FAIL;
170           }
171         }
172         // Clear the list to indicate that this device has been used
173         lib.second.PendingCtors.clear();
174         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
175       }
176     }
177   }
178   Device.HasPendingGlobals = false;
179   Device.PendingGlobalsMtx.unlock();
180 
181   return OFFLOAD_SUCCESS;
182 }
183 
184 // Check whether a device has been initialized, global ctors have been
185 // executed and global data has been mapped; do so if not already done.
186 int CheckDeviceAndCtors(int64_t device_id) {
187   // Is device ready?
188   if (!device_is_ready(device_id)) {
189     DP("Device %" PRId64 " is not ready.\n", device_id);
190     return OFFLOAD_FAIL;
191   }
192 
193   // Get device info.
194   DeviceTy &Device = Devices[device_id];
195 
196   // Check whether global data has been mapped for this device
197   Device.PendingGlobalsMtx.lock();
198   bool hasPendingGlobals = Device.HasPendingGlobals;
199   Device.PendingGlobalsMtx.unlock();
200   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
201     DP("Failed to init globals on device %" PRId64 "\n", device_id);
202     return OFFLOAD_FAIL;
203   }
204 
205   return OFFLOAD_SUCCESS;
206 }
207 
208 static int32_t member_of(int64_t type) {
209   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
210 }
211 
212 /// Internal function to do the mapping and transfer the data to the device
213 int target_data_begin(DeviceTy &Device, int32_t arg_num,
214     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
215   // process each input.
216   for (int32_t i = 0; i < arg_num; ++i) {
217     // Ignore private variables and arrays - there is no mapping for them.
218     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
219         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
220       continue;
221 
222     void *HstPtrBegin = args[i];
223     void *HstPtrBase = args_base[i];
224     int64_t data_size = arg_sizes[i];
225 
226     // Adjust for proper alignment if this is a combined entry (for structs).
227     // Look at the next argument - if that is MEMBER_OF this one, then this one
228     // is a combined entry.
229     int64_t padding = 0;
230     const int next_i = i+1;
231     if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
232         member_of(arg_types[next_i]) == i) {
233       padding = (int64_t)HstPtrBegin % alignment;
234       if (padding) {
235         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
236             "\n", padding, DPxPTR(HstPtrBegin));
237         HstPtrBegin = (char *) HstPtrBegin - padding;
238         data_size += padding;
239       }
240     }
241 
242     // Address of pointer on the host and device, respectively.
243     void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
244     bool IsNew, Pointer_IsNew;
245     bool IsHostPtr = false;
246     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
247     // Force the creation of a device side copy of the data when:
248     // a close map modifier was associated with a map that contained a to.
249     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
250     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
251     // have reached this point via __tgt_target_data_begin and not __tgt_target
252     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
253     // associated with a target region, so there are no target parameters). This
254     // may be considered a hack, we could revise the scheme in the future.
255     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
256     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
257       DP("Has a pointer entry: \n");
258       // base is address of pointer.
259       Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
260           sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef,
261           HasCloseModifier);
262       if (!Pointer_TgtPtrBegin) {
263         DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
264             "illegal mapping).\n");
265         return OFFLOAD_FAIL;
266       }
267       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
268           "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
269           (Pointer_IsNew ? "" : " not"));
270       Pointer_HstPtrBegin = HstPtrBase;
271       // modify current entry.
272       HstPtrBase = *(void **)HstPtrBase;
273       UpdateRef = true; // subsequently update ref count of pointee
274     }
275 
276     void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
277         data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier);
278     if (!TgtPtrBegin && data_size) {
279       // If data_size==0, then the argument could be a zero-length pointer to
280       // NULL, so getOrAlloc() returning NULL is not an error.
281       DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
282           "illegal mapping).\n");
283     }
284     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
285         " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
286         (IsNew ? "" : " not"));
287 
288     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
289       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
290       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
291       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
292       args_base[i] = TgtPtrBase;
293     }
294 
295     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
296       bool copy = false;
297       if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
298           HasCloseModifier) {
299         if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
300           copy = true;
301         } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
302           // Copy data only if the "parent" struct has RefCount==1.
303           int32_t parent_idx = member_of(arg_types[i]);
304           long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
305           assert(parent_rc > 0 && "parent struct not found");
306           if (parent_rc == 1) {
307             copy = true;
308           }
309         }
310       }
311 
312       if (copy && !IsHostPtr) {
313         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
314             data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
315         int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
316         if (rt != OFFLOAD_SUCCESS) {
317           DP("Copying data to device failed.\n");
318           return OFFLOAD_FAIL;
319         }
320       }
321     }
322 
323     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
324       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
325           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
326       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
327       void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
328       int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
329           sizeof(void *));
330       if (rt != OFFLOAD_SUCCESS) {
331         DP("Copying data to device failed.\n");
332         return OFFLOAD_FAIL;
333       }
334       // create shadow pointers for this entry
335       Device.ShadowMtx.lock();
336       Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
337           Pointer_TgtPtrBegin, TgtPtrBase};
338       Device.ShadowMtx.unlock();
339     }
340   }
341 
342   return OFFLOAD_SUCCESS;
343 }
344 
345 /// Internal function to undo the mapping and retrieve the data from the device.
346 int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
347     void **args, int64_t *arg_sizes, int64_t *arg_types) {
348   // process each input.
349   for (int32_t i = arg_num - 1; i >= 0; --i) {
350     // Ignore private variables and arrays - there is no mapping for them.
351     // Also, ignore the use_device_ptr directive, it has no effect here.
352     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
353         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
354       continue;
355 
356     void *HstPtrBegin = args[i];
357     int64_t data_size = arg_sizes[i];
358     // Adjust for proper alignment if this is a combined entry (for structs).
359     // Look at the next argument - if that is MEMBER_OF this one, then this one
360     // is a combined entry.
361     int64_t padding = 0;
362     const int next_i = i+1;
363     if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
364         member_of(arg_types[next_i]) == i) {
365       padding = (int64_t)HstPtrBegin % alignment;
366       if (padding) {
367         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
368             "\n", padding, DPxPTR(HstPtrBegin));
369         HstPtrBegin = (char *) HstPtrBegin - padding;
370         data_size += padding;
371       }
372     }
373 
374     bool IsLast, IsHostPtr;
375     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
376         (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
377     bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
378     bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
379 
380     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
381     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
382         UpdateRef, IsHostPtr);
383     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
384         " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
385         (IsLast ? "" : " not"));
386 
387     bool DelEntry = IsLast || ForceDelete;
388 
389     if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
390         !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
391       DelEntry = false; // protect parent struct from being deallocated
392     }
393 
394     if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
395       // Move data back to the host
396       if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
397         bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
398         bool CopyMember = false;
399         if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
400             HasCloseModifier) {
401           if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
402               !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
403             // Copy data only if the "parent" struct has RefCount==1.
404             int32_t parent_idx = member_of(arg_types[i]);
405             long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
406             assert(parent_rc > 0 && "parent struct not found");
407             if (parent_rc == 1) {
408               CopyMember = true;
409             }
410           }
411         }
412 
413         if ((DelEntry || Always || CopyMember) &&
414             !(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
415               TgtPtrBegin == HstPtrBegin)) {
416           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
417               data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
418           int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
419           if (rt != OFFLOAD_SUCCESS) {
420             DP("Copying data from device failed.\n");
421             return OFFLOAD_FAIL;
422           }
423         }
424       }
425 
426       // If we copied back to the host a struct/array containing pointers, we
427       // need to restore the original host pointer values from their shadow
428       // copies. If the struct is going to be deallocated, remove any remaining
429       // shadow pointer entries for this struct.
430       uintptr_t lb = (uintptr_t) HstPtrBegin;
431       uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
432       Device.ShadowMtx.lock();
433       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
434            it != Device.ShadowPtrMap.end();) {
435         void **ShadowHstPtrAddr = (void**) it->first;
436 
437         // An STL map is sorted on its keys; use this property
438         // to quickly determine when to break out of the loop.
439         if ((uintptr_t) ShadowHstPtrAddr < lb) {
440           ++it;
441           continue;
442         }
443         if ((uintptr_t) ShadowHstPtrAddr >= ub)
444           break;
445 
446         // If we copied the struct to the host, we need to restore the pointer.
447         if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
448           DP("Restoring original host pointer value " DPxMOD " for host "
449               "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
450               DPxPTR(ShadowHstPtrAddr));
451           *ShadowHstPtrAddr = it->second.HstPtrVal;
452         }
453         // If the struct is to be deallocated, remove the shadow entry.
454         if (DelEntry) {
455           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
456           it = Device.ShadowPtrMap.erase(it);
457         } else {
458           ++it;
459         }
460       }
461       Device.ShadowMtx.unlock();
462 
463       // Deallocate map
464       if (DelEntry) {
465         int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete,
466                                       HasCloseModifier);
467         if (rt != OFFLOAD_SUCCESS) {
468           DP("Deallocating data from device failed.\n");
469           return OFFLOAD_FAIL;
470         }
471       }
472     }
473   }
474 
475   return OFFLOAD_SUCCESS;
476 }
477 
478 /// Internal function to pass data to/from the target.
479 int target_data_update(DeviceTy &Device, int32_t arg_num,
480     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
481   // process each input.
482   for (int32_t i = 0; i < arg_num; ++i) {
483     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
484         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
485       continue;
486 
487     void *HstPtrBegin = args[i];
488     int64_t MapSize = arg_sizes[i];
489     bool IsLast, IsHostPtr;
490     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
491         false, IsHostPtr);
492     if (!TgtPtrBegin) {
493       DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
494       continue;
495     }
496 
497     if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
498         TgtPtrBegin == HstPtrBegin) {
499       DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
500          DPxPTR(HstPtrBegin));
501       continue;
502     }
503 
504     if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
505       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
506           arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
507       int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
508       if (rt != OFFLOAD_SUCCESS) {
509         DP("Copying data from device failed.\n");
510         return OFFLOAD_FAIL;
511       }
512 
513       uintptr_t lb = (uintptr_t) HstPtrBegin;
514       uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
515       Device.ShadowMtx.lock();
516       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
517           it != Device.ShadowPtrMap.end(); ++it) {
518         void **ShadowHstPtrAddr = (void**) it->first;
519         if ((uintptr_t) ShadowHstPtrAddr < lb)
520           continue;
521         if ((uintptr_t) ShadowHstPtrAddr >= ub)
522           break;
523         DP("Restoring original host pointer value " DPxMOD " for host pointer "
524             DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
525             DPxPTR(ShadowHstPtrAddr));
526         *ShadowHstPtrAddr = it->second.HstPtrVal;
527       }
528       Device.ShadowMtx.unlock();
529     }
530 
531     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
532       DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
533           arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
534       int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
535       if (rt != OFFLOAD_SUCCESS) {
536         DP("Copying data to device failed.\n");
537         return OFFLOAD_FAIL;
538       }
539 
540       uintptr_t lb = (uintptr_t) HstPtrBegin;
541       uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
542       Device.ShadowMtx.lock();
543       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
544           it != Device.ShadowPtrMap.end(); ++it) {
545         void **ShadowHstPtrAddr = (void**) it->first;
546         if ((uintptr_t) ShadowHstPtrAddr < lb)
547           continue;
548         if ((uintptr_t) ShadowHstPtrAddr >= ub)
549           break;
550         DP("Restoring original target pointer value " DPxMOD " for target "
551             "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
552             DPxPTR(it->second.TgtPtrAddr));
553         rt = Device.data_submit(it->second.TgtPtrAddr,
554             &it->second.TgtPtrVal, sizeof(void *));
555         if (rt != OFFLOAD_SUCCESS) {
556           DP("Copying data to device failed.\n");
557           Device.ShadowMtx.unlock();
558           return OFFLOAD_FAIL;
559         }
560       }
561       Device.ShadowMtx.unlock();
562     }
563   }
564   return OFFLOAD_SUCCESS;
565 }
566 
567 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
568                                       OMP_TGT_MAPTYPE_LITERAL |
569                                       OMP_TGT_MAPTYPE_IMPLICIT;
570 static bool isLambdaMapping(int64_t Mapping) {
571   return (Mapping & LambdaMapping) == LambdaMapping;
572 }
573 
574 /// performs the same actions as data_begin in case arg_num is
575 /// non-zero and initiates run of the offloaded region on the target platform;
576 /// if arg_num is non-zero after the region execution is done it also
577 /// performs the same action as data_update and data_end above. This function
578 /// returns 0 if it was able to transfer the execution to a target and an
579 /// integer different from zero otherwise.
580 int target(int64_t device_id, void *host_ptr, int32_t arg_num,
581     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
582     int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
583   DeviceTy &Device = Devices[device_id];
584 
585   // Find the table information in the map or look it up in the translation
586   // tables.
587   TableMap *TM = 0;
588   TblMapMtx.lock();
589   HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
590   if (TableMapIt == HostPtrToTableMap.end()) {
591     // We don't have a map. So search all the registered libraries.
592     TrlTblMtx.lock();
593     for (HostEntriesBeginToTransTableTy::iterator
594              ii = HostEntriesBeginToTransTable.begin(),
595              ie = HostEntriesBeginToTransTable.end();
596          !TM && ii != ie; ++ii) {
597       // get the translation table (which contains all the good info).
598       TranslationTable *TransTable = &ii->second;
599       // iterate over all the host table entries to see if we can locate the
600       // host_ptr.
601       __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
602       __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
603       __tgt_offload_entry *cur = begin;
604       for (uint32_t i = 0; cur < end; ++cur, ++i) {
605         if (cur->addr != host_ptr)
606           continue;
607         // we got a match, now fill the HostPtrToTableMap so that we
608         // may avoid this search next time.
609         TM = &HostPtrToTableMap[host_ptr];
610         TM->Table = TransTable;
611         TM->Index = i;
612         break;
613       }
614     }
615     TrlTblMtx.unlock();
616   } else {
617     TM = &TableMapIt->second;
618   }
619   TblMapMtx.unlock();
620 
621   // No map for this host pointer found!
622   if (!TM) {
623     DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
624        DPxPTR(host_ptr));
625     return OFFLOAD_FAIL;
626   }
627 
628   // get target table.
629   TrlTblMtx.lock();
630   assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
631          "Not expecting a device ID outside the table's bounds!");
632   __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
633   TrlTblMtx.unlock();
634   assert(TargetTable && "Global data has not been mapped\n");
635 
636   // Move data to device.
637   int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
638       arg_types);
639   if (rc != OFFLOAD_SUCCESS) {
640     DP("Call to target_data_begin failed, abort target.\n");
641     return OFFLOAD_FAIL;
642   }
643 
644   std::vector<void *> tgt_args;
645   std::vector<ptrdiff_t> tgt_offsets;
646 
647   // List of (first-)private arrays allocated for this target region
648   std::vector<void *> fpArrays;
649   std::vector<int> tgtArgsPositions(arg_num, -1);
650 
651   for (int32_t i = 0; i < arg_num; ++i) {
652     if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
653       // This is not a target parameter, do not push it into tgt_args.
654       // Check for lambda mapping.
655       if (isLambdaMapping(arg_types[i])) {
656         assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
657                "PTR_AND_OBJ must be also MEMBER_OF.");
658         unsigned idx = member_of(arg_types[i]);
659         int tgtIdx = tgtArgsPositions[idx];
660         assert(tgtIdx != -1 && "Base address must be translated already.");
661         // The parent lambda must be processed already and it must be the last
662         // in tgt_args and tgt_offsets arrays.
663         void *HstPtrVal = args[i];
664         void *HstPtrBegin = args_base[i];
665         void *HstPtrBase = args[idx];
666         bool IsLast, IsHostPtr; // unused.
667         void *TgtPtrBase =
668             (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
669         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
670         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
671         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
672         void *Pointer_TgtPtrBegin =
673             Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
674                                   IsHostPtr);
675         if (!Pointer_TgtPtrBegin) {
676           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
677              DPxPTR(HstPtrVal));
678           continue;
679         }
680         if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
681             TgtPtrBegin == HstPtrBegin) {
682           DP("Unified memory is active, no need to map lambda captured"
683              "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
684           continue;
685         }
686         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
687            DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
688         int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
689                                     sizeof(void *));
690         if (rt != OFFLOAD_SUCCESS) {
691           DP("Copying data to device failed.\n");
692           return OFFLOAD_FAIL;
693         }
694       }
695       continue;
696     }
697     void *HstPtrBegin = args[i];
698     void *HstPtrBase = args_base[i];
699     void *TgtPtrBegin;
700     ptrdiff_t TgtBaseOffset;
701     bool IsLast, IsHostPtr; // unused.
702     if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
703       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
704           DPxPTR(HstPtrBase));
705       TgtPtrBegin = HstPtrBase;
706       TgtBaseOffset = 0;
707     } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
708       // Allocate memory for (first-)private array
709       TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
710           arg_sizes[i], HstPtrBegin);
711       if (!TgtPtrBegin) {
712         DP ("Data allocation for %sprivate array " DPxMOD " failed, "
713             "abort target.\n",
714             (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
715             DPxPTR(HstPtrBegin));
716         return OFFLOAD_FAIL;
717       }
718       fpArrays.push_back(TgtPtrBegin);
719       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
720 #ifdef OMPTARGET_DEBUG
721       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
722       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
723           "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
724           arg_sizes[i], DPxPTR(TgtPtrBegin),
725           (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
726           DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
727 #endif
728       // If first-private, copy data from host
729       if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
730         int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
731         if (rt != OFFLOAD_SUCCESS) {
732           DP ("Copying data to device failed, failed.\n");
733           return OFFLOAD_FAIL;
734         }
735       }
736     } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
737       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
738           false, IsHostPtr);
739       TgtBaseOffset = 0; // no offset for ptrs.
740       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
741          "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
742          DPxPTR(HstPtrBase));
743     } else {
744       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
745           false, IsHostPtr);
746       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
747 #ifdef OMPTARGET_DEBUG
748       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
749       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
750           DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
751 #endif
752     }
753     tgtArgsPositions[i] = tgt_args.size();
754     tgt_args.push_back(TgtPtrBegin);
755     tgt_offsets.push_back(TgtBaseOffset);
756   }
757 
758   assert(tgt_args.size() == tgt_offsets.size() &&
759       "Size mismatch in arguments and offsets");
760 
761   // Pop loop trip count
762   uint64_t ltc = 0;
763   TblMapMtx.lock();
764   auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
765   if (I != Device.LoopTripCnt.end()) {
766     ltc = I->second;
767     Device.LoopTripCnt.erase(I);
768     DP("loop trip count is %lu.\n", ltc);
769   }
770   TblMapMtx.unlock();
771 
772   // Launch device execution.
773   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
774       TargetTable->EntriesBegin[TM->Index].name,
775       DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
776   if (IsTeamConstruct) {
777     rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
778         &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
779         thread_limit, ltc);
780   } else {
781     rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
782         &tgt_args[0], &tgt_offsets[0], tgt_args.size());
783   }
784   if (rc != OFFLOAD_SUCCESS) {
785     DP ("Executing target region abort target.\n");
786     return OFFLOAD_FAIL;
787   }
788 
789   // Deallocate (first-)private arrays
790   for (auto it : fpArrays) {
791     int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
792     if (rt != OFFLOAD_SUCCESS) {
793       DP("Deallocation of (first-)private arrays failed.\n");
794       return OFFLOAD_FAIL;
795     }
796   }
797 
798   // Move data from device.
799   int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
800       arg_types);
801   if (rt != OFFLOAD_SUCCESS) {
802     DP("Call to target_data_end failed, abort targe.\n");
803     return OFFLOAD_FAIL;
804   }
805 
806   return OFFLOAD_SUCCESS;
807 }
808