1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Implementation of the interface to be used by Clang during the codegen of a
11 // target region.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include <omptarget.h>
16 
17 #include "device.h"
18 #include "private.h"
19 #include "rtl.h"
20 
21 #include <cassert>
22 #include <vector>
23 
24 #ifdef OMPTARGET_DEBUG
25 int DebugLevel = 0;
26 #endif // OMPTARGET_DEBUG
27 
28 
29 
30 /* All begin addresses for partially mapped structs must be 8-aligned in order
31  * to ensure proper alignment of members. E.g.
32  *
33  * struct S {
34  *   int a;   // 4-aligned
35  *   int b;   // 4-aligned
36  *   int *p;  // 8-aligned
37  * } s1;
38  * ...
39  * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
40  * {
41  *   s1.b = 5;
42  *   for (int i...) s1.p[i] = ...;
43  * }
44  *
45  * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
46  * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
47  * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
48  * requirements for its type. Now, when we allocate memory on the device, in
49  * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
50  * This means that the chunk of the struct on the device will start at a
51  * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
52  * address of p will be a misaligned 0x204 (on the host there was no need to add
53  * padding between b and p, so p comes exactly 4 bytes after b). If the device
54  * kernel tries to access s1.p, a misaligned address error occurs (as reported
55  * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
56  * extending the size of the allocated chuck accordingly, the chuck on the
57  * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
58  * &s1.p=0x208, as they should be to satisfy the alignment requirements.
59  */
60 static const int64_t alignment = 8;
61 
62 /// Map global data and execute pending ctors
63 static int InitLibrary(DeviceTy& Device) {
64   /*
65    * Map global data
66    */
67   int32_t device_id = Device.DeviceID;
68   int rc = OFFLOAD_SUCCESS;
69 
70   Device.PendingGlobalsMtx.lock();
71   TrlTblMtx.lock();
72   for (HostEntriesBeginToTransTableTy::iterator
73       ii = HostEntriesBeginToTransTable.begin();
74       ii != HostEntriesBeginToTransTable.end(); ++ii) {
75     TranslationTable *TransTable = &ii->second;
76     if (TransTable->TargetsTable[device_id] != 0) {
77       // Library entries have already been processed
78       continue;
79     }
80 
81     // 1) get image.
82     assert(TransTable->TargetsImages.size() > (size_t)device_id &&
83            "Not expecting a device ID outside the table's bounds!");
84     __tgt_device_image *img = TransTable->TargetsImages[device_id];
85     if (!img) {
86       DP("No image loaded for device id %d.\n", device_id);
87       rc = OFFLOAD_FAIL;
88       break;
89     }
90     // 2) load image into the target table.
91     __tgt_target_table *TargetTable =
92         TransTable->TargetsTable[device_id] = Device.load_binary(img);
93     // Unable to get table for this image: invalidate image and fail.
94     if (!TargetTable) {
95       DP("Unable to generate entries table for device id %d.\n", device_id);
96       TransTable->TargetsImages[device_id] = 0;
97       rc = OFFLOAD_FAIL;
98       break;
99     }
100 
101     // Verify whether the two table sizes match.
102     size_t hsize =
103         TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
104     size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
105 
106     // Invalid image for these host entries!
107     if (hsize != tsize) {
108       DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
109          device_id, hsize, tsize);
110       TransTable->TargetsImages[device_id] = 0;
111       TransTable->TargetsTable[device_id] = 0;
112       rc = OFFLOAD_FAIL;
113       break;
114     }
115 
116     // process global data that needs to be mapped.
117     Device.DataMapMtx.lock();
118     __tgt_target_table *HostTable = &TransTable->HostTable;
119     for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
120                              *CurrHostEntry = HostTable->EntriesBegin,
121                              *EntryDeviceEnd = TargetTable->EntriesEnd;
122          CurrDeviceEntry != EntryDeviceEnd;
123          CurrDeviceEntry++, CurrHostEntry++) {
124       if (CurrDeviceEntry->size != 0) {
125         // has data.
126         assert(CurrDeviceEntry->size == CurrHostEntry->size &&
127                "data size mismatch");
128 
129         // Fortran may use multiple weak declarations for the same symbol,
130         // therefore we must allow for multiple weak symbols to be loaded from
131         // the fat binary. Treat these mappings as any other "regular" mapping.
132         // Add entry to map.
133         if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
134           continue;
135         DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
136             "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
137             CurrDeviceEntry->size);
138         Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
139             (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
140             (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
141             (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
142             (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
143             INF_REF_CNT /*RefCount*/));
144       }
145     }
146     Device.DataMapMtx.unlock();
147   }
148   TrlTblMtx.unlock();
149 
150   if (rc != OFFLOAD_SUCCESS) {
151     Device.PendingGlobalsMtx.unlock();
152     return rc;
153   }
154 
155   /*
156    * Run ctors for static objects
157    */
158   if (!Device.PendingCtorsDtors.empty()) {
159     // Call all ctors for all libraries registered so far
160     for (auto &lib : Device.PendingCtorsDtors) {
161       if (!lib.second.PendingCtors.empty()) {
162         DP("Has pending ctors... call now\n");
163         for (auto &entry : lib.second.PendingCtors) {
164           void *ctor = entry;
165           int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
166                           NULL, 1, 1, true /*team*/);
167           if (rc != OFFLOAD_SUCCESS) {
168             DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
169             Device.PendingGlobalsMtx.unlock();
170             return OFFLOAD_FAIL;
171           }
172         }
173         // Clear the list to indicate that this device has been used
174         lib.second.PendingCtors.clear();
175         DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
176       }
177     }
178   }
179   Device.HasPendingGlobals = false;
180   Device.PendingGlobalsMtx.unlock();
181 
182   return OFFLOAD_SUCCESS;
183 }
184 
185 // Check whether a device has been initialized, global ctors have been
186 // executed and global data has been mapped; do so if not already done.
187 int CheckDeviceAndCtors(int64_t device_id) {
188   // Is device ready?
189   if (!device_is_ready(device_id)) {
190     DP("Device %" PRId64 " is not ready.\n", device_id);
191     return OFFLOAD_FAIL;
192   }
193 
194   // Get device info.
195   DeviceTy &Device = Devices[device_id];
196 
197   // Check whether global data has been mapped for this device
198   Device.PendingGlobalsMtx.lock();
199   bool hasPendingGlobals = Device.HasPendingGlobals;
200   Device.PendingGlobalsMtx.unlock();
201   if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
202     DP("Failed to init globals on device %" PRId64 "\n", device_id);
203     return OFFLOAD_FAIL;
204   }
205 
206   return OFFLOAD_SUCCESS;
207 }
208 
209 static int32_t member_of(int64_t type) {
210   return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
211 }
212 
213 /// Internal function to do the mapping and transfer the data to the device
214 int target_data_begin(DeviceTy &Device, int32_t arg_num,
215     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
216   // process each input.
217   for (int32_t i = 0; i < arg_num; ++i) {
218     // Ignore private variables and arrays - there is no mapping for them.
219     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
220         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
221       continue;
222 
223     void *HstPtrBegin = args[i];
224     void *HstPtrBase = args_base[i];
225     int64_t data_size = arg_sizes[i];
226 
227     // Adjust for proper alignment if this is a combined entry (for structs).
228     // Look at the next argument - if that is MEMBER_OF this one, then this one
229     // is a combined entry.
230     int64_t padding = 0;
231     const int next_i = i+1;
232     if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
233         member_of(arg_types[next_i]) == i) {
234       padding = (int64_t)HstPtrBegin % alignment;
235       if (padding) {
236         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
237             "\n", padding, DPxPTR(HstPtrBegin));
238         HstPtrBegin = (char *) HstPtrBegin - padding;
239         data_size += padding;
240       }
241     }
242 
243     // Address of pointer on the host and device, respectively.
244     void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
245     bool IsNew, Pointer_IsNew;
246     bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
247     // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
248     // have reached this point via __tgt_target_data_begin and not __tgt_target
249     // then no argument is marked as TARGET_PARAM ("omp target data map" is not
250     // associated with a target region, so there are no target parameters). This
251     // may be considered a hack, we could revise the scheme in the future.
252     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
253     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
254       DP("Has a pointer entry: \n");
255       // base is address of pointer.
256       Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
257           sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
258       if (!Pointer_TgtPtrBegin) {
259         DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
260             "illegal mapping).\n");
261         return OFFLOAD_FAIL;
262       }
263       DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
264           "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
265           (Pointer_IsNew ? "" : " not"));
266       Pointer_HstPtrBegin = HstPtrBase;
267       // modify current entry.
268       HstPtrBase = *(void **)HstPtrBase;
269       UpdateRef = true; // subsequently update ref count of pointee
270     }
271 
272     void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
273         data_size, IsNew, IsImplicit, UpdateRef);
274     if (!TgtPtrBegin && data_size) {
275       // If data_size==0, then the argument could be a zero-length pointer to
276       // NULL, so getOrAlloc() returning NULL is not an error.
277       DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
278           "illegal mapping).\n");
279     }
280     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
281         " - is%s new\n", data_size, DPxPTR(TgtPtrBegin),
282         (IsNew ? "" : " not"));
283 
284     if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
285       uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
286       void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
287       DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
288       args_base[i] = TgtPtrBase;
289     }
290 
291     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
292       bool copy = false;
293       if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
294         copy = true;
295       } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
296         // Copy data only if the "parent" struct has RefCount==1.
297         int32_t parent_idx = member_of(arg_types[i]);
298         long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
299         assert(parent_rc > 0 && "parent struct not found");
300         if (parent_rc == 1) {
301           copy = true;
302         }
303       }
304 
305       if (copy) {
306         DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
307             data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
308         int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
309         if (rt != OFFLOAD_SUCCESS) {
310           DP("Copying data to device failed.\n");
311           return OFFLOAD_FAIL;
312         }
313       }
314     }
315 
316     if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
317       DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
318           DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
319       uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
320       void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
321       int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
322           sizeof(void *));
323       if (rt != OFFLOAD_SUCCESS) {
324         DP("Copying data to device failed.\n");
325         return OFFLOAD_FAIL;
326       }
327       // create shadow pointers for this entry
328       Device.ShadowMtx.lock();
329       Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
330           Pointer_TgtPtrBegin, TgtPtrBase};
331       Device.ShadowMtx.unlock();
332     }
333   }
334 
335   return OFFLOAD_SUCCESS;
336 }
337 
338 /// Internal function to undo the mapping and retrieve the data from the device.
339 int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
340     void **args, int64_t *arg_sizes, int64_t *arg_types) {
341   // process each input.
342   for (int32_t i = arg_num - 1; i >= 0; --i) {
343     // Ignore private variables and arrays - there is no mapping for them.
344     // Also, ignore the use_device_ptr directive, it has no effect here.
345     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
346         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
347       continue;
348 
349     void *HstPtrBegin = args[i];
350     int64_t data_size = arg_sizes[i];
351     // Adjust for proper alignment if this is a combined entry (for structs).
352     // Look at the next argument - if that is MEMBER_OF this one, then this one
353     // is a combined entry.
354     int64_t padding = 0;
355     const int next_i = i+1;
356     if (member_of(arg_types[i]) < 0 && next_i < arg_num &&
357         member_of(arg_types[next_i]) == i) {
358       padding = (int64_t)HstPtrBegin % alignment;
359       if (padding) {
360         DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
361             "\n", padding, DPxPTR(HstPtrBegin));
362         HstPtrBegin = (char *) HstPtrBegin - padding;
363         data_size += padding;
364       }
365     }
366 
367     bool IsLast;
368     bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
369         (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
370     bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
371 
372     // If PTR_AND_OBJ, HstPtrBegin is address of pointee
373     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
374         UpdateRef);
375     DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
376         " - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
377         (IsLast ? "" : " not"));
378 
379     bool DelEntry = IsLast || ForceDelete;
380 
381     if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
382         !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
383       DelEntry = false; // protect parent struct from being deallocated
384     }
385 
386     if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
387       // Move data back to the host
388       if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
389         bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
390         bool CopyMember = false;
391         if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
392             !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
393           // Copy data only if the "parent" struct has RefCount==1.
394           int32_t parent_idx = member_of(arg_types[i]);
395           long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
396           assert(parent_rc > 0 && "parent struct not found");
397           if (parent_rc == 1) {
398             CopyMember = true;
399           }
400         }
401 
402         if (DelEntry || Always || CopyMember) {
403           DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
404               data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
405           int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
406           if (rt != OFFLOAD_SUCCESS) {
407             DP("Copying data from device failed.\n");
408             return OFFLOAD_FAIL;
409           }
410         }
411       }
412 
413       // If we copied back to the host a struct/array containing pointers, we
414       // need to restore the original host pointer values from their shadow
415       // copies. If the struct is going to be deallocated, remove any remaining
416       // shadow pointer entries for this struct.
417       uintptr_t lb = (uintptr_t) HstPtrBegin;
418       uintptr_t ub = (uintptr_t) HstPtrBegin + data_size;
419       Device.ShadowMtx.lock();
420       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
421            it != Device.ShadowPtrMap.end();) {
422         void **ShadowHstPtrAddr = (void**) it->first;
423 
424         // An STL map is sorted on its keys; use this property
425         // to quickly determine when to break out of the loop.
426         if ((uintptr_t) ShadowHstPtrAddr < lb) {
427           ++it;
428           continue;
429         }
430         if ((uintptr_t) ShadowHstPtrAddr >= ub)
431           break;
432 
433         // If we copied the struct to the host, we need to restore the pointer.
434         if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
435           DP("Restoring original host pointer value " DPxMOD " for host "
436               "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
437               DPxPTR(ShadowHstPtrAddr));
438           *ShadowHstPtrAddr = it->second.HstPtrVal;
439         }
440         // If the struct is to be deallocated, remove the shadow entry.
441         if (DelEntry) {
442           DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
443           it = Device.ShadowPtrMap.erase(it);
444         } else {
445           ++it;
446         }
447       }
448       Device.ShadowMtx.unlock();
449 
450       // Deallocate map
451       if (DelEntry) {
452         int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete);
453         if (rt != OFFLOAD_SUCCESS) {
454           DP("Deallocating data from device failed.\n");
455           return OFFLOAD_FAIL;
456         }
457       }
458     }
459   }
460 
461   return OFFLOAD_SUCCESS;
462 }
463 
464 /// Internal function to pass data to/from the target.
465 int target_data_update(DeviceTy &Device, int32_t arg_num,
466     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
467   // process each input.
468   for (int32_t i = 0; i < arg_num; ++i) {
469     if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
470         (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
471       continue;
472 
473     void *HstPtrBegin = args[i];
474     int64_t MapSize = arg_sizes[i];
475     bool IsLast;
476     void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
477         false);
478     if (!TgtPtrBegin) {
479       DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
480       continue;
481     }
482 
483     if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
484       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
485           arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
486       int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
487       if (rt != OFFLOAD_SUCCESS) {
488         DP("Copying data from device failed.\n");
489         return OFFLOAD_FAIL;
490       }
491 
492       uintptr_t lb = (uintptr_t) HstPtrBegin;
493       uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
494       Device.ShadowMtx.lock();
495       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
496           it != Device.ShadowPtrMap.end(); ++it) {
497         void **ShadowHstPtrAddr = (void**) it->first;
498         if ((uintptr_t) ShadowHstPtrAddr < lb)
499           continue;
500         if ((uintptr_t) ShadowHstPtrAddr >= ub)
501           break;
502         DP("Restoring original host pointer value " DPxMOD " for host pointer "
503             DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
504             DPxPTR(ShadowHstPtrAddr));
505         *ShadowHstPtrAddr = it->second.HstPtrVal;
506       }
507       Device.ShadowMtx.unlock();
508     }
509 
510     if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
511       DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
512           arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
513       int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
514       if (rt != OFFLOAD_SUCCESS) {
515         DP("Copying data to device failed.\n");
516         return OFFLOAD_FAIL;
517       }
518       uintptr_t lb = (uintptr_t) HstPtrBegin;
519       uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
520       Device.ShadowMtx.lock();
521       for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
522           it != Device.ShadowPtrMap.end(); ++it) {
523         void **ShadowHstPtrAddr = (void**) it->first;
524         if ((uintptr_t) ShadowHstPtrAddr < lb)
525           continue;
526         if ((uintptr_t) ShadowHstPtrAddr >= ub)
527           break;
528         DP("Restoring original target pointer value " DPxMOD " for target "
529             "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
530             DPxPTR(it->second.TgtPtrAddr));
531         rt = Device.data_submit(it->second.TgtPtrAddr,
532             &it->second.TgtPtrVal, sizeof(void *));
533         if (rt != OFFLOAD_SUCCESS) {
534           DP("Copying data to device failed.\n");
535           Device.ShadowMtx.unlock();
536           return OFFLOAD_FAIL;
537         }
538       }
539       Device.ShadowMtx.unlock();
540     }
541   }
542   return OFFLOAD_SUCCESS;
543 }
544 
545 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
546                                       OMP_TGT_MAPTYPE_LITERAL |
547                                       OMP_TGT_MAPTYPE_IMPLICIT;
548 static bool isLambdaMapping(int64_t Mapping) {
549   return (Mapping & LambdaMapping) == LambdaMapping;
550 }
551 
552 /// performs the same actions as data_begin in case arg_num is
553 /// non-zero and initiates run of the offloaded region on the target platform;
554 /// if arg_num is non-zero after the region execution is done it also
555 /// performs the same action as data_update and data_end above. This function
556 /// returns 0 if it was able to transfer the execution to a target and an
557 /// integer different from zero otherwise.
558 int target(int64_t device_id, void *host_ptr, int32_t arg_num,
559     void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
560     int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
561   DeviceTy &Device = Devices[device_id];
562 
563   // Find the table information in the map or look it up in the translation
564   // tables.
565   TableMap *TM = 0;
566   TblMapMtx.lock();
567   HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
568   if (TableMapIt == HostPtrToTableMap.end()) {
569     // We don't have a map. So search all the registered libraries.
570     TrlTblMtx.lock();
571     for (HostEntriesBeginToTransTableTy::iterator
572              ii = HostEntriesBeginToTransTable.begin(),
573              ie = HostEntriesBeginToTransTable.end();
574          !TM && ii != ie; ++ii) {
575       // get the translation table (which contains all the good info).
576       TranslationTable *TransTable = &ii->second;
577       // iterate over all the host table entries to see if we can locate the
578       // host_ptr.
579       __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
580       __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
581       __tgt_offload_entry *cur = begin;
582       for (uint32_t i = 0; cur < end; ++cur, ++i) {
583         if (cur->addr != host_ptr)
584           continue;
585         // we got a match, now fill the HostPtrToTableMap so that we
586         // may avoid this search next time.
587         TM = &HostPtrToTableMap[host_ptr];
588         TM->Table = TransTable;
589         TM->Index = i;
590         break;
591       }
592     }
593     TrlTblMtx.unlock();
594   } else {
595     TM = &TableMapIt->second;
596   }
597   TblMapMtx.unlock();
598 
599   // No map for this host pointer found!
600   if (!TM) {
601     DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
602        DPxPTR(host_ptr));
603     return OFFLOAD_FAIL;
604   }
605 
606   // get target table.
607   TrlTblMtx.lock();
608   assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
609          "Not expecting a device ID outside the table's bounds!");
610   __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
611   TrlTblMtx.unlock();
612   assert(TargetTable && "Global data has not been mapped\n");
613 
614   // Move data to device.
615   int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
616       arg_types);
617   if (rc != OFFLOAD_SUCCESS) {
618     DP("Call to target_data_begin failed, abort target.\n");
619     return OFFLOAD_FAIL;
620   }
621 
622   std::vector<void *> tgt_args;
623   std::vector<ptrdiff_t> tgt_offsets;
624 
625   // List of (first-)private arrays allocated for this target region
626   std::vector<void *> fpArrays;
627   std::vector<int> tgtArgsPositions(arg_num, -1);
628 
629   for (int32_t i = 0; i < arg_num; ++i) {
630     if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
631       // This is not a target parameter, do not push it into tgt_args.
632       // Check for lambda mapping.
633       if (isLambdaMapping(arg_types[i])) {
634         assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
635                "PTR_AND_OBJ must be also MEMBER_OF.");
636         unsigned idx = member_of(arg_types[i]);
637         int tgtIdx = tgtArgsPositions[idx];
638         assert(tgtIdx != -1 && "Base address must be translated already.");
639         // The parent lambda must be processed already and it must be the last
640         // in tgt_args and tgt_offsets arrays.
641         void *HstPtrVal = args[i];
642         void *HstPtrBegin = args_base[i];
643         void *HstPtrBase = args[idx];
644         bool IsLast; // unused.
645         void *TgtPtrBase =
646             (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
647         DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
648         uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
649         void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
650         void *Pointer_TgtPtrBegin =
651             Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false);
652         if (!Pointer_TgtPtrBegin) {
653           DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
654              DPxPTR(HstPtrVal));
655           continue;
656         }
657         DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
658            DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
659         int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
660                                     sizeof(void *));
661         if (rt != OFFLOAD_SUCCESS) {
662           DP("Copying data to device failed.\n");
663           return OFFLOAD_FAIL;
664         }
665       }
666       continue;
667     }
668     void *HstPtrBegin = args[i];
669     void *HstPtrBase = args_base[i];
670     void *TgtPtrBegin;
671     ptrdiff_t TgtBaseOffset;
672     bool IsLast; // unused.
673     if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
674       DP("Forwarding first-private value " DPxMOD " to the target construct\n",
675           DPxPTR(HstPtrBase));
676       TgtPtrBegin = HstPtrBase;
677       TgtBaseOffset = 0;
678     } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
679       // Allocate memory for (first-)private array
680       TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
681           arg_sizes[i], HstPtrBegin);
682       if (!TgtPtrBegin) {
683         DP ("Data allocation for %sprivate array " DPxMOD " failed, "
684             "abort target.\n",
685             (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
686             DPxPTR(HstPtrBegin));
687         return OFFLOAD_FAIL;
688       }
689       fpArrays.push_back(TgtPtrBegin);
690       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
691 #ifdef OMPTARGET_DEBUG
692       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
693       DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
694           "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
695           arg_sizes[i], DPxPTR(TgtPtrBegin),
696           (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
697           DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
698 #endif
699       // If first-private, copy data from host
700       if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
701         int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
702         if (rt != OFFLOAD_SUCCESS) {
703           DP ("Copying data to device failed, failed.\n");
704           return OFFLOAD_FAIL;
705         }
706       }
707     } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
708       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
709           false);
710       TgtBaseOffset = 0; // no offset for ptrs.
711       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
712          "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
713          DPxPTR(HstPtrBase));
714     } else {
715       TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
716           false);
717       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
718 #ifdef OMPTARGET_DEBUG
719       void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
720       DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
721           DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
722 #endif
723     }
724     tgtArgsPositions[i] = tgt_args.size();
725     tgt_args.push_back(TgtPtrBegin);
726     tgt_offsets.push_back(TgtBaseOffset);
727   }
728 
729   assert(tgt_args.size() == tgt_offsets.size() &&
730       "Size mismatch in arguments and offsets");
731 
732   // Pop loop trip count
733   uint64_t ltc = Device.loopTripCnt;
734   Device.loopTripCnt = 0;
735 
736   // Launch device execution.
737   DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
738       TargetTable->EntriesBegin[TM->Index].name,
739       DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
740   if (IsTeamConstruct) {
741     rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
742         &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
743         thread_limit, ltc);
744   } else {
745     rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
746         &tgt_args[0], &tgt_offsets[0], tgt_args.size());
747   }
748   if (rc != OFFLOAD_SUCCESS) {
749     DP ("Executing target region abort target.\n");
750     return OFFLOAD_FAIL;
751   }
752 
753   // Deallocate (first-)private arrays
754   for (auto it : fpArrays) {
755     int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
756     if (rt != OFFLOAD_SUCCESS) {
757       DP("Deallocation of (first-)private arrays failed.\n");
758       return OFFLOAD_FAIL;
759     }
760   }
761 
762   // Move data from device.
763   int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
764       arg_types);
765   if (rt != OFFLOAD_SUCCESS) {
766     DP("Call to target_data_end failed, abort targe.\n");
767     return OFFLOAD_FAIL;
768   }
769 
770   return OFFLOAD_SUCCESS;
771 }
772