1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
18
19 #include <cassert>
20 #include <cstdint>
21 #include <vector>
22
synchronize()23 int AsyncInfoTy::synchronize() {
24 int Result = OFFLOAD_SUCCESS;
25 if (AsyncInfo.Queue) {
26 // If we have a queue we need to synchronize it now.
27 Result = Device.synchronize(*this);
28 assert(AsyncInfo.Queue == nullptr &&
29 "The device plugin should have nulled the queue to indicate there "
30 "are no outstanding actions!");
31 }
32 return Result;
33 }
34
getVoidPtrLocation()35 void *&AsyncInfoTy::getVoidPtrLocation() {
36 BufferLocations.push_back(nullptr);
37 return BufferLocations.back();
38 }
39
40 /* All begin addresses for partially mapped structs must be 8-aligned in order
41 * to ensure proper alignment of members. E.g.
42 *
43 * struct S {
44 * int a; // 4-aligned
45 * int b; // 4-aligned
46 * int *p; // 8-aligned
47 * } s1;
48 * ...
49 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
50 * {
51 * s1.b = 5;
52 * for (int i...) s1.p[i] = ...;
53 * }
54 *
55 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
56 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
57 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
58 * requirements for its type. Now, when we allocate memory on the device, in
59 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
60 * This means that the chunk of the struct on the device will start at a
61 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
62 * address of p will be a misaligned 0x204 (on the host there was no need to add
63 * padding between b and p, so p comes exactly 4 bytes after b). If the device
64 * kernel tries to access s1.p, a misaligned address error occurs (as reported
65 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
66 * extending the size of the allocated chuck accordingly, the chuck on the
67 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
68 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
69 */
70 static const int64_t Alignment = 8;
71
72 /// Map global data and execute pending ctors
initLibrary(DeviceTy & Device)73 static int initLibrary(DeviceTy &Device) {
74 /*
75 * Map global data
76 */
77 int32_t DeviceId = Device.DeviceID;
78 int Rc = OFFLOAD_SUCCESS;
79 bool SupportsEmptyImages = Device.RTL->supports_empty_images &&
80 Device.RTL->supports_empty_images() > 0;
81
82 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
83 Device.PendingGlobalsMtx);
84 {
85 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
86 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
87 TranslationTable *TransTable =
88 &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
89 if (TransTable->HostTable.EntriesBegin ==
90 TransTable->HostTable.EntriesEnd &&
91 !SupportsEmptyImages) {
92 // No host entry so no need to proceed
93 continue;
94 }
95
96 if (TransTable->TargetsTable[DeviceId] != 0) {
97 // Library entries have already been processed
98 continue;
99 }
100
101 // 1) get image.
102 assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&
103 "Not expecting a device ID outside the table's bounds!");
104 __tgt_device_image *Img = TransTable->TargetsImages[DeviceId];
105 if (!Img) {
106 REPORT("No image loaded for device id %d.\n", DeviceId);
107 Rc = OFFLOAD_FAIL;
108 break;
109 }
110 // 2) load image into the target table.
111 __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
112 Device.loadBinary(Img);
113 // Unable to get table for this image: invalidate image and fail.
114 if (!TargetTable) {
115 REPORT("Unable to generate entries table for device id %d.\n",
116 DeviceId);
117 TransTable->TargetsImages[DeviceId] = 0;
118 Rc = OFFLOAD_FAIL;
119 break;
120 }
121
122 // Verify whether the two table sizes match.
123 size_t Hsize =
124 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
125 size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
126
127 // Invalid image for these host entries!
128 if (Hsize != Tsize) {
129 REPORT(
130 "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
131 DeviceId, Hsize, Tsize);
132 TransTable->TargetsImages[DeviceId] = 0;
133 TransTable->TargetsTable[DeviceId] = 0;
134 Rc = OFFLOAD_FAIL;
135 break;
136 }
137
138 DeviceTy::HDTTMapAccessorTy HDTTMap =
139 Device.HostDataToTargetMap.getExclusiveAccessor();
140
141 __tgt_target_table *HostTable = &TransTable->HostTable;
142 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
143 *CurrHostEntry = HostTable->EntriesBegin,
144 *EntryDeviceEnd = TargetTable->EntriesEnd;
145 CurrDeviceEntry != EntryDeviceEnd;
146 CurrDeviceEntry++, CurrHostEntry++) {
147 if (CurrDeviceEntry->size != 0) {
148 // has data.
149 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
150 "data size mismatch");
151
152 // Fortran may use multiple weak declarations for the same symbol,
153 // therefore we must allow for multiple weak symbols to be loaded from
154 // the fat binary. Treat these mappings as any other "regular"
155 // mapping. Add entry to map.
156 if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr,
157 CurrHostEntry->size))
158 continue;
159
160 DP("Add mapping from host " DPxMOD " to device " DPxMOD
161 " with size %zu"
162 "\n",
163 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
164 CurrDeviceEntry->size);
165 HDTTMap->emplace(new HostDataToTargetTy(
166 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
167 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
168 (uintptr_t)CurrHostEntry->addr +
169 CurrHostEntry->size /*HstPtrEnd*/,
170 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
171 false /*UseHoldRefCount*/, CurrHostEntry->name,
172 true /*IsRefCountINF*/));
173 }
174 }
175 }
176 }
177
178 if (Rc != OFFLOAD_SUCCESS) {
179 return Rc;
180 }
181
182 /*
183 * Run ctors for static objects
184 */
185 if (!Device.PendingCtorsDtors.empty()) {
186 AsyncInfoTy AsyncInfo(Device);
187 // Call all ctors for all libraries registered so far
188 for (auto &Lib : Device.PendingCtorsDtors) {
189 if (!Lib.second.PendingCtors.empty()) {
190 DP("Has pending ctors... call now\n");
191 for (auto &Entry : Lib.second.PendingCtors) {
192 void *Ctor = Entry;
193 int Rc = target(nullptr, Device, Ctor, 0, nullptr, nullptr, nullptr,
194 nullptr, nullptr, nullptr, 1, 1, 0, true /*team*/,
195 AsyncInfo);
196 if (Rc != OFFLOAD_SUCCESS) {
197 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(Ctor));
198 return OFFLOAD_FAIL;
199 }
200 }
201 // Clear the list to indicate that this device has been used
202 Lib.second.PendingCtors.clear();
203 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(Lib.first));
204 }
205 }
206 // All constructors have been issued, wait for them now.
207 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
208 return OFFLOAD_FAIL;
209 }
210 Device.HasPendingGlobals = false;
211
212 return OFFLOAD_SUCCESS;
213 }
214
handleTargetOutcome(bool Success,ident_t * Loc)215 void handleTargetOutcome(bool Success, ident_t *Loc) {
216 switch (PM->TargetOffloadPolicy) {
217 case tgt_disabled:
218 if (Success) {
219 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
220 }
221 break;
222 case tgt_default:
223 FATAL_MESSAGE0(1, "default offloading policy must be switched to "
224 "mandatory or disabled");
225 break;
226 case tgt_mandatory:
227 if (!Success) {
228 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
229 for (auto &Device : PM->Devices)
230 dumpTargetPointerMappings(Loc, *Device);
231 else
232 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
233 "for debugging options.\n");
234
235 SourceInfo Info(Loc);
236 if (Info.isAvailible())
237 fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
238 Info.getColumn());
239 else
240 FAILURE_MESSAGE("Source location information not present. Compile with "
241 "-g or -gline-tables-only.\n");
242 FATAL_MESSAGE0(
243 1, "failure of target construct while offloading is mandatory");
244 } else {
245 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
246 for (auto &Device : PM->Devices)
247 dumpTargetPointerMappings(Loc, *Device);
248 }
249 break;
250 }
251 }
252
handleDefaultTargetOffload()253 static void handleDefaultTargetOffload() {
254 std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx);
255 if (PM->TargetOffloadPolicy == tgt_default) {
256 if (omp_get_num_devices() > 0) {
257 DP("Default TARGET OFFLOAD policy is now mandatory "
258 "(devices were found)\n");
259 PM->TargetOffloadPolicy = tgt_mandatory;
260 } else {
261 DP("Default TARGET OFFLOAD policy is now disabled "
262 "(no devices were found)\n");
263 PM->TargetOffloadPolicy = tgt_disabled;
264 }
265 }
266 }
267
isOffloadDisabled()268 static bool isOffloadDisabled() {
269 if (PM->TargetOffloadPolicy == tgt_default)
270 handleDefaultTargetOffload();
271 return PM->TargetOffloadPolicy == tgt_disabled;
272 }
273
274 // If offload is enabled, ensure that device DeviceID has been initialized,
275 // global ctors have been executed, and global data has been mapped.
276 //
277 // The return bool indicates if the offload is to the host device
278 // There are three possible results:
279 // - Return false if the taregt device is ready for offload
280 // - Return true without reporting a runtime error if offload is
281 // disabled, perhaps because the initial device was specified.
282 // - Report a runtime error and return true.
283 //
284 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
285 // This step might be skipped if offload is disabled.
checkDeviceAndCtors(int64_t & DeviceID,ident_t * Loc)286 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
287 if (isOffloadDisabled()) {
288 DP("Offload is disabled\n");
289 return true;
290 }
291
292 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
293 DeviceID = omp_get_default_device();
294 DP("Use default device id %" PRId64 "\n", DeviceID);
295 }
296
297 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
298 if (omp_get_num_devices() == 0) {
299 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
300 handleTargetOutcome(false, Loc);
301 return true;
302 }
303
304 if (DeviceID == omp_get_initial_device()) {
305 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
306 DeviceID);
307 return true;
308 }
309
310 // Is device ready?
311 if (!deviceIsReady(DeviceID)) {
312 REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
313 handleTargetOutcome(false, Loc);
314 return true;
315 }
316
317 // Get device info.
318 DeviceTy &Device = *PM->Devices[DeviceID];
319
320 // Check whether global data has been mapped for this device
321 bool HasPendingGlobals;
322 {
323 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
324 Device.PendingGlobalsMtx);
325 HasPendingGlobals = Device.HasPendingGlobals;
326 }
327 if (HasPendingGlobals && initLibrary(Device) != OFFLOAD_SUCCESS) {
328 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
329 handleTargetOutcome(false, Loc);
330 return true;
331 }
332
333 return false;
334 }
335
getParentIndex(int64_t Type)336 static int32_t getParentIndex(int64_t Type) {
337 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
338 }
339
targetAllocExplicit(size_t Size,int DeviceNum,int Kind,const char * Name)340 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
341 const char *Name) {
342 TIMESCOPE();
343 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
344
345 if (Size <= 0) {
346 DP("Call to %s with non-positive length\n", Name);
347 return NULL;
348 }
349
350 void *Rc = NULL;
351
352 if (DeviceNum == omp_get_initial_device()) {
353 Rc = malloc(Size);
354 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
355 return Rc;
356 }
357
358 if (!deviceIsReady(DeviceNum)) {
359 DP("%s returns NULL ptr\n", Name);
360 return NULL;
361 }
362
363 DeviceTy &Device = *PM->Devices[DeviceNum];
364 Rc = Device.allocData(Size, nullptr, Kind);
365 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
366 return Rc;
367 }
368
369 /// Call the user-defined mapper function followed by the appropriate
370 // targetData* function (targetData{Begin,End,Update}).
targetDataMapper(ident_t * Loc,DeviceTy & Device,void * ArgBase,void * Arg,int64_t ArgSize,int64_t ArgType,map_var_info_t ArgNames,void * ArgMapper,AsyncInfoTy & AsyncInfo,TargetDataFuncPtrTy TargetDataFunction)371 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
372 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
373 void *ArgMapper, AsyncInfoTy &AsyncInfo,
374 TargetDataFuncPtrTy TargetDataFunction) {
375 TIMESCOPE_WITH_IDENT(Loc);
376 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
377
378 // The mapper function fills up Components.
379 MapperComponentsTy MapperComponents;
380 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
381 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
382 ArgNames);
383
384 // Construct new arrays for args_base, args, arg_sizes and arg_types
385 // using the information in MapperComponents and call the corresponding
386 // targetData* function using these new arrays.
387 std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
388 std::vector<void *> MapperArgs(MapperComponents.Components.size());
389 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
390 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
391 std::vector<void *> MapperArgNames(MapperComponents.Components.size());
392
393 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
394 auto &C = MapperComponents.Components[I];
395 MapperArgsBase[I] = C.Base;
396 MapperArgs[I] = C.Begin;
397 MapperArgSizes[I] = C.Size;
398 MapperArgTypes[I] = C.Type;
399 MapperArgNames[I] = C.Name;
400 }
401
402 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
403 MapperArgsBase.data(), MapperArgs.data(),
404 MapperArgSizes.data(), MapperArgTypes.data(),
405 MapperArgNames.data(), /*arg_mappers*/ nullptr,
406 AsyncInfo, /*FromMapper=*/true);
407
408 return Rc;
409 }
410
411 /// Internal function to do the mapping and transfer the data to the device
targetDataBegin(ident_t * Loc,DeviceTy & Device,int32_t ArgNum,void ** ArgsBase,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool FromMapper)412 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
413 void **ArgsBase, void **Args, int64_t *ArgSizes,
414 int64_t *ArgTypes, map_var_info_t *ArgNames,
415 void **ArgMappers, AsyncInfoTy &AsyncInfo,
416 bool FromMapper) {
417 // process each input.
418 for (int32_t I = 0; I < ArgNum; ++I) {
419 // Ignore private variables and arrays - there is no mapping for them.
420 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
421 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
422 continue;
423
424 if (ArgMappers && ArgMappers[I]) {
425 // Instead of executing the regular path of targetDataBegin, call the
426 // targetDataMapper variant which will call targetDataBegin again
427 // with new arguments.
428 DP("Calling targetDataMapper for the %dth argument\n", I);
429
430 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
431 int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
432 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
433 targetDataBegin);
434
435 if (Rc != OFFLOAD_SUCCESS) {
436 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
437 " failed.\n");
438 return OFFLOAD_FAIL;
439 }
440
441 // Skip the rest of this function, continue to the next argument.
442 continue;
443 }
444
445 void *HstPtrBegin = Args[I];
446 void *HstPtrBase = ArgsBase[I];
447 int64_t DataSize = ArgSizes[I];
448 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
449
450 // Adjust for proper alignment if this is a combined entry (for structs).
451 // Look at the next argument - if that is MEMBER_OF this one, then this one
452 // is a combined entry.
453 int64_t Padding = 0;
454 const int NextI = I + 1;
455 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
456 getParentIndex(ArgTypes[NextI]) == I) {
457 Padding = (int64_t)HstPtrBegin % Alignment;
458 if (Padding) {
459 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
460 "\n",
461 Padding, DPxPTR(HstPtrBegin));
462 HstPtrBegin = (char *)HstPtrBegin - Padding;
463 DataSize += Padding;
464 }
465 }
466
467 // Address of pointer on the host and device, respectively.
468 void *PointerHstPtrBegin, *PointerTgtPtrBegin;
469 TargetPointerResultTy PointerTpr;
470 bool IsHostPtr = false;
471 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
472 // Force the creation of a device side copy of the data when:
473 // a close map modifier was associated with a map that contained a to.
474 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
475 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
476 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
477 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
478 // have reached this point via __tgt_target_data_begin and not __tgt_target
479 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
480 // associated with a target region, so there are no target parameters). This
481 // may be considered a hack, we could revise the scheme in the future.
482 bool UpdateRef =
483 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
484 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
485 DP("Has a pointer entry: \n");
486 // Base is address of pointer.
487 //
488 // Usually, the pointer is already allocated by this time. For example:
489 //
490 // #pragma omp target map(s.p[0:N])
491 //
492 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
493 // afterward, so the pointer is already allocated by the time the
494 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
495 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
496 // entry for a global that might not already be allocated by the time the
497 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
498 // when HasPresentModifier.
499 PointerTpr = Device.getTargetPointer(
500 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr,
501 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
502 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo);
503 PointerTgtPtrBegin = PointerTpr.TargetPointer;
504 IsHostPtr = PointerTpr.Flags.IsHostPointer;
505 if (!PointerTgtPtrBegin) {
506 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
507 HasPresentModifier ? "'present' map type modifier"
508 : "device failure or illegal mapping");
509 return OFFLOAD_FAIL;
510 }
511 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
512 "\n",
513 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
514 (PointerTpr.Flags.IsNewEntry ? "" : " not"));
515 PointerHstPtrBegin = HstPtrBase;
516 // modify current entry.
517 HstPtrBase = *(void **)HstPtrBase;
518 // No need to update pointee ref count for the first element of the
519 // subelement that comes from mapper.
520 UpdateRef =
521 (!FromMapper || I != 0); // subsequently update ref count of pointee
522 }
523
524 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
525 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
526 auto TPR = Device.getTargetPointer(
527 HstPtrBegin, HstPtrBase, DataSize, HstPtrName, HasFlagTo, HasFlagAlways,
528 IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier,
529 HasHoldModifier, AsyncInfo);
530 void *TgtPtrBegin = TPR.TargetPointer;
531 IsHostPtr = TPR.Flags.IsHostPointer;
532 // If data_size==0, then the argument could be a zero-length pointer to
533 // NULL, so getOrAlloc() returning NULL is not an error.
534 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
535 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
536 HasPresentModifier ? "'present' map type modifier"
537 : "device failure or illegal mapping");
538 return OFFLOAD_FAIL;
539 }
540 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
541 " - is%s new\n",
542 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
543
544 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
545 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
546 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
547 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
548 ArgsBase[I] = TgtPtrBase;
549 }
550
551 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
552 // Check whether we need to update the pointer on the device
553 bool UpdateDevPtr = false;
554
555 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
556 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
557
558 Device.ShadowMtx.lock();
559 auto Entry = Device.ShadowPtrMap.find(PointerHstPtrBegin);
560 // If this pointer is not in the map we need to insert it. If the map
561 // contains a stale entry, we need to update it (e.g. if the pointee was
562 // deallocated and later on is reallocated at another device address). The
563 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
564 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
565 // pair is deallocated, not when the OBJ is deallocated. In
566 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
567 // so it stays in the map for the lifetime of the application. When the
568 // OBJ is deallocated and later on allocated again (at a different device
569 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
570 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
571 if (Entry == Device.ShadowPtrMap.end() ||
572 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
573 // create or update shadow pointers for this entry
574 Device.ShadowPtrMap[PointerHstPtrBegin] = {
575 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
576 PointerTpr.Entry->setMayContainAttachedPointers();
577 UpdateDevPtr = true;
578 }
579
580 if (UpdateDevPtr) {
581 std::lock_guard<decltype(*PointerTpr.Entry)> LG(*PointerTpr.Entry);
582 Device.ShadowMtx.unlock();
583
584 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
585 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
586
587 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
588 TgtPtrBase = ExpectedTgtPtrBase;
589
590 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
591 sizeof(void *), AsyncInfo);
592 if (Ret != OFFLOAD_SUCCESS) {
593 REPORT("Copying data to device failed.\n");
594 return OFFLOAD_FAIL;
595 }
596 if (PointerTpr.Entry->addEventIfNecessary(Device, AsyncInfo) !=
597 OFFLOAD_SUCCESS)
598 return OFFLOAD_FAIL;
599 } else
600 Device.ShadowMtx.unlock();
601 }
602 }
603
604 return OFFLOAD_SUCCESS;
605 }
606
607 namespace {
608 /// This structure contains information to deallocate a target pointer, aka.
609 /// used to fix up the shadow map and potentially delete the entry from the
610 /// mapping table via \p DeviceTy::deallocTgtPtr.
611 struct PostProcessingInfo {
612 /// Host pointer used to look up into the map table
613 void *HstPtrBegin;
614
615 /// Size of the data
616 int64_t DataSize;
617
618 /// The mapping type (bitfield).
619 int64_t ArgType;
620
621 /// The target pointer information.
622 TargetPointerResultTy TPR;
623
624 /// Are we expecting to delete this entry or not. Even if set, we might not
625 /// delete the entry if another thread reused the entry in the meantime.
626 bool DelEntry;
627
PostProcessingInfo__anon039aff710111::PostProcessingInfo628 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry,
629 TargetPointerResultTy TPR)
630 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR),
631 DelEntry(DelEntry) {}
632 };
633
634 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to
635 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the
636 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the
637 /// rest of the map is not checked anymore.
638 template <typename CBTy>
applyToShadowMapEntries(DeviceTy & Device,CBTy CB,void * Begin,uintptr_t Size,const TargetPointerResultTy & TPR)639 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin,
640 uintptr_t Size,
641 const TargetPointerResultTy &TPR) {
642 // If we have an object that is too small to hold a pointer subobject, no need
643 // to do any checking.
644 if (Size < sizeof(void *))
645 return;
646
647 // If the map entry for the object was never marked as containing attached
648 // pointers, no need to do any checking.
649 if (!TPR.Entry || !TPR.Entry->getMayContainAttachedPointers())
650 return;
651
652 uintptr_t LB = (uintptr_t)Begin;
653 uintptr_t UB = LB + Size;
654 // Now we are looking into the shadow map so we need to lock it.
655 std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx);
656 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
657 Itr != Device.ShadowPtrMap.end();) {
658 uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first;
659
660 // An STL map is sorted on its keys; use this property
661 // to quickly determine when to break out of the loop.
662 if (ShadowHstPtrAddr < LB) {
663 ++Itr;
664 continue;
665 }
666 if (ShadowHstPtrAddr >= UB)
667 break;
668
669 if (CB(Itr) == OFFLOAD_FAIL)
670 break;
671 }
672 }
673
674 } // namespace
675
676 /// Internal function to undo the mapping and retrieve the data from the device.
targetDataEnd(ident_t * Loc,DeviceTy & Device,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool FromMapper)677 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
678 void **ArgBases, void **Args, int64_t *ArgSizes,
679 int64_t *ArgTypes, map_var_info_t *ArgNames,
680 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
681 int Ret;
682 std::vector<PostProcessingInfo> PostProcessingPtrs;
683 void *FromMapperBase = nullptr;
684 // process each input.
685 for (int32_t I = ArgNum - 1; I >= 0; --I) {
686 // Ignore private variables and arrays - there is no mapping for them.
687 // Also, ignore the use_device_ptr directive, it has no effect here.
688 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
689 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
690 continue;
691
692 if (ArgMappers && ArgMappers[I]) {
693 // Instead of executing the regular path of targetDataEnd, call the
694 // targetDataMapper variant which will call targetDataEnd again
695 // with new arguments.
696 DP("Calling targetDataMapper for the %dth argument\n", I);
697
698 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
699 Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
700 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
701 targetDataEnd);
702
703 if (Ret != OFFLOAD_SUCCESS) {
704 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
705 " failed.\n");
706 return OFFLOAD_FAIL;
707 }
708
709 // Skip the rest of this function, continue to the next argument.
710 continue;
711 }
712
713 void *HstPtrBegin = Args[I];
714 int64_t DataSize = ArgSizes[I];
715 // Adjust for proper alignment if this is a combined entry (for structs).
716 // Look at the next argument - if that is MEMBER_OF this one, then this one
717 // is a combined entry.
718 const int NextI = I + 1;
719 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
720 getParentIndex(ArgTypes[NextI]) == I) {
721 int64_t Padding = (int64_t)HstPtrBegin % Alignment;
722 if (Padding) {
723 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
724 "\n",
725 Padding, DPxPTR(HstPtrBegin));
726 HstPtrBegin = (char *)HstPtrBegin - Padding;
727 DataSize += Padding;
728 }
729 }
730
731 bool IsLast, IsHostPtr;
732 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
733 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
734 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
735 !(FromMapper && I == 0);
736 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
737 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
738 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
739
740 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
741 TargetPointerResultTy TPR = Device.getTgtPtrBegin(
742 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
743 !IsImplicit, ForceDelete);
744 void *TgtPtrBegin = TPR.TargetPointer;
745 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
746 DP("Mapping does not exist (%s)\n",
747 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
748 if (HasPresentModifier) {
749 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
750 // "If a map clause appears on a target, target data, target enter data
751 // or target exit data construct with a present map-type-modifier then
752 // on entry to the region if the corresponding list item does not appear
753 // in the device data environment then an error occurs and the program
754 // terminates."
755 //
756 // This should be an error upon entering an "omp target exit data". It
757 // should not be an error upon exiting an "omp target data" or "omp
758 // target". For "omp target data", Clang thus doesn't include present
759 // modifiers for end calls. For "omp target", we have not found a valid
760 // OpenMP program for which the error matters: it appears that, if a
761 // program can guarantee that data is present at the beginning of an
762 // "omp target" region so that there's no error there, that data is also
763 // guaranteed to be present at the end.
764 MESSAGE("device mapping required by 'present' map type modifier does "
765 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
766 DPxPTR(HstPtrBegin), DataSize);
767 return OFFLOAD_FAIL;
768 }
769 } else {
770 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
771 " - is%s last\n",
772 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
773 }
774
775 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
776 // "If the map clause appears on a target, target data, or target exit data
777 // construct and a corresponding list item of the original list item is not
778 // present in the device data environment on exit from the region then the
779 // list item is ignored."
780 if (!TgtPtrBegin)
781 continue;
782
783 bool DelEntry = IsLast;
784
785 // If the last element from the mapper (for end transfer args comes in
786 // reverse order), do not remove the partial entry, the parent struct still
787 // exists.
788 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
789 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
790 DelEntry = false; // protect parent struct from being deallocated
791 }
792
793 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
794 // Move data back to the host
795 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
796 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
797 if ((Always || IsLast) && !IsHostPtr) {
798 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
799 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
800
801 std::lock_guard<decltype(*TPR.Entry)> LG(*TPR.Entry);
802 // Wait for any previous transfer if an event is present.
803 if (void *Event = TPR.Entry->getEvent()) {
804 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
805 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
806 return OFFLOAD_FAIL;
807 }
808 }
809
810 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
811 AsyncInfo);
812 if (Ret != OFFLOAD_SUCCESS) {
813 REPORT("Copying data from device failed.\n");
814 return OFFLOAD_FAIL;
815 }
816
817 // As we are expecting to delete the entry the d2h copy might race
818 // with another one that also tries to delete the entry. This happens
819 // as the entry can be reused and the reuse might happen after the
820 // copy-back was issued but before it completed. Since the reuse might
821 // also copy-back a value we would race.
822 if (IsLast) {
823 if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) !=
824 OFFLOAD_SUCCESS)
825 return OFFLOAD_FAIL;
826 }
827 }
828 }
829 if (DelEntry && FromMapper && I == 0) {
830 DelEntry = false;
831 FromMapperBase = HstPtrBegin;
832 }
833
834 // Add pointer to the buffer for post-synchronize processing.
835 PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
836 DelEntry && !IsHostPtr, TPR);
837 }
838 }
839
840 // TODO: We should not synchronize here but pass the AsyncInfo object to the
841 // allocate/deallocate device APIs.
842 //
843 // We need to synchronize before deallocating data.
844 Ret = AsyncInfo.synchronize();
845 if (Ret != OFFLOAD_SUCCESS)
846 return OFFLOAD_FAIL;
847
848 // Deallocate target pointer
849 for (PostProcessingInfo &Info : PostProcessingPtrs) {
850 // If we marked the entry to be deleted we need to verify no other thread
851 // reused it by now. If deletion is still supposed to happen by this thread
852 // LR will be set and exclusive access to the HDTT map will avoid another
853 // thread reusing the entry now. Note that we do not request (exclusive)
854 // access to the HDTT map if Info.DelEntry is not set.
855 LookupResult LR;
856 DeviceTy::HDTTMapAccessorTy HDTTMap =
857 Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry);
858
859 if (Info.DelEntry) {
860 LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize);
861 if (LR.Entry->getTotalRefCount() != 0 ||
862 LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) {
863 // The thread is not in charge of deletion anymore. Give up access to
864 // the HDTT map and unset the deletion flag.
865 HDTTMap.destroy();
866 Info.DelEntry = false;
867 }
868 }
869
870 // If we copied back to the host a struct/array containing pointers, we
871 // need to restore the original host pointer values from their shadow
872 // copies. If the struct is going to be deallocated, remove any remaining
873 // shadow pointer entries for this struct.
874 auto CB = [&](ShadowPtrListTy::iterator &Itr) {
875 // If we copied the struct to the host, we need to restore the pointer.
876 if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) {
877 void **ShadowHstPtrAddr = (void **)Itr->first;
878 *ShadowHstPtrAddr = Itr->second.HstPtrVal;
879 DP("Restoring original host pointer value " DPxMOD " for host "
880 "pointer " DPxMOD "\n",
881 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
882 }
883 // If the struct is to be deallocated, remove the shadow entry.
884 if (Info.DelEntry) {
885 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first));
886 Itr = Device.ShadowPtrMap.erase(Itr);
887 } else {
888 ++Itr;
889 }
890 return OFFLOAD_SUCCESS;
891 };
892 applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize,
893 Info.TPR);
894
895 // If we are deleting the entry the DataMapMtx is locked and we own the
896 // entry.
897 if (Info.DelEntry) {
898 if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin)
899 Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize);
900
901 if (Ret != OFFLOAD_SUCCESS) {
902 REPORT("Deallocating data from device failed.\n");
903 break;
904 }
905 }
906 }
907
908 return Ret;
909 }
910
targetDataContiguous(ident_t * Loc,DeviceTy & Device,void * ArgsBase,void * HstPtrBegin,int64_t ArgSize,int64_t ArgType,AsyncInfoTy & AsyncInfo)911 static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
912 void *HstPtrBegin, int64_t ArgSize,
913 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
914 TIMESCOPE_WITH_IDENT(Loc);
915 bool IsLast, IsHostPtr;
916 TargetPointerResultTy TPR = Device.getTgtPtrBegin(
917 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
918 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
919 void *TgtPtrBegin = TPR.TargetPointer;
920 if (!TgtPtrBegin) {
921 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
922 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
923 MESSAGE("device mapping required by 'present' motion modifier does not "
924 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
925 DPxPTR(HstPtrBegin), ArgSize);
926 return OFFLOAD_FAIL;
927 }
928 return OFFLOAD_SUCCESS;
929 }
930
931 if (IsHostPtr) {
932 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
933 DPxPTR(HstPtrBegin));
934 return OFFLOAD_SUCCESS;
935 }
936
937 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
938 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
939 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
940 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
941 if (Ret != OFFLOAD_SUCCESS) {
942 REPORT("Copying data from device failed.\n");
943 return OFFLOAD_FAIL;
944 }
945
946 auto CB = [&](ShadowPtrListTy::iterator &Itr) {
947 void **ShadowHstPtrAddr = (void **)Itr->first;
948 // Wait for device-to-host memcopies for whole struct to complete,
949 // before restoring the correct host pointer.
950 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
951 return OFFLOAD_FAIL;
952 *ShadowHstPtrAddr = Itr->second.HstPtrVal;
953 DP("Restoring original host pointer value " DPxMOD
954 " for host pointer " DPxMOD "\n",
955 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
956 ++Itr;
957 return OFFLOAD_SUCCESS;
958 };
959 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
960 }
961
962 if (ArgType & OMP_TGT_MAPTYPE_TO) {
963 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
964 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
965 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
966 if (Ret != OFFLOAD_SUCCESS) {
967 REPORT("Copying data to device failed.\n");
968 return OFFLOAD_FAIL;
969 }
970
971 auto CB = [&](ShadowPtrListTy::iterator &Itr) {
972 DP("Restoring original target pointer value " DPxMOD " for target "
973 "pointer " DPxMOD "\n",
974 DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr));
975 Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal,
976 sizeof(void *), AsyncInfo);
977 if (Ret != OFFLOAD_SUCCESS)
978 REPORT("Copying data to device failed.\n");
979 ++Itr;
980 return Ret;
981 };
982 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR);
983 }
984 return OFFLOAD_SUCCESS;
985 }
986
targetDataNonContiguous(ident_t * Loc,DeviceTy & Device,void * ArgsBase,__tgt_target_non_contig * NonContig,uint64_t Size,int64_t ArgType,int CurrentDim,int DimSize,uint64_t Offset,AsyncInfoTy & AsyncInfo)987 static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
988 void *ArgsBase,
989 __tgt_target_non_contig *NonContig,
990 uint64_t Size, int64_t ArgType,
991 int CurrentDim, int DimSize, uint64_t Offset,
992 AsyncInfoTy &AsyncInfo) {
993 TIMESCOPE_WITH_IDENT(Loc);
994 int Ret = OFFLOAD_SUCCESS;
995 if (CurrentDim < DimSize) {
996 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
997 uint64_t CurOffset =
998 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
999 // we only need to transfer the first element for the last dimension
1000 // since we've already got a contiguous piece.
1001 if (CurrentDim != DimSize - 1 || I == 0) {
1002 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
1003 ArgType, CurrentDim + 1, DimSize,
1004 Offset + CurOffset, AsyncInfo);
1005 // Stop the whole process if any contiguous piece returns anything
1006 // other than OFFLOAD_SUCCESS.
1007 if (Ret != OFFLOAD_SUCCESS)
1008 return Ret;
1009 }
1010 }
1011 } else {
1012 char *Ptr = (char *)ArgsBase + Offset;
1013 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
1014 " len %" PRIu64 "\n",
1015 DPxPTR(Ptr), Offset, Size);
1016 Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
1017 AsyncInfo);
1018 }
1019 return Ret;
1020 }
1021
getNonContigMergedDimension(__tgt_target_non_contig * NonContig,int32_t DimSize)1022 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
1023 int32_t DimSize) {
1024 int RemovedDim = 0;
1025 for (int I = DimSize - 1; I > 0; --I) {
1026 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
1027 RemovedDim++;
1028 }
1029 return RemovedDim;
1030 }
1031
1032 /// Internal function to pass data to/from the target.
targetDataUpdate(ident_t * Loc,DeviceTy & Device,int32_t ArgNum,void ** ArgsBase,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,AsyncInfoTy & AsyncInfo,bool)1033 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
1034 void **ArgsBase, void **Args, int64_t *ArgSizes,
1035 int64_t *ArgTypes, map_var_info_t *ArgNames,
1036 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
1037 // process each input.
1038 for (int32_t I = 0; I < ArgNum; ++I) {
1039 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
1040 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
1041 continue;
1042
1043 if (ArgMappers && ArgMappers[I]) {
1044 // Instead of executing the regular path of targetDataUpdate, call the
1045 // targetDataMapper variant which will call targetDataUpdate again
1046 // with new arguments.
1047 DP("Calling targetDataMapper for the %dth argument\n", I);
1048
1049 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
1050 int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1051 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
1052 targetDataUpdate);
1053
1054 if (Ret != OFFLOAD_SUCCESS) {
1055 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1056 " failed.\n");
1057 return OFFLOAD_FAIL;
1058 }
1059
1060 // Skip the rest of this function, continue to the next argument.
1061 continue;
1062 }
1063
1064 int Ret = OFFLOAD_SUCCESS;
1065
1066 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1067 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1068 int32_t DimSize = ArgSizes[I];
1069 uint64_t Size =
1070 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1071 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1072 Ret = targetDataNonContiguous(
1073 Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1074 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1075 } else {
1076 Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1077 ArgTypes[I], AsyncInfo);
1078 }
1079 if (Ret == OFFLOAD_FAIL)
1080 return OFFLOAD_FAIL;
1081 }
1082 return OFFLOAD_SUCCESS;
1083 }
1084
1085 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1086 OMP_TGT_MAPTYPE_LITERAL |
1087 OMP_TGT_MAPTYPE_IMPLICIT;
isLambdaMapping(int64_t Mapping)1088 static bool isLambdaMapping(int64_t Mapping) {
1089 return (Mapping & LambdaMapping) == LambdaMapping;
1090 }
1091
1092 namespace {
1093 /// Find the table information in the map or look it up in the translation
1094 /// tables.
getTableMap(void * HostPtr)1095 TableMap *getTableMap(void *HostPtr) {
1096 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1097 HostPtrToTableMapTy::iterator TableMapIt =
1098 PM->HostPtrToTableMap.find(HostPtr);
1099
1100 if (TableMapIt != PM->HostPtrToTableMap.end())
1101 return &TableMapIt->second;
1102
1103 // We don't have a map. So search all the registered libraries.
1104 TableMap *TM = nullptr;
1105 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1106 for (HostEntriesBeginToTransTableTy::iterator Itr =
1107 PM->HostEntriesBeginToTransTable.begin();
1108 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1109 // get the translation table (which contains all the good info).
1110 TranslationTable *TransTable = &Itr->second;
1111 // iterate over all the host table entries to see if we can locate the
1112 // host_ptr.
1113 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1114 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1115 if (Cur->addr != HostPtr)
1116 continue;
1117 // we got a match, now fill the HostPtrToTableMap so that we
1118 // may avoid this search next time.
1119 TM = &(PM->HostPtrToTableMap)[HostPtr];
1120 TM->Table = TransTable;
1121 TM->Index = I;
1122 return TM;
1123 }
1124 }
1125
1126 return nullptr;
1127 }
1128
1129 /// Get loop trip count
1130 /// FIXME: This function will not work right if calling
1131 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1132 /// another thread, which might occur when we call task yield.
getLoopTripCount(int64_t DeviceId)1133 uint64_t getLoopTripCount(int64_t DeviceId) {
1134 DeviceTy &Device = *PM->Devices[DeviceId];
1135 uint64_t LoopTripCount = 0;
1136
1137 {
1138 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1139 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1140 if (I != Device.LoopTripCnt.end()) {
1141 LoopTripCount = I->second;
1142 Device.LoopTripCnt.erase(I);
1143 }
1144 }
1145
1146 return LoopTripCount;
1147 }
1148
1149 /// A class manages private arguments in a target region.
1150 class PrivateArgumentManagerTy {
1151 /// A data structure for the information of first-private arguments. We can
1152 /// use this information to optimize data transfer by packing all
1153 /// first-private arguments and transfer them all at once.
1154 struct FirstPrivateArgInfoTy {
1155 /// The index of the element in \p TgtArgs corresponding to the argument
1156 const int Index;
1157 /// Host pointer begin
1158 const char *HstPtrBegin;
1159 /// Host pointer end
1160 const char *HstPtrEnd;
1161 /// Aligned size
1162 const int64_t AlignedSize;
1163 /// Host pointer name
1164 const map_var_info_t HstPtrName = nullptr;
1165
FirstPrivateArgInfoTy__anon039aff710511::PrivateArgumentManagerTy::FirstPrivateArgInfoTy1166 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1167 const map_var_info_t HstPtrName = nullptr)
1168 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1169 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1170 HstPtrName(HstPtrName) {}
1171 };
1172
1173 /// A vector of target pointers for all private arguments
1174 std::vector<void *> TgtPtrs;
1175
1176 /// A vector of information of all first-private arguments to be packed
1177 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1178 /// Host buffer for all arguments to be packed
1179 std::vector<char> FirstPrivateArgBuffer;
1180 /// The total size of all arguments to be packed
1181 int64_t FirstPrivateArgSize = 0;
1182
1183 /// A reference to the \p DeviceTy object
1184 DeviceTy &Device;
1185 /// A pointer to a \p AsyncInfoTy object
1186 AsyncInfoTy &AsyncInfo;
1187
1188 // TODO: What would be the best value here? Should we make it configurable?
1189 // If the size is larger than this threshold, we will allocate and transfer it
1190 // immediately instead of packing it.
1191 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1192
1193 public:
1194 /// Constructor
PrivateArgumentManagerTy(DeviceTy & Dev,AsyncInfoTy & AsyncInfo)1195 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1196 : Device(Dev), AsyncInfo(AsyncInfo) {}
1197
1198 /// Add a private argument
addArg(void * HstPtr,int64_t ArgSize,int64_t ArgOffset,bool IsFirstPrivate,void * & TgtPtr,int TgtArgsIndex,const map_var_info_t HstPtrName=nullptr,const bool AllocImmediately=false)1199 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1200 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1201 const map_var_info_t HstPtrName = nullptr,
1202 const bool AllocImmediately = false) {
1203 // If the argument is not first-private, or its size is greater than a
1204 // predefined threshold, we will allocate memory and issue the transfer
1205 // immediately.
1206 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1207 AllocImmediately) {
1208 TgtPtr = Device.allocData(ArgSize, HstPtr);
1209 if (!TgtPtr) {
1210 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1211 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1212 return OFFLOAD_FAIL;
1213 }
1214 #ifdef OMPTARGET_DEBUG
1215 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1216 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1217 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1218 "\n",
1219 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1220 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1221 #endif
1222 // If first-private, copy data from host
1223 if (IsFirstPrivate) {
1224 DP("Submitting firstprivate data to the device.\n");
1225 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1226 if (Ret != OFFLOAD_SUCCESS) {
1227 DP("Copying data to device failed, failed.\n");
1228 return OFFLOAD_FAIL;
1229 }
1230 }
1231 TgtPtrs.push_back(TgtPtr);
1232 } else {
1233 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1234 DPxPTR(HstPtr), ArgSize);
1235 // When reach this point, the argument must meet all following
1236 // requirements:
1237 // 1. Its size does not exceed the threshold (see the comment for
1238 // FirstPrivateArgSizeThreshold);
1239 // 2. It must be first-private (needs to be mapped to target device).
1240 // We will pack all this kind of arguments to transfer them all at once
1241 // to reduce the number of data transfer. We will not take
1242 // non-first-private arguments, aka. private arguments that doesn't need
1243 // to be mapped to target device, into account because data allocation
1244 // can be very efficient with memory manager.
1245
1246 // Placeholder value
1247 TgtPtr = nullptr;
1248 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1249 HstPtrName);
1250 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1251 }
1252
1253 return OFFLOAD_SUCCESS;
1254 }
1255
1256 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1257 /// and start the transfer.
packAndTransfer(std::vector<void * > & TgtArgs)1258 int packAndTransfer(std::vector<void *> &TgtArgs) {
1259 if (!FirstPrivateArgInfo.empty()) {
1260 assert(FirstPrivateArgSize != 0 &&
1261 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1262 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1263 auto Itr = FirstPrivateArgBuffer.begin();
1264 // Copy all host data to this buffer
1265 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1266 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1267 Itr = std::next(Itr, Info.AlignedSize);
1268 }
1269 // Allocate target memory
1270 void *TgtPtr =
1271 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1272 if (TgtPtr == nullptr) {
1273 DP("Failed to allocate target memory for private arguments.\n");
1274 return OFFLOAD_FAIL;
1275 }
1276 TgtPtrs.push_back(TgtPtr);
1277 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1278 FirstPrivateArgSize, DPxPTR(TgtPtr));
1279 // Transfer data to target device
1280 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1281 FirstPrivateArgSize, AsyncInfo);
1282 if (Ret != OFFLOAD_SUCCESS) {
1283 DP("Failed to submit data of private arguments.\n");
1284 return OFFLOAD_FAIL;
1285 }
1286 // Fill in all placeholder pointers
1287 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1288 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1289 void *&Ptr = TgtArgs[Info.Index];
1290 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1291 Ptr = reinterpret_cast<void *>(TP);
1292 TP += Info.AlignedSize;
1293 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1294 "\n",
1295 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1296 DPxPTR(Ptr));
1297 }
1298 }
1299
1300 return OFFLOAD_SUCCESS;
1301 }
1302
1303 /// Free all target memory allocated for private arguments
free()1304 int free() {
1305 for (void *P : TgtPtrs) {
1306 int Ret = Device.deleteData(P);
1307 if (Ret != OFFLOAD_SUCCESS) {
1308 DP("Deallocation of (first-)private arrays failed.\n");
1309 return OFFLOAD_FAIL;
1310 }
1311 }
1312
1313 TgtPtrs.clear();
1314
1315 return OFFLOAD_SUCCESS;
1316 }
1317 };
1318
1319 /// Process data before launching the kernel, including calling targetDataBegin
1320 /// to map and transfer data to target device, transferring (first-)private
1321 /// variables.
processDataBefore(ident_t * Loc,int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,std::vector<void * > & TgtArgs,std::vector<ptrdiff_t> & TgtOffsets,PrivateArgumentManagerTy & PrivateArgumentManager,AsyncInfoTy & AsyncInfo)1322 static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1323 int32_t ArgNum, void **ArgBases, void **Args,
1324 int64_t *ArgSizes, int64_t *ArgTypes,
1325 map_var_info_t *ArgNames, void **ArgMappers,
1326 std::vector<void *> &TgtArgs,
1327 std::vector<ptrdiff_t> &TgtOffsets,
1328 PrivateArgumentManagerTy &PrivateArgumentManager,
1329 AsyncInfoTy &AsyncInfo) {
1330 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
1331 DeviceTy &Device = *PM->Devices[DeviceId];
1332 int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1333 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1334 if (Ret != OFFLOAD_SUCCESS) {
1335 REPORT("Call to targetDataBegin failed, abort target.\n");
1336 return OFFLOAD_FAIL;
1337 }
1338
1339 // List of (first-)private arrays allocated for this target region
1340 std::vector<int> TgtArgsPositions(ArgNum, -1);
1341
1342 for (int32_t I = 0; I < ArgNum; ++I) {
1343 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1344 // This is not a target parameter, do not push it into TgtArgs.
1345 // Check for lambda mapping.
1346 if (isLambdaMapping(ArgTypes[I])) {
1347 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1348 "PTR_AND_OBJ must be also MEMBER_OF.");
1349 unsigned Idx = getParentIndex(ArgTypes[I]);
1350 int TgtIdx = TgtArgsPositions[Idx];
1351 assert(TgtIdx != -1 && "Base address must be translated already.");
1352 // The parent lambda must be processed already and it must be the last
1353 // in TgtArgs and TgtOffsets arrays.
1354 void *HstPtrVal = Args[I];
1355 void *HstPtrBegin = ArgBases[I];
1356 void *HstPtrBase = Args[Idx];
1357 bool IsLast, IsHostPtr; // IsLast is unused.
1358 void *TgtPtrBase =
1359 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1360 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1361 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1362 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1363 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1364 TargetPointerResultTy TPR = Device.getTgtPtrBegin(
1365 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
1366 /*UseHoldRefCount=*/false, IsHostPtr);
1367 PointerTgtPtrBegin = TPR.TargetPointer;
1368 if (!PointerTgtPtrBegin) {
1369 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1370 DPxPTR(HstPtrVal));
1371 continue;
1372 }
1373 if (IsHostPtr) {
1374 DP("Unified memory is active, no need to map lambda captured"
1375 "variable (" DPxMOD ")\n",
1376 DPxPTR(HstPtrVal));
1377 continue;
1378 }
1379 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1380 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1381 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1382 sizeof(void *), AsyncInfo);
1383 if (Ret != OFFLOAD_SUCCESS) {
1384 REPORT("Copying data to device failed.\n");
1385 return OFFLOAD_FAIL;
1386 }
1387 }
1388 continue;
1389 }
1390 void *HstPtrBegin = Args[I];
1391 void *HstPtrBase = ArgBases[I];
1392 void *TgtPtrBegin;
1393 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1394 ptrdiff_t TgtBaseOffset;
1395 bool IsLast, IsHostPtr; // unused.
1396 TargetPointerResultTy TPR;
1397 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1398 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1399 DPxPTR(HstPtrBase));
1400 TgtPtrBegin = HstPtrBase;
1401 TgtBaseOffset = 0;
1402 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1403 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1404 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1405 // If there is a next argument and it depends on the current one, we need
1406 // to allocate the private memory immediately. If this is not the case,
1407 // then the argument can be marked for optimization and packed with the
1408 // other privates.
1409 const bool AllocImmediately =
1410 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1411 Ret = PrivateArgumentManager.addArg(
1412 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1413 TgtArgs.size(), HstPtrName, AllocImmediately);
1414 if (Ret != OFFLOAD_SUCCESS) {
1415 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1416 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1417 return OFFLOAD_FAIL;
1418 }
1419 } else {
1420 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1421 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1422 TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1423 /*UpdateRefCount=*/false,
1424 /*UseHoldRefCount=*/false, IsHostPtr);
1425 TgtPtrBegin = TPR.TargetPointer;
1426 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1427 #ifdef OMPTARGET_DEBUG
1428 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1429 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1430 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1431 #endif
1432 }
1433 TgtArgsPositions[I] = TgtArgs.size();
1434 TgtArgs.push_back(TgtPtrBegin);
1435 TgtOffsets.push_back(TgtBaseOffset);
1436 }
1437
1438 assert(TgtArgs.size() == TgtOffsets.size() &&
1439 "Size mismatch in arguments and offsets");
1440
1441 // Pack and transfer first-private arguments
1442 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1443 if (Ret != OFFLOAD_SUCCESS) {
1444 DP("Failed to pack and transfer first private arguments\n");
1445 return OFFLOAD_FAIL;
1446 }
1447
1448 return OFFLOAD_SUCCESS;
1449 }
1450
1451 /// Process data after launching the kernel, including transferring data back to
1452 /// host if needed and deallocating target memory of (first-)private variables.
processDataAfter(ident_t * Loc,int64_t DeviceId,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,PrivateArgumentManagerTy & PrivateArgumentManager,AsyncInfoTy & AsyncInfo)1453 static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1454 int32_t ArgNum, void **ArgBases, void **Args,
1455 int64_t *ArgSizes, int64_t *ArgTypes,
1456 map_var_info_t *ArgNames, void **ArgMappers,
1457 PrivateArgumentManagerTy &PrivateArgumentManager,
1458 AsyncInfoTy &AsyncInfo) {
1459 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
1460 DeviceTy &Device = *PM->Devices[DeviceId];
1461
1462 // Move data from device.
1463 int Ret = targetDataEnd(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1464 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1465 if (Ret != OFFLOAD_SUCCESS) {
1466 REPORT("Call to targetDataEnd failed, abort target.\n");
1467 return OFFLOAD_FAIL;
1468 }
1469
1470 // Free target memory for private arguments
1471 Ret = PrivateArgumentManager.free();
1472 if (Ret != OFFLOAD_SUCCESS) {
1473 REPORT("Failed to deallocate target memory for private args\n");
1474 return OFFLOAD_FAIL;
1475 }
1476
1477 return OFFLOAD_SUCCESS;
1478 }
1479 } // namespace
1480
1481 /// performs the same actions as data_begin in case arg_num is
1482 /// non-zero and initiates run of the offloaded region on the target platform;
1483 /// if arg_num is non-zero after the region execution is done it also
1484 /// performs the same action as data_update and data_end above. This function
1485 /// returns 0 if it was able to transfer the execution to a target and an
1486 /// integer different from zero otherwise.
target(ident_t * Loc,DeviceTy & Device,void * HostPtr,int32_t ArgNum,void ** ArgBases,void ** Args,int64_t * ArgSizes,int64_t * ArgTypes,map_var_info_t * ArgNames,void ** ArgMappers,int32_t TeamNum,int32_t ThreadLimit,uint64_t Tripcount,int IsTeamConstruct,AsyncInfoTy & AsyncInfo)1487 int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1488 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1489 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1490 int32_t ThreadLimit, uint64_t Tripcount, int IsTeamConstruct,
1491 AsyncInfoTy &AsyncInfo) {
1492 int32_t DeviceId = Device.DeviceID;
1493 TableMap *TM = getTableMap(HostPtr);
1494 // No map for this host pointer found!
1495 if (!TM) {
1496 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1497 DPxPTR(HostPtr));
1498 return OFFLOAD_FAIL;
1499 }
1500
1501 // get target table.
1502 __tgt_target_table *TargetTable = nullptr;
1503 {
1504 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1505 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1506 "Not expecting a device ID outside the table's bounds!");
1507 TargetTable = TM->Table->TargetsTable[DeviceId];
1508 }
1509 assert(TargetTable && "Global data has not been mapped\n");
1510
1511 // FIXME: Use legacy tripcount method if it is '-1'.
1512 Tripcount = Tripcount == -1UL ? getLoopTripCount(DeviceId) : Tripcount;
1513 DP("loop trip count is %" PRIu64 ".\n", Tripcount);
1514
1515 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1516 // need to manifest base pointers prior to launching a kernel. Even if we have
1517 // mapped an object only partially, e.g. A[N:M], although the kernel is
1518 // expected to access elements starting at address &A[N] and beyond, we still
1519 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1520 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1521 // begin addresses, not bases. That's why we pass args and offsets as two
1522 // separate entities so that each plugin can do what it needs. This behavior
1523 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1524 std::vector<void *> TgtArgs;
1525 std::vector<ptrdiff_t> TgtOffsets;
1526
1527 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1528
1529 int Ret;
1530 if (ArgNum) {
1531 // Process data, such as data mapping, before launching the kernel
1532 Ret = processDataBefore(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1533 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1534 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1535 if (Ret != OFFLOAD_SUCCESS) {
1536 REPORT("Failed to process data before launching the kernel.\n");
1537 return OFFLOAD_FAIL;
1538 }
1539 }
1540
1541 // Launch device execution.
1542 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1543 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1544 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1545
1546 {
1547 TIMESCOPE_WITH_NAME_AND_IDENT(
1548 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", Loc);
1549 if (IsTeamConstruct)
1550 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1551 TgtArgs.size(), TeamNum, ThreadLimit,
1552 Tripcount, AsyncInfo);
1553 else
1554 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1555 TgtArgs.size(), AsyncInfo);
1556 }
1557
1558 if (Ret != OFFLOAD_SUCCESS) {
1559 REPORT("Executing target region abort target.\n");
1560 return OFFLOAD_FAIL;
1561 }
1562
1563 if (ArgNum) {
1564 // Transfer data back and deallocate target memory for (first-)private
1565 // variables
1566 Ret = processDataAfter(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1567 ArgSizes, ArgTypes, ArgNames, ArgMappers,
1568 PrivateArgumentManager, AsyncInfo);
1569 if (Ret != OFFLOAD_SUCCESS) {
1570 REPORT("Failed to process data after launching the kernel.\n");
1571 return OFFLOAD_FAIL;
1572 }
1573 }
1574
1575 return OFFLOAD_SUCCESS;
1576 }
1577