1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "device.h" 16 #include "private.h" 17 #include "rtl.h" 18 19 #include <cassert> 20 #include <cstdint> 21 #include <vector> 22 23 int AsyncInfoTy::synchronize() { 24 int Result = OFFLOAD_SUCCESS; 25 if (AsyncInfo.Queue) { 26 // If we have a queue we need to synchronize it now. 27 Result = Device.synchronize(*this); 28 assert(AsyncInfo.Queue == nullptr && 29 "The device plugin should have nulled the queue to indicate there " 30 "are no outstanding actions!"); 31 } 32 return Result; 33 } 34 35 void *&AsyncInfoTy::getVoidPtrLocation() { 36 BufferLocations.push_back(nullptr); 37 return BufferLocations.back(); 38 } 39 40 /* All begin addresses for partially mapped structs must be 8-aligned in order 41 * to ensure proper alignment of members. E.g. 42 * 43 * struct S { 44 * int a; // 4-aligned 45 * int b; // 4-aligned 46 * int *p; // 8-aligned 47 * } s1; 48 * ... 49 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 50 * { 51 * s1.b = 5; 52 * for (int i...) s1.p[i] = ...; 53 * } 54 * 55 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 56 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 57 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 58 * requirements for its type. Now, when we allocate memory on the device, in 59 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 60 * This means that the chunk of the struct on the device will start at a 61 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 62 * address of p will be a misaligned 0x204 (on the host there was no need to add 63 * padding between b and p, so p comes exactly 4 bytes after b). If the device 64 * kernel tries to access s1.p, a misaligned address error occurs (as reported 65 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 66 * extending the size of the allocated chuck accordingly, the chuck on the 67 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 68 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 69 */ 70 static const int64_t Alignment = 8; 71 72 /// Map global data and execute pending ctors 73 static int initLibrary(DeviceTy &Device) { 74 /* 75 * Map global data 76 */ 77 int32_t DeviceId = Device.DeviceID; 78 int Rc = OFFLOAD_SUCCESS; 79 bool SupportsEmptyImages = Device.RTL->supports_empty_images && 80 Device.RTL->supports_empty_images() > 0; 81 82 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( 83 Device.PendingGlobalsMtx); 84 { 85 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx); 86 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { 87 TranslationTable *TransTable = 88 &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; 89 if (TransTable->HostTable.EntriesBegin == 90 TransTable->HostTable.EntriesEnd && 91 !SupportsEmptyImages) { 92 // No host entry so no need to proceed 93 continue; 94 } 95 96 if (TransTable->TargetsTable[DeviceId] != 0) { 97 // Library entries have already been processed 98 continue; 99 } 100 101 // 1) get image. 102 assert(TransTable->TargetsImages.size() > (size_t)DeviceId && 103 "Not expecting a device ID outside the table's bounds!"); 104 __tgt_device_image *Img = TransTable->TargetsImages[DeviceId]; 105 if (!Img) { 106 REPORT("No image loaded for device id %d.\n", DeviceId); 107 Rc = OFFLOAD_FAIL; 108 break; 109 } 110 // 2) load image into the target table. 111 __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] = 112 Device.loadBinary(Img); 113 // Unable to get table for this image: invalidate image and fail. 114 if (!TargetTable) { 115 REPORT("Unable to generate entries table for device id %d.\n", 116 DeviceId); 117 TransTable->TargetsImages[DeviceId] = 0; 118 Rc = OFFLOAD_FAIL; 119 break; 120 } 121 122 // Verify whether the two table sizes match. 123 size_t Hsize = 124 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 125 size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 126 127 // Invalid image for these host entries! 128 if (Hsize != Tsize) { 129 REPORT( 130 "Host and Target tables mismatch for device id %d [%zx != %zx].\n", 131 DeviceId, Hsize, Tsize); 132 TransTable->TargetsImages[DeviceId] = 0; 133 TransTable->TargetsTable[DeviceId] = 0; 134 Rc = OFFLOAD_FAIL; 135 break; 136 } 137 138 DeviceTy::HDTTMapAccessorTy HDTTMap = 139 Device.HostDataToTargetMap.getExclusiveAccessor(); 140 141 __tgt_target_table *HostTable = &TransTable->HostTable; 142 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 143 *CurrHostEntry = HostTable->EntriesBegin, 144 *EntryDeviceEnd = TargetTable->EntriesEnd; 145 CurrDeviceEntry != EntryDeviceEnd; 146 CurrDeviceEntry++, CurrHostEntry++) { 147 if (CurrDeviceEntry->size != 0) { 148 // has data. 149 assert(CurrDeviceEntry->size == CurrHostEntry->size && 150 "data size mismatch"); 151 152 // Fortran may use multiple weak declarations for the same symbol, 153 // therefore we must allow for multiple weak symbols to be loaded from 154 // the fat binary. Treat these mappings as any other "regular" 155 // mapping. Add entry to map. 156 if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr, 157 CurrHostEntry->size)) 158 continue; 159 160 DP("Add mapping from host " DPxMOD " to device " DPxMOD 161 " with size %zu" 162 "\n", 163 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 164 CurrDeviceEntry->size); 165 HDTTMap->emplace(new HostDataToTargetTy( 166 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 167 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 168 (uintptr_t)CurrHostEntry->addr + 169 CurrHostEntry->size /*HstPtrEnd*/, 170 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 171 false /*UseHoldRefCount*/, CurrHostEntry->name, 172 true /*IsRefCountINF*/)); 173 } 174 } 175 } 176 } 177 178 if (Rc != OFFLOAD_SUCCESS) { 179 return Rc; 180 } 181 182 /* 183 * Run ctors for static objects 184 */ 185 if (!Device.PendingCtorsDtors.empty()) { 186 AsyncInfoTy AsyncInfo(Device); 187 // Call all ctors for all libraries registered so far 188 for (auto &Lib : Device.PendingCtorsDtors) { 189 if (!Lib.second.PendingCtors.empty()) { 190 DP("Has pending ctors... call now\n"); 191 for (auto &Entry : Lib.second.PendingCtors) { 192 void *Ctor = Entry; 193 int Rc = target(nullptr, Device, Ctor, 0, nullptr, nullptr, nullptr, 194 nullptr, nullptr, nullptr, 1, 1, 0, true /*team*/, 195 AsyncInfo); 196 if (Rc != OFFLOAD_SUCCESS) { 197 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(Ctor)); 198 return OFFLOAD_FAIL; 199 } 200 } 201 // Clear the list to indicate that this device has been used 202 Lib.second.PendingCtors.clear(); 203 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(Lib.first)); 204 } 205 } 206 // All constructors have been issued, wait for them now. 207 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 208 return OFFLOAD_FAIL; 209 } 210 Device.HasPendingGlobals = false; 211 212 return OFFLOAD_SUCCESS; 213 } 214 215 void handleTargetOutcome(bool Success, ident_t *Loc) { 216 switch (PM->TargetOffloadPolicy) { 217 case tgt_disabled: 218 if (Success) { 219 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 220 } 221 break; 222 case tgt_default: 223 FATAL_MESSAGE0(1, "default offloading policy must be switched to " 224 "mandatory or disabled"); 225 break; 226 case tgt_mandatory: 227 if (!Success) { 228 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 229 for (auto &Device : PM->Devices) 230 dumpTargetPointerMappings(Loc, *Device); 231 else 232 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " 233 "for debugging options.\n"); 234 235 SourceInfo Info(Loc); 236 if (Info.isAvailible()) 237 fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(), 238 Info.getColumn()); 239 else 240 FAILURE_MESSAGE("Source location information not present. Compile with " 241 "-g or -gline-tables-only.\n"); 242 FATAL_MESSAGE0( 243 1, "failure of target construct while offloading is mandatory"); 244 } else { 245 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 246 for (auto &Device : PM->Devices) 247 dumpTargetPointerMappings(Loc, *Device); 248 } 249 break; 250 } 251 } 252 253 static void handleDefaultTargetOffload() { 254 std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx); 255 if (PM->TargetOffloadPolicy == tgt_default) { 256 if (omp_get_num_devices() > 0) { 257 DP("Default TARGET OFFLOAD policy is now mandatory " 258 "(devices were found)\n"); 259 PM->TargetOffloadPolicy = tgt_mandatory; 260 } else { 261 DP("Default TARGET OFFLOAD policy is now disabled " 262 "(no devices were found)\n"); 263 PM->TargetOffloadPolicy = tgt_disabled; 264 } 265 } 266 } 267 268 static bool isOffloadDisabled() { 269 if (PM->TargetOffloadPolicy == tgt_default) 270 handleDefaultTargetOffload(); 271 return PM->TargetOffloadPolicy == tgt_disabled; 272 } 273 274 // If offload is enabled, ensure that device DeviceID has been initialized, 275 // global ctors have been executed, and global data has been mapped. 276 // 277 // The return bool indicates if the offload is to the host device 278 // There are three possible results: 279 // - Return false if the taregt device is ready for offload 280 // - Return true without reporting a runtime error if offload is 281 // disabled, perhaps because the initial device was specified. 282 // - Report a runtime error and return true. 283 // 284 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 285 // This step might be skipped if offload is disabled. 286 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { 287 if (isOffloadDisabled()) { 288 DP("Offload is disabled\n"); 289 return true; 290 } 291 292 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 293 DeviceID = omp_get_default_device(); 294 DP("Use default device id %" PRId64 "\n", DeviceID); 295 } 296 297 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 298 if (omp_get_num_devices() == 0) { 299 DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 300 handleTargetOutcome(false, Loc); 301 return true; 302 } 303 304 if (DeviceID == omp_get_initial_device()) { 305 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 306 DeviceID); 307 return true; 308 } 309 310 // Is device ready? 311 if (!deviceIsReady(DeviceID)) { 312 REPORT("Device %" PRId64 " is not ready.\n", DeviceID); 313 handleTargetOutcome(false, Loc); 314 return true; 315 } 316 317 // Get device info. 318 DeviceTy &Device = *PM->Devices[DeviceID]; 319 320 // Check whether global data has been mapped for this device 321 bool HasPendingGlobals; 322 { 323 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( 324 Device.PendingGlobalsMtx); 325 HasPendingGlobals = Device.HasPendingGlobals; 326 } 327 if (HasPendingGlobals && initLibrary(Device) != OFFLOAD_SUCCESS) { 328 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID); 329 handleTargetOutcome(false, Loc); 330 return true; 331 } 332 333 return false; 334 } 335 336 static int32_t getParentIndex(int64_t Type) { 337 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 338 } 339 340 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, 341 const char *Name) { 342 TIMESCOPE(); 343 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size); 344 345 if (Size <= 0) { 346 DP("Call to %s with non-positive length\n", Name); 347 return NULL; 348 } 349 350 void *Rc = NULL; 351 352 if (DeviceNum == omp_get_initial_device()) { 353 Rc = malloc(Size); 354 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc)); 355 return Rc; 356 } 357 358 if (!deviceIsReady(DeviceNum)) { 359 DP("%s returns NULL ptr\n", Name); 360 return NULL; 361 } 362 363 DeviceTy &Device = *PM->Devices[DeviceNum]; 364 Rc = Device.allocData(Size, nullptr, Kind); 365 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)); 366 return Rc; 367 } 368 369 /// Call the user-defined mapper function followed by the appropriate 370 // targetData* function (targetData{Begin,End,Update}). 371 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, 372 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, 373 void *ArgMapper, AsyncInfoTy &AsyncInfo, 374 TargetDataFuncPtrTy TargetDataFunction) { 375 TIMESCOPE_WITH_IDENT(Loc); 376 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); 377 378 // The mapper function fills up Components. 379 MapperComponentsTy MapperComponents; 380 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); 381 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, 382 ArgNames); 383 384 // Construct new arrays for args_base, args, arg_sizes and arg_types 385 // using the information in MapperComponents and call the corresponding 386 // targetData* function using these new arrays. 387 std::vector<void *> MapperArgsBase(MapperComponents.Components.size()); 388 std::vector<void *> MapperArgs(MapperComponents.Components.size()); 389 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 390 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 391 std::vector<void *> MapperArgNames(MapperComponents.Components.size()); 392 393 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 394 auto &C = MapperComponents.Components[I]; 395 MapperArgsBase[I] = C.Base; 396 MapperArgs[I] = C.Begin; 397 MapperArgSizes[I] = C.Size; 398 MapperArgTypes[I] = C.Type; 399 MapperArgNames[I] = C.Name; 400 } 401 402 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), 403 MapperArgsBase.data(), MapperArgs.data(), 404 MapperArgSizes.data(), MapperArgTypes.data(), 405 MapperArgNames.data(), /*arg_mappers*/ nullptr, 406 AsyncInfo, /*FromMapper=*/true); 407 408 return Rc; 409 } 410 411 /// Internal function to do the mapping and transfer the data to the device 412 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, 413 void **ArgsBase, void **Args, int64_t *ArgSizes, 414 int64_t *ArgTypes, map_var_info_t *ArgNames, 415 void **ArgMappers, AsyncInfoTy &AsyncInfo, 416 bool FromMapper) { 417 // process each input. 418 for (int32_t I = 0; I < ArgNum; ++I) { 419 // Ignore private variables and arrays - there is no mapping for them. 420 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 421 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 422 continue; 423 424 if (ArgMappers && ArgMappers[I]) { 425 // Instead of executing the regular path of targetDataBegin, call the 426 // targetDataMapper variant which will call targetDataBegin again 427 // with new arguments. 428 DP("Calling targetDataMapper for the %dth argument\n", I); 429 430 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 431 int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 432 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 433 targetDataBegin); 434 435 if (Rc != OFFLOAD_SUCCESS) { 436 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 437 " failed.\n"); 438 return OFFLOAD_FAIL; 439 } 440 441 // Skip the rest of this function, continue to the next argument. 442 continue; 443 } 444 445 void *HstPtrBegin = Args[I]; 446 void *HstPtrBase = ArgsBase[I]; 447 int64_t DataSize = ArgSizes[I]; 448 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 449 450 // Adjust for proper alignment if this is a combined entry (for structs). 451 // Look at the next argument - if that is MEMBER_OF this one, then this one 452 // is a combined entry. 453 int64_t Padding = 0; 454 const int NextI = I + 1; 455 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 456 getParentIndex(ArgTypes[NextI]) == I) { 457 Padding = (int64_t)HstPtrBegin % Alignment; 458 if (Padding) { 459 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 460 "\n", 461 Padding, DPxPTR(HstPtrBegin)); 462 HstPtrBegin = (char *)HstPtrBegin - Padding; 463 DataSize += Padding; 464 } 465 } 466 467 // Address of pointer on the host and device, respectively. 468 void *PointerHstPtrBegin, *PointerTgtPtrBegin; 469 TargetPointerResultTy PointerTpr; 470 bool IsHostPtr = false; 471 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 472 // Force the creation of a device side copy of the data when: 473 // a close map modifier was associated with a map that contained a to. 474 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; 475 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 476 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 477 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 478 // have reached this point via __tgt_target_data_begin and not __tgt_target 479 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 480 // associated with a target region, so there are no target parameters). This 481 // may be considered a hack, we could revise the scheme in the future. 482 bool UpdateRef = 483 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); 484 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 485 DP("Has a pointer entry: \n"); 486 // Base is address of pointer. 487 // 488 // Usually, the pointer is already allocated by this time. For example: 489 // 490 // #pragma omp target map(s.p[0:N]) 491 // 492 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 493 // afterward, so the pointer is already allocated by the time the 494 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 495 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 496 // entry for a global that might not already be allocated by the time the 497 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 498 // when HasPresentModifier. 499 PointerTpr = Device.getTargetPointer( 500 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, 501 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, 502 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); 503 PointerTgtPtrBegin = PointerTpr.TargetPointer; 504 IsHostPtr = PointerTpr.Flags.IsHostPointer; 505 if (!PointerTgtPtrBegin) { 506 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 507 HasPresentModifier ? "'present' map type modifier" 508 : "device failure or illegal mapping"); 509 return OFFLOAD_FAIL; 510 } 511 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 512 "\n", 513 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 514 (PointerTpr.Flags.IsNewEntry ? "" : " not")); 515 PointerHstPtrBegin = HstPtrBase; 516 // modify current entry. 517 HstPtrBase = *(void **)HstPtrBase; 518 // No need to update pointee ref count for the first element of the 519 // subelement that comes from mapper. 520 UpdateRef = 521 (!FromMapper || I != 0); // subsequently update ref count of pointee 522 } 523 524 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; 525 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 526 auto TPR = Device.getTargetPointer( 527 HstPtrBegin, HstPtrBase, DataSize, HstPtrName, HasFlagTo, HasFlagAlways, 528 IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier, 529 HasHoldModifier, AsyncInfo); 530 void *TgtPtrBegin = TPR.TargetPointer; 531 IsHostPtr = TPR.Flags.IsHostPointer; 532 // If data_size==0, then the argument could be a zero-length pointer to 533 // NULL, so getOrAlloc() returning NULL is not an error. 534 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 535 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 536 HasPresentModifier ? "'present' map type modifier" 537 : "device failure or illegal mapping"); 538 return OFFLOAD_FAIL; 539 } 540 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 541 " - is%s new\n", 542 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 543 544 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 545 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 546 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 547 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 548 ArgsBase[I] = TgtPtrBase; 549 } 550 551 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 552 // Check whether we need to update the pointer on the device 553 bool UpdateDevPtr = false; 554 555 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 556 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 557 558 Device.ShadowMtx.lock(); 559 auto Entry = Device.ShadowPtrMap.find(PointerHstPtrBegin); 560 // If this pointer is not in the map we need to insert it. If the map 561 // contains a stale entry, we need to update it (e.g. if the pointee was 562 // deallocated and later on is reallocated at another device address). The 563 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An 564 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ 565 // pair is deallocated, not when the OBJ is deallocated. In 566 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, 567 // so it stays in the map for the lifetime of the application. When the 568 // OBJ is deallocated and later on allocated again (at a different device 569 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin 570 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. 571 if (Entry == Device.ShadowPtrMap.end() || 572 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { 573 // create or update shadow pointers for this entry 574 Device.ShadowPtrMap[PointerHstPtrBegin] = { 575 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; 576 PointerTpr.Entry->setMayContainAttachedPointers(); 577 UpdateDevPtr = true; 578 } 579 580 if (UpdateDevPtr) { 581 std::lock_guard<decltype(*PointerTpr.Entry)> LG(*PointerTpr.Entry); 582 Device.ShadowMtx.unlock(); 583 584 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 585 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 586 587 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 588 TgtPtrBase = ExpectedTgtPtrBase; 589 590 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, 591 sizeof(void *), AsyncInfo); 592 if (Ret != OFFLOAD_SUCCESS) { 593 REPORT("Copying data to device failed.\n"); 594 return OFFLOAD_FAIL; 595 } 596 if (PointerTpr.Entry->addEventIfNecessary(Device, AsyncInfo) != 597 OFFLOAD_SUCCESS) 598 return OFFLOAD_FAIL; 599 } else 600 Device.ShadowMtx.unlock(); 601 } 602 } 603 604 return OFFLOAD_SUCCESS; 605 } 606 607 namespace { 608 /// This structure contains information to deallocate a target pointer, aka. 609 /// used to fix up the shadow map and potentially delete the entry from the 610 /// mapping table via \p DeviceTy::deallocTgtPtr. 611 struct PostProcessingInfo { 612 /// Host pointer used to look up into the map table 613 void *HstPtrBegin; 614 615 /// Size of the data 616 int64_t DataSize; 617 618 /// The mapping type (bitfield). 619 int64_t ArgType; 620 621 /// The target pointer information. 622 TargetPointerResultTy TPR; 623 624 /// Are we expecting to delete this entry or not. Even if set, we might not 625 /// delete the entry if another thread reused the entry in the meantime. 626 bool DelEntry; 627 628 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry, 629 TargetPointerResultTy TPR) 630 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR), 631 DelEntry(DelEntry) {} 632 }; 633 634 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to 635 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the 636 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the 637 /// rest of the map is not checked anymore. 638 template <typename CBTy> 639 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, 640 uintptr_t Size, 641 const TargetPointerResultTy &TPR) { 642 // If we have an object that is too small to hold a pointer subobject, no need 643 // to do any checking. 644 if (Size < sizeof(void *)) 645 return; 646 647 // If the map entry for the object was never marked as containing attached 648 // pointers, no need to do any checking. 649 if (!TPR.Entry || !TPR.Entry->getMayContainAttachedPointers()) 650 return; 651 652 uintptr_t LB = (uintptr_t)Begin; 653 uintptr_t UB = LB + Size; 654 // Now we are looking into the shadow map so we need to lock it. 655 std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx); 656 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); 657 Itr != Device.ShadowPtrMap.end();) { 658 uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first; 659 660 // An STL map is sorted on its keys; use this property 661 // to quickly determine when to break out of the loop. 662 if (ShadowHstPtrAddr < LB) { 663 ++Itr; 664 continue; 665 } 666 if (ShadowHstPtrAddr >= UB) 667 break; 668 669 if (CB(Itr) == OFFLOAD_FAIL) 670 break; 671 } 672 } 673 674 } // namespace 675 676 /// Internal function to undo the mapping and retrieve the data from the device. 677 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, 678 void **ArgBases, void **Args, int64_t *ArgSizes, 679 int64_t *ArgTypes, map_var_info_t *ArgNames, 680 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 681 int Ret; 682 std::vector<PostProcessingInfo> PostProcessingPtrs; 683 void *FromMapperBase = nullptr; 684 // process each input. 685 for (int32_t I = ArgNum - 1; I >= 0; --I) { 686 // Ignore private variables and arrays - there is no mapping for them. 687 // Also, ignore the use_device_ptr directive, it has no effect here. 688 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 689 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 690 continue; 691 692 if (ArgMappers && ArgMappers[I]) { 693 // Instead of executing the regular path of targetDataEnd, call the 694 // targetDataMapper variant which will call targetDataEnd again 695 // with new arguments. 696 DP("Calling targetDataMapper for the %dth argument\n", I); 697 698 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 699 Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I], 700 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 701 targetDataEnd); 702 703 if (Ret != OFFLOAD_SUCCESS) { 704 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 705 " failed.\n"); 706 return OFFLOAD_FAIL; 707 } 708 709 // Skip the rest of this function, continue to the next argument. 710 continue; 711 } 712 713 void *HstPtrBegin = Args[I]; 714 int64_t DataSize = ArgSizes[I]; 715 // Adjust for proper alignment if this is a combined entry (for structs). 716 // Look at the next argument - if that is MEMBER_OF this one, then this one 717 // is a combined entry. 718 const int NextI = I + 1; 719 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 720 getParentIndex(ArgTypes[NextI]) == I) { 721 int64_t Padding = (int64_t)HstPtrBegin % Alignment; 722 if (Padding) { 723 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD 724 "\n", 725 Padding, DPxPTR(HstPtrBegin)); 726 HstPtrBegin = (char *)HstPtrBegin - Padding; 727 DataSize += Padding; 728 } 729 } 730 731 bool IsLast, IsHostPtr; 732 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 733 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 734 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 735 !(FromMapper && I == 0); 736 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 737 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 738 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 739 740 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 741 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 742 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, 743 !IsImplicit, ForceDelete); 744 void *TgtPtrBegin = TPR.TargetPointer; 745 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 746 DP("Mapping does not exist (%s)\n", 747 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 748 if (HasPresentModifier) { 749 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 750 // "If a map clause appears on a target, target data, target enter data 751 // or target exit data construct with a present map-type-modifier then 752 // on entry to the region if the corresponding list item does not appear 753 // in the device data environment then an error occurs and the program 754 // terminates." 755 // 756 // This should be an error upon entering an "omp target exit data". It 757 // should not be an error upon exiting an "omp target data" or "omp 758 // target". For "omp target data", Clang thus doesn't include present 759 // modifiers for end calls. For "omp target", we have not found a valid 760 // OpenMP program for which the error matters: it appears that, if a 761 // program can guarantee that data is present at the beginning of an 762 // "omp target" region so that there's no error there, that data is also 763 // guaranteed to be present at the end. 764 MESSAGE("device mapping required by 'present' map type modifier does " 765 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 766 DPxPTR(HstPtrBegin), DataSize); 767 return OFFLOAD_FAIL; 768 } 769 } else { 770 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 771 " - is%s last\n", 772 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); 773 } 774 775 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 776 // "If the map clause appears on a target, target data, or target exit data 777 // construct and a corresponding list item of the original list item is not 778 // present in the device data environment on exit from the region then the 779 // list item is ignored." 780 if (!TgtPtrBegin) 781 continue; 782 783 bool DelEntry = IsLast; 784 785 // If the last element from the mapper (for end transfer args comes in 786 // reverse order), do not remove the partial entry, the parent struct still 787 // exists. 788 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 789 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 790 DelEntry = false; // protect parent struct from being deallocated 791 } 792 793 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 794 // Move data back to the host 795 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 796 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 797 if ((Always || IsLast) && !IsHostPtr) { 798 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 799 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 800 801 std::lock_guard<decltype(*TPR.Entry)> LG(*TPR.Entry); 802 // Wait for any previous transfer if an event is present. 803 if (void *Event = TPR.Entry->getEvent()) { 804 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { 805 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); 806 return OFFLOAD_FAIL; 807 } 808 } 809 810 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, 811 AsyncInfo); 812 if (Ret != OFFLOAD_SUCCESS) { 813 REPORT("Copying data from device failed.\n"); 814 return OFFLOAD_FAIL; 815 } 816 817 // As we are expecting to delete the entry the d2h copy might race 818 // with another one that also tries to delete the entry. This happens 819 // as the entry can be reused and the reuse might happen after the 820 // copy-back was issued but before it completed. Since the reuse might 821 // also copy-back a value we would race. 822 if (IsLast) { 823 if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != 824 OFFLOAD_SUCCESS) 825 return OFFLOAD_FAIL; 826 } 827 } 828 } 829 if (DelEntry && FromMapper && I == 0) { 830 DelEntry = false; 831 FromMapperBase = HstPtrBegin; 832 } 833 834 // Add pointer to the buffer for post-synchronize processing. 835 PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], 836 DelEntry && !IsHostPtr, TPR); 837 } 838 } 839 840 // TODO: We should not synchronize here but pass the AsyncInfo object to the 841 // allocate/deallocate device APIs. 842 // 843 // We need to synchronize before deallocating data. 844 Ret = AsyncInfo.synchronize(); 845 if (Ret != OFFLOAD_SUCCESS) 846 return OFFLOAD_FAIL; 847 848 // Deallocate target pointer 849 for (PostProcessingInfo &Info : PostProcessingPtrs) { 850 // If we marked the entry to be deleted we need to verify no other thread 851 // reused it by now. If deletion is still supposed to happen by this thread 852 // LR will be set and exclusive access to the HDTT map will avoid another 853 // thread reusing the entry now. Note that we do not request (exclusive) 854 // access to the HDTT map if Info.DelEntry is not set. 855 LookupResult LR; 856 DeviceTy::HDTTMapAccessorTy HDTTMap = 857 Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); 858 859 if (Info.DelEntry) { 860 LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); 861 if (LR.Entry->getTotalRefCount() != 0 || 862 LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) { 863 // The thread is not in charge of deletion anymore. Give up access to 864 // the HDTT map and unset the deletion flag. 865 HDTTMap.destroy(); 866 Info.DelEntry = false; 867 } 868 } 869 870 // If we copied back to the host a struct/array containing pointers, we 871 // need to restore the original host pointer values from their shadow 872 // copies. If the struct is going to be deallocated, remove any remaining 873 // shadow pointer entries for this struct. 874 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 875 // If we copied the struct to the host, we need to restore the pointer. 876 if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) { 877 void **ShadowHstPtrAddr = (void **)Itr->first; 878 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 879 DP("Restoring original host pointer value " DPxMOD " for host " 880 "pointer " DPxMOD "\n", 881 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 882 } 883 // If the struct is to be deallocated, remove the shadow entry. 884 if (Info.DelEntry) { 885 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)); 886 Itr = Device.ShadowPtrMap.erase(Itr); 887 } else { 888 ++Itr; 889 } 890 return OFFLOAD_SUCCESS; 891 }; 892 applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, 893 Info.TPR); 894 895 // If we are deleting the entry the DataMapMtx is locked and we own the 896 // entry. 897 if (Info.DelEntry) { 898 if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) 899 Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); 900 901 if (Ret != OFFLOAD_SUCCESS) { 902 REPORT("Deallocating data from device failed.\n"); 903 break; 904 } 905 } 906 } 907 908 return Ret; 909 } 910 911 static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, 912 void *HstPtrBegin, int64_t ArgSize, 913 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 914 TIMESCOPE_WITH_IDENT(Loc); 915 bool IsLast, IsHostPtr; 916 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 917 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, 918 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); 919 void *TgtPtrBegin = TPR.TargetPointer; 920 if (!TgtPtrBegin) { 921 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 922 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 923 MESSAGE("device mapping required by 'present' motion modifier does not " 924 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 925 DPxPTR(HstPtrBegin), ArgSize); 926 return OFFLOAD_FAIL; 927 } 928 return OFFLOAD_SUCCESS; 929 } 930 931 if (IsHostPtr) { 932 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 933 DPxPTR(HstPtrBegin)); 934 return OFFLOAD_SUCCESS; 935 } 936 937 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 938 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 939 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 940 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); 941 if (Ret != OFFLOAD_SUCCESS) { 942 REPORT("Copying data from device failed.\n"); 943 return OFFLOAD_FAIL; 944 } 945 946 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 947 void **ShadowHstPtrAddr = (void **)Itr->first; 948 // Wait for device-to-host memcopies for whole struct to complete, 949 // before restoring the correct host pointer. 950 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 951 return OFFLOAD_FAIL; 952 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 953 DP("Restoring original host pointer value " DPxMOD 954 " for host pointer " DPxMOD "\n", 955 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 956 ++Itr; 957 return OFFLOAD_SUCCESS; 958 }; 959 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 960 } 961 962 if (ArgType & OMP_TGT_MAPTYPE_TO) { 963 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 964 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 965 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); 966 if (Ret != OFFLOAD_SUCCESS) { 967 REPORT("Copying data to device failed.\n"); 968 return OFFLOAD_FAIL; 969 } 970 971 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 972 DP("Restoring original target pointer value " DPxMOD " for target " 973 "pointer " DPxMOD "\n", 974 DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr)); 975 Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal, 976 sizeof(void *), AsyncInfo); 977 if (Ret != OFFLOAD_SUCCESS) 978 REPORT("Copying data to device failed.\n"); 979 ++Itr; 980 return Ret; 981 }; 982 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 983 } 984 return OFFLOAD_SUCCESS; 985 } 986 987 static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device, 988 void *ArgsBase, 989 __tgt_target_non_contig *NonContig, 990 uint64_t Size, int64_t ArgType, 991 int CurrentDim, int DimSize, uint64_t Offset, 992 AsyncInfoTy &AsyncInfo) { 993 TIMESCOPE_WITH_IDENT(Loc); 994 int Ret = OFFLOAD_SUCCESS; 995 if (CurrentDim < DimSize) { 996 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 997 uint64_t CurOffset = 998 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 999 // we only need to transfer the first element for the last dimension 1000 // since we've already got a contiguous piece. 1001 if (CurrentDim != DimSize - 1 || I == 0) { 1002 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size, 1003 ArgType, CurrentDim + 1, DimSize, 1004 Offset + CurOffset, AsyncInfo); 1005 // Stop the whole process if any contiguous piece returns anything 1006 // other than OFFLOAD_SUCCESS. 1007 if (Ret != OFFLOAD_SUCCESS) 1008 return Ret; 1009 } 1010 } 1011 } else { 1012 char *Ptr = (char *)ArgsBase + Offset; 1013 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 1014 " len %" PRIu64 "\n", 1015 DPxPTR(Ptr), Offset, Size); 1016 Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType, 1017 AsyncInfo); 1018 } 1019 return Ret; 1020 } 1021 1022 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 1023 int32_t DimSize) { 1024 int RemovedDim = 0; 1025 for (int I = DimSize - 1; I > 0; --I) { 1026 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 1027 RemovedDim++; 1028 } 1029 return RemovedDim; 1030 } 1031 1032 /// Internal function to pass data to/from the target. 1033 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, 1034 void **ArgsBase, void **Args, int64_t *ArgSizes, 1035 int64_t *ArgTypes, map_var_info_t *ArgNames, 1036 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 1037 // process each input. 1038 for (int32_t I = 0; I < ArgNum; ++I) { 1039 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 1040 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 1041 continue; 1042 1043 if (ArgMappers && ArgMappers[I]) { 1044 // Instead of executing the regular path of targetDataUpdate, call the 1045 // targetDataMapper variant which will call targetDataUpdate again 1046 // with new arguments. 1047 DP("Calling targetDataMapper for the %dth argument\n", I); 1048 1049 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 1050 int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1051 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 1052 targetDataUpdate); 1053 1054 if (Ret != OFFLOAD_SUCCESS) { 1055 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 1056 " failed.\n"); 1057 return OFFLOAD_FAIL; 1058 } 1059 1060 // Skip the rest of this function, continue to the next argument. 1061 continue; 1062 } 1063 1064 int Ret = OFFLOAD_SUCCESS; 1065 1066 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 1067 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 1068 int32_t DimSize = ArgSizes[I]; 1069 uint64_t Size = 1070 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 1071 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 1072 Ret = targetDataNonContiguous( 1073 Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 1074 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 1075 } else { 1076 Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1077 ArgTypes[I], AsyncInfo); 1078 } 1079 if (Ret == OFFLOAD_FAIL) 1080 return OFFLOAD_FAIL; 1081 } 1082 return OFFLOAD_SUCCESS; 1083 } 1084 1085 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 1086 OMP_TGT_MAPTYPE_LITERAL | 1087 OMP_TGT_MAPTYPE_IMPLICIT; 1088 static bool isLambdaMapping(int64_t Mapping) { 1089 return (Mapping & LambdaMapping) == LambdaMapping; 1090 } 1091 1092 namespace { 1093 /// Find the table information in the map or look it up in the translation 1094 /// tables. 1095 TableMap *getTableMap(void *HostPtr) { 1096 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1097 HostPtrToTableMapTy::iterator TableMapIt = 1098 PM->HostPtrToTableMap.find(HostPtr); 1099 1100 if (TableMapIt != PM->HostPtrToTableMap.end()) 1101 return &TableMapIt->second; 1102 1103 // We don't have a map. So search all the registered libraries. 1104 TableMap *TM = nullptr; 1105 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1106 for (HostEntriesBeginToTransTableTy::iterator Itr = 1107 PM->HostEntriesBeginToTransTable.begin(); 1108 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 1109 // get the translation table (which contains all the good info). 1110 TranslationTable *TransTable = &Itr->second; 1111 // iterate over all the host table entries to see if we can locate the 1112 // host_ptr. 1113 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; 1114 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 1115 if (Cur->addr != HostPtr) 1116 continue; 1117 // we got a match, now fill the HostPtrToTableMap so that we 1118 // may avoid this search next time. 1119 TM = &(PM->HostPtrToTableMap)[HostPtr]; 1120 TM->Table = TransTable; 1121 TM->Index = I; 1122 return TM; 1123 } 1124 } 1125 1126 return nullptr; 1127 } 1128 1129 /// Get loop trip count 1130 /// FIXME: This function will not work right if calling 1131 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in 1132 /// another thread, which might occur when we call task yield. 1133 uint64_t getLoopTripCount(int64_t DeviceId) { 1134 DeviceTy &Device = *PM->Devices[DeviceId]; 1135 uint64_t LoopTripCount = 0; 1136 1137 { 1138 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1139 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 1140 if (I != Device.LoopTripCnt.end()) { 1141 LoopTripCount = I->second; 1142 Device.LoopTripCnt.erase(I); 1143 } 1144 } 1145 1146 return LoopTripCount; 1147 } 1148 1149 /// A class manages private arguments in a target region. 1150 class PrivateArgumentManagerTy { 1151 /// A data structure for the information of first-private arguments. We can 1152 /// use this information to optimize data transfer by packing all 1153 /// first-private arguments and transfer them all at once. 1154 struct FirstPrivateArgInfoTy { 1155 /// The index of the element in \p TgtArgs corresponding to the argument 1156 const int Index; 1157 /// Host pointer begin 1158 const char *HstPtrBegin; 1159 /// Host pointer end 1160 const char *HstPtrEnd; 1161 /// Aligned size 1162 const int64_t AlignedSize; 1163 /// Host pointer name 1164 const map_var_info_t HstPtrName = nullptr; 1165 1166 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, 1167 const map_var_info_t HstPtrName = nullptr) 1168 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)), 1169 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), 1170 HstPtrName(HstPtrName) {} 1171 }; 1172 1173 /// A vector of target pointers for all private arguments 1174 std::vector<void *> TgtPtrs; 1175 1176 /// A vector of information of all first-private arguments to be packed 1177 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1178 /// Host buffer for all arguments to be packed 1179 std::vector<char> FirstPrivateArgBuffer; 1180 /// The total size of all arguments to be packed 1181 int64_t FirstPrivateArgSize = 0; 1182 1183 /// A reference to the \p DeviceTy object 1184 DeviceTy &Device; 1185 /// A pointer to a \p AsyncInfoTy object 1186 AsyncInfoTy &AsyncInfo; 1187 1188 // TODO: What would be the best value here? Should we make it configurable? 1189 // If the size is larger than this threshold, we will allocate and transfer it 1190 // immediately instead of packing it. 1191 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1192 1193 public: 1194 /// Constructor 1195 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1196 : Device(Dev), AsyncInfo(AsyncInfo) {} 1197 1198 /// Add a private argument 1199 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1200 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1201 const map_var_info_t HstPtrName = nullptr, 1202 const bool AllocImmediately = false) { 1203 // If the argument is not first-private, or its size is greater than a 1204 // predefined threshold, we will allocate memory and issue the transfer 1205 // immediately. 1206 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1207 AllocImmediately) { 1208 TgtPtr = Device.allocData(ArgSize, HstPtr); 1209 if (!TgtPtr) { 1210 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1211 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1212 return OFFLOAD_FAIL; 1213 } 1214 #ifdef OMPTARGET_DEBUG 1215 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1216 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1217 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1218 "\n", 1219 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1220 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1221 #endif 1222 // If first-private, copy data from host 1223 if (IsFirstPrivate) { 1224 DP("Submitting firstprivate data to the device.\n"); 1225 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1226 if (Ret != OFFLOAD_SUCCESS) { 1227 DP("Copying data to device failed, failed.\n"); 1228 return OFFLOAD_FAIL; 1229 } 1230 } 1231 TgtPtrs.push_back(TgtPtr); 1232 } else { 1233 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1234 DPxPTR(HstPtr), ArgSize); 1235 // When reach this point, the argument must meet all following 1236 // requirements: 1237 // 1. Its size does not exceed the threshold (see the comment for 1238 // FirstPrivateArgSizeThreshold); 1239 // 2. It must be first-private (needs to be mapped to target device). 1240 // We will pack all this kind of arguments to transfer them all at once 1241 // to reduce the number of data transfer. We will not take 1242 // non-first-private arguments, aka. private arguments that doesn't need 1243 // to be mapped to target device, into account because data allocation 1244 // can be very efficient with memory manager. 1245 1246 // Placeholder value 1247 TgtPtr = nullptr; 1248 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1249 HstPtrName); 1250 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; 1251 } 1252 1253 return OFFLOAD_SUCCESS; 1254 } 1255 1256 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1257 /// and start the transfer. 1258 int packAndTransfer(std::vector<void *> &TgtArgs) { 1259 if (!FirstPrivateArgInfo.empty()) { 1260 assert(FirstPrivateArgSize != 0 && 1261 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1262 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1263 auto Itr = FirstPrivateArgBuffer.begin(); 1264 // Copy all host data to this buffer 1265 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1266 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1267 Itr = std::next(Itr, Info.AlignedSize); 1268 } 1269 // Allocate target memory 1270 void *TgtPtr = 1271 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1272 if (TgtPtr == nullptr) { 1273 DP("Failed to allocate target memory for private arguments.\n"); 1274 return OFFLOAD_FAIL; 1275 } 1276 TgtPtrs.push_back(TgtPtr); 1277 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1278 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1279 // Transfer data to target device 1280 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1281 FirstPrivateArgSize, AsyncInfo); 1282 if (Ret != OFFLOAD_SUCCESS) { 1283 DP("Failed to submit data of private arguments.\n"); 1284 return OFFLOAD_FAIL; 1285 } 1286 // Fill in all placeholder pointers 1287 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1288 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1289 void *&Ptr = TgtArgs[Info.Index]; 1290 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1291 Ptr = reinterpret_cast<void *>(TP); 1292 TP += Info.AlignedSize; 1293 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1294 "\n", 1295 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1296 DPxPTR(Ptr)); 1297 } 1298 } 1299 1300 return OFFLOAD_SUCCESS; 1301 } 1302 1303 /// Free all target memory allocated for private arguments 1304 int free() { 1305 for (void *P : TgtPtrs) { 1306 int Ret = Device.deleteData(P); 1307 if (Ret != OFFLOAD_SUCCESS) { 1308 DP("Deallocation of (first-)private arrays failed.\n"); 1309 return OFFLOAD_FAIL; 1310 } 1311 } 1312 1313 TgtPtrs.clear(); 1314 1315 return OFFLOAD_SUCCESS; 1316 } 1317 }; 1318 1319 /// Process data before launching the kernel, including calling targetDataBegin 1320 /// to map and transfer data to target device, transferring (first-)private 1321 /// variables. 1322 static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, 1323 int32_t ArgNum, void **ArgBases, void **Args, 1324 int64_t *ArgSizes, int64_t *ArgTypes, 1325 map_var_info_t *ArgNames, void **ArgMappers, 1326 std::vector<void *> &TgtArgs, 1327 std::vector<ptrdiff_t> &TgtOffsets, 1328 PrivateArgumentManagerTy &PrivateArgumentManager, 1329 AsyncInfoTy &AsyncInfo) { 1330 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc); 1331 DeviceTy &Device = *PM->Devices[DeviceId]; 1332 int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1333 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1334 if (Ret != OFFLOAD_SUCCESS) { 1335 REPORT("Call to targetDataBegin failed, abort target.\n"); 1336 return OFFLOAD_FAIL; 1337 } 1338 1339 // List of (first-)private arrays allocated for this target region 1340 std::vector<int> TgtArgsPositions(ArgNum, -1); 1341 1342 for (int32_t I = 0; I < ArgNum; ++I) { 1343 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1344 // This is not a target parameter, do not push it into TgtArgs. 1345 // Check for lambda mapping. 1346 if (isLambdaMapping(ArgTypes[I])) { 1347 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1348 "PTR_AND_OBJ must be also MEMBER_OF."); 1349 unsigned Idx = getParentIndex(ArgTypes[I]); 1350 int TgtIdx = TgtArgsPositions[Idx]; 1351 assert(TgtIdx != -1 && "Base address must be translated already."); 1352 // The parent lambda must be processed already and it must be the last 1353 // in TgtArgs and TgtOffsets arrays. 1354 void *HstPtrVal = Args[I]; 1355 void *HstPtrBegin = ArgBases[I]; 1356 void *HstPtrBase = Args[Idx]; 1357 bool IsLast, IsHostPtr; // IsLast is unused. 1358 void *TgtPtrBase = 1359 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1360 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1361 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1362 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1363 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1364 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 1365 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, 1366 /*UseHoldRefCount=*/false, IsHostPtr); 1367 PointerTgtPtrBegin = TPR.TargetPointer; 1368 if (!PointerTgtPtrBegin) { 1369 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1370 DPxPTR(HstPtrVal)); 1371 continue; 1372 } 1373 if (IsHostPtr) { 1374 DP("Unified memory is active, no need to map lambda captured" 1375 "variable (" DPxMOD ")\n", 1376 DPxPTR(HstPtrVal)); 1377 continue; 1378 } 1379 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1380 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1381 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1382 sizeof(void *), AsyncInfo); 1383 if (Ret != OFFLOAD_SUCCESS) { 1384 REPORT("Copying data to device failed.\n"); 1385 return OFFLOAD_FAIL; 1386 } 1387 } 1388 continue; 1389 } 1390 void *HstPtrBegin = Args[I]; 1391 void *HstPtrBase = ArgBases[I]; 1392 void *TgtPtrBegin; 1393 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1394 ptrdiff_t TgtBaseOffset; 1395 bool IsLast, IsHostPtr; // unused. 1396 TargetPointerResultTy TPR; 1397 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1398 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1399 DPxPTR(HstPtrBase)); 1400 TgtPtrBegin = HstPtrBase; 1401 TgtBaseOffset = 0; 1402 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1403 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1404 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1405 // If there is a next argument and it depends on the current one, we need 1406 // to allocate the private memory immediately. If this is not the case, 1407 // then the argument can be marked for optimization and packed with the 1408 // other privates. 1409 const bool AllocImmediately = 1410 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1411 Ret = PrivateArgumentManager.addArg( 1412 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1413 TgtArgs.size(), HstPtrName, AllocImmediately); 1414 if (Ret != OFFLOAD_SUCCESS) { 1415 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1416 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1417 return OFFLOAD_FAIL; 1418 } 1419 } else { 1420 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1421 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1422 TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, 1423 /*UpdateRefCount=*/false, 1424 /*UseHoldRefCount=*/false, IsHostPtr); 1425 TgtPtrBegin = TPR.TargetPointer; 1426 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1427 #ifdef OMPTARGET_DEBUG 1428 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1429 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1430 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1431 #endif 1432 } 1433 TgtArgsPositions[I] = TgtArgs.size(); 1434 TgtArgs.push_back(TgtPtrBegin); 1435 TgtOffsets.push_back(TgtBaseOffset); 1436 } 1437 1438 assert(TgtArgs.size() == TgtOffsets.size() && 1439 "Size mismatch in arguments and offsets"); 1440 1441 // Pack and transfer first-private arguments 1442 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1443 if (Ret != OFFLOAD_SUCCESS) { 1444 DP("Failed to pack and transfer first private arguments\n"); 1445 return OFFLOAD_FAIL; 1446 } 1447 1448 return OFFLOAD_SUCCESS; 1449 } 1450 1451 /// Process data after launching the kernel, including transferring data back to 1452 /// host if needed and deallocating target memory of (first-)private variables. 1453 static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, 1454 int32_t ArgNum, void **ArgBases, void **Args, 1455 int64_t *ArgSizes, int64_t *ArgTypes, 1456 map_var_info_t *ArgNames, void **ArgMappers, 1457 PrivateArgumentManagerTy &PrivateArgumentManager, 1458 AsyncInfoTy &AsyncInfo) { 1459 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc); 1460 DeviceTy &Device = *PM->Devices[DeviceId]; 1461 1462 // Move data from device. 1463 int Ret = targetDataEnd(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1464 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1465 if (Ret != OFFLOAD_SUCCESS) { 1466 REPORT("Call to targetDataEnd failed, abort target.\n"); 1467 return OFFLOAD_FAIL; 1468 } 1469 1470 // Free target memory for private arguments 1471 Ret = PrivateArgumentManager.free(); 1472 if (Ret != OFFLOAD_SUCCESS) { 1473 REPORT("Failed to deallocate target memory for private args\n"); 1474 return OFFLOAD_FAIL; 1475 } 1476 1477 return OFFLOAD_SUCCESS; 1478 } 1479 } // namespace 1480 1481 /// performs the same actions as data_begin in case arg_num is 1482 /// non-zero and initiates run of the offloaded region on the target platform; 1483 /// if arg_num is non-zero after the region execution is done it also 1484 /// performs the same action as data_update and data_end above. This function 1485 /// returns 0 if it was able to transfer the execution to a target and an 1486 /// integer different from zero otherwise. 1487 int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, 1488 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 1489 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, 1490 int32_t ThreadLimit, uint64_t Tripcount, int IsTeamConstruct, 1491 AsyncInfoTy &AsyncInfo) { 1492 int32_t DeviceId = Device.DeviceID; 1493 TableMap *TM = getTableMap(HostPtr); 1494 // No map for this host pointer found! 1495 if (!TM) { 1496 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1497 DPxPTR(HostPtr)); 1498 return OFFLOAD_FAIL; 1499 } 1500 1501 // get target table. 1502 __tgt_target_table *TargetTable = nullptr; 1503 { 1504 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1505 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1506 "Not expecting a device ID outside the table's bounds!"); 1507 TargetTable = TM->Table->TargetsTable[DeviceId]; 1508 } 1509 assert(TargetTable && "Global data has not been mapped\n"); 1510 1511 // FIXME: Use legacy tripcount method if it is '-1'. 1512 Tripcount = Tripcount == -1 ? getLoopTripCount(DeviceId) : Tripcount; 1513 DP("loop trip count is %" PRIu64 ".\n", Tripcount); 1514 1515 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we 1516 // need to manifest base pointers prior to launching a kernel. Even if we have 1517 // mapped an object only partially, e.g. A[N:M], although the kernel is 1518 // expected to access elements starting at address &A[N] and beyond, we still 1519 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI 1520 // API, we need the begin address itself, i.e. &A[N], as the API operates on 1521 // begin addresses, not bases. That's why we pass args and offsets as two 1522 // separate entities so that each plugin can do what it needs. This behavior 1523 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. 1524 std::vector<void *> TgtArgs; 1525 std::vector<ptrdiff_t> TgtOffsets; 1526 1527 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1528 1529 int Ret; 1530 if (ArgNum) { 1531 // Process data, such as data mapping, before launching the kernel 1532 Ret = processDataBefore(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1533 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, 1534 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1535 if (Ret != OFFLOAD_SUCCESS) { 1536 REPORT("Failed to process data before launching the kernel.\n"); 1537 return OFFLOAD_FAIL; 1538 } 1539 } 1540 1541 // Launch device execution. 1542 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; 1543 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1544 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); 1545 1546 { 1547 TIMESCOPE_WITH_NAME_AND_IDENT( 1548 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", Loc); 1549 if (IsTeamConstruct) 1550 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1551 TgtArgs.size(), TeamNum, ThreadLimit, 1552 Tripcount, AsyncInfo); 1553 else 1554 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1555 TgtArgs.size(), AsyncInfo); 1556 } 1557 1558 if (Ret != OFFLOAD_SUCCESS) { 1559 REPORT("Executing target region abort target.\n"); 1560 return OFFLOAD_FAIL; 1561 } 1562 1563 if (ArgNum) { 1564 // Transfer data back and deallocate target memory for (first-)private 1565 // variables 1566 Ret = processDataAfter(Loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1567 ArgSizes, ArgTypes, ArgNames, ArgMappers, 1568 PrivateArgumentManager, AsyncInfo); 1569 if (Ret != OFFLOAD_SUCCESS) { 1570 REPORT("Failed to process data after launching the kernel.\n"); 1571 return OFFLOAD_FAIL; 1572 } 1573 } 1574 1575 return OFFLOAD_SUCCESS; 1576 } 1577