1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "device.h" 16 #include "private.h" 17 #include "rtl.h" 18 19 #include <cassert> 20 #include <cstdint> 21 #include <vector> 22 23 int AsyncInfoTy::synchronize() { 24 int Result = OFFLOAD_SUCCESS; 25 if (AsyncInfo.Queue) { 26 // If we have a queue we need to synchronize it now. 27 Result = Device.synchronize(*this); 28 assert(AsyncInfo.Queue == nullptr && 29 "The device plugin should have nulled the queue to indicate there " 30 "are no outstanding actions!"); 31 } 32 return Result; 33 } 34 35 void *&AsyncInfoTy::getVoidPtrLocation() { 36 BufferLocations.push_back(nullptr); 37 return BufferLocations.back(); 38 } 39 40 /* All begin addresses for partially mapped structs must be 8-aligned in order 41 * to ensure proper alignment of members. E.g. 42 * 43 * struct S { 44 * int a; // 4-aligned 45 * int b; // 4-aligned 46 * int *p; // 8-aligned 47 * } s1; 48 * ... 49 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 50 * { 51 * s1.b = 5; 52 * for (int i...) s1.p[i] = ...; 53 * } 54 * 55 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 56 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 57 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 58 * requirements for its type. Now, when we allocate memory on the device, in 59 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 60 * This means that the chunk of the struct on the device will start at a 61 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 62 * address of p will be a misaligned 0x204 (on the host there was no need to add 63 * padding between b and p, so p comes exactly 4 bytes after b). If the device 64 * kernel tries to access s1.p, a misaligned address error occurs (as reported 65 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 66 * extending the size of the allocated chuck accordingly, the chuck on the 67 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 68 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 69 */ 70 static const int64_t Alignment = 8; 71 72 /// Map global data and execute pending ctors 73 static int InitLibrary(DeviceTy &Device) { 74 /* 75 * Map global data 76 */ 77 int32_t device_id = Device.DeviceID; 78 int rc = OFFLOAD_SUCCESS; 79 bool supportsEmptyImages = Device.RTL->supports_empty_images && 80 Device.RTL->supports_empty_images() > 0; 81 82 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( 83 Device.PendingGlobalsMtx); 84 { 85 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx); 86 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { 87 TranslationTable *TransTable = 88 &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; 89 if (TransTable->HostTable.EntriesBegin == 90 TransTable->HostTable.EntriesEnd && 91 !supportsEmptyImages) { 92 // No host entry so no need to proceed 93 continue; 94 } 95 96 if (TransTable->TargetsTable[device_id] != 0) { 97 // Library entries have already been processed 98 continue; 99 } 100 101 // 1) get image. 102 assert(TransTable->TargetsImages.size() > (size_t)device_id && 103 "Not expecting a device ID outside the table's bounds!"); 104 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 105 if (!img) { 106 REPORT("No image loaded for device id %d.\n", device_id); 107 rc = OFFLOAD_FAIL; 108 break; 109 } 110 // 2) load image into the target table. 111 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = 112 Device.load_binary(img); 113 // Unable to get table for this image: invalidate image and fail. 114 if (!TargetTable) { 115 REPORT("Unable to generate entries table for device id %d.\n", 116 device_id); 117 TransTable->TargetsImages[device_id] = 0; 118 rc = OFFLOAD_FAIL; 119 break; 120 } 121 122 // Verify whether the two table sizes match. 123 size_t hsize = 124 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 125 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 126 127 // Invalid image for these host entries! 128 if (hsize != tsize) { 129 REPORT( 130 "Host and Target tables mismatch for device id %d [%zx != %zx].\n", 131 device_id, hsize, tsize); 132 TransTable->TargetsImages[device_id] = 0; 133 TransTable->TargetsTable[device_id] = 0; 134 rc = OFFLOAD_FAIL; 135 break; 136 } 137 138 DeviceTy::HDTTMapAccessorTy HDTTMap = 139 Device.HostDataToTargetMap.getExclusiveAccessor(); 140 141 __tgt_target_table *HostTable = &TransTable->HostTable; 142 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 143 *CurrHostEntry = HostTable->EntriesBegin, 144 *EntryDeviceEnd = TargetTable->EntriesEnd; 145 CurrDeviceEntry != EntryDeviceEnd; 146 CurrDeviceEntry++, CurrHostEntry++) { 147 if (CurrDeviceEntry->size != 0) { 148 // has data. 149 assert(CurrDeviceEntry->size == CurrHostEntry->size && 150 "data size mismatch"); 151 152 // Fortran may use multiple weak declarations for the same symbol, 153 // therefore we must allow for multiple weak symbols to be loaded from 154 // the fat binary. Treat these mappings as any other "regular" 155 // mapping. Add entry to map. 156 if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr, 157 CurrHostEntry->size)) 158 continue; 159 160 DP("Add mapping from host " DPxMOD " to device " DPxMOD 161 " with size %zu" 162 "\n", 163 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 164 CurrDeviceEntry->size); 165 HDTTMap->emplace(new HostDataToTargetTy( 166 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 167 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 168 (uintptr_t)CurrHostEntry->addr + 169 CurrHostEntry->size /*HstPtrEnd*/, 170 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 171 false /*UseHoldRefCount*/, CurrHostEntry->name, 172 true /*IsRefCountINF*/)); 173 } 174 } 175 } 176 } 177 178 if (rc != OFFLOAD_SUCCESS) { 179 return rc; 180 } 181 182 /* 183 * Run ctors for static objects 184 */ 185 if (!Device.PendingCtorsDtors.empty()) { 186 AsyncInfoTy AsyncInfo(Device); 187 // Call all ctors for all libraries registered so far 188 for (auto &lib : Device.PendingCtorsDtors) { 189 if (!lib.second.PendingCtors.empty()) { 190 DP("Has pending ctors... call now\n"); 191 for (auto &entry : lib.second.PendingCtors) { 192 void *ctor = entry; 193 int rc = 194 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, 195 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); 196 if (rc != OFFLOAD_SUCCESS) { 197 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 198 return OFFLOAD_FAIL; 199 } 200 } 201 // Clear the list to indicate that this device has been used 202 lib.second.PendingCtors.clear(); 203 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 204 } 205 } 206 // All constructors have been issued, wait for them now. 207 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 208 return OFFLOAD_FAIL; 209 } 210 Device.HasPendingGlobals = false; 211 212 return OFFLOAD_SUCCESS; 213 } 214 215 void handleTargetOutcome(bool Success, ident_t *Loc) { 216 switch (PM->TargetOffloadPolicy) { 217 case tgt_disabled: 218 if (Success) { 219 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 220 } 221 break; 222 case tgt_default: 223 FATAL_MESSAGE0(1, "default offloading policy must be switched to " 224 "mandatory or disabled"); 225 break; 226 case tgt_mandatory: 227 if (!Success) { 228 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 229 for (auto &Device : PM->Devices) 230 dumpTargetPointerMappings(Loc, *Device); 231 else 232 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " 233 "for debugging options.\n"); 234 235 SourceInfo info(Loc); 236 if (info.isAvailible()) 237 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(), 238 info.getColumn()); 239 else 240 FAILURE_MESSAGE("Source location information not present. Compile with " 241 "-g or -gline-tables-only.\n"); 242 FATAL_MESSAGE0( 243 1, "failure of target construct while offloading is mandatory"); 244 } else { 245 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 246 for (auto &Device : PM->Devices) 247 dumpTargetPointerMappings(Loc, *Device); 248 } 249 break; 250 } 251 } 252 253 static void handleDefaultTargetOffload() { 254 std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx); 255 if (PM->TargetOffloadPolicy == tgt_default) { 256 if (omp_get_num_devices() > 0) { 257 DP("Default TARGET OFFLOAD policy is now mandatory " 258 "(devices were found)\n"); 259 PM->TargetOffloadPolicy = tgt_mandatory; 260 } else { 261 DP("Default TARGET OFFLOAD policy is now disabled " 262 "(no devices were found)\n"); 263 PM->TargetOffloadPolicy = tgt_disabled; 264 } 265 } 266 } 267 268 static bool isOffloadDisabled() { 269 if (PM->TargetOffloadPolicy == tgt_default) 270 handleDefaultTargetOffload(); 271 return PM->TargetOffloadPolicy == tgt_disabled; 272 } 273 274 // If offload is enabled, ensure that device DeviceID has been initialized, 275 // global ctors have been executed, and global data has been mapped. 276 // 277 // The return bool indicates if the offload is to the host device 278 // There are three possible results: 279 // - Return false if the taregt device is ready for offload 280 // - Return true without reporting a runtime error if offload is 281 // disabled, perhaps because the initial device was specified. 282 // - Report a runtime error and return true. 283 // 284 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 285 // This step might be skipped if offload is disabled. 286 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { 287 if (isOffloadDisabled()) { 288 DP("Offload is disabled\n"); 289 return true; 290 } 291 292 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 293 DeviceID = omp_get_default_device(); 294 DP("Use default device id %" PRId64 "\n", DeviceID); 295 } 296 297 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 298 if (omp_get_num_devices() == 0) { 299 DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 300 handleTargetOutcome(false, Loc); 301 return true; 302 } 303 304 if (DeviceID == omp_get_initial_device()) { 305 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 306 DeviceID); 307 return true; 308 } 309 310 // Is device ready? 311 if (!device_is_ready(DeviceID)) { 312 REPORT("Device %" PRId64 " is not ready.\n", DeviceID); 313 handleTargetOutcome(false, Loc); 314 return true; 315 } 316 317 // Get device info. 318 DeviceTy &Device = *PM->Devices[DeviceID]; 319 320 // Check whether global data has been mapped for this device 321 bool HasPendingGlobals; 322 { 323 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( 324 Device.PendingGlobalsMtx); 325 HasPendingGlobals = Device.HasPendingGlobals; 326 } 327 if (HasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 328 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID); 329 handleTargetOutcome(false, Loc); 330 return true; 331 } 332 333 return false; 334 } 335 336 static int32_t getParentIndex(int64_t type) { 337 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 338 } 339 340 void *targetAllocExplicit(size_t size, int device_num, int kind, 341 const char *name) { 342 TIMESCOPE(); 343 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size); 344 345 if (size <= 0) { 346 DP("Call to %s with non-positive length\n", name); 347 return NULL; 348 } 349 350 void *rc = NULL; 351 352 if (device_num == omp_get_initial_device()) { 353 rc = malloc(size); 354 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc)); 355 return rc; 356 } 357 358 if (!device_is_ready(device_num)) { 359 DP("%s returns NULL ptr\n", name); 360 return NULL; 361 } 362 363 DeviceTy &Device = *PM->Devices[device_num]; 364 rc = Device.allocData(size, nullptr, kind); 365 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); 366 return rc; 367 } 368 369 /// Call the user-defined mapper function followed by the appropriate 370 // targetData* function (targetData{Begin,End,Update}). 371 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, 372 int64_t arg_size, int64_t arg_type, 373 map_var_info_t arg_names, void *arg_mapper, 374 AsyncInfoTy &AsyncInfo, 375 TargetDataFuncPtrTy target_data_function) { 376 TIMESCOPE_WITH_IDENT(loc); 377 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); 378 379 // The mapper function fills up Components. 380 MapperComponentsTy MapperComponents; 381 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); 382 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type, 383 arg_names); 384 385 // Construct new arrays for args_base, args, arg_sizes and arg_types 386 // using the information in MapperComponents and call the corresponding 387 // targetData* function using these new arrays. 388 std::vector<void *> MapperArgsBase(MapperComponents.Components.size()); 389 std::vector<void *> MapperArgs(MapperComponents.Components.size()); 390 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 391 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 392 std::vector<void *> MapperArgNames(MapperComponents.Components.size()); 393 394 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 395 auto &C = MapperComponents.Components[I]; 396 MapperArgsBase[I] = C.Base; 397 MapperArgs[I] = C.Begin; 398 MapperArgSizes[I] = C.Size; 399 MapperArgTypes[I] = C.Type; 400 MapperArgNames[I] = C.Name; 401 } 402 403 int rc = target_data_function(loc, Device, MapperComponents.Components.size(), 404 MapperArgsBase.data(), MapperArgs.data(), 405 MapperArgSizes.data(), MapperArgTypes.data(), 406 MapperArgNames.data(), /*arg_mappers*/ nullptr, 407 AsyncInfo, /*FromMapper=*/true); 408 409 return rc; 410 } 411 412 /// Internal function to do the mapping and transfer the data to the device 413 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, 414 void **args_base, void **args, int64_t *arg_sizes, 415 int64_t *arg_types, map_var_info_t *arg_names, 416 void **arg_mappers, AsyncInfoTy &AsyncInfo, 417 bool FromMapper) { 418 // process each input. 419 for (int32_t i = 0; i < arg_num; ++i) { 420 // Ignore private variables and arrays - there is no mapping for them. 421 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 422 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 423 continue; 424 425 if (arg_mappers && arg_mappers[i]) { 426 // Instead of executing the regular path of targetDataBegin, call the 427 // targetDataMapper variant which will call targetDataBegin again 428 // with new arguments. 429 DP("Calling targetDataMapper for the %dth argument\n", i); 430 431 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; 432 int rc = targetDataMapper(loc, Device, args_base[i], args[i], 433 arg_sizes[i], arg_types[i], arg_name, 434 arg_mappers[i], AsyncInfo, targetDataBegin); 435 436 if (rc != OFFLOAD_SUCCESS) { 437 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 438 " failed.\n"); 439 return OFFLOAD_FAIL; 440 } 441 442 // Skip the rest of this function, continue to the next argument. 443 continue; 444 } 445 446 void *HstPtrBegin = args[i]; 447 void *HstPtrBase = args_base[i]; 448 int64_t data_size = arg_sizes[i]; 449 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i]; 450 451 // Adjust for proper alignment if this is a combined entry (for structs). 452 // Look at the next argument - if that is MEMBER_OF this one, then this one 453 // is a combined entry. 454 int64_t padding = 0; 455 const int next_i = i + 1; 456 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && 457 getParentIndex(arg_types[next_i]) == i) { 458 padding = (int64_t)HstPtrBegin % Alignment; 459 if (padding) { 460 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 461 "\n", 462 padding, DPxPTR(HstPtrBegin)); 463 HstPtrBegin = (char *)HstPtrBegin - padding; 464 data_size += padding; 465 } 466 } 467 468 // Address of pointer on the host and device, respectively. 469 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin; 470 TargetPointerResultTy Pointer_TPR; 471 bool IsHostPtr = false; 472 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 473 // Force the creation of a device side copy of the data when: 474 // a close map modifier was associated with a map that contained a to. 475 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 476 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT; 477 bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD; 478 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 479 // have reached this point via __tgt_target_data_begin and not __tgt_target 480 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 481 // associated with a target region, so there are no target parameters). This 482 // may be considered a hack, we could revise the scheme in the future. 483 bool UpdateRef = 484 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0); 485 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 486 DP("Has a pointer entry: \n"); 487 // Base is address of pointer. 488 // 489 // Usually, the pointer is already allocated by this time. For example: 490 // 491 // #pragma omp target map(s.p[0:N]) 492 // 493 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 494 // afterward, so the pointer is already allocated by the time the 495 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 496 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 497 // entry for a global that might not already be allocated by the time the 498 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 499 // when HasPresentModifier. 500 Pointer_TPR = Device.getTargetPointer( 501 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, 502 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, 503 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); 504 PointerTgtPtrBegin = Pointer_TPR.TargetPointer; 505 IsHostPtr = Pointer_TPR.Flags.IsHostPointer; 506 if (!PointerTgtPtrBegin) { 507 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 508 HasPresentModifier ? "'present' map type modifier" 509 : "device failure or illegal mapping"); 510 return OFFLOAD_FAIL; 511 } 512 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 513 "\n", 514 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 515 (Pointer_TPR.Flags.IsNewEntry ? "" : " not")); 516 Pointer_HstPtrBegin = HstPtrBase; 517 // modify current entry. 518 HstPtrBase = *(void **)HstPtrBase; 519 // No need to update pointee ref count for the first element of the 520 // subelement that comes from mapper. 521 UpdateRef = 522 (!FromMapper || i != 0); // subsequently update ref count of pointee 523 } 524 525 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO; 526 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 527 auto TPR = Device.getTargetPointer( 528 HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo, 529 HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, 530 HasPresentModifier, HasHoldModifier, AsyncInfo); 531 void *TgtPtrBegin = TPR.TargetPointer; 532 IsHostPtr = TPR.Flags.IsHostPointer; 533 // If data_size==0, then the argument could be a zero-length pointer to 534 // NULL, so getOrAlloc() returning NULL is not an error. 535 if (!TgtPtrBegin && (data_size || HasPresentModifier)) { 536 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 537 HasPresentModifier ? "'present' map type modifier" 538 : "device failure or illegal mapping"); 539 return OFFLOAD_FAIL; 540 } 541 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 542 " - is%s new\n", 543 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 544 545 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 546 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 547 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 548 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 549 args_base[i] = TgtPtrBase; 550 } 551 552 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 553 // Check whether we need to update the pointer on the device 554 bool UpdateDevPtr = false; 555 556 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 557 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 558 559 Device.ShadowMtx.lock(); 560 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin); 561 // If this pointer is not in the map we need to insert it. If the map 562 // contains a stale entry, we need to update it (e.g. if the pointee was 563 // deallocated and later on is reallocated at another device address). The 564 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An 565 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ 566 // pair is deallocated, not when the OBJ is deallocated. In 567 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, 568 // so it stays in the map for the lifetime of the application. When the 569 // OBJ is deallocated and later on allocated again (at a different device 570 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin 571 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. 572 if (Entry == Device.ShadowPtrMap.end() || 573 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { 574 // create or update shadow pointers for this entry 575 Device.ShadowPtrMap[Pointer_HstPtrBegin] = { 576 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; 577 Pointer_TPR.Entry->setMayContainAttachedPointers(); 578 UpdateDevPtr = true; 579 } 580 581 if (UpdateDevPtr) { 582 std::lock_guard<decltype(*Pointer_TPR.Entry)> LG(*Pointer_TPR.Entry); 583 Device.ShadowMtx.unlock(); 584 585 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 586 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 587 588 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 589 TgtPtrBase = ExpectedTgtPtrBase; 590 591 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, 592 sizeof(void *), AsyncInfo); 593 if (Ret != OFFLOAD_SUCCESS) { 594 REPORT("Copying data to device failed.\n"); 595 return OFFLOAD_FAIL; 596 } 597 if (Pointer_TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != 598 OFFLOAD_SUCCESS) 599 return OFFLOAD_FAIL; 600 } else 601 Device.ShadowMtx.unlock(); 602 } 603 } 604 605 return OFFLOAD_SUCCESS; 606 } 607 608 namespace { 609 /// This structure contains information to deallocate a target pointer, aka. 610 /// used to fix up the shadow map and potentially delete the entry from the 611 /// mapping table via \p DeviceTy::deallocTgtPtr. 612 struct PostProcessingInfo { 613 /// Host pointer used to look up into the map table 614 void *HstPtrBegin; 615 616 /// Size of the data 617 int64_t DataSize; 618 619 /// The mapping type (bitfield). 620 int64_t ArgType; 621 622 /// The target pointer information. 623 TargetPointerResultTy TPR; 624 625 /// Are we expecting to delete this entry or not. Even if set, we might not 626 /// delete the entry if another thread reused the entry in the meantime. 627 bool DelEntry; 628 629 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, bool DelEntry, 630 TargetPointerResultTy TPR) 631 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), TPR(TPR), 632 DelEntry(DelEntry) {} 633 }; 634 635 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to 636 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the 637 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the 638 /// rest of the map is not checked anymore. 639 template <typename CBTy> 640 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, 641 uintptr_t Size, 642 const TargetPointerResultTy &TPR) { 643 // If we have an object that is too small to hold a pointer subobject, no need 644 // to do any checking. 645 if (Size < sizeof(void *)) 646 return; 647 648 // If the map entry for the object was never marked as containing attached 649 // pointers, no need to do any checking. 650 if (!TPR.Entry || !TPR.Entry->getMayContainAttachedPointers()) 651 return; 652 653 uintptr_t LB = (uintptr_t)Begin; 654 uintptr_t UB = LB + Size; 655 // Now we are looking into the shadow map so we need to lock it. 656 std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx); 657 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); 658 Itr != Device.ShadowPtrMap.end();) { 659 uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first; 660 661 // An STL map is sorted on its keys; use this property 662 // to quickly determine when to break out of the loop. 663 if (ShadowHstPtrAddr < LB) { 664 ++Itr; 665 continue; 666 } 667 if (ShadowHstPtrAddr >= UB) 668 break; 669 670 if (CB(Itr) == OFFLOAD_FAIL) 671 break; 672 } 673 } 674 675 } // namespace 676 677 /// Internal function to undo the mapping and retrieve the data from the device. 678 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 679 void **ArgBases, void **Args, int64_t *ArgSizes, 680 int64_t *ArgTypes, map_var_info_t *ArgNames, 681 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 682 int Ret; 683 std::vector<PostProcessingInfo> PostProcessingPtrs; 684 void *FromMapperBase = nullptr; 685 // process each input. 686 for (int32_t I = ArgNum - 1; I >= 0; --I) { 687 // Ignore private variables and arrays - there is no mapping for them. 688 // Also, ignore the use_device_ptr directive, it has no effect here. 689 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 690 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 691 continue; 692 693 if (ArgMappers && ArgMappers[I]) { 694 // Instead of executing the regular path of targetDataEnd, call the 695 // targetDataMapper variant which will call targetDataEnd again 696 // with new arguments. 697 DP("Calling targetDataMapper for the %dth argument\n", I); 698 699 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 700 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], 701 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 702 targetDataEnd); 703 704 if (Ret != OFFLOAD_SUCCESS) { 705 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 706 " failed.\n"); 707 return OFFLOAD_FAIL; 708 } 709 710 // Skip the rest of this function, continue to the next argument. 711 continue; 712 } 713 714 void *HstPtrBegin = Args[I]; 715 int64_t DataSize = ArgSizes[I]; 716 // Adjust for proper alignment if this is a combined entry (for structs). 717 // Look at the next argument - if that is MEMBER_OF this one, then this one 718 // is a combined entry. 719 const int NextI = I + 1; 720 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 721 getParentIndex(ArgTypes[NextI]) == I) { 722 int64_t Padding = (int64_t)HstPtrBegin % Alignment; 723 if (Padding) { 724 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD 725 "\n", 726 Padding, DPxPTR(HstPtrBegin)); 727 HstPtrBegin = (char *)HstPtrBegin - Padding; 728 DataSize += Padding; 729 } 730 } 731 732 bool IsLast, IsHostPtr; 733 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 734 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 735 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 736 !(FromMapper && I == 0); 737 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 738 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 739 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 740 741 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 742 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 743 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, 744 !IsImplicit, ForceDelete); 745 void *TgtPtrBegin = TPR.TargetPointer; 746 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 747 DP("Mapping does not exist (%s)\n", 748 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 749 if (HasPresentModifier) { 750 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 751 // "If a map clause appears on a target, target data, target enter data 752 // or target exit data construct with a present map-type-modifier then 753 // on entry to the region if the corresponding list item does not appear 754 // in the device data environment then an error occurs and the program 755 // terminates." 756 // 757 // This should be an error upon entering an "omp target exit data". It 758 // should not be an error upon exiting an "omp target data" or "omp 759 // target". For "omp target data", Clang thus doesn't include present 760 // modifiers for end calls. For "omp target", we have not found a valid 761 // OpenMP program for which the error matters: it appears that, if a 762 // program can guarantee that data is present at the beginning of an 763 // "omp target" region so that there's no error there, that data is also 764 // guaranteed to be present at the end. 765 MESSAGE("device mapping required by 'present' map type modifier does " 766 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 767 DPxPTR(HstPtrBegin), DataSize); 768 return OFFLOAD_FAIL; 769 } 770 } else { 771 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 772 " - is%s last\n", 773 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); 774 } 775 776 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 777 // "If the map clause appears on a target, target data, or target exit data 778 // construct and a corresponding list item of the original list item is not 779 // present in the device data environment on exit from the region then the 780 // list item is ignored." 781 if (!TgtPtrBegin) 782 continue; 783 784 bool DelEntry = IsLast; 785 786 // If the last element from the mapper (for end transfer args comes in 787 // reverse order), do not remove the partial entry, the parent struct still 788 // exists. 789 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 790 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 791 DelEntry = false; // protect parent struct from being deallocated 792 } 793 794 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 795 // Move data back to the host 796 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 797 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 798 if ((Always || IsLast) && !IsHostPtr) { 799 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 800 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 801 802 std::lock_guard<decltype(*TPR.Entry)> LG(*TPR.Entry); 803 // Wait for any previous transfer if an event is present. 804 if (void *Event = TPR.Entry->getEvent()) { 805 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { 806 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event)); 807 return OFFLOAD_FAIL; 808 } 809 } 810 811 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, 812 AsyncInfo); 813 if (Ret != OFFLOAD_SUCCESS) { 814 REPORT("Copying data from device failed.\n"); 815 return OFFLOAD_FAIL; 816 } 817 818 // As we are expecting to delete the entry the d2h copy might race 819 // with another one that also tries to delete the entry. This happens 820 // as the entry can be reused and the reuse might happen after the 821 // copy-back was issued but before it completed. Since the reuse might 822 // also copy-back a value we would race. 823 if (IsLast) { 824 if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != 825 OFFLOAD_SUCCESS) 826 return OFFLOAD_FAIL; 827 } 828 } 829 } 830 if (DelEntry && FromMapper && I == 0) { 831 DelEntry = false; 832 FromMapperBase = HstPtrBegin; 833 } 834 835 // Add pointer to the buffer for post-synchronize processing. 836 PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], 837 DelEntry && !IsHostPtr, TPR); 838 } 839 } 840 841 // TODO: We should not synchronize here but pass the AsyncInfo object to the 842 // allocate/deallocate device APIs. 843 // 844 // We need to synchronize before deallocating data. 845 Ret = AsyncInfo.synchronize(); 846 if (Ret != OFFLOAD_SUCCESS) 847 return OFFLOAD_FAIL; 848 849 // Deallocate target pointer 850 for (PostProcessingInfo &Info : PostProcessingPtrs) { 851 // If we marked the entry to be deleted we need to verify no other thread 852 // reused it by now. If deletion is still supposed to happen by this thread 853 // LR will be set and exclusive access to the HDTT map will avoid another 854 // thread reusing the entry now. Note that we do not request (exclusive) 855 // access to the HDTT map if Info.DelEntry is not set. 856 LookupResult LR; 857 DeviceTy::HDTTMapAccessorTy HDTTMap = 858 Device.HostDataToTargetMap.getExclusiveAccessor(!Info.DelEntry); 859 860 if (Info.DelEntry) { 861 LR = Device.lookupMapping(HDTTMap, Info.HstPtrBegin, Info.DataSize); 862 if (LR.Entry->getTotalRefCount() != 0 || 863 LR.Entry->getDeleteThreadId() != std::this_thread::get_id()) { 864 // The thread is not in charge of deletion anymore. Give up access to 865 // the HDTT map and unset the deletion flag. 866 HDTTMap.destroy(); 867 Info.DelEntry = false; 868 } 869 } 870 871 // If we copied back to the host a struct/array containing pointers, we 872 // need to restore the original host pointer values from their shadow 873 // copies. If the struct is going to be deallocated, remove any remaining 874 // shadow pointer entries for this struct. 875 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 876 // If we copied the struct to the host, we need to restore the pointer. 877 if (Info.ArgType & OMP_TGT_MAPTYPE_FROM) { 878 void **ShadowHstPtrAddr = (void **)Itr->first; 879 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 880 DP("Restoring original host pointer value " DPxMOD " for host " 881 "pointer " DPxMOD "\n", 882 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 883 } 884 // If the struct is to be deallocated, remove the shadow entry. 885 if (Info.DelEntry) { 886 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)); 887 Itr = Device.ShadowPtrMap.erase(Itr); 888 } else { 889 ++Itr; 890 } 891 return OFFLOAD_SUCCESS; 892 }; 893 applyToShadowMapEntries(Device, CB, Info.HstPtrBegin, Info.DataSize, 894 Info.TPR); 895 896 // If we are deleting the entry the DataMapMtx is locked and we own the 897 // entry. 898 if (Info.DelEntry) { 899 if (!FromMapperBase || FromMapperBase != Info.HstPtrBegin) 900 Ret = Device.deallocTgtPtr(HDTTMap, LR, Info.DataSize); 901 902 if (Ret != OFFLOAD_SUCCESS) { 903 REPORT("Deallocating data from device failed.\n"); 904 break; 905 } 906 } 907 } 908 909 return Ret; 910 } 911 912 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, 913 void *HstPtrBegin, int64_t ArgSize, 914 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 915 TIMESCOPE_WITH_IDENT(loc); 916 bool IsLast, IsHostPtr; 917 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 918 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, 919 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); 920 void *TgtPtrBegin = TPR.TargetPointer; 921 if (!TgtPtrBegin) { 922 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 923 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 924 MESSAGE("device mapping required by 'present' motion modifier does not " 925 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 926 DPxPTR(HstPtrBegin), ArgSize); 927 return OFFLOAD_FAIL; 928 } 929 return OFFLOAD_SUCCESS; 930 } 931 932 if (IsHostPtr) { 933 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 934 DPxPTR(HstPtrBegin)); 935 return OFFLOAD_SUCCESS; 936 } 937 938 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 939 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 940 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 941 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); 942 if (Ret != OFFLOAD_SUCCESS) { 943 REPORT("Copying data from device failed.\n"); 944 return OFFLOAD_FAIL; 945 } 946 947 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 948 void **ShadowHstPtrAddr = (void **)Itr->first; 949 // Wait for device-to-host memcopies for whole struct to complete, 950 // before restoring the correct host pointer. 951 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 952 return OFFLOAD_FAIL; 953 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 954 DP("Restoring original host pointer value " DPxMOD 955 " for host pointer " DPxMOD "\n", 956 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 957 ++Itr; 958 return OFFLOAD_SUCCESS; 959 }; 960 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 961 } 962 963 if (ArgType & OMP_TGT_MAPTYPE_TO) { 964 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 965 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 966 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); 967 if (Ret != OFFLOAD_SUCCESS) { 968 REPORT("Copying data to device failed.\n"); 969 return OFFLOAD_FAIL; 970 } 971 972 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 973 DP("Restoring original target pointer value " DPxMOD " for target " 974 "pointer " DPxMOD "\n", 975 DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr)); 976 Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal, 977 sizeof(void *), AsyncInfo); 978 if (Ret != OFFLOAD_SUCCESS) 979 REPORT("Copying data to device failed.\n"); 980 ++Itr; 981 return Ret; 982 }; 983 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 984 } 985 return OFFLOAD_SUCCESS; 986 } 987 988 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device, 989 void *ArgsBase, 990 __tgt_target_non_contig *NonContig, 991 uint64_t Size, int64_t ArgType, 992 int CurrentDim, int DimSize, uint64_t Offset, 993 AsyncInfoTy &AsyncInfo) { 994 TIMESCOPE_WITH_IDENT(loc); 995 int Ret = OFFLOAD_SUCCESS; 996 if (CurrentDim < DimSize) { 997 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 998 uint64_t CurOffset = 999 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 1000 // we only need to transfer the first element for the last dimension 1001 // since we've already got a contiguous piece. 1002 if (CurrentDim != DimSize - 1 || I == 0) { 1003 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, 1004 ArgType, CurrentDim + 1, DimSize, 1005 Offset + CurOffset, AsyncInfo); 1006 // Stop the whole process if any contiguous piece returns anything 1007 // other than OFFLOAD_SUCCESS. 1008 if (Ret != OFFLOAD_SUCCESS) 1009 return Ret; 1010 } 1011 } 1012 } else { 1013 char *Ptr = (char *)ArgsBase + Offset; 1014 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 1015 " len %" PRIu64 "\n", 1016 DPxPTR(Ptr), Offset, Size); 1017 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, 1018 AsyncInfo); 1019 } 1020 return Ret; 1021 } 1022 1023 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 1024 int32_t DimSize) { 1025 int RemovedDim = 0; 1026 for (int I = DimSize - 1; I > 0; --I) { 1027 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 1028 RemovedDim++; 1029 } 1030 return RemovedDim; 1031 } 1032 1033 /// Internal function to pass data to/from the target. 1034 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 1035 void **ArgsBase, void **Args, int64_t *ArgSizes, 1036 int64_t *ArgTypes, map_var_info_t *ArgNames, 1037 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 1038 // process each input. 1039 for (int32_t I = 0; I < ArgNum; ++I) { 1040 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 1041 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 1042 continue; 1043 1044 if (ArgMappers && ArgMappers[I]) { 1045 // Instead of executing the regular path of targetDataUpdate, call the 1046 // targetDataMapper variant which will call targetDataUpdate again 1047 // with new arguments. 1048 DP("Calling targetDataMapper for the %dth argument\n", I); 1049 1050 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 1051 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1052 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 1053 targetDataUpdate); 1054 1055 if (Ret != OFFLOAD_SUCCESS) { 1056 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 1057 " failed.\n"); 1058 return OFFLOAD_FAIL; 1059 } 1060 1061 // Skip the rest of this function, continue to the next argument. 1062 continue; 1063 } 1064 1065 int Ret = OFFLOAD_SUCCESS; 1066 1067 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 1068 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 1069 int32_t DimSize = ArgSizes[I]; 1070 uint64_t Size = 1071 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 1072 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 1073 Ret = targetDataNonContiguous( 1074 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 1075 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 1076 } else { 1077 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1078 ArgTypes[I], AsyncInfo); 1079 } 1080 if (Ret == OFFLOAD_FAIL) 1081 return OFFLOAD_FAIL; 1082 } 1083 return OFFLOAD_SUCCESS; 1084 } 1085 1086 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 1087 OMP_TGT_MAPTYPE_LITERAL | 1088 OMP_TGT_MAPTYPE_IMPLICIT; 1089 static bool isLambdaMapping(int64_t Mapping) { 1090 return (Mapping & LambdaMapping) == LambdaMapping; 1091 } 1092 1093 namespace { 1094 /// Find the table information in the map or look it up in the translation 1095 /// tables. 1096 TableMap *getTableMap(void *HostPtr) { 1097 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1098 HostPtrToTableMapTy::iterator TableMapIt = 1099 PM->HostPtrToTableMap.find(HostPtr); 1100 1101 if (TableMapIt != PM->HostPtrToTableMap.end()) 1102 return &TableMapIt->second; 1103 1104 // We don't have a map. So search all the registered libraries. 1105 TableMap *TM = nullptr; 1106 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1107 for (HostEntriesBeginToTransTableTy::iterator Itr = 1108 PM->HostEntriesBeginToTransTable.begin(); 1109 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 1110 // get the translation table (which contains all the good info). 1111 TranslationTable *TransTable = &Itr->second; 1112 // iterate over all the host table entries to see if we can locate the 1113 // host_ptr. 1114 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; 1115 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 1116 if (Cur->addr != HostPtr) 1117 continue; 1118 // we got a match, now fill the HostPtrToTableMap so that we 1119 // may avoid this search next time. 1120 TM = &(PM->HostPtrToTableMap)[HostPtr]; 1121 TM->Table = TransTable; 1122 TM->Index = I; 1123 return TM; 1124 } 1125 } 1126 1127 return nullptr; 1128 } 1129 1130 /// Get loop trip count 1131 /// FIXME: This function will not work right if calling 1132 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in 1133 /// another thread, which might occur when we call task yield. 1134 uint64_t getLoopTripCount(int64_t DeviceId) { 1135 DeviceTy &Device = *PM->Devices[DeviceId]; 1136 uint64_t LoopTripCount = 0; 1137 1138 { 1139 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1140 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 1141 if (I != Device.LoopTripCnt.end()) { 1142 LoopTripCount = I->second; 1143 Device.LoopTripCnt.erase(I); 1144 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount); 1145 } 1146 } 1147 1148 return LoopTripCount; 1149 } 1150 1151 /// A class manages private arguments in a target region. 1152 class PrivateArgumentManagerTy { 1153 /// A data structure for the information of first-private arguments. We can 1154 /// use this information to optimize data transfer by packing all 1155 /// first-private arguments and transfer them all at once. 1156 struct FirstPrivateArgInfoTy { 1157 /// The index of the element in \p TgtArgs corresponding to the argument 1158 const int Index; 1159 /// Host pointer begin 1160 const char *HstPtrBegin; 1161 /// Host pointer end 1162 const char *HstPtrEnd; 1163 /// Aligned size 1164 const int64_t AlignedSize; 1165 /// Host pointer name 1166 const map_var_info_t HstPtrName = nullptr; 1167 1168 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, 1169 const map_var_info_t HstPtrName = nullptr) 1170 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)), 1171 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), 1172 HstPtrName(HstPtrName) {} 1173 }; 1174 1175 /// A vector of target pointers for all private arguments 1176 std::vector<void *> TgtPtrs; 1177 1178 /// A vector of information of all first-private arguments to be packed 1179 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1180 /// Host buffer for all arguments to be packed 1181 std::vector<char> FirstPrivateArgBuffer; 1182 /// The total size of all arguments to be packed 1183 int64_t FirstPrivateArgSize = 0; 1184 1185 /// A reference to the \p DeviceTy object 1186 DeviceTy &Device; 1187 /// A pointer to a \p AsyncInfoTy object 1188 AsyncInfoTy &AsyncInfo; 1189 1190 // TODO: What would be the best value here? Should we make it configurable? 1191 // If the size is larger than this threshold, we will allocate and transfer it 1192 // immediately instead of packing it. 1193 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1194 1195 public: 1196 /// Constructor 1197 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1198 : Device(Dev), AsyncInfo(AsyncInfo) {} 1199 1200 /// Add a private argument 1201 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1202 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1203 const map_var_info_t HstPtrName = nullptr, 1204 const bool AllocImmediately = false) { 1205 // If the argument is not first-private, or its size is greater than a 1206 // predefined threshold, we will allocate memory and issue the transfer 1207 // immediately. 1208 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1209 AllocImmediately) { 1210 TgtPtr = Device.allocData(ArgSize, HstPtr); 1211 if (!TgtPtr) { 1212 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1213 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1214 return OFFLOAD_FAIL; 1215 } 1216 #ifdef OMPTARGET_DEBUG 1217 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1218 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1219 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1220 "\n", 1221 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1222 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1223 #endif 1224 // If first-private, copy data from host 1225 if (IsFirstPrivate) { 1226 DP("Submitting firstprivate data to the device.\n"); 1227 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1228 if (Ret != OFFLOAD_SUCCESS) { 1229 DP("Copying data to device failed, failed.\n"); 1230 return OFFLOAD_FAIL; 1231 } 1232 } 1233 TgtPtrs.push_back(TgtPtr); 1234 } else { 1235 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1236 DPxPTR(HstPtr), ArgSize); 1237 // When reach this point, the argument must meet all following 1238 // requirements: 1239 // 1. Its size does not exceed the threshold (see the comment for 1240 // FirstPrivateArgSizeThreshold); 1241 // 2. It must be first-private (needs to be mapped to target device). 1242 // We will pack all this kind of arguments to transfer them all at once 1243 // to reduce the number of data transfer. We will not take 1244 // non-first-private arguments, aka. private arguments that doesn't need 1245 // to be mapped to target device, into account because data allocation 1246 // can be very efficient with memory manager. 1247 1248 // Placeholder value 1249 TgtPtr = nullptr; 1250 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1251 HstPtrName); 1252 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; 1253 } 1254 1255 return OFFLOAD_SUCCESS; 1256 } 1257 1258 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1259 /// and start the transfer. 1260 int packAndTransfer(std::vector<void *> &TgtArgs) { 1261 if (!FirstPrivateArgInfo.empty()) { 1262 assert(FirstPrivateArgSize != 0 && 1263 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1264 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1265 auto Itr = FirstPrivateArgBuffer.begin(); 1266 // Copy all host data to this buffer 1267 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1268 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1269 Itr = std::next(Itr, Info.AlignedSize); 1270 } 1271 // Allocate target memory 1272 void *TgtPtr = 1273 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1274 if (TgtPtr == nullptr) { 1275 DP("Failed to allocate target memory for private arguments.\n"); 1276 return OFFLOAD_FAIL; 1277 } 1278 TgtPtrs.push_back(TgtPtr); 1279 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1280 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1281 // Transfer data to target device 1282 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1283 FirstPrivateArgSize, AsyncInfo); 1284 if (Ret != OFFLOAD_SUCCESS) { 1285 DP("Failed to submit data of private arguments.\n"); 1286 return OFFLOAD_FAIL; 1287 } 1288 // Fill in all placeholder pointers 1289 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1290 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1291 void *&Ptr = TgtArgs[Info.Index]; 1292 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1293 Ptr = reinterpret_cast<void *>(TP); 1294 TP += Info.AlignedSize; 1295 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1296 "\n", 1297 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1298 DPxPTR(Ptr)); 1299 } 1300 } 1301 1302 return OFFLOAD_SUCCESS; 1303 } 1304 1305 /// Free all target memory allocated for private arguments 1306 int free() { 1307 for (void *P : TgtPtrs) { 1308 int Ret = Device.deleteData(P); 1309 if (Ret != OFFLOAD_SUCCESS) { 1310 DP("Deallocation of (first-)private arrays failed.\n"); 1311 return OFFLOAD_FAIL; 1312 } 1313 } 1314 1315 TgtPtrs.clear(); 1316 1317 return OFFLOAD_SUCCESS; 1318 } 1319 }; 1320 1321 /// Process data before launching the kernel, including calling targetDataBegin 1322 /// to map and transfer data to target device, transferring (first-)private 1323 /// variables. 1324 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr, 1325 int32_t ArgNum, void **ArgBases, void **Args, 1326 int64_t *ArgSizes, int64_t *ArgTypes, 1327 map_var_info_t *ArgNames, void **ArgMappers, 1328 std::vector<void *> &TgtArgs, 1329 std::vector<ptrdiff_t> &TgtOffsets, 1330 PrivateArgumentManagerTy &PrivateArgumentManager, 1331 AsyncInfoTy &AsyncInfo) { 1332 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); 1333 DeviceTy &Device = *PM->Devices[DeviceId]; 1334 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1335 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1336 if (Ret != OFFLOAD_SUCCESS) { 1337 REPORT("Call to targetDataBegin failed, abort target.\n"); 1338 return OFFLOAD_FAIL; 1339 } 1340 1341 // List of (first-)private arrays allocated for this target region 1342 std::vector<int> TgtArgsPositions(ArgNum, -1); 1343 1344 for (int32_t I = 0; I < ArgNum; ++I) { 1345 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1346 // This is not a target parameter, do not push it into TgtArgs. 1347 // Check for lambda mapping. 1348 if (isLambdaMapping(ArgTypes[I])) { 1349 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1350 "PTR_AND_OBJ must be also MEMBER_OF."); 1351 unsigned Idx = getParentIndex(ArgTypes[I]); 1352 int TgtIdx = TgtArgsPositions[Idx]; 1353 assert(TgtIdx != -1 && "Base address must be translated already."); 1354 // The parent lambda must be processed already and it must be the last 1355 // in TgtArgs and TgtOffsets arrays. 1356 void *HstPtrVal = Args[I]; 1357 void *HstPtrBegin = ArgBases[I]; 1358 void *HstPtrBase = Args[Idx]; 1359 bool IsLast, IsHostPtr; // IsLast is unused. 1360 void *TgtPtrBase = 1361 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1362 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1363 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1364 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1365 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1366 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 1367 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, 1368 /*UseHoldRefCount=*/false, IsHostPtr); 1369 PointerTgtPtrBegin = TPR.TargetPointer; 1370 if (!PointerTgtPtrBegin) { 1371 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1372 DPxPTR(HstPtrVal)); 1373 continue; 1374 } 1375 if (IsHostPtr) { 1376 DP("Unified memory is active, no need to map lambda captured" 1377 "variable (" DPxMOD ")\n", 1378 DPxPTR(HstPtrVal)); 1379 continue; 1380 } 1381 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1382 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1383 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1384 sizeof(void *), AsyncInfo); 1385 if (Ret != OFFLOAD_SUCCESS) { 1386 REPORT("Copying data to device failed.\n"); 1387 return OFFLOAD_FAIL; 1388 } 1389 } 1390 continue; 1391 } 1392 void *HstPtrBegin = Args[I]; 1393 void *HstPtrBase = ArgBases[I]; 1394 void *TgtPtrBegin; 1395 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1396 ptrdiff_t TgtBaseOffset; 1397 bool IsLast, IsHostPtr; // unused. 1398 TargetPointerResultTy TPR; 1399 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1400 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1401 DPxPTR(HstPtrBase)); 1402 TgtPtrBegin = HstPtrBase; 1403 TgtBaseOffset = 0; 1404 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1405 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1406 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1407 // If there is a next argument and it depends on the current one, we need 1408 // to allocate the private memory immediately. If this is not the case, 1409 // then the argument can be marked for optimization and packed with the 1410 // other privates. 1411 const bool AllocImmediately = 1412 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1413 Ret = PrivateArgumentManager.addArg( 1414 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1415 TgtArgs.size(), HstPtrName, AllocImmediately); 1416 if (Ret != OFFLOAD_SUCCESS) { 1417 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1418 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1419 return OFFLOAD_FAIL; 1420 } 1421 } else { 1422 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1423 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1424 TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, 1425 /*UpdateRefCount=*/false, 1426 /*UseHoldRefCount=*/false, IsHostPtr); 1427 TgtPtrBegin = TPR.TargetPointer; 1428 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1429 #ifdef OMPTARGET_DEBUG 1430 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1431 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1432 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1433 #endif 1434 } 1435 TgtArgsPositions[I] = TgtArgs.size(); 1436 TgtArgs.push_back(TgtPtrBegin); 1437 TgtOffsets.push_back(TgtBaseOffset); 1438 } 1439 1440 assert(TgtArgs.size() == TgtOffsets.size() && 1441 "Size mismatch in arguments and offsets"); 1442 1443 // Pack and transfer first-private arguments 1444 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1445 if (Ret != OFFLOAD_SUCCESS) { 1446 DP("Failed to pack and transfer first private arguments\n"); 1447 return OFFLOAD_FAIL; 1448 } 1449 1450 return OFFLOAD_SUCCESS; 1451 } 1452 1453 /// Process data after launching the kernel, including transferring data back to 1454 /// host if needed and deallocating target memory of (first-)private variables. 1455 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr, 1456 int32_t ArgNum, void **ArgBases, void **Args, 1457 int64_t *ArgSizes, int64_t *ArgTypes, 1458 map_var_info_t *ArgNames, void **ArgMappers, 1459 PrivateArgumentManagerTy &PrivateArgumentManager, 1460 AsyncInfoTy &AsyncInfo) { 1461 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); 1462 DeviceTy &Device = *PM->Devices[DeviceId]; 1463 1464 // Move data from device. 1465 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1466 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1467 if (Ret != OFFLOAD_SUCCESS) { 1468 REPORT("Call to targetDataEnd failed, abort target.\n"); 1469 return OFFLOAD_FAIL; 1470 } 1471 1472 // Free target memory for private arguments 1473 Ret = PrivateArgumentManager.free(); 1474 if (Ret != OFFLOAD_SUCCESS) { 1475 REPORT("Failed to deallocate target memory for private args\n"); 1476 return OFFLOAD_FAIL; 1477 } 1478 1479 return OFFLOAD_SUCCESS; 1480 } 1481 } // namespace 1482 1483 /// performs the same actions as data_begin in case arg_num is 1484 /// non-zero and initiates run of the offloaded region on the target platform; 1485 /// if arg_num is non-zero after the region execution is done it also 1486 /// performs the same action as data_update and data_end above. This function 1487 /// returns 0 if it was able to transfer the execution to a target and an 1488 /// integer different from zero otherwise. 1489 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, 1490 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 1491 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, 1492 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { 1493 int32_t DeviceId = Device.DeviceID; 1494 1495 TableMap *TM = getTableMap(HostPtr); 1496 // No map for this host pointer found! 1497 if (!TM) { 1498 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1499 DPxPTR(HostPtr)); 1500 return OFFLOAD_FAIL; 1501 } 1502 1503 // get target table. 1504 __tgt_target_table *TargetTable = nullptr; 1505 { 1506 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1507 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1508 "Not expecting a device ID outside the table's bounds!"); 1509 TargetTable = TM->Table->TargetsTable[DeviceId]; 1510 } 1511 assert(TargetTable && "Global data has not been mapped\n"); 1512 1513 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we 1514 // need to manifest base pointers prior to launching a kernel. Even if we have 1515 // mapped an object only partially, e.g. A[N:M], although the kernel is 1516 // expected to access elements starting at address &A[N] and beyond, we still 1517 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI 1518 // API, we need the begin address itself, i.e. &A[N], as the API operates on 1519 // begin addresses, not bases. That's why we pass args and offsets as two 1520 // separate entities so that each plugin can do what it needs. This behavior 1521 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. 1522 std::vector<void *> TgtArgs; 1523 std::vector<ptrdiff_t> TgtOffsets; 1524 1525 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1526 1527 int Ret; 1528 if (ArgNum) { 1529 // Process data, such as data mapping, before launching the kernel 1530 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1531 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, 1532 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1533 if (Ret != OFFLOAD_SUCCESS) { 1534 REPORT("Failed to process data before launching the kernel.\n"); 1535 return OFFLOAD_FAIL; 1536 } 1537 } 1538 1539 // Launch device execution. 1540 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; 1541 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1542 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); 1543 1544 { 1545 TIMESCOPE_WITH_NAME_AND_IDENT( 1546 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); 1547 if (IsTeamConstruct) 1548 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1549 TgtArgs.size(), TeamNum, ThreadLimit, 1550 getLoopTripCount(DeviceId), AsyncInfo); 1551 else 1552 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1553 TgtArgs.size(), AsyncInfo); 1554 } 1555 1556 if (Ret != OFFLOAD_SUCCESS) { 1557 REPORT("Executing target region abort target.\n"); 1558 return OFFLOAD_FAIL; 1559 } 1560 1561 if (ArgNum) { 1562 // Transfer data back and deallocate target memory for (first-)private 1563 // variables 1564 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1565 ArgSizes, ArgTypes, ArgNames, ArgMappers, 1566 PrivateArgumentManager, AsyncInfo); 1567 if (Ret != OFFLOAD_SUCCESS) { 1568 REPORT("Failed to process data after launching the kernel.\n"); 1569 return OFFLOAD_FAIL; 1570 } 1571 } 1572 1573 return OFFLOAD_SUCCESS; 1574 } 1575