1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "device.h" 16 #include "private.h" 17 #include "rtl.h" 18 19 #include <cassert> 20 #include <vector> 21 22 int AsyncInfoTy::synchronize() { 23 int Result = OFFLOAD_SUCCESS; 24 if (AsyncInfo.Queue) { 25 // If we have a queue we need to synchronize it now. 26 Result = Device.synchronize(*this); 27 assert(AsyncInfo.Queue == nullptr && 28 "The device plugin should have nulled the queue to indicate there " 29 "are no outstanding actions!"); 30 } 31 return Result; 32 } 33 34 void *&AsyncInfoTy::getVoidPtrLocation() { 35 BufferLocations.push_back(nullptr); 36 return BufferLocations.back(); 37 } 38 39 /* All begin addresses for partially mapped structs must be 8-aligned in order 40 * to ensure proper alignment of members. E.g. 41 * 42 * struct S { 43 * int a; // 4-aligned 44 * int b; // 4-aligned 45 * int *p; // 8-aligned 46 * } s1; 47 * ... 48 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 49 * { 50 * s1.b = 5; 51 * for (int i...) s1.p[i] = ...; 52 * } 53 * 54 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 55 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 56 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 57 * requirements for its type. Now, when we allocate memory on the device, in 58 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 59 * This means that the chunk of the struct on the device will start at a 60 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 61 * address of p will be a misaligned 0x204 (on the host there was no need to add 62 * padding between b and p, so p comes exactly 4 bytes after b). If the device 63 * kernel tries to access s1.p, a misaligned address error occurs (as reported 64 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 65 * extending the size of the allocated chuck accordingly, the chuck on the 66 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 67 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 68 */ 69 static const int64_t Alignment = 8; 70 71 /// Map global data and execute pending ctors 72 static int InitLibrary(DeviceTy &Device) { 73 /* 74 * Map global data 75 */ 76 int32_t device_id = Device.DeviceID; 77 int rc = OFFLOAD_SUCCESS; 78 bool supportsEmptyImages = Device.RTL->supports_empty_images && 79 Device.RTL->supports_empty_images() > 0; 80 81 Device.PendingGlobalsMtx.lock(); 82 PM->TrlTblMtx.lock(); 83 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { 84 TranslationTable *TransTable = 85 &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; 86 if (TransTable->HostTable.EntriesBegin == 87 TransTable->HostTable.EntriesEnd && 88 !supportsEmptyImages) { 89 // No host entry so no need to proceed 90 continue; 91 } 92 93 if (TransTable->TargetsTable[device_id] != 0) { 94 // Library entries have already been processed 95 continue; 96 } 97 98 // 1) get image. 99 assert(TransTable->TargetsImages.size() > (size_t)device_id && 100 "Not expecting a device ID outside the table's bounds!"); 101 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 102 if (!img) { 103 REPORT("No image loaded for device id %d.\n", device_id); 104 rc = OFFLOAD_FAIL; 105 break; 106 } 107 // 2) load image into the target table. 108 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = 109 Device.load_binary(img); 110 // Unable to get table for this image: invalidate image and fail. 111 if (!TargetTable) { 112 REPORT("Unable to generate entries table for device id %d.\n", device_id); 113 TransTable->TargetsImages[device_id] = 0; 114 rc = OFFLOAD_FAIL; 115 break; 116 } 117 118 // Verify whether the two table sizes match. 119 size_t hsize = 120 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 121 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 122 123 // Invalid image for these host entries! 124 if (hsize != tsize) { 125 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n", 126 device_id, hsize, tsize); 127 TransTable->TargetsImages[device_id] = 0; 128 TransTable->TargetsTable[device_id] = 0; 129 rc = OFFLOAD_FAIL; 130 break; 131 } 132 133 // process global data that needs to be mapped. 134 Device.DataMapMtx.lock(); 135 __tgt_target_table *HostTable = &TransTable->HostTable; 136 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 137 *CurrHostEntry = HostTable->EntriesBegin, 138 *EntryDeviceEnd = TargetTable->EntriesEnd; 139 CurrDeviceEntry != EntryDeviceEnd; 140 CurrDeviceEntry++, CurrHostEntry++) { 141 if (CurrDeviceEntry->size != 0) { 142 // has data. 143 assert(CurrDeviceEntry->size == CurrHostEntry->size && 144 "data size mismatch"); 145 146 // Fortran may use multiple weak declarations for the same symbol, 147 // therefore we must allow for multiple weak symbols to be loaded from 148 // the fat binary. Treat these mappings as any other "regular" mapping. 149 // Add entry to map. 150 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) 151 continue; 152 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" 153 "\n", 154 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 155 CurrDeviceEntry->size); 156 Device.HostDataToTargetMap.emplace( 157 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 158 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 159 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, 160 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 161 false /*UseHoldRefCount*/, nullptr /*Name*/, 162 true /*IsRefCountINF*/); 163 } 164 } 165 Device.DataMapMtx.unlock(); 166 } 167 PM->TrlTblMtx.unlock(); 168 169 if (rc != OFFLOAD_SUCCESS) { 170 Device.PendingGlobalsMtx.unlock(); 171 return rc; 172 } 173 174 /* 175 * Run ctors for static objects 176 */ 177 if (!Device.PendingCtorsDtors.empty()) { 178 AsyncInfoTy AsyncInfo(Device); 179 // Call all ctors for all libraries registered so far 180 for (auto &lib : Device.PendingCtorsDtors) { 181 if (!lib.second.PendingCtors.empty()) { 182 DP("Has pending ctors... call now\n"); 183 for (auto &entry : lib.second.PendingCtors) { 184 void *ctor = entry; 185 int rc = 186 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, 187 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); 188 if (rc != OFFLOAD_SUCCESS) { 189 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 190 Device.PendingGlobalsMtx.unlock(); 191 return OFFLOAD_FAIL; 192 } 193 } 194 // Clear the list to indicate that this device has been used 195 lib.second.PendingCtors.clear(); 196 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 197 } 198 } 199 // All constructors have been issued, wait for them now. 200 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 201 return OFFLOAD_FAIL; 202 } 203 Device.HasPendingGlobals = false; 204 Device.PendingGlobalsMtx.unlock(); 205 206 return OFFLOAD_SUCCESS; 207 } 208 209 void handleTargetOutcome(bool Success, ident_t *Loc) { 210 switch (PM->TargetOffloadPolicy) { 211 case tgt_disabled: 212 if (Success) { 213 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 214 } 215 break; 216 case tgt_default: 217 FATAL_MESSAGE0(1, "default offloading policy must be switched to " 218 "mandatory or disabled"); 219 break; 220 case tgt_mandatory: 221 if (!Success) { 222 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 223 for (auto &Device : PM->Devices) 224 dumpTargetPointerMappings(Loc, *Device); 225 else 226 FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target " 227 "pointer mappings.\n", 228 OMP_INFOTYPE_DUMP_TABLE); 229 230 SourceInfo info(Loc); 231 if (info.isAvailible()) 232 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(), 233 info.getColumn()); 234 else 235 FAILURE_MESSAGE("Source location information not present. Compile with " 236 "-g or -gline-tables-only.\n"); 237 FATAL_MESSAGE0( 238 1, "failure of target construct while offloading is mandatory"); 239 } else { 240 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 241 for (auto &Device : PM->Devices) 242 dumpTargetPointerMappings(Loc, *Device); 243 } 244 break; 245 } 246 } 247 248 static void handleDefaultTargetOffload() { 249 PM->TargetOffloadMtx.lock(); 250 if (PM->TargetOffloadPolicy == tgt_default) { 251 if (omp_get_num_devices() > 0) { 252 DP("Default TARGET OFFLOAD policy is now mandatory " 253 "(devices were found)\n"); 254 PM->TargetOffloadPolicy = tgt_mandatory; 255 } else { 256 DP("Default TARGET OFFLOAD policy is now disabled " 257 "(no devices were found)\n"); 258 PM->TargetOffloadPolicy = tgt_disabled; 259 } 260 } 261 PM->TargetOffloadMtx.unlock(); 262 } 263 264 static bool isOffloadDisabled() { 265 if (PM->TargetOffloadPolicy == tgt_default) 266 handleDefaultTargetOffload(); 267 return PM->TargetOffloadPolicy == tgt_disabled; 268 } 269 270 // If offload is enabled, ensure that device DeviceID has been initialized, 271 // global ctors have been executed, and global data has been mapped. 272 // 273 // The return bool indicates if the offload is to the host device 274 // There are three possible results: 275 // - Return false if the taregt device is ready for offload 276 // - Return true without reporting a runtime error if offload is 277 // disabled, perhaps because the initial device was specified. 278 // - Report a runtime error and return true. 279 // 280 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 281 // This step might be skipped if offload is disabled. 282 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { 283 if (isOffloadDisabled()) { 284 DP("Offload is disabled\n"); 285 return true; 286 } 287 288 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 289 DeviceID = omp_get_default_device(); 290 DP("Use default device id %" PRId64 "\n", DeviceID); 291 } 292 293 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 294 if (omp_get_num_devices() == 0) { 295 DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 296 handleTargetOutcome(false, Loc); 297 return true; 298 } 299 300 if (DeviceID == omp_get_initial_device()) { 301 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 302 DeviceID); 303 return true; 304 } 305 306 // Is device ready? 307 if (!device_is_ready(DeviceID)) { 308 REPORT("Device %" PRId64 " is not ready.\n", DeviceID); 309 handleTargetOutcome(false, Loc); 310 return true; 311 } 312 313 // Get device info. 314 DeviceTy &Device = *PM->Devices[DeviceID]; 315 316 // Check whether global data has been mapped for this device 317 Device.PendingGlobalsMtx.lock(); 318 bool hasPendingGlobals = Device.HasPendingGlobals; 319 Device.PendingGlobalsMtx.unlock(); 320 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 321 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID); 322 handleTargetOutcome(false, Loc); 323 return true; 324 } 325 326 return false; 327 } 328 329 static int32_t getParentIndex(int64_t type) { 330 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 331 } 332 333 void *targetAllocExplicit(size_t size, int device_num, int kind, 334 const char *name) { 335 TIMESCOPE(); 336 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size); 337 338 if (size <= 0) { 339 DP("Call to %s with non-positive length\n", name); 340 return NULL; 341 } 342 343 void *rc = NULL; 344 345 if (device_num == omp_get_initial_device()) { 346 rc = malloc(size); 347 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc)); 348 return rc; 349 } 350 351 if (!device_is_ready(device_num)) { 352 DP("%s returns NULL ptr\n", name); 353 return NULL; 354 } 355 356 DeviceTy &Device = *PM->Devices[device_num]; 357 rc = Device.allocData(size, nullptr, kind); 358 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); 359 return rc; 360 } 361 362 /// Call the user-defined mapper function followed by the appropriate 363 // targetData* function (targetData{Begin,End,Update}). 364 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, 365 int64_t arg_size, int64_t arg_type, 366 map_var_info_t arg_names, void *arg_mapper, 367 AsyncInfoTy &AsyncInfo, 368 TargetDataFuncPtrTy target_data_function) { 369 TIMESCOPE_WITH_IDENT(loc); 370 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); 371 372 // The mapper function fills up Components. 373 MapperComponentsTy MapperComponents; 374 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); 375 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type, 376 arg_names); 377 378 // Construct new arrays for args_base, args, arg_sizes and arg_types 379 // using the information in MapperComponents and call the corresponding 380 // targetData* function using these new arrays. 381 std::vector<void *> MapperArgsBase(MapperComponents.Components.size()); 382 std::vector<void *> MapperArgs(MapperComponents.Components.size()); 383 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 384 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 385 std::vector<void *> MapperArgNames(MapperComponents.Components.size()); 386 387 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 388 auto &C = MapperComponents.Components[I]; 389 MapperArgsBase[I] = C.Base; 390 MapperArgs[I] = C.Begin; 391 MapperArgSizes[I] = C.Size; 392 MapperArgTypes[I] = C.Type; 393 MapperArgNames[I] = C.Name; 394 } 395 396 int rc = target_data_function(loc, Device, MapperComponents.Components.size(), 397 MapperArgsBase.data(), MapperArgs.data(), 398 MapperArgSizes.data(), MapperArgTypes.data(), 399 MapperArgNames.data(), /*arg_mappers*/ nullptr, 400 AsyncInfo, /*FromMapper=*/true); 401 402 return rc; 403 } 404 405 /// Internal function to do the mapping and transfer the data to the device 406 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, 407 void **args_base, void **args, int64_t *arg_sizes, 408 int64_t *arg_types, map_var_info_t *arg_names, 409 void **arg_mappers, AsyncInfoTy &AsyncInfo, 410 bool FromMapper) { 411 // process each input. 412 for (int32_t i = 0; i < arg_num; ++i) { 413 // Ignore private variables and arrays - there is no mapping for them. 414 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 415 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 416 continue; 417 418 if (arg_mappers && arg_mappers[i]) { 419 // Instead of executing the regular path of targetDataBegin, call the 420 // targetDataMapper variant which will call targetDataBegin again 421 // with new arguments. 422 DP("Calling targetDataMapper for the %dth argument\n", i); 423 424 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; 425 int rc = targetDataMapper(loc, Device, args_base[i], args[i], 426 arg_sizes[i], arg_types[i], arg_name, 427 arg_mappers[i], AsyncInfo, targetDataBegin); 428 429 if (rc != OFFLOAD_SUCCESS) { 430 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 431 " failed.\n"); 432 return OFFLOAD_FAIL; 433 } 434 435 // Skip the rest of this function, continue to the next argument. 436 continue; 437 } 438 439 void *HstPtrBegin = args[i]; 440 void *HstPtrBase = args_base[i]; 441 int64_t data_size = arg_sizes[i]; 442 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i]; 443 444 // Adjust for proper alignment if this is a combined entry (for structs). 445 // Look at the next argument - if that is MEMBER_OF this one, then this one 446 // is a combined entry. 447 int64_t padding = 0; 448 const int next_i = i + 1; 449 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && 450 getParentIndex(arg_types[next_i]) == i) { 451 padding = (int64_t)HstPtrBegin % Alignment; 452 if (padding) { 453 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 454 "\n", 455 padding, DPxPTR(HstPtrBegin)); 456 HstPtrBegin = (char *)HstPtrBegin - padding; 457 data_size += padding; 458 } 459 } 460 461 // Address of pointer on the host and device, respectively. 462 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin; 463 TargetPointerResultTy Pointer_TPR; 464 bool IsHostPtr = false; 465 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 466 // Force the creation of a device side copy of the data when: 467 // a close map modifier was associated with a map that contained a to. 468 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 469 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT; 470 bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD; 471 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 472 // have reached this point via __tgt_target_data_begin and not __tgt_target 473 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 474 // associated with a target region, so there are no target parameters). This 475 // may be considered a hack, we could revise the scheme in the future. 476 bool UpdateRef = 477 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0); 478 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 479 DP("Has a pointer entry: \n"); 480 // Base is address of pointer. 481 // 482 // Usually, the pointer is already allocated by this time. For example: 483 // 484 // #pragma omp target map(s.p[0:N]) 485 // 486 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 487 // afterward, so the pointer is already allocated by the time the 488 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 489 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 490 // entry for a global that might not already be allocated by the time the 491 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 492 // when HasPresentModifier. 493 Pointer_TPR = Device.getTargetPointer( 494 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, 495 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, 496 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); 497 PointerTgtPtrBegin = Pointer_TPR.TargetPointer; 498 IsHostPtr = Pointer_TPR.Flags.IsHostPointer; 499 if (!PointerTgtPtrBegin) { 500 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 501 HasPresentModifier ? "'present' map type modifier" 502 : "device failure or illegal mapping"); 503 return OFFLOAD_FAIL; 504 } 505 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 506 "\n", 507 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 508 (Pointer_TPR.Flags.IsNewEntry ? "" : " not")); 509 Pointer_HstPtrBegin = HstPtrBase; 510 // modify current entry. 511 HstPtrBase = *(void **)HstPtrBase; 512 // No need to update pointee ref count for the first element of the 513 // subelement that comes from mapper. 514 UpdateRef = 515 (!FromMapper || i != 0); // subsequently update ref count of pointee 516 } 517 518 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO; 519 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 520 auto TPR = Device.getTargetPointer(HstPtrBegin, HstPtrBase, data_size, 521 HstPtrName, HasFlagTo, HasFlagAlways, 522 IsImplicit, UpdateRef, HasCloseModifier, 523 HasPresentModifier, HasHoldModifier, 524 AsyncInfo); 525 void *TgtPtrBegin = TPR.TargetPointer; 526 IsHostPtr = TPR.Flags.IsHostPointer; 527 // If data_size==0, then the argument could be a zero-length pointer to 528 // NULL, so getOrAlloc() returning NULL is not an error. 529 if (!TgtPtrBegin && (data_size || HasPresentModifier)) { 530 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 531 HasPresentModifier ? "'present' map type modifier" 532 : "device failure or illegal mapping"); 533 return OFFLOAD_FAIL; 534 } 535 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 536 " - is%s new\n", 537 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 538 539 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 540 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 541 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 542 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 543 args_base[i] = TgtPtrBase; 544 } 545 546 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 547 // Check whether we need to update the pointer on the device 548 bool UpdateDevPtr = false; 549 550 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 551 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 552 553 Device.ShadowMtx.lock(); 554 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin); 555 // If this pointer is not in the map we need to insert it. If the map 556 // contains a stale entry, we need to update it (e.g. if the pointee was 557 // deallocated and later on is reallocated at another device address). The 558 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An 559 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ 560 // pair is deallocated, not when the OBJ is deallocated. In 561 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, 562 // so it stays in the map for the lifetime of the application. When the 563 // OBJ is deallocated and later on allocated again (at a different device 564 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin 565 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. 566 if (Entry == Device.ShadowPtrMap.end() || 567 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { 568 // create or update shadow pointers for this entry 569 Device.ShadowPtrMap[Pointer_HstPtrBegin] = { 570 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; 571 UpdateDevPtr = true; 572 } 573 574 if (UpdateDevPtr) { 575 Pointer_TPR.MapTableEntry->lock(); 576 Device.ShadowMtx.unlock(); 577 578 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 579 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 580 581 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 582 TgtPtrBase = ExpectedTgtPtrBase; 583 584 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, 585 sizeof(void *), AsyncInfo); 586 if (Ret != OFFLOAD_SUCCESS) { 587 Pointer_TPR.MapTableEntry->unlock(); 588 REPORT("Copying data to device failed.\n"); 589 return OFFLOAD_FAIL; 590 } 591 void *Event = Pointer_TPR.MapTableEntry->getEvent(); 592 bool NeedNewEvent = Event == nullptr; 593 if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) { 594 Pointer_TPR.MapTableEntry->unlock(); 595 REPORT("Failed to create event.\n"); 596 return OFFLOAD_FAIL; 597 } 598 // We cannot assume the event should not be nullptr because we don't 599 // know if the target support event. But if a target doesn't, 600 // recordEvent should always return success. 601 Ret = Device.recordEvent(Event, AsyncInfo); 602 if (Ret != OFFLOAD_SUCCESS) { 603 Pointer_TPR.MapTableEntry->unlock(); 604 REPORT("Failed to set dependence on event " DPxMOD "\n", 605 DPxPTR(Event)); 606 return OFFLOAD_FAIL; 607 } 608 if (NeedNewEvent) 609 Pointer_TPR.MapTableEntry->setEvent(Event); 610 Pointer_TPR.MapTableEntry->unlock(); 611 } else 612 Device.ShadowMtx.unlock(); 613 } 614 } 615 616 return OFFLOAD_SUCCESS; 617 } 618 619 namespace { 620 /// This structure contains information to deallocate a target pointer, aka. 621 /// used to call the function \p DeviceTy::deallocTgtPtr. 622 struct DeallocTgtPtrInfo { 623 /// Host pointer used to look up into the map table 624 void *HstPtrBegin; 625 /// Size of the data 626 int64_t DataSize; 627 /// Whether it has \p ompx_hold modifier 628 bool HasHoldModifier; 629 630 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier) 631 : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {} 632 }; 633 } // namespace 634 635 /// Internal function to undo the mapping and retrieve the data from the device. 636 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 637 void **ArgBases, void **Args, int64_t *ArgSizes, 638 int64_t *ArgTypes, map_var_info_t *ArgNames, 639 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 640 int Ret; 641 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs; 642 void *FromMapperBase = nullptr; 643 // process each input. 644 for (int32_t I = ArgNum - 1; I >= 0; --I) { 645 // Ignore private variables and arrays - there is no mapping for them. 646 // Also, ignore the use_device_ptr directive, it has no effect here. 647 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 648 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 649 continue; 650 651 if (ArgMappers && ArgMappers[I]) { 652 // Instead of executing the regular path of targetDataEnd, call the 653 // targetDataMapper variant which will call targetDataEnd again 654 // with new arguments. 655 DP("Calling targetDataMapper for the %dth argument\n", I); 656 657 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 658 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], 659 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 660 targetDataEnd); 661 662 if (Ret != OFFLOAD_SUCCESS) { 663 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 664 " failed.\n"); 665 return OFFLOAD_FAIL; 666 } 667 668 // Skip the rest of this function, continue to the next argument. 669 continue; 670 } 671 672 void *HstPtrBegin = Args[I]; 673 int64_t DataSize = ArgSizes[I]; 674 // Adjust for proper alignment if this is a combined entry (for structs). 675 // Look at the next argument - if that is MEMBER_OF this one, then this one 676 // is a combined entry. 677 const int NextI = I + 1; 678 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 679 getParentIndex(ArgTypes[NextI]) == I) { 680 int64_t Padding = (int64_t)HstPtrBegin % Alignment; 681 if (Padding) { 682 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD 683 "\n", 684 Padding, DPxPTR(HstPtrBegin)); 685 HstPtrBegin = (char *)HstPtrBegin - Padding; 686 DataSize += Padding; 687 } 688 } 689 690 bool IsLast, IsHostPtr; 691 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 692 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 693 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 694 !(FromMapper && I == 0); 695 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 696 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 697 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 698 699 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 700 void *TgtPtrBegin = Device.getTgtPtrBegin( 701 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, 702 !IsImplicit, ForceDelete); 703 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 704 DP("Mapping does not exist (%s)\n", 705 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 706 if (HasPresentModifier) { 707 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 708 // "If a map clause appears on a target, target data, target enter data 709 // or target exit data construct with a present map-type-modifier then 710 // on entry to the region if the corresponding list item does not appear 711 // in the device data environment then an error occurs and the program 712 // terminates." 713 // 714 // This should be an error upon entering an "omp target exit data". It 715 // should not be an error upon exiting an "omp target data" or "omp 716 // target". For "omp target data", Clang thus doesn't include present 717 // modifiers for end calls. For "omp target", we have not found a valid 718 // OpenMP program for which the error matters: it appears that, if a 719 // program can guarantee that data is present at the beginning of an 720 // "omp target" region so that there's no error there, that data is also 721 // guaranteed to be present at the end. 722 MESSAGE("device mapping required by 'present' map type modifier does " 723 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 724 DPxPTR(HstPtrBegin), DataSize); 725 return OFFLOAD_FAIL; 726 } 727 } else { 728 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 729 " - is%s last\n", 730 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); 731 } 732 733 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 734 // "If the map clause appears on a target, target data, or target exit data 735 // construct and a corresponding list item of the original list item is not 736 // present in the device data environment on exit from the region then the 737 // list item is ignored." 738 if (!TgtPtrBegin) 739 continue; 740 741 bool DelEntry = IsLast; 742 743 // If the last element from the mapper (for end transfer args comes in 744 // reverse order), do not remove the partial entry, the parent struct still 745 // exists. 746 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 747 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 748 DelEntry = false; // protect parent struct from being deallocated 749 } 750 751 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 752 // Move data back to the host 753 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 754 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 755 if ((Always || IsLast) && !IsHostPtr) { 756 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 757 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 758 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, 759 AsyncInfo); 760 if (Ret != OFFLOAD_SUCCESS) { 761 REPORT("Copying data from device failed.\n"); 762 return OFFLOAD_FAIL; 763 } 764 } 765 } 766 if (DelEntry && FromMapper && I == 0) { 767 DelEntry = false; 768 FromMapperBase = HstPtrBegin; 769 } 770 771 // If we copied back to the host a struct/array containing pointers, we 772 // need to restore the original host pointer values from their shadow 773 // copies. If the struct is going to be deallocated, remove any remaining 774 // shadow pointer entries for this struct. 775 uintptr_t LB = (uintptr_t)HstPtrBegin; 776 uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize; 777 Device.ShadowMtx.lock(); 778 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); 779 Itr != Device.ShadowPtrMap.end();) { 780 void **ShadowHstPtrAddr = (void **)Itr->first; 781 782 // An STL map is sorted on its keys; use this property 783 // to quickly determine when to break out of the loop. 784 if ((uintptr_t)ShadowHstPtrAddr < LB) { 785 ++Itr; 786 continue; 787 } 788 if ((uintptr_t)ShadowHstPtrAddr >= UB) 789 break; 790 791 // If we copied the struct to the host, we need to restore the pointer. 792 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 793 DP("Restoring original host pointer value " DPxMOD " for host " 794 "pointer " DPxMOD "\n", 795 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 796 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 797 } 798 // If the struct is to be deallocated, remove the shadow entry. 799 if (DelEntry) { 800 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); 801 Itr = Device.ShadowPtrMap.erase(Itr); 802 } else { 803 ++Itr; 804 } 805 } 806 Device.ShadowMtx.unlock(); 807 808 // Add pointer to the buffer for later deallocation 809 if (DelEntry && !IsHostPtr) 810 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier); 811 } 812 } 813 814 // TODO: We should not synchronize here but pass the AsyncInfo object to the 815 // allocate/deallocate device APIs. 816 // 817 // We need to synchronize before deallocating data. 818 Ret = AsyncInfo.synchronize(); 819 if (Ret != OFFLOAD_SUCCESS) 820 return OFFLOAD_FAIL; 821 822 // Deallocate target pointer 823 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { 824 if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) 825 continue; 826 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, 827 Info.HasHoldModifier); 828 if (Ret != OFFLOAD_SUCCESS) { 829 REPORT("Deallocating data from device failed.\n"); 830 return OFFLOAD_FAIL; 831 } 832 } 833 834 return OFFLOAD_SUCCESS; 835 } 836 837 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, 838 void *HstPtrBegin, int64_t ArgSize, 839 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 840 TIMESCOPE_WITH_IDENT(loc); 841 bool IsLast, IsHostPtr; 842 void *TgtPtrBegin = Device.getTgtPtrBegin( 843 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, 844 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); 845 if (!TgtPtrBegin) { 846 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 847 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 848 MESSAGE("device mapping required by 'present' motion modifier does not " 849 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 850 DPxPTR(HstPtrBegin), ArgSize); 851 return OFFLOAD_FAIL; 852 } 853 return OFFLOAD_SUCCESS; 854 } 855 856 if (IsHostPtr) { 857 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 858 DPxPTR(HstPtrBegin)); 859 return OFFLOAD_SUCCESS; 860 } 861 862 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 863 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 864 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 865 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); 866 if (Ret != OFFLOAD_SUCCESS) { 867 REPORT("Copying data from device failed.\n"); 868 return OFFLOAD_FAIL; 869 } 870 871 uintptr_t LB = (uintptr_t)HstPtrBegin; 872 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize; 873 Device.ShadowMtx.lock(); 874 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin(); 875 IT != Device.ShadowPtrMap.end(); ++IT) { 876 void **ShadowHstPtrAddr = (void **)IT->first; 877 if ((uintptr_t)ShadowHstPtrAddr < LB) 878 continue; 879 if ((uintptr_t)ShadowHstPtrAddr >= UB) 880 break; 881 DP("Restoring original host pointer value " DPxMOD 882 " for host pointer " DPxMOD "\n", 883 DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 884 *ShadowHstPtrAddr = IT->second.HstPtrVal; 885 } 886 Device.ShadowMtx.unlock(); 887 } 888 889 if (ArgType & OMP_TGT_MAPTYPE_TO) { 890 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 891 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 892 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); 893 if (Ret != OFFLOAD_SUCCESS) { 894 REPORT("Copying data to device failed.\n"); 895 return OFFLOAD_FAIL; 896 } 897 898 uintptr_t LB = (uintptr_t)HstPtrBegin; 899 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize; 900 Device.ShadowMtx.lock(); 901 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin(); 902 IT != Device.ShadowPtrMap.end(); ++IT) { 903 void **ShadowHstPtrAddr = (void **)IT->first; 904 if ((uintptr_t)ShadowHstPtrAddr < LB) 905 continue; 906 if ((uintptr_t)ShadowHstPtrAddr >= UB) 907 break; 908 DP("Restoring original target pointer value " DPxMOD " for target " 909 "pointer " DPxMOD "\n", 910 DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr)); 911 Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal, 912 sizeof(void *), AsyncInfo); 913 if (Ret != OFFLOAD_SUCCESS) { 914 REPORT("Copying data to device failed.\n"); 915 Device.ShadowMtx.unlock(); 916 return OFFLOAD_FAIL; 917 } 918 } 919 Device.ShadowMtx.unlock(); 920 } 921 return OFFLOAD_SUCCESS; 922 } 923 924 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device, 925 void *ArgsBase, 926 __tgt_target_non_contig *NonContig, 927 uint64_t Size, int64_t ArgType, 928 int CurrentDim, int DimSize, uint64_t Offset, 929 AsyncInfoTy &AsyncInfo) { 930 TIMESCOPE_WITH_IDENT(loc); 931 int Ret = OFFLOAD_SUCCESS; 932 if (CurrentDim < DimSize) { 933 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 934 uint64_t CurOffset = 935 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 936 // we only need to transfer the first element for the last dimension 937 // since we've already got a contiguous piece. 938 if (CurrentDim != DimSize - 1 || I == 0) { 939 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, 940 ArgType, CurrentDim + 1, DimSize, 941 Offset + CurOffset, AsyncInfo); 942 // Stop the whole process if any contiguous piece returns anything 943 // other than OFFLOAD_SUCCESS. 944 if (Ret != OFFLOAD_SUCCESS) 945 return Ret; 946 } 947 } 948 } else { 949 char *Ptr = (char *)ArgsBase + Offset; 950 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 951 " len %" PRIu64 "\n", 952 DPxPTR(Ptr), Offset, Size); 953 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, 954 AsyncInfo); 955 } 956 return Ret; 957 } 958 959 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 960 int32_t DimSize) { 961 int RemovedDim = 0; 962 for (int I = DimSize - 1; I > 0; --I) { 963 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 964 RemovedDim++; 965 } 966 return RemovedDim; 967 } 968 969 /// Internal function to pass data to/from the target. 970 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 971 void **ArgsBase, void **Args, int64_t *ArgSizes, 972 int64_t *ArgTypes, map_var_info_t *ArgNames, 973 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 974 // process each input. 975 for (int32_t I = 0; I < ArgNum; ++I) { 976 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 977 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 978 continue; 979 980 if (ArgMappers && ArgMappers[I]) { 981 // Instead of executing the regular path of targetDataUpdate, call the 982 // targetDataMapper variant which will call targetDataUpdate again 983 // with new arguments. 984 DP("Calling targetDataMapper for the %dth argument\n", I); 985 986 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 987 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 988 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 989 targetDataUpdate); 990 991 if (Ret != OFFLOAD_SUCCESS) { 992 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 993 " failed.\n"); 994 return OFFLOAD_FAIL; 995 } 996 997 // Skip the rest of this function, continue to the next argument. 998 continue; 999 } 1000 1001 int Ret = OFFLOAD_SUCCESS; 1002 1003 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 1004 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 1005 int32_t DimSize = ArgSizes[I]; 1006 uint64_t Size = 1007 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 1008 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 1009 Ret = targetDataNonContiguous( 1010 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 1011 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 1012 } else { 1013 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1014 ArgTypes[I], AsyncInfo); 1015 } 1016 if (Ret == OFFLOAD_FAIL) 1017 return OFFLOAD_FAIL; 1018 } 1019 return OFFLOAD_SUCCESS; 1020 } 1021 1022 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 1023 OMP_TGT_MAPTYPE_LITERAL | 1024 OMP_TGT_MAPTYPE_IMPLICIT; 1025 static bool isLambdaMapping(int64_t Mapping) { 1026 return (Mapping & LambdaMapping) == LambdaMapping; 1027 } 1028 1029 namespace { 1030 /// Find the table information in the map or look it up in the translation 1031 /// tables. 1032 TableMap *getTableMap(void *HostPtr) { 1033 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1034 HostPtrToTableMapTy::iterator TableMapIt = 1035 PM->HostPtrToTableMap.find(HostPtr); 1036 1037 if (TableMapIt != PM->HostPtrToTableMap.end()) 1038 return &TableMapIt->second; 1039 1040 // We don't have a map. So search all the registered libraries. 1041 TableMap *TM = nullptr; 1042 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1043 for (HostEntriesBeginToTransTableTy::iterator Itr = 1044 PM->HostEntriesBeginToTransTable.begin(); 1045 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 1046 // get the translation table (which contains all the good info). 1047 TranslationTable *TransTable = &Itr->second; 1048 // iterate over all the host table entries to see if we can locate the 1049 // host_ptr. 1050 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; 1051 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 1052 if (Cur->addr != HostPtr) 1053 continue; 1054 // we got a match, now fill the HostPtrToTableMap so that we 1055 // may avoid this search next time. 1056 TM = &(PM->HostPtrToTableMap)[HostPtr]; 1057 TM->Table = TransTable; 1058 TM->Index = I; 1059 return TM; 1060 } 1061 } 1062 1063 return nullptr; 1064 } 1065 1066 /// Get loop trip count 1067 /// FIXME: This function will not work right if calling 1068 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in 1069 /// another thread, which might occur when we call task yield. 1070 uint64_t getLoopTripCount(int64_t DeviceId) { 1071 DeviceTy &Device = *PM->Devices[DeviceId]; 1072 uint64_t LoopTripCount = 0; 1073 1074 { 1075 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1076 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 1077 if (I != Device.LoopTripCnt.end()) { 1078 LoopTripCount = I->second; 1079 Device.LoopTripCnt.erase(I); 1080 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount); 1081 } 1082 } 1083 1084 return LoopTripCount; 1085 } 1086 1087 /// A class manages private arguments in a target region. 1088 class PrivateArgumentManagerTy { 1089 /// A data structure for the information of first-private arguments. We can 1090 /// use this information to optimize data transfer by packing all 1091 /// first-private arguments and transfer them all at once. 1092 struct FirstPrivateArgInfoTy { 1093 /// The index of the element in \p TgtArgs corresponding to the argument 1094 const int Index; 1095 /// Host pointer begin 1096 const char *HstPtrBegin; 1097 /// Host pointer end 1098 const char *HstPtrEnd; 1099 /// Aligned size 1100 const int64_t AlignedSize; 1101 /// Host pointer name 1102 const map_var_info_t HstPtrName = nullptr; 1103 1104 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, 1105 const map_var_info_t HstPtrName = nullptr) 1106 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)), 1107 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), 1108 HstPtrName(HstPtrName) {} 1109 }; 1110 1111 /// A vector of target pointers for all private arguments 1112 std::vector<void *> TgtPtrs; 1113 1114 /// A vector of information of all first-private arguments to be packed 1115 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1116 /// Host buffer for all arguments to be packed 1117 std::vector<char> FirstPrivateArgBuffer; 1118 /// The total size of all arguments to be packed 1119 int64_t FirstPrivateArgSize = 0; 1120 1121 /// A reference to the \p DeviceTy object 1122 DeviceTy &Device; 1123 /// A pointer to a \p AsyncInfoTy object 1124 AsyncInfoTy &AsyncInfo; 1125 1126 // TODO: What would be the best value here? Should we make it configurable? 1127 // If the size is larger than this threshold, we will allocate and transfer it 1128 // immediately instead of packing it. 1129 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1130 1131 public: 1132 /// Constructor 1133 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1134 : Device(Dev), AsyncInfo(AsyncInfo) {} 1135 1136 /// Add a private argument 1137 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1138 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1139 const map_var_info_t HstPtrName = nullptr, 1140 const bool AllocImmediately = false) { 1141 // If the argument is not first-private, or its size is greater than a 1142 // predefined threshold, we will allocate memory and issue the transfer 1143 // immediately. 1144 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1145 AllocImmediately) { 1146 TgtPtr = Device.allocData(ArgSize, HstPtr); 1147 if (!TgtPtr) { 1148 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1149 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1150 return OFFLOAD_FAIL; 1151 } 1152 #ifdef OMPTARGET_DEBUG 1153 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1154 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1155 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1156 "\n", 1157 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1158 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1159 #endif 1160 // If first-private, copy data from host 1161 if (IsFirstPrivate) { 1162 DP("Submitting firstprivate data to the device.\n"); 1163 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1164 if (Ret != OFFLOAD_SUCCESS) { 1165 DP("Copying data to device failed, failed.\n"); 1166 return OFFLOAD_FAIL; 1167 } 1168 } 1169 TgtPtrs.push_back(TgtPtr); 1170 } else { 1171 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1172 DPxPTR(HstPtr), ArgSize); 1173 // When reach this point, the argument must meet all following 1174 // requirements: 1175 // 1. Its size does not exceed the threshold (see the comment for 1176 // FirstPrivateArgSizeThreshold); 1177 // 2. It must be first-private (needs to be mapped to target device). 1178 // We will pack all this kind of arguments to transfer them all at once 1179 // to reduce the number of data transfer. We will not take 1180 // non-first-private arguments, aka. private arguments that doesn't need 1181 // to be mapped to target device, into account because data allocation 1182 // can be very efficient with memory manager. 1183 1184 // Placeholder value 1185 TgtPtr = nullptr; 1186 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1187 HstPtrName); 1188 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; 1189 } 1190 1191 return OFFLOAD_SUCCESS; 1192 } 1193 1194 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1195 /// and start the transfer. 1196 int packAndTransfer(std::vector<void *> &TgtArgs) { 1197 if (!FirstPrivateArgInfo.empty()) { 1198 assert(FirstPrivateArgSize != 0 && 1199 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1200 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1201 auto Itr = FirstPrivateArgBuffer.begin(); 1202 // Copy all host data to this buffer 1203 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1204 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1205 Itr = std::next(Itr, Info.AlignedSize); 1206 } 1207 // Allocate target memory 1208 void *TgtPtr = 1209 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1210 if (TgtPtr == nullptr) { 1211 DP("Failed to allocate target memory for private arguments.\n"); 1212 return OFFLOAD_FAIL; 1213 } 1214 TgtPtrs.push_back(TgtPtr); 1215 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1216 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1217 // Transfer data to target device 1218 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1219 FirstPrivateArgSize, AsyncInfo); 1220 if (Ret != OFFLOAD_SUCCESS) { 1221 DP("Failed to submit data of private arguments.\n"); 1222 return OFFLOAD_FAIL; 1223 } 1224 // Fill in all placeholder pointers 1225 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1226 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1227 void *&Ptr = TgtArgs[Info.Index]; 1228 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1229 Ptr = reinterpret_cast<void *>(TP); 1230 TP += Info.AlignedSize; 1231 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1232 "\n", 1233 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1234 DPxPTR(Ptr)); 1235 } 1236 } 1237 1238 return OFFLOAD_SUCCESS; 1239 } 1240 1241 /// Free all target memory allocated for private arguments 1242 int free() { 1243 for (void *P : TgtPtrs) { 1244 int Ret = Device.deleteData(P); 1245 if (Ret != OFFLOAD_SUCCESS) { 1246 DP("Deallocation of (first-)private arrays failed.\n"); 1247 return OFFLOAD_FAIL; 1248 } 1249 } 1250 1251 TgtPtrs.clear(); 1252 1253 return OFFLOAD_SUCCESS; 1254 } 1255 }; 1256 1257 /// Process data before launching the kernel, including calling targetDataBegin 1258 /// to map and transfer data to target device, transferring (first-)private 1259 /// variables. 1260 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr, 1261 int32_t ArgNum, void **ArgBases, void **Args, 1262 int64_t *ArgSizes, int64_t *ArgTypes, 1263 map_var_info_t *ArgNames, void **ArgMappers, 1264 std::vector<void *> &TgtArgs, 1265 std::vector<ptrdiff_t> &TgtOffsets, 1266 PrivateArgumentManagerTy &PrivateArgumentManager, 1267 AsyncInfoTy &AsyncInfo) { 1268 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); 1269 DeviceTy &Device = *PM->Devices[DeviceId]; 1270 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1271 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1272 if (Ret != OFFLOAD_SUCCESS) { 1273 REPORT("Call to targetDataBegin failed, abort target.\n"); 1274 return OFFLOAD_FAIL; 1275 } 1276 1277 // List of (first-)private arrays allocated for this target region 1278 std::vector<int> TgtArgsPositions(ArgNum, -1); 1279 1280 for (int32_t I = 0; I < ArgNum; ++I) { 1281 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1282 // This is not a target parameter, do not push it into TgtArgs. 1283 // Check for lambda mapping. 1284 if (isLambdaMapping(ArgTypes[I])) { 1285 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1286 "PTR_AND_OBJ must be also MEMBER_OF."); 1287 unsigned Idx = getParentIndex(ArgTypes[I]); 1288 int TgtIdx = TgtArgsPositions[Idx]; 1289 assert(TgtIdx != -1 && "Base address must be translated already."); 1290 // The parent lambda must be processed already and it must be the last 1291 // in TgtArgs and TgtOffsets arrays. 1292 void *HstPtrVal = Args[I]; 1293 void *HstPtrBegin = ArgBases[I]; 1294 void *HstPtrBase = Args[Idx]; 1295 bool IsLast, IsHostPtr; // IsLast is unused. 1296 void *TgtPtrBase = 1297 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1298 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1299 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1300 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1301 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1302 PointerTgtPtrBegin = Device.getTgtPtrBegin( 1303 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, 1304 /*UseHoldRefCount=*/false, IsHostPtr); 1305 if (!PointerTgtPtrBegin) { 1306 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1307 DPxPTR(HstPtrVal)); 1308 continue; 1309 } 1310 if (IsHostPtr) { 1311 DP("Unified memory is active, no need to map lambda captured" 1312 "variable (" DPxMOD ")\n", 1313 DPxPTR(HstPtrVal)); 1314 continue; 1315 } 1316 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1317 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1318 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1319 sizeof(void *), AsyncInfo); 1320 if (Ret != OFFLOAD_SUCCESS) { 1321 REPORT("Copying data to device failed.\n"); 1322 return OFFLOAD_FAIL; 1323 } 1324 } 1325 continue; 1326 } 1327 void *HstPtrBegin = Args[I]; 1328 void *HstPtrBase = ArgBases[I]; 1329 void *TgtPtrBegin; 1330 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1331 ptrdiff_t TgtBaseOffset; 1332 bool IsLast, IsHostPtr; // unused. 1333 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1334 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1335 DPxPTR(HstPtrBase)); 1336 TgtPtrBegin = HstPtrBase; 1337 TgtBaseOffset = 0; 1338 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1339 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1340 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1341 // If there is a next argument and it depends on the current one, we need 1342 // to allocate the private memory immediately. If this is not the case, 1343 // then the argument can be marked for optimization and packed with the 1344 // other privates. 1345 const bool AllocImmediately = 1346 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1347 Ret = PrivateArgumentManager.addArg( 1348 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1349 TgtArgs.size(), HstPtrName, AllocImmediately); 1350 if (Ret != OFFLOAD_SUCCESS) { 1351 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1352 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1353 return OFFLOAD_FAIL; 1354 } 1355 } else { 1356 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1357 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1358 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, 1359 /*UpdateRefCount=*/false, 1360 /*UseHoldRefCount=*/false, IsHostPtr); 1361 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1362 #ifdef OMPTARGET_DEBUG 1363 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1364 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1365 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1366 #endif 1367 } 1368 TgtArgsPositions[I] = TgtArgs.size(); 1369 TgtArgs.push_back(TgtPtrBegin); 1370 TgtOffsets.push_back(TgtBaseOffset); 1371 } 1372 1373 assert(TgtArgs.size() == TgtOffsets.size() && 1374 "Size mismatch in arguments and offsets"); 1375 1376 // Pack and transfer first-private arguments 1377 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1378 if (Ret != OFFLOAD_SUCCESS) { 1379 DP("Failed to pack and transfer first private arguments\n"); 1380 return OFFLOAD_FAIL; 1381 } 1382 1383 return OFFLOAD_SUCCESS; 1384 } 1385 1386 /// Process data after launching the kernel, including transferring data back to 1387 /// host if needed and deallocating target memory of (first-)private variables. 1388 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr, 1389 int32_t ArgNum, void **ArgBases, void **Args, 1390 int64_t *ArgSizes, int64_t *ArgTypes, 1391 map_var_info_t *ArgNames, void **ArgMappers, 1392 PrivateArgumentManagerTy &PrivateArgumentManager, 1393 AsyncInfoTy &AsyncInfo) { 1394 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); 1395 DeviceTy &Device = *PM->Devices[DeviceId]; 1396 1397 // Move data from device. 1398 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1399 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1400 if (Ret != OFFLOAD_SUCCESS) { 1401 REPORT("Call to targetDataEnd failed, abort target.\n"); 1402 return OFFLOAD_FAIL; 1403 } 1404 1405 // Free target memory for private arguments 1406 Ret = PrivateArgumentManager.free(); 1407 if (Ret != OFFLOAD_SUCCESS) { 1408 REPORT("Failed to deallocate target memory for private args\n"); 1409 return OFFLOAD_FAIL; 1410 } 1411 1412 return OFFLOAD_SUCCESS; 1413 } 1414 } // namespace 1415 1416 /// performs the same actions as data_begin in case arg_num is 1417 /// non-zero and initiates run of the offloaded region on the target platform; 1418 /// if arg_num is non-zero after the region execution is done it also 1419 /// performs the same action as data_update and data_end above. This function 1420 /// returns 0 if it was able to transfer the execution to a target and an 1421 /// integer different from zero otherwise. 1422 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, 1423 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 1424 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, 1425 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { 1426 int32_t DeviceId = Device.DeviceID; 1427 1428 TableMap *TM = getTableMap(HostPtr); 1429 // No map for this host pointer found! 1430 if (!TM) { 1431 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1432 DPxPTR(HostPtr)); 1433 return OFFLOAD_FAIL; 1434 } 1435 1436 // get target table. 1437 __tgt_target_table *TargetTable = nullptr; 1438 { 1439 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1440 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1441 "Not expecting a device ID outside the table's bounds!"); 1442 TargetTable = TM->Table->TargetsTable[DeviceId]; 1443 } 1444 assert(TargetTable && "Global data has not been mapped\n"); 1445 1446 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we 1447 // need to manifest base pointers prior to launching a kernel. Even if we have 1448 // mapped an object only partially, e.g. A[N:M], although the kernel is 1449 // expected to access elements starting at address &A[N] and beyond, we still 1450 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI 1451 // API, we need the begin address itself, i.e. &A[N], as the API operates on 1452 // begin addresses, not bases. That's why we pass args and offsets as two 1453 // separate entities so that each plugin can do what it needs. This behavior 1454 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. 1455 std::vector<void *> TgtArgs; 1456 std::vector<ptrdiff_t> TgtOffsets; 1457 1458 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1459 1460 int Ret; 1461 if (ArgNum) { 1462 // Process data, such as data mapping, before launching the kernel 1463 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1464 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, 1465 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1466 if (Ret != OFFLOAD_SUCCESS) { 1467 REPORT("Failed to process data before launching the kernel.\n"); 1468 return OFFLOAD_FAIL; 1469 } 1470 } 1471 1472 // Launch device execution. 1473 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; 1474 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1475 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); 1476 1477 { 1478 TIMESCOPE_WITH_NAME_AND_IDENT( 1479 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); 1480 if (IsTeamConstruct) 1481 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1482 TgtArgs.size(), TeamNum, ThreadLimit, 1483 getLoopTripCount(DeviceId), AsyncInfo); 1484 else 1485 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1486 TgtArgs.size(), AsyncInfo); 1487 } 1488 1489 if (Ret != OFFLOAD_SUCCESS) { 1490 REPORT("Executing target region abort target.\n"); 1491 return OFFLOAD_FAIL; 1492 } 1493 1494 if (ArgNum) { 1495 // Transfer data back and deallocate target memory for (first-)private 1496 // variables 1497 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1498 ArgSizes, ArgTypes, ArgNames, ArgMappers, 1499 PrivateArgumentManager, AsyncInfo); 1500 if (Ret != OFFLOAD_SUCCESS) { 1501 REPORT("Failed to process data after launching the kernel.\n"); 1502 return OFFLOAD_FAIL; 1503 } 1504 } 1505 1506 return OFFLOAD_SUCCESS; 1507 } 1508