1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "device.h" 16 #include "private.h" 17 #include "rtl.h" 18 19 #include <cassert> 20 #include <cstdint> 21 #include <vector> 22 23 int AsyncInfoTy::synchronize() { 24 int Result = OFFLOAD_SUCCESS; 25 if (AsyncInfo.Queue) { 26 // If we have a queue we need to synchronize it now. 27 Result = Device.synchronize(*this); 28 assert(AsyncInfo.Queue == nullptr && 29 "The device plugin should have nulled the queue to indicate there " 30 "are no outstanding actions!"); 31 } 32 return Result; 33 } 34 35 void *&AsyncInfoTy::getVoidPtrLocation() { 36 BufferLocations.push_back(nullptr); 37 return BufferLocations.back(); 38 } 39 40 /* All begin addresses for partially mapped structs must be 8-aligned in order 41 * to ensure proper alignment of members. E.g. 42 * 43 * struct S { 44 * int a; // 4-aligned 45 * int b; // 4-aligned 46 * int *p; // 8-aligned 47 * } s1; 48 * ... 49 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 50 * { 51 * s1.b = 5; 52 * for (int i...) s1.p[i] = ...; 53 * } 54 * 55 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 56 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 57 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 58 * requirements for its type. Now, when we allocate memory on the device, in 59 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 60 * This means that the chunk of the struct on the device will start at a 61 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 62 * address of p will be a misaligned 0x204 (on the host there was no need to add 63 * padding between b and p, so p comes exactly 4 bytes after b). If the device 64 * kernel tries to access s1.p, a misaligned address error occurs (as reported 65 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 66 * extending the size of the allocated chuck accordingly, the chuck on the 67 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 68 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 69 */ 70 static const int64_t Alignment = 8; 71 72 /// Map global data and execute pending ctors 73 static int InitLibrary(DeviceTy &Device) { 74 /* 75 * Map global data 76 */ 77 int32_t device_id = Device.DeviceID; 78 int rc = OFFLOAD_SUCCESS; 79 bool supportsEmptyImages = Device.RTL->supports_empty_images && 80 Device.RTL->supports_empty_images() > 0; 81 82 Device.PendingGlobalsMtx.lock(); 83 PM->TrlTblMtx.lock(); 84 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { 85 TranslationTable *TransTable = 86 &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; 87 if (TransTable->HostTable.EntriesBegin == 88 TransTable->HostTable.EntriesEnd && 89 !supportsEmptyImages) { 90 // No host entry so no need to proceed 91 continue; 92 } 93 94 if (TransTable->TargetsTable[device_id] != 0) { 95 // Library entries have already been processed 96 continue; 97 } 98 99 // 1) get image. 100 assert(TransTable->TargetsImages.size() > (size_t)device_id && 101 "Not expecting a device ID outside the table's bounds!"); 102 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 103 if (!img) { 104 REPORT("No image loaded for device id %d.\n", device_id); 105 rc = OFFLOAD_FAIL; 106 break; 107 } 108 // 2) load image into the target table. 109 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = 110 Device.load_binary(img); 111 // Unable to get table for this image: invalidate image and fail. 112 if (!TargetTable) { 113 REPORT("Unable to generate entries table for device id %d.\n", device_id); 114 TransTable->TargetsImages[device_id] = 0; 115 rc = OFFLOAD_FAIL; 116 break; 117 } 118 119 // Verify whether the two table sizes match. 120 size_t hsize = 121 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 122 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 123 124 // Invalid image for these host entries! 125 if (hsize != tsize) { 126 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n", 127 device_id, hsize, tsize); 128 TransTable->TargetsImages[device_id] = 0; 129 TransTable->TargetsTable[device_id] = 0; 130 rc = OFFLOAD_FAIL; 131 break; 132 } 133 134 // process global data that needs to be mapped. 135 Device.DataMapMtx.lock(); 136 __tgt_target_table *HostTable = &TransTable->HostTable; 137 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 138 *CurrHostEntry = HostTable->EntriesBegin, 139 *EntryDeviceEnd = TargetTable->EntriesEnd; 140 CurrDeviceEntry != EntryDeviceEnd; 141 CurrDeviceEntry++, CurrHostEntry++) { 142 if (CurrDeviceEntry->size != 0) { 143 // has data. 144 assert(CurrDeviceEntry->size == CurrHostEntry->size && 145 "data size mismatch"); 146 147 // Fortran may use multiple weak declarations for the same symbol, 148 // therefore we must allow for multiple weak symbols to be loaded from 149 // the fat binary. Treat these mappings as any other "regular" mapping. 150 // Add entry to map. 151 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) 152 continue; 153 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" 154 "\n", 155 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 156 CurrDeviceEntry->size); 157 Device.HostDataToTargetMap.emplace( 158 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 159 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 160 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, 161 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 162 false /*UseHoldRefCount*/, nullptr /*Name*/, 163 true /*IsRefCountINF*/); 164 } 165 } 166 Device.DataMapMtx.unlock(); 167 } 168 PM->TrlTblMtx.unlock(); 169 170 if (rc != OFFLOAD_SUCCESS) { 171 Device.PendingGlobalsMtx.unlock(); 172 return rc; 173 } 174 175 /* 176 * Run ctors for static objects 177 */ 178 if (!Device.PendingCtorsDtors.empty()) { 179 AsyncInfoTy AsyncInfo(Device); 180 // Call all ctors for all libraries registered so far 181 for (auto &lib : Device.PendingCtorsDtors) { 182 if (!lib.second.PendingCtors.empty()) { 183 DP("Has pending ctors... call now\n"); 184 for (auto &entry : lib.second.PendingCtors) { 185 void *ctor = entry; 186 int rc = 187 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, 188 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); 189 if (rc != OFFLOAD_SUCCESS) { 190 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 191 Device.PendingGlobalsMtx.unlock(); 192 return OFFLOAD_FAIL; 193 } 194 } 195 // Clear the list to indicate that this device has been used 196 lib.second.PendingCtors.clear(); 197 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 198 } 199 } 200 // All constructors have been issued, wait for them now. 201 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 202 return OFFLOAD_FAIL; 203 } 204 Device.HasPendingGlobals = false; 205 Device.PendingGlobalsMtx.unlock(); 206 207 return OFFLOAD_SUCCESS; 208 } 209 210 void handleTargetOutcome(bool Success, ident_t *Loc) { 211 switch (PM->TargetOffloadPolicy) { 212 case tgt_disabled: 213 if (Success) { 214 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 215 } 216 break; 217 case tgt_default: 218 FATAL_MESSAGE0(1, "default offloading policy must be switched to " 219 "mandatory or disabled"); 220 break; 221 case tgt_mandatory: 222 if (!Success) { 223 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 224 for (auto &Device : PM->Devices) 225 dumpTargetPointerMappings(Loc, *Device); 226 else 227 FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target " 228 "pointer mappings.\n", 229 OMP_INFOTYPE_DUMP_TABLE); 230 231 SourceInfo info(Loc); 232 if (info.isAvailible()) 233 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(), 234 info.getColumn()); 235 else 236 FAILURE_MESSAGE("Source location information not present. Compile with " 237 "-g or -gline-tables-only.\n"); 238 FATAL_MESSAGE0( 239 1, "failure of target construct while offloading is mandatory"); 240 } else { 241 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 242 for (auto &Device : PM->Devices) 243 dumpTargetPointerMappings(Loc, *Device); 244 } 245 break; 246 } 247 } 248 249 static void handleDefaultTargetOffload() { 250 PM->TargetOffloadMtx.lock(); 251 if (PM->TargetOffloadPolicy == tgt_default) { 252 if (omp_get_num_devices() > 0) { 253 DP("Default TARGET OFFLOAD policy is now mandatory " 254 "(devices were found)\n"); 255 PM->TargetOffloadPolicy = tgt_mandatory; 256 } else { 257 DP("Default TARGET OFFLOAD policy is now disabled " 258 "(no devices were found)\n"); 259 PM->TargetOffloadPolicy = tgt_disabled; 260 } 261 } 262 PM->TargetOffloadMtx.unlock(); 263 } 264 265 static bool isOffloadDisabled() { 266 if (PM->TargetOffloadPolicy == tgt_default) 267 handleDefaultTargetOffload(); 268 return PM->TargetOffloadPolicy == tgt_disabled; 269 } 270 271 // If offload is enabled, ensure that device DeviceID has been initialized, 272 // global ctors have been executed, and global data has been mapped. 273 // 274 // The return bool indicates if the offload is to the host device 275 // There are three possible results: 276 // - Return false if the taregt device is ready for offload 277 // - Return true without reporting a runtime error if offload is 278 // disabled, perhaps because the initial device was specified. 279 // - Report a runtime error and return true. 280 // 281 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 282 // This step might be skipped if offload is disabled. 283 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { 284 if (isOffloadDisabled()) { 285 DP("Offload is disabled\n"); 286 return true; 287 } 288 289 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 290 DeviceID = omp_get_default_device(); 291 DP("Use default device id %" PRId64 "\n", DeviceID); 292 } 293 294 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 295 if (omp_get_num_devices() == 0) { 296 DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 297 handleTargetOutcome(false, Loc); 298 return true; 299 } 300 301 if (DeviceID == omp_get_initial_device()) { 302 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 303 DeviceID); 304 return true; 305 } 306 307 // Is device ready? 308 if (!device_is_ready(DeviceID)) { 309 REPORT("Device %" PRId64 " is not ready.\n", DeviceID); 310 handleTargetOutcome(false, Loc); 311 return true; 312 } 313 314 // Get device info. 315 DeviceTy &Device = *PM->Devices[DeviceID]; 316 317 // Check whether global data has been mapped for this device 318 Device.PendingGlobalsMtx.lock(); 319 bool hasPendingGlobals = Device.HasPendingGlobals; 320 Device.PendingGlobalsMtx.unlock(); 321 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 322 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID); 323 handleTargetOutcome(false, Loc); 324 return true; 325 } 326 327 return false; 328 } 329 330 static int32_t getParentIndex(int64_t type) { 331 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 332 } 333 334 void *targetAllocExplicit(size_t size, int device_num, int kind, 335 const char *name) { 336 TIMESCOPE(); 337 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size); 338 339 if (size <= 0) { 340 DP("Call to %s with non-positive length\n", name); 341 return NULL; 342 } 343 344 void *rc = NULL; 345 346 if (device_num == omp_get_initial_device()) { 347 rc = malloc(size); 348 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc)); 349 return rc; 350 } 351 352 if (!device_is_ready(device_num)) { 353 DP("%s returns NULL ptr\n", name); 354 return NULL; 355 } 356 357 DeviceTy &Device = *PM->Devices[device_num]; 358 rc = Device.allocData(size, nullptr, kind); 359 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); 360 return rc; 361 } 362 363 /// Call the user-defined mapper function followed by the appropriate 364 // targetData* function (targetData{Begin,End,Update}). 365 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, 366 int64_t arg_size, int64_t arg_type, 367 map_var_info_t arg_names, void *arg_mapper, 368 AsyncInfoTy &AsyncInfo, 369 TargetDataFuncPtrTy target_data_function) { 370 TIMESCOPE_WITH_IDENT(loc); 371 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); 372 373 // The mapper function fills up Components. 374 MapperComponentsTy MapperComponents; 375 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); 376 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type, 377 arg_names); 378 379 // Construct new arrays for args_base, args, arg_sizes and arg_types 380 // using the information in MapperComponents and call the corresponding 381 // targetData* function using these new arrays. 382 std::vector<void *> MapperArgsBase(MapperComponents.Components.size()); 383 std::vector<void *> MapperArgs(MapperComponents.Components.size()); 384 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 385 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 386 std::vector<void *> MapperArgNames(MapperComponents.Components.size()); 387 388 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 389 auto &C = MapperComponents.Components[I]; 390 MapperArgsBase[I] = C.Base; 391 MapperArgs[I] = C.Begin; 392 MapperArgSizes[I] = C.Size; 393 MapperArgTypes[I] = C.Type; 394 MapperArgNames[I] = C.Name; 395 } 396 397 int rc = target_data_function(loc, Device, MapperComponents.Components.size(), 398 MapperArgsBase.data(), MapperArgs.data(), 399 MapperArgSizes.data(), MapperArgTypes.data(), 400 MapperArgNames.data(), /*arg_mappers*/ nullptr, 401 AsyncInfo, /*FromMapper=*/true); 402 403 return rc; 404 } 405 406 /// Internal function to do the mapping and transfer the data to the device 407 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, 408 void **args_base, void **args, int64_t *arg_sizes, 409 int64_t *arg_types, map_var_info_t *arg_names, 410 void **arg_mappers, AsyncInfoTy &AsyncInfo, 411 bool FromMapper) { 412 // process each input. 413 for (int32_t i = 0; i < arg_num; ++i) { 414 // Ignore private variables and arrays - there is no mapping for them. 415 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 416 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 417 continue; 418 419 if (arg_mappers && arg_mappers[i]) { 420 // Instead of executing the regular path of targetDataBegin, call the 421 // targetDataMapper variant which will call targetDataBegin again 422 // with new arguments. 423 DP("Calling targetDataMapper for the %dth argument\n", i); 424 425 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; 426 int rc = targetDataMapper(loc, Device, args_base[i], args[i], 427 arg_sizes[i], arg_types[i], arg_name, 428 arg_mappers[i], AsyncInfo, targetDataBegin); 429 430 if (rc != OFFLOAD_SUCCESS) { 431 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 432 " failed.\n"); 433 return OFFLOAD_FAIL; 434 } 435 436 // Skip the rest of this function, continue to the next argument. 437 continue; 438 } 439 440 void *HstPtrBegin = args[i]; 441 void *HstPtrBase = args_base[i]; 442 int64_t data_size = arg_sizes[i]; 443 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i]; 444 445 // Adjust for proper alignment if this is a combined entry (for structs). 446 // Look at the next argument - if that is MEMBER_OF this one, then this one 447 // is a combined entry. 448 int64_t padding = 0; 449 const int next_i = i + 1; 450 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && 451 getParentIndex(arg_types[next_i]) == i) { 452 padding = (int64_t)HstPtrBegin % Alignment; 453 if (padding) { 454 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 455 "\n", 456 padding, DPxPTR(HstPtrBegin)); 457 HstPtrBegin = (char *)HstPtrBegin - padding; 458 data_size += padding; 459 } 460 } 461 462 // Address of pointer on the host and device, respectively. 463 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin; 464 TargetPointerResultTy Pointer_TPR; 465 bool IsHostPtr = false; 466 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 467 // Force the creation of a device side copy of the data when: 468 // a close map modifier was associated with a map that contained a to. 469 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 470 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT; 471 bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD; 472 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 473 // have reached this point via __tgt_target_data_begin and not __tgt_target 474 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 475 // associated with a target region, so there are no target parameters). This 476 // may be considered a hack, we could revise the scheme in the future. 477 bool UpdateRef = 478 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0); 479 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 480 DP("Has a pointer entry: \n"); 481 // Base is address of pointer. 482 // 483 // Usually, the pointer is already allocated by this time. For example: 484 // 485 // #pragma omp target map(s.p[0:N]) 486 // 487 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 488 // afterward, so the pointer is already allocated by the time the 489 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 490 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 491 // entry for a global that might not already be allocated by the time the 492 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 493 // when HasPresentModifier. 494 Pointer_TPR = Device.getTargetPointer( 495 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, 496 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, 497 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); 498 PointerTgtPtrBegin = Pointer_TPR.TargetPointer; 499 IsHostPtr = Pointer_TPR.Flags.IsHostPointer; 500 if (!PointerTgtPtrBegin) { 501 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 502 HasPresentModifier ? "'present' map type modifier" 503 : "device failure or illegal mapping"); 504 return OFFLOAD_FAIL; 505 } 506 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 507 "\n", 508 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 509 (Pointer_TPR.Flags.IsNewEntry ? "" : " not")); 510 Pointer_HstPtrBegin = HstPtrBase; 511 // modify current entry. 512 HstPtrBase = *(void **)HstPtrBase; 513 // No need to update pointee ref count for the first element of the 514 // subelement that comes from mapper. 515 UpdateRef = 516 (!FromMapper || i != 0); // subsequently update ref count of pointee 517 } 518 519 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO; 520 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 521 auto TPR = Device.getTargetPointer( 522 HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo, 523 HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, 524 HasPresentModifier, HasHoldModifier, AsyncInfo); 525 void *TgtPtrBegin = TPR.TargetPointer; 526 IsHostPtr = TPR.Flags.IsHostPointer; 527 // If data_size==0, then the argument could be a zero-length pointer to 528 // NULL, so getOrAlloc() returning NULL is not an error. 529 if (!TgtPtrBegin && (data_size || HasPresentModifier)) { 530 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 531 HasPresentModifier ? "'present' map type modifier" 532 : "device failure or illegal mapping"); 533 return OFFLOAD_FAIL; 534 } 535 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 536 " - is%s new\n", 537 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 538 539 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 540 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 541 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 542 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 543 args_base[i] = TgtPtrBase; 544 } 545 546 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 547 // Check whether we need to update the pointer on the device 548 bool UpdateDevPtr = false; 549 550 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 551 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 552 553 Device.ShadowMtx.lock(); 554 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin); 555 // If this pointer is not in the map we need to insert it. If the map 556 // contains a stale entry, we need to update it (e.g. if the pointee was 557 // deallocated and later on is reallocated at another device address). The 558 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An 559 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ 560 // pair is deallocated, not when the OBJ is deallocated. In 561 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, 562 // so it stays in the map for the lifetime of the application. When the 563 // OBJ is deallocated and later on allocated again (at a different device 564 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin 565 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. 566 if (Entry == Device.ShadowPtrMap.end() || 567 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { 568 // create or update shadow pointers for this entry 569 Device.ShadowPtrMap[Pointer_HstPtrBegin] = { 570 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; 571 Pointer_TPR.MapTableEntry->setMayContainAttachedPointers(); 572 UpdateDevPtr = true; 573 } 574 575 if (UpdateDevPtr) { 576 HostDataToTargetTy::LockGuard LG(*Pointer_TPR.MapTableEntry); 577 Device.ShadowMtx.unlock(); 578 579 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 580 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 581 582 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 583 TgtPtrBase = ExpectedTgtPtrBase; 584 585 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, 586 sizeof(void *), AsyncInfo); 587 if (Ret != OFFLOAD_SUCCESS) { 588 REPORT("Copying data to device failed.\n"); 589 return OFFLOAD_FAIL; 590 } 591 if (Pointer_TPR.MapTableEntry->addEventIfNecessary(Device, AsyncInfo) != 592 OFFLOAD_SUCCESS) 593 return OFFLOAD_FAIL; 594 } else 595 Device.ShadowMtx.unlock(); 596 } 597 } 598 599 return OFFLOAD_SUCCESS; 600 } 601 602 namespace { 603 /// This structure contains information to deallocate a target pointer, aka. 604 /// used to call the function \p DeviceTy::deallocTgtPtr. 605 struct DeallocTgtPtrInfo { 606 /// Host pointer used to look up into the map table 607 void *HstPtrBegin; 608 /// Size of the data 609 int64_t DataSize; 610 /// Whether it has \p ompx_hold modifier 611 bool HasHoldModifier; 612 613 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier) 614 : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {} 615 }; 616 617 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to 618 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the 619 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the 620 /// rest of the map is not checked anymore. 621 template <typename CBTy> 622 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, 623 uintptr_t Size, 624 const TargetPointerResultTy &TPR) { 625 // If we have an object that is too small to hold a pointer subobject, no need 626 // to do any checking. 627 if (Size < sizeof(void *)) 628 return; 629 630 // If the map entry for the object was never marked as containing attached 631 // pointers, no need to do any checking. 632 if (TPR.MapTableEntry == HostDataToTargetListTy::iterator{} || 633 !TPR.MapTableEntry->getMayContainAttachedPointers()) 634 return; 635 636 uintptr_t LB = (uintptr_t)Begin; 637 uintptr_t UB = LB + Size; 638 // Now we are looking into the shadow map so we need to lock it. 639 Device.ShadowMtx.lock(); 640 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); 641 Itr != Device.ShadowPtrMap.end();) { 642 uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first; 643 644 // An STL map is sorted on its keys; use this property 645 // to quickly determine when to break out of the loop. 646 if (ShadowHstPtrAddr < LB) { 647 ++Itr; 648 continue; 649 } 650 if (ShadowHstPtrAddr >= UB) 651 break; 652 653 if (CB(Itr) == OFFLOAD_FAIL) 654 break; 655 } 656 Device.ShadowMtx.unlock(); 657 } 658 659 } // namespace 660 661 /// Internal function to undo the mapping and retrieve the data from the device. 662 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 663 void **ArgBases, void **Args, int64_t *ArgSizes, 664 int64_t *ArgTypes, map_var_info_t *ArgNames, 665 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 666 int Ret; 667 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs; 668 void *FromMapperBase = nullptr; 669 // process each input. 670 for (int32_t I = ArgNum - 1; I >= 0; --I) { 671 // Ignore private variables and arrays - there is no mapping for them. 672 // Also, ignore the use_device_ptr directive, it has no effect here. 673 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 674 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 675 continue; 676 677 if (ArgMappers && ArgMappers[I]) { 678 // Instead of executing the regular path of targetDataEnd, call the 679 // targetDataMapper variant which will call targetDataEnd again 680 // with new arguments. 681 DP("Calling targetDataMapper for the %dth argument\n", I); 682 683 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 684 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], 685 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 686 targetDataEnd); 687 688 if (Ret != OFFLOAD_SUCCESS) { 689 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 690 " failed.\n"); 691 return OFFLOAD_FAIL; 692 } 693 694 // Skip the rest of this function, continue to the next argument. 695 continue; 696 } 697 698 void *HstPtrBegin = Args[I]; 699 int64_t DataSize = ArgSizes[I]; 700 // Adjust for proper alignment if this is a combined entry (for structs). 701 // Look at the next argument - if that is MEMBER_OF this one, then this one 702 // is a combined entry. 703 const int NextI = I + 1; 704 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 705 getParentIndex(ArgTypes[NextI]) == I) { 706 int64_t Padding = (int64_t)HstPtrBegin % Alignment; 707 if (Padding) { 708 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD 709 "\n", 710 Padding, DPxPTR(HstPtrBegin)); 711 HstPtrBegin = (char *)HstPtrBegin - Padding; 712 DataSize += Padding; 713 } 714 } 715 716 bool IsLast, IsHostPtr; 717 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 718 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 719 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 720 !(FromMapper && I == 0); 721 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 722 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 723 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 724 725 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 726 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 727 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, 728 !IsImplicit, ForceDelete); 729 void *TgtPtrBegin = TPR.TargetPointer; 730 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 731 DP("Mapping does not exist (%s)\n", 732 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 733 if (HasPresentModifier) { 734 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 735 // "If a map clause appears on a target, target data, target enter data 736 // or target exit data construct with a present map-type-modifier then 737 // on entry to the region if the corresponding list item does not appear 738 // in the device data environment then an error occurs and the program 739 // terminates." 740 // 741 // This should be an error upon entering an "omp target exit data". It 742 // should not be an error upon exiting an "omp target data" or "omp 743 // target". For "omp target data", Clang thus doesn't include present 744 // modifiers for end calls. For "omp target", we have not found a valid 745 // OpenMP program for which the error matters: it appears that, if a 746 // program can guarantee that data is present at the beginning of an 747 // "omp target" region so that there's no error there, that data is also 748 // guaranteed to be present at the end. 749 MESSAGE("device mapping required by 'present' map type modifier does " 750 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 751 DPxPTR(HstPtrBegin), DataSize); 752 return OFFLOAD_FAIL; 753 } 754 } else { 755 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 756 " - is%s last\n", 757 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); 758 } 759 760 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 761 // "If the map clause appears on a target, target data, or target exit data 762 // construct and a corresponding list item of the original list item is not 763 // present in the device data environment on exit from the region then the 764 // list item is ignored." 765 if (!TgtPtrBegin) 766 continue; 767 768 bool DelEntry = IsLast; 769 770 // If the last element from the mapper (for end transfer args comes in 771 // reverse order), do not remove the partial entry, the parent struct still 772 // exists. 773 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 774 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 775 DelEntry = false; // protect parent struct from being deallocated 776 } 777 778 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 779 // Move data back to the host 780 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 781 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 782 if ((Always || IsLast) && !IsHostPtr) { 783 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 784 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 785 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, 786 AsyncInfo); 787 if (Ret != OFFLOAD_SUCCESS) { 788 REPORT("Copying data from device failed.\n"); 789 return OFFLOAD_FAIL; 790 } 791 } 792 } 793 if (DelEntry && FromMapper && I == 0) { 794 DelEntry = false; 795 FromMapperBase = HstPtrBegin; 796 } 797 798 // If we copied back to the host a struct/array containing pointers, we 799 // need to restore the original host pointer values from their shadow 800 // copies. If the struct is going to be deallocated, remove any remaining 801 // shadow pointer entries for this struct. 802 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 803 // If we copied the struct to the host, we need to restore the pointer. 804 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 805 void **ShadowHstPtrAddr = (void **)Itr->first; 806 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 807 DP("Restoring original host pointer value " DPxMOD " for host " 808 "pointer " DPxMOD "\n", 809 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 810 } 811 // If the struct is to be deallocated, remove the shadow entry. 812 if (DelEntry) { 813 DP("Removing shadow pointer " DPxMOD "\n", 814 DPxPTR((void **)Itr->first)); 815 Itr = Device.ShadowPtrMap.erase(Itr); 816 } else { 817 ++Itr; 818 } 819 return OFFLOAD_SUCCESS; 820 }; 821 applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR); 822 823 // Add pointer to the buffer for later deallocation 824 if (DelEntry && !IsHostPtr) 825 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier); 826 } 827 } 828 829 // TODO: We should not synchronize here but pass the AsyncInfo object to the 830 // allocate/deallocate device APIs. 831 // 832 // We need to synchronize before deallocating data. 833 Ret = AsyncInfo.synchronize(); 834 if (Ret != OFFLOAD_SUCCESS) 835 return OFFLOAD_FAIL; 836 837 // Deallocate target pointer 838 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { 839 if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) 840 continue; 841 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, 842 Info.HasHoldModifier); 843 if (Ret != OFFLOAD_SUCCESS) { 844 REPORT("Deallocating data from device failed.\n"); 845 return OFFLOAD_FAIL; 846 } 847 } 848 849 return OFFLOAD_SUCCESS; 850 } 851 852 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, 853 void *HstPtrBegin, int64_t ArgSize, 854 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 855 TIMESCOPE_WITH_IDENT(loc); 856 bool IsLast, IsHostPtr; 857 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 858 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, 859 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); 860 void *TgtPtrBegin = TPR.TargetPointer; 861 if (!TgtPtrBegin) { 862 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 863 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 864 MESSAGE("device mapping required by 'present' motion modifier does not " 865 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 866 DPxPTR(HstPtrBegin), ArgSize); 867 return OFFLOAD_FAIL; 868 } 869 return OFFLOAD_SUCCESS; 870 } 871 872 if (IsHostPtr) { 873 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 874 DPxPTR(HstPtrBegin)); 875 return OFFLOAD_SUCCESS; 876 } 877 878 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 879 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 880 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 881 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); 882 if (Ret != OFFLOAD_SUCCESS) { 883 REPORT("Copying data from device failed.\n"); 884 return OFFLOAD_FAIL; 885 } 886 887 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 888 void **ShadowHstPtrAddr = (void **)Itr->first; 889 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 890 DP("Restoring original host pointer value " DPxMOD 891 " for host pointer " DPxMOD "\n", 892 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 893 return OFFLOAD_SUCCESS; 894 }; 895 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 896 } 897 898 if (ArgType & OMP_TGT_MAPTYPE_TO) { 899 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 900 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 901 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); 902 if (Ret != OFFLOAD_SUCCESS) { 903 REPORT("Copying data to device failed.\n"); 904 return OFFLOAD_FAIL; 905 } 906 907 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 908 DP("Restoring original target pointer value " DPxMOD " for target " 909 "pointer " DPxMOD "\n", 910 DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr)); 911 Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal, 912 sizeof(void *), AsyncInfo); 913 if (Ret != OFFLOAD_SUCCESS) 914 REPORT("Copying data to device failed.\n"); 915 return Ret; 916 }; 917 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 918 } 919 return OFFLOAD_SUCCESS; 920 } 921 922 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device, 923 void *ArgsBase, 924 __tgt_target_non_contig *NonContig, 925 uint64_t Size, int64_t ArgType, 926 int CurrentDim, int DimSize, uint64_t Offset, 927 AsyncInfoTy &AsyncInfo) { 928 TIMESCOPE_WITH_IDENT(loc); 929 int Ret = OFFLOAD_SUCCESS; 930 if (CurrentDim < DimSize) { 931 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 932 uint64_t CurOffset = 933 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 934 // we only need to transfer the first element for the last dimension 935 // since we've already got a contiguous piece. 936 if (CurrentDim != DimSize - 1 || I == 0) { 937 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, 938 ArgType, CurrentDim + 1, DimSize, 939 Offset + CurOffset, AsyncInfo); 940 // Stop the whole process if any contiguous piece returns anything 941 // other than OFFLOAD_SUCCESS. 942 if (Ret != OFFLOAD_SUCCESS) 943 return Ret; 944 } 945 } 946 } else { 947 char *Ptr = (char *)ArgsBase + Offset; 948 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 949 " len %" PRIu64 "\n", 950 DPxPTR(Ptr), Offset, Size); 951 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, 952 AsyncInfo); 953 } 954 return Ret; 955 } 956 957 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 958 int32_t DimSize) { 959 int RemovedDim = 0; 960 for (int I = DimSize - 1; I > 0; --I) { 961 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 962 RemovedDim++; 963 } 964 return RemovedDim; 965 } 966 967 /// Internal function to pass data to/from the target. 968 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 969 void **ArgsBase, void **Args, int64_t *ArgSizes, 970 int64_t *ArgTypes, map_var_info_t *ArgNames, 971 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 972 // process each input. 973 for (int32_t I = 0; I < ArgNum; ++I) { 974 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 975 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 976 continue; 977 978 if (ArgMappers && ArgMappers[I]) { 979 // Instead of executing the regular path of targetDataUpdate, call the 980 // targetDataMapper variant which will call targetDataUpdate again 981 // with new arguments. 982 DP("Calling targetDataMapper for the %dth argument\n", I); 983 984 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 985 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 986 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 987 targetDataUpdate); 988 989 if (Ret != OFFLOAD_SUCCESS) { 990 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 991 " failed.\n"); 992 return OFFLOAD_FAIL; 993 } 994 995 // Skip the rest of this function, continue to the next argument. 996 continue; 997 } 998 999 int Ret = OFFLOAD_SUCCESS; 1000 1001 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 1002 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 1003 int32_t DimSize = ArgSizes[I]; 1004 uint64_t Size = 1005 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 1006 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 1007 Ret = targetDataNonContiguous( 1008 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 1009 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 1010 } else { 1011 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1012 ArgTypes[I], AsyncInfo); 1013 } 1014 if (Ret == OFFLOAD_FAIL) 1015 return OFFLOAD_FAIL; 1016 } 1017 return OFFLOAD_SUCCESS; 1018 } 1019 1020 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 1021 OMP_TGT_MAPTYPE_LITERAL | 1022 OMP_TGT_MAPTYPE_IMPLICIT; 1023 static bool isLambdaMapping(int64_t Mapping) { 1024 return (Mapping & LambdaMapping) == LambdaMapping; 1025 } 1026 1027 namespace { 1028 /// Find the table information in the map or look it up in the translation 1029 /// tables. 1030 TableMap *getTableMap(void *HostPtr) { 1031 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1032 HostPtrToTableMapTy::iterator TableMapIt = 1033 PM->HostPtrToTableMap.find(HostPtr); 1034 1035 if (TableMapIt != PM->HostPtrToTableMap.end()) 1036 return &TableMapIt->second; 1037 1038 // We don't have a map. So search all the registered libraries. 1039 TableMap *TM = nullptr; 1040 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1041 for (HostEntriesBeginToTransTableTy::iterator Itr = 1042 PM->HostEntriesBeginToTransTable.begin(); 1043 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 1044 // get the translation table (which contains all the good info). 1045 TranslationTable *TransTable = &Itr->second; 1046 // iterate over all the host table entries to see if we can locate the 1047 // host_ptr. 1048 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; 1049 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 1050 if (Cur->addr != HostPtr) 1051 continue; 1052 // we got a match, now fill the HostPtrToTableMap so that we 1053 // may avoid this search next time. 1054 TM = &(PM->HostPtrToTableMap)[HostPtr]; 1055 TM->Table = TransTable; 1056 TM->Index = I; 1057 return TM; 1058 } 1059 } 1060 1061 return nullptr; 1062 } 1063 1064 /// Get loop trip count 1065 /// FIXME: This function will not work right if calling 1066 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in 1067 /// another thread, which might occur when we call task yield. 1068 uint64_t getLoopTripCount(int64_t DeviceId) { 1069 DeviceTy &Device = *PM->Devices[DeviceId]; 1070 uint64_t LoopTripCount = 0; 1071 1072 { 1073 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1074 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 1075 if (I != Device.LoopTripCnt.end()) { 1076 LoopTripCount = I->second; 1077 Device.LoopTripCnt.erase(I); 1078 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount); 1079 } 1080 } 1081 1082 return LoopTripCount; 1083 } 1084 1085 /// A class manages private arguments in a target region. 1086 class PrivateArgumentManagerTy { 1087 /// A data structure for the information of first-private arguments. We can 1088 /// use this information to optimize data transfer by packing all 1089 /// first-private arguments and transfer them all at once. 1090 struct FirstPrivateArgInfoTy { 1091 /// The index of the element in \p TgtArgs corresponding to the argument 1092 const int Index; 1093 /// Host pointer begin 1094 const char *HstPtrBegin; 1095 /// Host pointer end 1096 const char *HstPtrEnd; 1097 /// Aligned size 1098 const int64_t AlignedSize; 1099 /// Host pointer name 1100 const map_var_info_t HstPtrName = nullptr; 1101 1102 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, 1103 const map_var_info_t HstPtrName = nullptr) 1104 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)), 1105 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), 1106 HstPtrName(HstPtrName) {} 1107 }; 1108 1109 /// A vector of target pointers for all private arguments 1110 std::vector<void *> TgtPtrs; 1111 1112 /// A vector of information of all first-private arguments to be packed 1113 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1114 /// Host buffer for all arguments to be packed 1115 std::vector<char> FirstPrivateArgBuffer; 1116 /// The total size of all arguments to be packed 1117 int64_t FirstPrivateArgSize = 0; 1118 1119 /// A reference to the \p DeviceTy object 1120 DeviceTy &Device; 1121 /// A pointer to a \p AsyncInfoTy object 1122 AsyncInfoTy &AsyncInfo; 1123 1124 // TODO: What would be the best value here? Should we make it configurable? 1125 // If the size is larger than this threshold, we will allocate and transfer it 1126 // immediately instead of packing it. 1127 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1128 1129 public: 1130 /// Constructor 1131 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1132 : Device(Dev), AsyncInfo(AsyncInfo) {} 1133 1134 /// Add a private argument 1135 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1136 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1137 const map_var_info_t HstPtrName = nullptr, 1138 const bool AllocImmediately = false) { 1139 // If the argument is not first-private, or its size is greater than a 1140 // predefined threshold, we will allocate memory and issue the transfer 1141 // immediately. 1142 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1143 AllocImmediately) { 1144 TgtPtr = Device.allocData(ArgSize, HstPtr); 1145 if (!TgtPtr) { 1146 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1147 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1148 return OFFLOAD_FAIL; 1149 } 1150 #ifdef OMPTARGET_DEBUG 1151 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1152 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1153 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1154 "\n", 1155 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1156 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1157 #endif 1158 // If first-private, copy data from host 1159 if (IsFirstPrivate) { 1160 DP("Submitting firstprivate data to the device.\n"); 1161 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1162 if (Ret != OFFLOAD_SUCCESS) { 1163 DP("Copying data to device failed, failed.\n"); 1164 return OFFLOAD_FAIL; 1165 } 1166 } 1167 TgtPtrs.push_back(TgtPtr); 1168 } else { 1169 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1170 DPxPTR(HstPtr), ArgSize); 1171 // When reach this point, the argument must meet all following 1172 // requirements: 1173 // 1. Its size does not exceed the threshold (see the comment for 1174 // FirstPrivateArgSizeThreshold); 1175 // 2. It must be first-private (needs to be mapped to target device). 1176 // We will pack all this kind of arguments to transfer them all at once 1177 // to reduce the number of data transfer. We will not take 1178 // non-first-private arguments, aka. private arguments that doesn't need 1179 // to be mapped to target device, into account because data allocation 1180 // can be very efficient with memory manager. 1181 1182 // Placeholder value 1183 TgtPtr = nullptr; 1184 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1185 HstPtrName); 1186 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; 1187 } 1188 1189 return OFFLOAD_SUCCESS; 1190 } 1191 1192 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1193 /// and start the transfer. 1194 int packAndTransfer(std::vector<void *> &TgtArgs) { 1195 if (!FirstPrivateArgInfo.empty()) { 1196 assert(FirstPrivateArgSize != 0 && 1197 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1198 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1199 auto Itr = FirstPrivateArgBuffer.begin(); 1200 // Copy all host data to this buffer 1201 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1202 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1203 Itr = std::next(Itr, Info.AlignedSize); 1204 } 1205 // Allocate target memory 1206 void *TgtPtr = 1207 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1208 if (TgtPtr == nullptr) { 1209 DP("Failed to allocate target memory for private arguments.\n"); 1210 return OFFLOAD_FAIL; 1211 } 1212 TgtPtrs.push_back(TgtPtr); 1213 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1214 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1215 // Transfer data to target device 1216 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1217 FirstPrivateArgSize, AsyncInfo); 1218 if (Ret != OFFLOAD_SUCCESS) { 1219 DP("Failed to submit data of private arguments.\n"); 1220 return OFFLOAD_FAIL; 1221 } 1222 // Fill in all placeholder pointers 1223 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1224 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1225 void *&Ptr = TgtArgs[Info.Index]; 1226 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1227 Ptr = reinterpret_cast<void *>(TP); 1228 TP += Info.AlignedSize; 1229 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1230 "\n", 1231 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1232 DPxPTR(Ptr)); 1233 } 1234 } 1235 1236 return OFFLOAD_SUCCESS; 1237 } 1238 1239 /// Free all target memory allocated for private arguments 1240 int free() { 1241 for (void *P : TgtPtrs) { 1242 int Ret = Device.deleteData(P); 1243 if (Ret != OFFLOAD_SUCCESS) { 1244 DP("Deallocation of (first-)private arrays failed.\n"); 1245 return OFFLOAD_FAIL; 1246 } 1247 } 1248 1249 TgtPtrs.clear(); 1250 1251 return OFFLOAD_SUCCESS; 1252 } 1253 }; 1254 1255 /// Process data before launching the kernel, including calling targetDataBegin 1256 /// to map and transfer data to target device, transferring (first-)private 1257 /// variables. 1258 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr, 1259 int32_t ArgNum, void **ArgBases, void **Args, 1260 int64_t *ArgSizes, int64_t *ArgTypes, 1261 map_var_info_t *ArgNames, void **ArgMappers, 1262 std::vector<void *> &TgtArgs, 1263 std::vector<ptrdiff_t> &TgtOffsets, 1264 PrivateArgumentManagerTy &PrivateArgumentManager, 1265 AsyncInfoTy &AsyncInfo) { 1266 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); 1267 DeviceTy &Device = *PM->Devices[DeviceId]; 1268 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1269 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1270 if (Ret != OFFLOAD_SUCCESS) { 1271 REPORT("Call to targetDataBegin failed, abort target.\n"); 1272 return OFFLOAD_FAIL; 1273 } 1274 1275 // List of (first-)private arrays allocated for this target region 1276 std::vector<int> TgtArgsPositions(ArgNum, -1); 1277 1278 for (int32_t I = 0; I < ArgNum; ++I) { 1279 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1280 // This is not a target parameter, do not push it into TgtArgs. 1281 // Check for lambda mapping. 1282 if (isLambdaMapping(ArgTypes[I])) { 1283 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1284 "PTR_AND_OBJ must be also MEMBER_OF."); 1285 unsigned Idx = getParentIndex(ArgTypes[I]); 1286 int TgtIdx = TgtArgsPositions[Idx]; 1287 assert(TgtIdx != -1 && "Base address must be translated already."); 1288 // The parent lambda must be processed already and it must be the last 1289 // in TgtArgs and TgtOffsets arrays. 1290 void *HstPtrVal = Args[I]; 1291 void *HstPtrBegin = ArgBases[I]; 1292 void *HstPtrBase = Args[Idx]; 1293 bool IsLast, IsHostPtr; // IsLast is unused. 1294 void *TgtPtrBase = 1295 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1296 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1297 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1298 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1299 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1300 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 1301 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, 1302 /*UseHoldRefCount=*/false, IsHostPtr); 1303 PointerTgtPtrBegin = TPR.TargetPointer; 1304 if (!PointerTgtPtrBegin) { 1305 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1306 DPxPTR(HstPtrVal)); 1307 continue; 1308 } 1309 if (IsHostPtr) { 1310 DP("Unified memory is active, no need to map lambda captured" 1311 "variable (" DPxMOD ")\n", 1312 DPxPTR(HstPtrVal)); 1313 continue; 1314 } 1315 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1316 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1317 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1318 sizeof(void *), AsyncInfo); 1319 if (Ret != OFFLOAD_SUCCESS) { 1320 REPORT("Copying data to device failed.\n"); 1321 return OFFLOAD_FAIL; 1322 } 1323 } 1324 continue; 1325 } 1326 void *HstPtrBegin = Args[I]; 1327 void *HstPtrBase = ArgBases[I]; 1328 void *TgtPtrBegin; 1329 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1330 ptrdiff_t TgtBaseOffset; 1331 bool IsLast, IsHostPtr; // unused. 1332 TargetPointerResultTy TPR; 1333 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1334 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1335 DPxPTR(HstPtrBase)); 1336 TgtPtrBegin = HstPtrBase; 1337 TgtBaseOffset = 0; 1338 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1339 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1340 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1341 // If there is a next argument and it depends on the current one, we need 1342 // to allocate the private memory immediately. If this is not the case, 1343 // then the argument can be marked for optimization and packed with the 1344 // other privates. 1345 const bool AllocImmediately = 1346 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1347 Ret = PrivateArgumentManager.addArg( 1348 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1349 TgtArgs.size(), HstPtrName, AllocImmediately); 1350 if (Ret != OFFLOAD_SUCCESS) { 1351 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1352 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1353 return OFFLOAD_FAIL; 1354 } 1355 } else { 1356 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1357 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1358 TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, 1359 /*UpdateRefCount=*/false, 1360 /*UseHoldRefCount=*/false, IsHostPtr); 1361 TgtPtrBegin = TPR.TargetPointer; 1362 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1363 #ifdef OMPTARGET_DEBUG 1364 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1365 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1366 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1367 #endif 1368 } 1369 TgtArgsPositions[I] = TgtArgs.size(); 1370 TgtArgs.push_back(TgtPtrBegin); 1371 TgtOffsets.push_back(TgtBaseOffset); 1372 } 1373 1374 assert(TgtArgs.size() == TgtOffsets.size() && 1375 "Size mismatch in arguments and offsets"); 1376 1377 // Pack and transfer first-private arguments 1378 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1379 if (Ret != OFFLOAD_SUCCESS) { 1380 DP("Failed to pack and transfer first private arguments\n"); 1381 return OFFLOAD_FAIL; 1382 } 1383 1384 return OFFLOAD_SUCCESS; 1385 } 1386 1387 /// Process data after launching the kernel, including transferring data back to 1388 /// host if needed and deallocating target memory of (first-)private variables. 1389 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr, 1390 int32_t ArgNum, void **ArgBases, void **Args, 1391 int64_t *ArgSizes, int64_t *ArgTypes, 1392 map_var_info_t *ArgNames, void **ArgMappers, 1393 PrivateArgumentManagerTy &PrivateArgumentManager, 1394 AsyncInfoTy &AsyncInfo) { 1395 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); 1396 DeviceTy &Device = *PM->Devices[DeviceId]; 1397 1398 // Move data from device. 1399 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1400 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1401 if (Ret != OFFLOAD_SUCCESS) { 1402 REPORT("Call to targetDataEnd failed, abort target.\n"); 1403 return OFFLOAD_FAIL; 1404 } 1405 1406 // Free target memory for private arguments 1407 Ret = PrivateArgumentManager.free(); 1408 if (Ret != OFFLOAD_SUCCESS) { 1409 REPORT("Failed to deallocate target memory for private args\n"); 1410 return OFFLOAD_FAIL; 1411 } 1412 1413 return OFFLOAD_SUCCESS; 1414 } 1415 } // namespace 1416 1417 /// performs the same actions as data_begin in case arg_num is 1418 /// non-zero and initiates run of the offloaded region on the target platform; 1419 /// if arg_num is non-zero after the region execution is done it also 1420 /// performs the same action as data_update and data_end above. This function 1421 /// returns 0 if it was able to transfer the execution to a target and an 1422 /// integer different from zero otherwise. 1423 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, 1424 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 1425 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, 1426 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { 1427 int32_t DeviceId = Device.DeviceID; 1428 1429 TableMap *TM = getTableMap(HostPtr); 1430 // No map for this host pointer found! 1431 if (!TM) { 1432 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1433 DPxPTR(HostPtr)); 1434 return OFFLOAD_FAIL; 1435 } 1436 1437 // get target table. 1438 __tgt_target_table *TargetTable = nullptr; 1439 { 1440 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1441 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1442 "Not expecting a device ID outside the table's bounds!"); 1443 TargetTable = TM->Table->TargetsTable[DeviceId]; 1444 } 1445 assert(TargetTable && "Global data has not been mapped\n"); 1446 1447 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we 1448 // need to manifest base pointers prior to launching a kernel. Even if we have 1449 // mapped an object only partially, e.g. A[N:M], although the kernel is 1450 // expected to access elements starting at address &A[N] and beyond, we still 1451 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI 1452 // API, we need the begin address itself, i.e. &A[N], as the API operates on 1453 // begin addresses, not bases. That's why we pass args and offsets as two 1454 // separate entities so that each plugin can do what it needs. This behavior 1455 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. 1456 std::vector<void *> TgtArgs; 1457 std::vector<ptrdiff_t> TgtOffsets; 1458 1459 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1460 1461 int Ret; 1462 if (ArgNum) { 1463 // Process data, such as data mapping, before launching the kernel 1464 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1465 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, 1466 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1467 if (Ret != OFFLOAD_SUCCESS) { 1468 REPORT("Failed to process data before launching the kernel.\n"); 1469 return OFFLOAD_FAIL; 1470 } 1471 } 1472 1473 // Launch device execution. 1474 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; 1475 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1476 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); 1477 1478 { 1479 TIMESCOPE_WITH_NAME_AND_IDENT( 1480 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); 1481 if (IsTeamConstruct) 1482 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1483 TgtArgs.size(), TeamNum, ThreadLimit, 1484 getLoopTripCount(DeviceId), AsyncInfo); 1485 else 1486 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1487 TgtArgs.size(), AsyncInfo); 1488 } 1489 1490 if (Ret != OFFLOAD_SUCCESS) { 1491 REPORT("Executing target region abort target.\n"); 1492 return OFFLOAD_FAIL; 1493 } 1494 1495 if (ArgNum) { 1496 // Transfer data back and deallocate target memory for (first-)private 1497 // variables 1498 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1499 ArgSizes, ArgTypes, ArgNames, ArgMappers, 1500 PrivateArgumentManager, AsyncInfo); 1501 if (Ret != OFFLOAD_SUCCESS) { 1502 REPORT("Failed to process data after launching the kernel.\n"); 1503 return OFFLOAD_FAIL; 1504 } 1505 } 1506 1507 return OFFLOAD_SUCCESS; 1508 } 1509