1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "device.h" 16 #include "private.h" 17 #include "rtl.h" 18 19 #include <cassert> 20 #include <vector> 21 22 int AsyncInfoTy::synchronize() { 23 int Result = OFFLOAD_SUCCESS; 24 if (AsyncInfo.Queue) { 25 // If we have a queue we need to synchronize it now. 26 Result = Device.synchronize(*this); 27 assert(AsyncInfo.Queue == nullptr && 28 "The device plugin should have nulled the queue to indicate there " 29 "are no outstanding actions!"); 30 } 31 return Result; 32 } 33 34 void *&AsyncInfoTy::getVoidPtrLocation() { 35 BufferLocations.push_back(nullptr); 36 return BufferLocations.back(); 37 } 38 39 /* All begin addresses for partially mapped structs must be 8-aligned in order 40 * to ensure proper alignment of members. E.g. 41 * 42 * struct S { 43 * int a; // 4-aligned 44 * int b; // 4-aligned 45 * int *p; // 8-aligned 46 * } s1; 47 * ... 48 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 49 * { 50 * s1.b = 5; 51 * for (int i...) s1.p[i] = ...; 52 * } 53 * 54 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 55 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 56 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 57 * requirements for its type. Now, when we allocate memory on the device, in 58 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 59 * This means that the chunk of the struct on the device will start at a 60 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 61 * address of p will be a misaligned 0x204 (on the host there was no need to add 62 * padding between b and p, so p comes exactly 4 bytes after b). If the device 63 * kernel tries to access s1.p, a misaligned address error occurs (as reported 64 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 65 * extending the size of the allocated chuck accordingly, the chuck on the 66 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 67 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 68 */ 69 static const int64_t Alignment = 8; 70 71 /// Map global data and execute pending ctors 72 static int InitLibrary(DeviceTy &Device) { 73 /* 74 * Map global data 75 */ 76 int32_t device_id = Device.DeviceID; 77 int rc = OFFLOAD_SUCCESS; 78 bool supportsEmptyImages = Device.RTL->supports_empty_images && 79 Device.RTL->supports_empty_images() > 0; 80 81 Device.PendingGlobalsMtx.lock(); 82 PM->TrlTblMtx.lock(); 83 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { 84 TranslationTable *TransTable = 85 &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; 86 if (TransTable->HostTable.EntriesBegin == 87 TransTable->HostTable.EntriesEnd && 88 !supportsEmptyImages) { 89 // No host entry so no need to proceed 90 continue; 91 } 92 93 if (TransTable->TargetsTable[device_id] != 0) { 94 // Library entries have already been processed 95 continue; 96 } 97 98 // 1) get image. 99 assert(TransTable->TargetsImages.size() > (size_t)device_id && 100 "Not expecting a device ID outside the table's bounds!"); 101 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 102 if (!img) { 103 REPORT("No image loaded for device id %d.\n", device_id); 104 rc = OFFLOAD_FAIL; 105 break; 106 } 107 // 2) load image into the target table. 108 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = 109 Device.load_binary(img); 110 // Unable to get table for this image: invalidate image and fail. 111 if (!TargetTable) { 112 REPORT("Unable to generate entries table for device id %d.\n", device_id); 113 TransTable->TargetsImages[device_id] = 0; 114 rc = OFFLOAD_FAIL; 115 break; 116 } 117 118 // Verify whether the two table sizes match. 119 size_t hsize = 120 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 121 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 122 123 // Invalid image for these host entries! 124 if (hsize != tsize) { 125 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n", 126 device_id, hsize, tsize); 127 TransTable->TargetsImages[device_id] = 0; 128 TransTable->TargetsTable[device_id] = 0; 129 rc = OFFLOAD_FAIL; 130 break; 131 } 132 133 // process global data that needs to be mapped. 134 Device.DataMapMtx.lock(); 135 __tgt_target_table *HostTable = &TransTable->HostTable; 136 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 137 *CurrHostEntry = HostTable->EntriesBegin, 138 *EntryDeviceEnd = TargetTable->EntriesEnd; 139 CurrDeviceEntry != EntryDeviceEnd; 140 CurrDeviceEntry++, CurrHostEntry++) { 141 if (CurrDeviceEntry->size != 0) { 142 // has data. 143 assert(CurrDeviceEntry->size == CurrHostEntry->size && 144 "data size mismatch"); 145 146 // Fortran may use multiple weak declarations for the same symbol, 147 // therefore we must allow for multiple weak symbols to be loaded from 148 // the fat binary. Treat these mappings as any other "regular" mapping. 149 // Add entry to map. 150 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) 151 continue; 152 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" 153 "\n", 154 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 155 CurrDeviceEntry->size); 156 Device.HostDataToTargetMap.emplace( 157 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 158 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 159 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, 160 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, nullptr, 161 true /*IsRefCountINF*/); 162 } 163 } 164 Device.DataMapMtx.unlock(); 165 } 166 PM->TrlTblMtx.unlock(); 167 168 if (rc != OFFLOAD_SUCCESS) { 169 Device.PendingGlobalsMtx.unlock(); 170 return rc; 171 } 172 173 /* 174 * Run ctors for static objects 175 */ 176 if (!Device.PendingCtorsDtors.empty()) { 177 AsyncInfoTy AsyncInfo(Device); 178 // Call all ctors for all libraries registered so far 179 for (auto &lib : Device.PendingCtorsDtors) { 180 if (!lib.second.PendingCtors.empty()) { 181 DP("Has pending ctors... call now\n"); 182 for (auto &entry : lib.second.PendingCtors) { 183 void *ctor = entry; 184 int rc = 185 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, 186 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); 187 if (rc != OFFLOAD_SUCCESS) { 188 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 189 Device.PendingGlobalsMtx.unlock(); 190 return OFFLOAD_FAIL; 191 } 192 } 193 // Clear the list to indicate that this device has been used 194 lib.second.PendingCtors.clear(); 195 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 196 } 197 } 198 // All constructors have been issued, wait for them now. 199 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 200 return OFFLOAD_FAIL; 201 } 202 Device.HasPendingGlobals = false; 203 Device.PendingGlobalsMtx.unlock(); 204 205 return OFFLOAD_SUCCESS; 206 } 207 208 void handleTargetOutcome(bool Success, ident_t *Loc) { 209 switch (PM->TargetOffloadPolicy) { 210 case tgt_disabled: 211 if (Success) { 212 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 213 } 214 break; 215 case tgt_default: 216 FATAL_MESSAGE0(1, "default offloading policy must be switched to " 217 "mandatory or disabled"); 218 break; 219 case tgt_mandatory: 220 if (!Success) { 221 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 222 for (auto &Device : PM->Devices) 223 dumpTargetPointerMappings(Loc, Device); 224 else 225 FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target " 226 "pointer mappings.\n", 227 OMP_INFOTYPE_DUMP_TABLE); 228 229 SourceInfo info(Loc); 230 if (info.isAvailible()) 231 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(), 232 info.getColumn()); 233 else 234 FAILURE_MESSAGE("Source location information not present. Compile with " 235 "-g or -gline-tables-only.\n"); 236 FATAL_MESSAGE0( 237 1, "failure of target construct while offloading is mandatory"); 238 } else { 239 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 240 for (auto &Device : PM->Devices) 241 dumpTargetPointerMappings(Loc, Device); 242 } 243 break; 244 } 245 } 246 247 static void handleDefaultTargetOffload() { 248 PM->TargetOffloadMtx.lock(); 249 if (PM->TargetOffloadPolicy == tgt_default) { 250 if (omp_get_num_devices() > 0) { 251 DP("Default TARGET OFFLOAD policy is now mandatory " 252 "(devices were found)\n"); 253 PM->TargetOffloadPolicy = tgt_mandatory; 254 } else { 255 DP("Default TARGET OFFLOAD policy is now disabled " 256 "(no devices were found)\n"); 257 PM->TargetOffloadPolicy = tgt_disabled; 258 } 259 } 260 PM->TargetOffloadMtx.unlock(); 261 } 262 263 static bool isOffloadDisabled() { 264 if (PM->TargetOffloadPolicy == tgt_default) 265 handleDefaultTargetOffload(); 266 return PM->TargetOffloadPolicy == tgt_disabled; 267 } 268 269 // If offload is enabled, ensure that device DeviceID has been initialized, 270 // global ctors have been executed, and global data has been mapped. 271 // 272 // There are three possible results: 273 // - Return OFFLOAD_SUCCESS if the device is ready for offload. 274 // - Return OFFLOAD_FAIL without reporting a runtime error if offload is 275 // disabled, perhaps because the initial device was specified. 276 // - Report a runtime error and return OFFLOAD_FAIL. 277 // 278 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 279 // This step might be skipped if offload is disabled. 280 int checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { 281 if (isOffloadDisabled()) { 282 DP("Offload is disabled\n"); 283 return OFFLOAD_FAIL; 284 } 285 286 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 287 DeviceID = omp_get_default_device(); 288 DP("Use default device id %" PRId64 "\n", DeviceID); 289 } 290 291 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 292 if (omp_get_num_devices() == 0) { 293 DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 294 handleTargetOutcome(false, Loc); 295 return OFFLOAD_FAIL; 296 } 297 298 if (DeviceID == omp_get_initial_device()) { 299 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 300 DeviceID); 301 return OFFLOAD_FAIL; 302 } 303 304 // Is device ready? 305 if (!device_is_ready(DeviceID)) { 306 REPORT("Device %" PRId64 " is not ready.\n", DeviceID); 307 handleTargetOutcome(false, Loc); 308 return OFFLOAD_FAIL; 309 } 310 311 // Get device info. 312 DeviceTy &Device = PM->Devices[DeviceID]; 313 314 // Check whether global data has been mapped for this device 315 Device.PendingGlobalsMtx.lock(); 316 bool hasPendingGlobals = Device.HasPendingGlobals; 317 Device.PendingGlobalsMtx.unlock(); 318 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 319 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID); 320 handleTargetOutcome(false, Loc); 321 return OFFLOAD_FAIL; 322 } 323 324 return OFFLOAD_SUCCESS; 325 } 326 327 static int32_t getParentIndex(int64_t type) { 328 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 329 } 330 331 void *targetAllocExplicit(size_t size, int device_num, int kind, 332 const char *name) { 333 TIMESCOPE(); 334 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size); 335 336 if (size <= 0) { 337 DP("Call to %s with non-positive length\n", name); 338 return NULL; 339 } 340 341 void *rc = NULL; 342 343 if (device_num == omp_get_initial_device()) { 344 rc = malloc(size); 345 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc)); 346 return rc; 347 } 348 349 if (!device_is_ready(device_num)) { 350 DP("%s returns NULL ptr\n", name); 351 return NULL; 352 } 353 354 DeviceTy &Device = PM->Devices[device_num]; 355 rc = Device.allocData(size, nullptr, kind); 356 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); 357 return rc; 358 } 359 360 /// Call the user-defined mapper function followed by the appropriate 361 // targetData* function (targetData{Begin,End,Update}). 362 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, 363 int64_t arg_size, int64_t arg_type, 364 map_var_info_t arg_names, void *arg_mapper, 365 AsyncInfoTy &AsyncInfo, 366 TargetDataFuncPtrTy target_data_function) { 367 TIMESCOPE_WITH_IDENT(loc); 368 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); 369 370 // The mapper function fills up Components. 371 MapperComponentsTy MapperComponents; 372 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); 373 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type, 374 arg_names); 375 376 // Construct new arrays for args_base, args, arg_sizes and arg_types 377 // using the information in MapperComponents and call the corresponding 378 // targetData* function using these new arrays. 379 std::vector<void *> MapperArgsBase(MapperComponents.Components.size()); 380 std::vector<void *> MapperArgs(MapperComponents.Components.size()); 381 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 382 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 383 std::vector<void *> MapperArgNames(MapperComponents.Components.size()); 384 385 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 386 auto &C = MapperComponents.Components[I]; 387 MapperArgsBase[I] = C.Base; 388 MapperArgs[I] = C.Begin; 389 MapperArgSizes[I] = C.Size; 390 MapperArgTypes[I] = C.Type; 391 MapperArgNames[I] = C.Name; 392 } 393 394 int rc = target_data_function(loc, Device, MapperComponents.Components.size(), 395 MapperArgsBase.data(), MapperArgs.data(), 396 MapperArgSizes.data(), MapperArgTypes.data(), 397 MapperArgNames.data(), /*arg_mappers*/ nullptr, 398 AsyncInfo, /*FromMapper=*/true); 399 400 return rc; 401 } 402 403 /// Internal function to do the mapping and transfer the data to the device 404 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, 405 void **args_base, void **args, int64_t *arg_sizes, 406 int64_t *arg_types, map_var_info_t *arg_names, 407 void **arg_mappers, AsyncInfoTy &AsyncInfo, 408 bool FromMapper) { 409 // process each input. 410 for (int32_t i = 0; i < arg_num; ++i) { 411 // Ignore private variables and arrays - there is no mapping for them. 412 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 413 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 414 continue; 415 416 if (arg_mappers && arg_mappers[i]) { 417 // Instead of executing the regular path of targetDataBegin, call the 418 // targetDataMapper variant which will call targetDataBegin again 419 // with new arguments. 420 DP("Calling targetDataMapper for the %dth argument\n", i); 421 422 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; 423 int rc = targetDataMapper(loc, Device, args_base[i], args[i], 424 arg_sizes[i], arg_types[i], arg_name, 425 arg_mappers[i], AsyncInfo, targetDataBegin); 426 427 if (rc != OFFLOAD_SUCCESS) { 428 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 429 " failed.\n"); 430 return OFFLOAD_FAIL; 431 } 432 433 // Skip the rest of this function, continue to the next argument. 434 continue; 435 } 436 437 void *HstPtrBegin = args[i]; 438 void *HstPtrBase = args_base[i]; 439 int64_t data_size = arg_sizes[i]; 440 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i]; 441 442 // Adjust for proper alignment if this is a combined entry (for structs). 443 // Look at the next argument - if that is MEMBER_OF this one, then this one 444 // is a combined entry. 445 int64_t padding = 0; 446 const int next_i = i + 1; 447 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && 448 getParentIndex(arg_types[next_i]) == i) { 449 padding = (int64_t)HstPtrBegin % Alignment; 450 if (padding) { 451 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 452 "\n", 453 padding, DPxPTR(HstPtrBegin)); 454 HstPtrBegin = (char *)HstPtrBegin - padding; 455 data_size += padding; 456 } 457 } 458 459 // Address of pointer on the host and device, respectively. 460 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin; 461 TargetPointerResultTy Pointer_TPR; 462 bool IsHostPtr = false; 463 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 464 // Force the creation of a device side copy of the data when: 465 // a close map modifier was associated with a map that contained a to. 466 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 467 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT; 468 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 469 // have reached this point via __tgt_target_data_begin and not __tgt_target 470 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 471 // associated with a target region, so there are no target parameters). This 472 // may be considered a hack, we could revise the scheme in the future. 473 bool UpdateRef = 474 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0); 475 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 476 DP("Has a pointer entry: \n"); 477 // Base is address of pointer. 478 // 479 // Usually, the pointer is already allocated by this time. For example: 480 // 481 // #pragma omp target map(s.p[0:N]) 482 // 483 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 484 // afterward, so the pointer is already allocated by the time the 485 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 486 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 487 // entry for a global that might not already be allocated by the time the 488 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 489 // when HasPresentModifier. 490 Pointer_TPR = Device.getTargetPointer( 491 HstPtrBase, HstPtrBase, sizeof(void *), nullptr, 492 MoveDataStateTy::NONE, IsImplicit, UpdateRef, HasCloseModifier, 493 HasPresentModifier, AsyncInfo); 494 PointerTgtPtrBegin = Pointer_TPR.TargetPointer; 495 IsHostPtr = Pointer_TPR.Flags.IsHostPointer; 496 if (!PointerTgtPtrBegin) { 497 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n", 498 HasPresentModifier ? "'present' map type modifier" 499 : "device failure or illegal mapping"); 500 return OFFLOAD_FAIL; 501 } 502 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 503 "\n", 504 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 505 (Pointer_TPR.Flags.IsNewEntry ? "" : " not")); 506 Pointer_HstPtrBegin = HstPtrBase; 507 // modify current entry. 508 HstPtrBase = *(void **)HstPtrBase; 509 // No need to update pointee ref count for the first element of the 510 // subelement that comes from mapper. 511 UpdateRef = 512 (!FromMapper || i != 0); // subsequently update ref count of pointee 513 } 514 515 MoveDataStateTy MoveData = MoveDataStateTy::NONE; 516 const bool UseUSM = PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY; 517 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO; 518 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 519 if (HasFlagTo && (!UseUSM || HasCloseModifier)) 520 MoveData = HasFlagAlways ? MoveDataStateTy::REQUIRED 521 : MoveDataStateTy::UNKNOWN; 522 523 auto TPR = Device.getTargetPointer( 524 HstPtrBegin, HstPtrBase, data_size, HstPtrName, MoveData, IsImplicit, 525 UpdateRef, HasCloseModifier, HasPresentModifier, AsyncInfo); 526 void *TgtPtrBegin = TPR.TargetPointer; 527 IsHostPtr = TPR.Flags.IsHostPointer; 528 // If data_size==0, then the argument could be a zero-length pointer to 529 // NULL, so getOrAlloc() returning NULL is not an error. 530 if (!TgtPtrBegin && (data_size || HasPresentModifier)) { 531 REPORT("Call to getOrAllocTgtPtr returned null pointer (%s).\n", 532 HasPresentModifier ? "'present' map type modifier" 533 : "device failure or illegal mapping"); 534 return OFFLOAD_FAIL; 535 } 536 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 537 " - is%s new\n", 538 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 539 540 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 541 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 542 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 543 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 544 args_base[i] = TgtPtrBase; 545 } 546 547 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 548 // Check whether we need to update the pointer on the device 549 bool UpdateDevPtr = false; 550 551 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 552 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 553 554 Device.ShadowMtx.lock(); 555 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin); 556 // If this pointer is not in the map we need to insert it. If the map 557 // contains a stale entry, we need to update it (e.g. if the pointee was 558 // deallocated and later on is reallocated at another device address). The 559 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An 560 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ 561 // pair is deallocated, not when the OBJ is deallocated. In 562 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, 563 // so it stays in the map for the lifetime of the application. When the 564 // OBJ is deallocated and later on allocated again (at a different device 565 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin 566 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. 567 if (Entry == Device.ShadowPtrMap.end() || 568 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { 569 // create or update shadow pointers for this entry 570 Device.ShadowPtrMap[Pointer_HstPtrBegin] = { 571 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; 572 UpdateDevPtr = true; 573 } 574 575 if (UpdateDevPtr) { 576 Pointer_TPR.MapTableEntry->lock(); 577 Device.ShadowMtx.unlock(); 578 579 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 580 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 581 582 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 583 TgtPtrBase = ExpectedTgtPtrBase; 584 585 int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, 586 sizeof(void *), AsyncInfo); 587 Pointer_TPR.MapTableEntry->unlock(); 588 589 if (rt != OFFLOAD_SUCCESS) { 590 REPORT("Copying data to device failed.\n"); 591 return OFFLOAD_FAIL; 592 } 593 } else 594 Device.ShadowMtx.unlock(); 595 } 596 } 597 598 return OFFLOAD_SUCCESS; 599 } 600 601 namespace { 602 /// This structure contains information to deallocate a target pointer, aka. 603 /// used to call the function \p DeviceTy::deallocTgtPtr. 604 struct DeallocTgtPtrInfo { 605 /// Host pointer used to look up into the map table 606 void *HstPtrBegin; 607 /// Size of the data 608 int64_t DataSize; 609 /// Whether it has \p close modifier 610 bool HasCloseModifier; 611 612 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasCloseModifier) 613 : HstPtrBegin(HstPtr), DataSize(Size), 614 HasCloseModifier(HasCloseModifier) {} 615 }; 616 } // namespace 617 618 /// Internal function to undo the mapping and retrieve the data from the device. 619 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 620 void **ArgBases, void **Args, int64_t *ArgSizes, 621 int64_t *ArgTypes, map_var_info_t *ArgNames, 622 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 623 int Ret; 624 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs; 625 void *FromMapperBase = nullptr; 626 // process each input. 627 for (int32_t I = ArgNum - 1; I >= 0; --I) { 628 // Ignore private variables and arrays - there is no mapping for them. 629 // Also, ignore the use_device_ptr directive, it has no effect here. 630 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 631 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 632 continue; 633 634 if (ArgMappers && ArgMappers[I]) { 635 // Instead of executing the regular path of targetDataEnd, call the 636 // targetDataMapper variant which will call targetDataEnd again 637 // with new arguments. 638 DP("Calling targetDataMapper for the %dth argument\n", I); 639 640 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 641 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], 642 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 643 targetDataEnd); 644 645 if (Ret != OFFLOAD_SUCCESS) { 646 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 647 " failed.\n"); 648 return OFFLOAD_FAIL; 649 } 650 651 // Skip the rest of this function, continue to the next argument. 652 continue; 653 } 654 655 void *HstPtrBegin = Args[I]; 656 int64_t DataSize = ArgSizes[I]; 657 // Adjust for proper alignment if this is a combined entry (for structs). 658 // Look at the next argument - if that is MEMBER_OF this one, then this one 659 // is a combined entry. 660 const int NextI = I + 1; 661 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 662 getParentIndex(ArgTypes[NextI]) == I) { 663 int64_t Padding = (int64_t)HstPtrBegin % Alignment; 664 if (Padding) { 665 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD 666 "\n", 667 Padding, DPxPTR(HstPtrBegin)); 668 HstPtrBegin = (char *)HstPtrBegin - Padding; 669 DataSize += Padding; 670 } 671 } 672 673 bool IsLast, IsHostPtr; 674 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 675 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 676 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 677 !(FromMapper && I == 0); 678 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 679 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; 680 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 681 682 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 683 void *TgtPtrBegin = 684 Device.getTgtPtrBegin(HstPtrBegin, DataSize, IsLast, UpdateRef, 685 IsHostPtr, !IsImplicit, ForceDelete); 686 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 687 DP("Mapping does not exist (%s)\n", 688 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 689 if (HasPresentModifier) { 690 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 691 // "If a map clause appears on a target, target data, target enter data 692 // or target exit data construct with a present map-type-modifier then 693 // on entry to the region if the corresponding list item does not appear 694 // in the device data environment then an error occurs and the program 695 // terminates." 696 // 697 // This should be an error upon entering an "omp target exit data". It 698 // should not be an error upon exiting an "omp target data" or "omp 699 // target". For "omp target data", Clang thus doesn't include present 700 // modifiers for end calls. For "omp target", we have not found a valid 701 // OpenMP program for which the error matters: it appears that, if a 702 // program can guarantee that data is present at the beginning of an 703 // "omp target" region so that there's no error there, that data is also 704 // guaranteed to be present at the end. 705 MESSAGE("device mapping required by 'present' map type modifier does " 706 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 707 DPxPTR(HstPtrBegin), DataSize); 708 return OFFLOAD_FAIL; 709 } 710 } else { 711 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 712 " - is%s last\n", 713 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); 714 } 715 716 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 717 // "If the map clause appears on a target, target data, or target exit data 718 // construct and a corresponding list item of the original list item is not 719 // present in the device data environment on exit from the region then the 720 // list item is ignored." 721 if (!TgtPtrBegin) 722 continue; 723 724 bool DelEntry = IsLast; 725 726 // If the last element from the mapper (for end transfer args comes in 727 // reverse order), do not remove the partial entry, the parent struct still 728 // exists. 729 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 730 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 731 DelEntry = false; // protect parent struct from being deallocated 732 } 733 734 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 735 // Move data back to the host 736 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 737 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 738 bool CopyMember = false; 739 if (!(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || 740 HasCloseModifier) { 741 if (IsLast) 742 CopyMember = true; 743 } 744 745 if ((DelEntry || Always || CopyMember) && 746 !(PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 747 TgtPtrBegin == HstPtrBegin)) { 748 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 749 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 750 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, 751 AsyncInfo); 752 if (Ret != OFFLOAD_SUCCESS) { 753 REPORT("Copying data from device failed.\n"); 754 return OFFLOAD_FAIL; 755 } 756 } 757 } 758 if (DelEntry && FromMapper && I == 0) { 759 DelEntry = false; 760 FromMapperBase = HstPtrBegin; 761 } 762 763 // If we copied back to the host a struct/array containing pointers, we 764 // need to restore the original host pointer values from their shadow 765 // copies. If the struct is going to be deallocated, remove any remaining 766 // shadow pointer entries for this struct. 767 uintptr_t LB = (uintptr_t)HstPtrBegin; 768 uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize; 769 Device.ShadowMtx.lock(); 770 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); 771 Itr != Device.ShadowPtrMap.end();) { 772 void **ShadowHstPtrAddr = (void **)Itr->first; 773 774 // An STL map is sorted on its keys; use this property 775 // to quickly determine when to break out of the loop. 776 if ((uintptr_t)ShadowHstPtrAddr < LB) { 777 ++Itr; 778 continue; 779 } 780 if ((uintptr_t)ShadowHstPtrAddr >= UB) 781 break; 782 783 // If we copied the struct to the host, we need to restore the pointer. 784 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 785 DP("Restoring original host pointer value " DPxMOD " for host " 786 "pointer " DPxMOD "\n", 787 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 788 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 789 } 790 // If the struct is to be deallocated, remove the shadow entry. 791 if (DelEntry) { 792 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); 793 Itr = Device.ShadowPtrMap.erase(Itr); 794 } else { 795 ++Itr; 796 } 797 } 798 Device.ShadowMtx.unlock(); 799 800 // Add pointer to the buffer for later deallocation 801 if (DelEntry) 802 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasCloseModifier); 803 } 804 } 805 806 // TODO: We should not synchronize here but pass the AsyncInfo object to the 807 // allocate/deallocate device APIs. 808 // 809 // We need to synchronize before deallocating data. 810 Ret = AsyncInfo.synchronize(); 811 if (Ret != OFFLOAD_SUCCESS) 812 return OFFLOAD_FAIL; 813 814 // Deallocate target pointer 815 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { 816 if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) 817 continue; 818 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, 819 Info.HasCloseModifier); 820 if (Ret != OFFLOAD_SUCCESS) { 821 REPORT("Deallocating data from device failed.\n"); 822 return OFFLOAD_FAIL; 823 } 824 } 825 826 return OFFLOAD_SUCCESS; 827 } 828 829 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, 830 void *HstPtrBegin, int64_t ArgSize, 831 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 832 TIMESCOPE_WITH_IDENT(loc); 833 bool IsLast, IsHostPtr; 834 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false, 835 IsHostPtr, /*MustContain=*/true); 836 if (!TgtPtrBegin) { 837 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 838 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 839 MESSAGE("device mapping required by 'present' motion modifier does not " 840 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 841 DPxPTR(HstPtrBegin), ArgSize); 842 return OFFLOAD_FAIL; 843 } 844 return OFFLOAD_SUCCESS; 845 } 846 847 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 848 TgtPtrBegin == HstPtrBegin) { 849 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 850 DPxPTR(HstPtrBegin)); 851 return OFFLOAD_SUCCESS; 852 } 853 854 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 855 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 856 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 857 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); 858 if (Ret != OFFLOAD_SUCCESS) { 859 REPORT("Copying data from device failed.\n"); 860 return OFFLOAD_FAIL; 861 } 862 863 uintptr_t LB = (uintptr_t)HstPtrBegin; 864 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize; 865 Device.ShadowMtx.lock(); 866 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin(); 867 IT != Device.ShadowPtrMap.end(); ++IT) { 868 void **ShadowHstPtrAddr = (void **)IT->first; 869 if ((uintptr_t)ShadowHstPtrAddr < LB) 870 continue; 871 if ((uintptr_t)ShadowHstPtrAddr >= UB) 872 break; 873 DP("Restoring original host pointer value " DPxMOD 874 " for host pointer " DPxMOD "\n", 875 DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 876 *ShadowHstPtrAddr = IT->second.HstPtrVal; 877 } 878 Device.ShadowMtx.unlock(); 879 } 880 881 if (ArgType & OMP_TGT_MAPTYPE_TO) { 882 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 883 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 884 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); 885 if (Ret != OFFLOAD_SUCCESS) { 886 REPORT("Copying data to device failed.\n"); 887 return OFFLOAD_FAIL; 888 } 889 890 uintptr_t LB = (uintptr_t)HstPtrBegin; 891 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize; 892 Device.ShadowMtx.lock(); 893 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin(); 894 IT != Device.ShadowPtrMap.end(); ++IT) { 895 void **ShadowHstPtrAddr = (void **)IT->first; 896 if ((uintptr_t)ShadowHstPtrAddr < LB) 897 continue; 898 if ((uintptr_t)ShadowHstPtrAddr >= UB) 899 break; 900 DP("Restoring original target pointer value " DPxMOD " for target " 901 "pointer " DPxMOD "\n", 902 DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr)); 903 Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal, 904 sizeof(void *), AsyncInfo); 905 if (Ret != OFFLOAD_SUCCESS) { 906 REPORT("Copying data to device failed.\n"); 907 Device.ShadowMtx.unlock(); 908 return OFFLOAD_FAIL; 909 } 910 } 911 Device.ShadowMtx.unlock(); 912 } 913 return OFFLOAD_SUCCESS; 914 } 915 916 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device, 917 void *ArgsBase, 918 __tgt_target_non_contig *NonContig, 919 uint64_t Size, int64_t ArgType, 920 int CurrentDim, int DimSize, uint64_t Offset, 921 AsyncInfoTy &AsyncInfo) { 922 TIMESCOPE_WITH_IDENT(loc); 923 int Ret = OFFLOAD_SUCCESS; 924 if (CurrentDim < DimSize) { 925 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 926 uint64_t CurOffset = 927 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 928 // we only need to transfer the first element for the last dimension 929 // since we've already got a contiguous piece. 930 if (CurrentDim != DimSize - 1 || I == 0) { 931 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, 932 ArgType, CurrentDim + 1, DimSize, 933 Offset + CurOffset, AsyncInfo); 934 // Stop the whole process if any contiguous piece returns anything 935 // other than OFFLOAD_SUCCESS. 936 if (Ret != OFFLOAD_SUCCESS) 937 return Ret; 938 } 939 } 940 } else { 941 char *Ptr = (char *)ArgsBase + Offset; 942 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 943 " len %" PRIu64 "\n", 944 DPxPTR(Ptr), Offset, Size); 945 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, 946 AsyncInfo); 947 } 948 return Ret; 949 } 950 951 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 952 int32_t DimSize) { 953 int RemovedDim = 0; 954 for (int I = DimSize - 1; I > 0; --I) { 955 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 956 RemovedDim++; 957 } 958 return RemovedDim; 959 } 960 961 /// Internal function to pass data to/from the target. 962 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 963 void **ArgsBase, void **Args, int64_t *ArgSizes, 964 int64_t *ArgTypes, map_var_info_t *ArgNames, 965 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 966 // process each input. 967 for (int32_t I = 0; I < ArgNum; ++I) { 968 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 969 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 970 continue; 971 972 if (ArgMappers && ArgMappers[I]) { 973 // Instead of executing the regular path of targetDataUpdate, call the 974 // targetDataMapper variant which will call targetDataUpdate again 975 // with new arguments. 976 DP("Calling targetDataMapper for the %dth argument\n", I); 977 978 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 979 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 980 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 981 targetDataUpdate); 982 983 if (Ret != OFFLOAD_SUCCESS) { 984 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 985 " failed.\n"); 986 return OFFLOAD_FAIL; 987 } 988 989 // Skip the rest of this function, continue to the next argument. 990 continue; 991 } 992 993 int Ret = OFFLOAD_SUCCESS; 994 995 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 996 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 997 int32_t DimSize = ArgSizes[I]; 998 uint64_t Size = 999 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 1000 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 1001 Ret = targetDataNonContiguous( 1002 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 1003 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 1004 } else { 1005 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1006 ArgTypes[I], AsyncInfo); 1007 } 1008 if (Ret == OFFLOAD_FAIL) 1009 return OFFLOAD_FAIL; 1010 } 1011 return OFFLOAD_SUCCESS; 1012 } 1013 1014 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 1015 OMP_TGT_MAPTYPE_LITERAL | 1016 OMP_TGT_MAPTYPE_IMPLICIT; 1017 static bool isLambdaMapping(int64_t Mapping) { 1018 return (Mapping & LambdaMapping) == LambdaMapping; 1019 } 1020 1021 namespace { 1022 /// Find the table information in the map or look it up in the translation 1023 /// tables. 1024 TableMap *getTableMap(void *HostPtr) { 1025 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1026 HostPtrToTableMapTy::iterator TableMapIt = 1027 PM->HostPtrToTableMap.find(HostPtr); 1028 1029 if (TableMapIt != PM->HostPtrToTableMap.end()) 1030 return &TableMapIt->second; 1031 1032 // We don't have a map. So search all the registered libraries. 1033 TableMap *TM = nullptr; 1034 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1035 for (HostEntriesBeginToTransTableTy::iterator Itr = 1036 PM->HostEntriesBeginToTransTable.begin(); 1037 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 1038 // get the translation table (which contains all the good info). 1039 TranslationTable *TransTable = &Itr->second; 1040 // iterate over all the host table entries to see if we can locate the 1041 // host_ptr. 1042 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; 1043 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 1044 if (Cur->addr != HostPtr) 1045 continue; 1046 // we got a match, now fill the HostPtrToTableMap so that we 1047 // may avoid this search next time. 1048 TM = &(PM->HostPtrToTableMap)[HostPtr]; 1049 TM->Table = TransTable; 1050 TM->Index = I; 1051 return TM; 1052 } 1053 } 1054 1055 return nullptr; 1056 } 1057 1058 /// Get loop trip count 1059 /// FIXME: This function will not work right if calling 1060 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in 1061 /// another thread, which might occur when we call task yield. 1062 uint64_t getLoopTripCount(int64_t DeviceId) { 1063 DeviceTy &Device = PM->Devices[DeviceId]; 1064 uint64_t LoopTripCount = 0; 1065 1066 { 1067 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1068 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 1069 if (I != Device.LoopTripCnt.end()) { 1070 LoopTripCount = I->second; 1071 Device.LoopTripCnt.erase(I); 1072 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount); 1073 } 1074 } 1075 1076 return LoopTripCount; 1077 } 1078 1079 /// A class manages private arguments in a target region. 1080 class PrivateArgumentManagerTy { 1081 /// A data structure for the information of first-private arguments. We can 1082 /// use this information to optimize data transfer by packing all 1083 /// first-private arguments and transfer them all at once. 1084 struct FirstPrivateArgInfoTy { 1085 /// The index of the element in \p TgtArgs corresponding to the argument 1086 const int Index; 1087 /// Host pointer begin 1088 const char *HstPtrBegin; 1089 /// Host pointer end 1090 const char *HstPtrEnd; 1091 /// Aligned size 1092 const int64_t AlignedSize; 1093 /// Host pointer name 1094 const map_var_info_t HstPtrName = nullptr; 1095 1096 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, 1097 const map_var_info_t HstPtrName = nullptr) 1098 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)), 1099 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), 1100 HstPtrName(HstPtrName) {} 1101 }; 1102 1103 /// A vector of target pointers for all private arguments 1104 std::vector<void *> TgtPtrs; 1105 1106 /// A vector of information of all first-private arguments to be packed 1107 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1108 /// Host buffer for all arguments to be packed 1109 std::vector<char> FirstPrivateArgBuffer; 1110 /// The total size of all arguments to be packed 1111 int64_t FirstPrivateArgSize = 0; 1112 1113 /// A reference to the \p DeviceTy object 1114 DeviceTy &Device; 1115 /// A pointer to a \p AsyncInfoTy object 1116 AsyncInfoTy &AsyncInfo; 1117 1118 // TODO: What would be the best value here? Should we make it configurable? 1119 // If the size is larger than this threshold, we will allocate and transfer it 1120 // immediately instead of packing it. 1121 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1122 1123 public: 1124 /// Constructor 1125 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1126 : Device(Dev), AsyncInfo(AsyncInfo) {} 1127 1128 /// Add a private argument 1129 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1130 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1131 const map_var_info_t HstPtrName = nullptr, 1132 const bool AllocImmediately = false) { 1133 // If the argument is not first-private, or its size is greater than a 1134 // predefined threshold, we will allocate memory and issue the transfer 1135 // immediately. 1136 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1137 AllocImmediately) { 1138 TgtPtr = Device.allocData(ArgSize, HstPtr); 1139 if (!TgtPtr) { 1140 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1141 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1142 return OFFLOAD_FAIL; 1143 } 1144 #ifdef OMPTARGET_DEBUG 1145 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1146 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1147 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1148 "\n", 1149 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1150 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1151 #endif 1152 // If first-private, copy data from host 1153 if (IsFirstPrivate) { 1154 DP("Submitting firstprivate data to the device.\n"); 1155 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1156 if (Ret != OFFLOAD_SUCCESS) { 1157 DP("Copying data to device failed, failed.\n"); 1158 return OFFLOAD_FAIL; 1159 } 1160 } 1161 TgtPtrs.push_back(TgtPtr); 1162 } else { 1163 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1164 DPxPTR(HstPtr), ArgSize); 1165 // When reach this point, the argument must meet all following 1166 // requirements: 1167 // 1. Its size does not exceed the threshold (see the comment for 1168 // FirstPrivateArgSizeThreshold); 1169 // 2. It must be first-private (needs to be mapped to target device). 1170 // We will pack all this kind of arguments to transfer them all at once 1171 // to reduce the number of data transfer. We will not take 1172 // non-first-private arguments, aka. private arguments that doesn't need 1173 // to be mapped to target device, into account because data allocation 1174 // can be very efficient with memory manager. 1175 1176 // Placeholder value 1177 TgtPtr = nullptr; 1178 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1179 HstPtrName); 1180 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; 1181 } 1182 1183 return OFFLOAD_SUCCESS; 1184 } 1185 1186 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1187 /// and start the transfer. 1188 int packAndTransfer(std::vector<void *> &TgtArgs) { 1189 if (!FirstPrivateArgInfo.empty()) { 1190 assert(FirstPrivateArgSize != 0 && 1191 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1192 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1193 auto Itr = FirstPrivateArgBuffer.begin(); 1194 // Copy all host data to this buffer 1195 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1196 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1197 Itr = std::next(Itr, Info.AlignedSize); 1198 } 1199 // Allocate target memory 1200 void *TgtPtr = 1201 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1202 if (TgtPtr == nullptr) { 1203 DP("Failed to allocate target memory for private arguments.\n"); 1204 return OFFLOAD_FAIL; 1205 } 1206 TgtPtrs.push_back(TgtPtr); 1207 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1208 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1209 // Transfer data to target device 1210 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1211 FirstPrivateArgSize, AsyncInfo); 1212 if (Ret != OFFLOAD_SUCCESS) { 1213 DP("Failed to submit data of private arguments.\n"); 1214 return OFFLOAD_FAIL; 1215 } 1216 // Fill in all placeholder pointers 1217 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1218 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1219 void *&Ptr = TgtArgs[Info.Index]; 1220 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1221 Ptr = reinterpret_cast<void *>(TP); 1222 TP += Info.AlignedSize; 1223 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1224 "\n", 1225 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1226 DPxPTR(Ptr)); 1227 } 1228 } 1229 1230 return OFFLOAD_SUCCESS; 1231 } 1232 1233 /// Free all target memory allocated for private arguments 1234 int free() { 1235 for (void *P : TgtPtrs) { 1236 int Ret = Device.deleteData(P); 1237 if (Ret != OFFLOAD_SUCCESS) { 1238 DP("Deallocation of (first-)private arrays failed.\n"); 1239 return OFFLOAD_FAIL; 1240 } 1241 } 1242 1243 TgtPtrs.clear(); 1244 1245 return OFFLOAD_SUCCESS; 1246 } 1247 }; 1248 1249 /// Process data before launching the kernel, including calling targetDataBegin 1250 /// to map and transfer data to target device, transferring (first-)private 1251 /// variables. 1252 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr, 1253 int32_t ArgNum, void **ArgBases, void **Args, 1254 int64_t *ArgSizes, int64_t *ArgTypes, 1255 map_var_info_t *ArgNames, void **ArgMappers, 1256 std::vector<void *> &TgtArgs, 1257 std::vector<ptrdiff_t> &TgtOffsets, 1258 PrivateArgumentManagerTy &PrivateArgumentManager, 1259 AsyncInfoTy &AsyncInfo) { 1260 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); 1261 DeviceTy &Device = PM->Devices[DeviceId]; 1262 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1263 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1264 if (Ret != OFFLOAD_SUCCESS) { 1265 REPORT("Call to targetDataBegin failed, abort target.\n"); 1266 return OFFLOAD_FAIL; 1267 } 1268 1269 // List of (first-)private arrays allocated for this target region 1270 std::vector<int> TgtArgsPositions(ArgNum, -1); 1271 1272 for (int32_t I = 0; I < ArgNum; ++I) { 1273 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1274 // This is not a target parameter, do not push it into TgtArgs. 1275 // Check for lambda mapping. 1276 if (isLambdaMapping(ArgTypes[I])) { 1277 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1278 "PTR_AND_OBJ must be also MEMBER_OF."); 1279 unsigned Idx = getParentIndex(ArgTypes[I]); 1280 int TgtIdx = TgtArgsPositions[Idx]; 1281 assert(TgtIdx != -1 && "Base address must be translated already."); 1282 // The parent lambda must be processed already and it must be the last 1283 // in TgtArgs and TgtOffsets arrays. 1284 void *HstPtrVal = Args[I]; 1285 void *HstPtrBegin = ArgBases[I]; 1286 void *HstPtrBase = Args[Idx]; 1287 bool IsLast, IsHostPtr; // unused. 1288 void *TgtPtrBase = 1289 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1290 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1291 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1292 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1293 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1294 PointerTgtPtrBegin = Device.getTgtPtrBegin(HstPtrVal, ArgSizes[I], 1295 IsLast, false, IsHostPtr); 1296 if (!PointerTgtPtrBegin) { 1297 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1298 DPxPTR(HstPtrVal)); 1299 continue; 1300 } 1301 if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 1302 TgtPtrBegin == HstPtrBegin) { 1303 DP("Unified memory is active, no need to map lambda captured" 1304 "variable (" DPxMOD ")\n", 1305 DPxPTR(HstPtrVal)); 1306 continue; 1307 } 1308 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1309 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1310 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1311 sizeof(void *), AsyncInfo); 1312 if (Ret != OFFLOAD_SUCCESS) { 1313 REPORT("Copying data to device failed.\n"); 1314 return OFFLOAD_FAIL; 1315 } 1316 } 1317 continue; 1318 } 1319 void *HstPtrBegin = Args[I]; 1320 void *HstPtrBase = ArgBases[I]; 1321 void *TgtPtrBegin; 1322 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1323 ptrdiff_t TgtBaseOffset; 1324 bool IsLast, IsHostPtr; // unused. 1325 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1326 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1327 DPxPTR(HstPtrBase)); 1328 TgtPtrBegin = HstPtrBase; 1329 TgtBaseOffset = 0; 1330 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1331 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1332 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1333 // If there is a next argument and it depends on the current one, we need 1334 // to allocate the private memory immediately. If this is not the case, 1335 // then the argument can be marked for optimization and packed with the 1336 // other privates. 1337 const bool AllocImmediately = 1338 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1339 Ret = PrivateArgumentManager.addArg( 1340 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1341 TgtArgs.size(), HstPtrName, AllocImmediately); 1342 if (Ret != OFFLOAD_SUCCESS) { 1343 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1344 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1345 return OFFLOAD_FAIL; 1346 } 1347 } else { 1348 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1349 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1350 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, 1351 false, IsHostPtr); 1352 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1353 #ifdef OMPTARGET_DEBUG 1354 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1355 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1356 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1357 #endif 1358 } 1359 TgtArgsPositions[I] = TgtArgs.size(); 1360 TgtArgs.push_back(TgtPtrBegin); 1361 TgtOffsets.push_back(TgtBaseOffset); 1362 } 1363 1364 assert(TgtArgs.size() == TgtOffsets.size() && 1365 "Size mismatch in arguments and offsets"); 1366 1367 // Pack and transfer first-private arguments 1368 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1369 if (Ret != OFFLOAD_SUCCESS) { 1370 DP("Failed to pack and transfer first private arguments\n"); 1371 return OFFLOAD_FAIL; 1372 } 1373 1374 return OFFLOAD_SUCCESS; 1375 } 1376 1377 /// Process data after launching the kernel, including transferring data back to 1378 /// host if needed and deallocating target memory of (first-)private variables. 1379 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr, 1380 int32_t ArgNum, void **ArgBases, void **Args, 1381 int64_t *ArgSizes, int64_t *ArgTypes, 1382 map_var_info_t *ArgNames, void **ArgMappers, 1383 PrivateArgumentManagerTy &PrivateArgumentManager, 1384 AsyncInfoTy &AsyncInfo) { 1385 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); 1386 DeviceTy &Device = PM->Devices[DeviceId]; 1387 1388 // Move data from device. 1389 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1390 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1391 if (Ret != OFFLOAD_SUCCESS) { 1392 REPORT("Call to targetDataEnd failed, abort target.\n"); 1393 return OFFLOAD_FAIL; 1394 } 1395 1396 // Free target memory for private arguments 1397 Ret = PrivateArgumentManager.free(); 1398 if (Ret != OFFLOAD_SUCCESS) { 1399 REPORT("Failed to deallocate target memory for private args\n"); 1400 return OFFLOAD_FAIL; 1401 } 1402 1403 return OFFLOAD_SUCCESS; 1404 } 1405 } // namespace 1406 1407 /// performs the same actions as data_begin in case arg_num is 1408 /// non-zero and initiates run of the offloaded region on the target platform; 1409 /// if arg_num is non-zero after the region execution is done it also 1410 /// performs the same action as data_update and data_end above. This function 1411 /// returns 0 if it was able to transfer the execution to a target and an 1412 /// integer different from zero otherwise. 1413 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, 1414 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 1415 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, 1416 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { 1417 int32_t DeviceId = Device.DeviceID; 1418 1419 TableMap *TM = getTableMap(HostPtr); 1420 // No map for this host pointer found! 1421 if (!TM) { 1422 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1423 DPxPTR(HostPtr)); 1424 return OFFLOAD_FAIL; 1425 } 1426 1427 // get target table. 1428 __tgt_target_table *TargetTable = nullptr; 1429 { 1430 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1431 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1432 "Not expecting a device ID outside the table's bounds!"); 1433 TargetTable = TM->Table->TargetsTable[DeviceId]; 1434 } 1435 assert(TargetTable && "Global data has not been mapped\n"); 1436 1437 std::vector<void *> TgtArgs; 1438 std::vector<ptrdiff_t> TgtOffsets; 1439 1440 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1441 1442 int Ret; 1443 if (ArgNum) { 1444 // Process data, such as data mapping, before launching the kernel 1445 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1446 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, 1447 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1448 if (Ret != OFFLOAD_SUCCESS) { 1449 REPORT("Failed to process data before launching the kernel.\n"); 1450 return OFFLOAD_FAIL; 1451 } 1452 } 1453 1454 // Get loop trip count 1455 uint64_t LoopTripCount = getLoopTripCount(DeviceId); 1456 1457 // Launch device execution. 1458 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; 1459 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1460 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); 1461 1462 { 1463 TIMESCOPE_WITH_NAME_AND_IDENT( 1464 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); 1465 if (IsTeamConstruct) 1466 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1467 TgtArgs.size(), TeamNum, ThreadLimit, 1468 LoopTripCount, AsyncInfo); 1469 else 1470 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1471 TgtArgs.size(), AsyncInfo); 1472 } 1473 1474 if (Ret != OFFLOAD_SUCCESS) { 1475 REPORT("Executing target region abort target.\n"); 1476 return OFFLOAD_FAIL; 1477 } 1478 1479 if (ArgNum) { 1480 // Transfer data back and deallocate target memory for (first-)private 1481 // variables 1482 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1483 ArgSizes, ArgTypes, ArgNames, ArgMappers, 1484 PrivateArgumentManager, AsyncInfo); 1485 if (Ret != OFFLOAD_SUCCESS) { 1486 REPORT("Failed to process data after launching the kernel.\n"); 1487 return OFFLOAD_FAIL; 1488 } 1489 } 1490 1491 return OFFLOAD_SUCCESS; 1492 } 1493