1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptarget.h" 15 #include "device.h" 16 #include "private.h" 17 #include "rtl.h" 18 19 #include <cassert> 20 #include <cstdint> 21 #include <vector> 22 23 int AsyncInfoTy::synchronize() { 24 int Result = OFFLOAD_SUCCESS; 25 if (AsyncInfo.Queue) { 26 // If we have a queue we need to synchronize it now. 27 Result = Device.synchronize(*this); 28 assert(AsyncInfo.Queue == nullptr && 29 "The device plugin should have nulled the queue to indicate there " 30 "are no outstanding actions!"); 31 } 32 return Result; 33 } 34 35 void *&AsyncInfoTy::getVoidPtrLocation() { 36 BufferLocations.push_back(nullptr); 37 return BufferLocations.back(); 38 } 39 40 /* All begin addresses for partially mapped structs must be 8-aligned in order 41 * to ensure proper alignment of members. E.g. 42 * 43 * struct S { 44 * int a; // 4-aligned 45 * int b; // 4-aligned 46 * int *p; // 8-aligned 47 * } s1; 48 * ... 49 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 50 * { 51 * s1.b = 5; 52 * for (int i...) s1.p[i] = ...; 53 * } 54 * 55 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 56 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 57 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 58 * requirements for its type. Now, when we allocate memory on the device, in 59 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 60 * This means that the chunk of the struct on the device will start at a 61 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 62 * address of p will be a misaligned 0x204 (on the host there was no need to add 63 * padding between b and p, so p comes exactly 4 bytes after b). If the device 64 * kernel tries to access s1.p, a misaligned address error occurs (as reported 65 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 66 * extending the size of the allocated chuck accordingly, the chuck on the 67 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 68 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 69 */ 70 static const int64_t Alignment = 8; 71 72 /// Map global data and execute pending ctors 73 static int InitLibrary(DeviceTy &Device) { 74 /* 75 * Map global data 76 */ 77 int32_t device_id = Device.DeviceID; 78 int rc = OFFLOAD_SUCCESS; 79 bool supportsEmptyImages = Device.RTL->supports_empty_images && 80 Device.RTL->supports_empty_images() > 0; 81 82 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( 83 Device.PendingGlobalsMtx); 84 { 85 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx); 86 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { 87 TranslationTable *TransTable = 88 &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; 89 if (TransTable->HostTable.EntriesBegin == 90 TransTable->HostTable.EntriesEnd && 91 !supportsEmptyImages) { 92 // No host entry so no need to proceed 93 continue; 94 } 95 96 if (TransTable->TargetsTable[device_id] != 0) { 97 // Library entries have already been processed 98 continue; 99 } 100 101 // 1) get image. 102 assert(TransTable->TargetsImages.size() > (size_t)device_id && 103 "Not expecting a device ID outside the table's bounds!"); 104 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 105 if (!img) { 106 REPORT("No image loaded for device id %d.\n", device_id); 107 rc = OFFLOAD_FAIL; 108 break; 109 } 110 // 2) load image into the target table. 111 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] = 112 Device.load_binary(img); 113 // Unable to get table for this image: invalidate image and fail. 114 if (!TargetTable) { 115 REPORT("Unable to generate entries table for device id %d.\n", 116 device_id); 117 TransTable->TargetsImages[device_id] = 0; 118 rc = OFFLOAD_FAIL; 119 break; 120 } 121 122 // Verify whether the two table sizes match. 123 size_t hsize = 124 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 125 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 126 127 // Invalid image for these host entries! 128 if (hsize != tsize) { 129 REPORT( 130 "Host and Target tables mismatch for device id %d [%zx != %zx].\n", 131 device_id, hsize, tsize); 132 TransTable->TargetsImages[device_id] = 0; 133 TransTable->TargetsTable[device_id] = 0; 134 rc = OFFLOAD_FAIL; 135 break; 136 } 137 138 // process global data that needs to be mapped. 139 std::lock_guard<decltype(Device.DataMapMtx)> LG(Device.DataMapMtx); 140 141 __tgt_target_table *HostTable = &TransTable->HostTable; 142 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 143 *CurrHostEntry = HostTable->EntriesBegin, 144 *EntryDeviceEnd = TargetTable->EntriesEnd; 145 CurrDeviceEntry != EntryDeviceEnd; 146 CurrDeviceEntry++, CurrHostEntry++) { 147 if (CurrDeviceEntry->size != 0) { 148 // has data. 149 assert(CurrDeviceEntry->size == CurrHostEntry->size && 150 "data size mismatch"); 151 152 // Fortran may use multiple weak declarations for the same symbol, 153 // therefore we must allow for multiple weak symbols to be loaded from 154 // the fat binary. Treat these mappings as any other "regular" 155 // mapping. Add entry to map. 156 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) 157 continue; 158 DP("Add mapping from host " DPxMOD " to device " DPxMOD 159 " with size %zu" 160 "\n", 161 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 162 CurrDeviceEntry->size); 163 Device.HostDataToTargetMap.emplace( 164 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 165 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 166 (uintptr_t)CurrHostEntry->addr + 167 CurrHostEntry->size /*HstPtrEnd*/, 168 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 169 false /*UseHoldRefCount*/, nullptr /*Name*/, 170 true /*IsRefCountINF*/); 171 } 172 } 173 } 174 } 175 176 if (rc != OFFLOAD_SUCCESS) { 177 return rc; 178 } 179 180 /* 181 * Run ctors for static objects 182 */ 183 if (!Device.PendingCtorsDtors.empty()) { 184 AsyncInfoTy AsyncInfo(Device); 185 // Call all ctors for all libraries registered so far 186 for (auto &lib : Device.PendingCtorsDtors) { 187 if (!lib.second.PendingCtors.empty()) { 188 DP("Has pending ctors... call now\n"); 189 for (auto &entry : lib.second.PendingCtors) { 190 void *ctor = entry; 191 int rc = 192 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr, 193 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo); 194 if (rc != OFFLOAD_SUCCESS) { 195 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 196 return OFFLOAD_FAIL; 197 } 198 } 199 // Clear the list to indicate that this device has been used 200 lib.second.PendingCtors.clear(); 201 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 202 } 203 } 204 // All constructors have been issued, wait for them now. 205 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 206 return OFFLOAD_FAIL; 207 } 208 Device.HasPendingGlobals = false; 209 210 return OFFLOAD_SUCCESS; 211 } 212 213 void handleTargetOutcome(bool Success, ident_t *Loc) { 214 switch (PM->TargetOffloadPolicy) { 215 case tgt_disabled: 216 if (Success) { 217 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); 218 } 219 break; 220 case tgt_default: 221 FATAL_MESSAGE0(1, "default offloading policy must be switched to " 222 "mandatory or disabled"); 223 break; 224 case tgt_mandatory: 225 if (!Success) { 226 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 227 for (auto &Device : PM->Devices) 228 dumpTargetPointerMappings(Loc, *Device); 229 else 230 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " 231 "for debugging options.\n"); 232 233 SourceInfo info(Loc); 234 if (info.isAvailible()) 235 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(), 236 info.getColumn()); 237 else 238 FAILURE_MESSAGE("Source location information not present. Compile with " 239 "-g or -gline-tables-only.\n"); 240 FATAL_MESSAGE0( 241 1, "failure of target construct while offloading is mandatory"); 242 } else { 243 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) 244 for (auto &Device : PM->Devices) 245 dumpTargetPointerMappings(Loc, *Device); 246 } 247 break; 248 } 249 } 250 251 static void handleDefaultTargetOffload() { 252 std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx); 253 if (PM->TargetOffloadPolicy == tgt_default) { 254 if (omp_get_num_devices() > 0) { 255 DP("Default TARGET OFFLOAD policy is now mandatory " 256 "(devices were found)\n"); 257 PM->TargetOffloadPolicy = tgt_mandatory; 258 } else { 259 DP("Default TARGET OFFLOAD policy is now disabled " 260 "(no devices were found)\n"); 261 PM->TargetOffloadPolicy = tgt_disabled; 262 } 263 } 264 } 265 266 static bool isOffloadDisabled() { 267 if (PM->TargetOffloadPolicy == tgt_default) 268 handleDefaultTargetOffload(); 269 return PM->TargetOffloadPolicy == tgt_disabled; 270 } 271 272 // If offload is enabled, ensure that device DeviceID has been initialized, 273 // global ctors have been executed, and global data has been mapped. 274 // 275 // The return bool indicates if the offload is to the host device 276 // There are three possible results: 277 // - Return false if the taregt device is ready for offload 278 // - Return true without reporting a runtime error if offload is 279 // disabled, perhaps because the initial device was specified. 280 // - Report a runtime error and return true. 281 // 282 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. 283 // This step might be skipped if offload is disabled. 284 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { 285 if (isOffloadDisabled()) { 286 DP("Offload is disabled\n"); 287 return true; 288 } 289 290 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { 291 DeviceID = omp_get_default_device(); 292 DP("Use default device id %" PRId64 "\n", DeviceID); 293 } 294 295 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. 296 if (omp_get_num_devices() == 0) { 297 DP("omp_get_num_devices() == 0 but offload is manadatory\n"); 298 handleTargetOutcome(false, Loc); 299 return true; 300 } 301 302 if (DeviceID == omp_get_initial_device()) { 303 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n", 304 DeviceID); 305 return true; 306 } 307 308 // Is device ready? 309 if (!device_is_ready(DeviceID)) { 310 REPORT("Device %" PRId64 " is not ready.\n", DeviceID); 311 handleTargetOutcome(false, Loc); 312 return true; 313 } 314 315 // Get device info. 316 DeviceTy &Device = *PM->Devices[DeviceID]; 317 318 // Check whether global data has been mapped for this device 319 bool HasPendingGlobals; 320 { 321 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( 322 Device.PendingGlobalsMtx); 323 HasPendingGlobals = Device.HasPendingGlobals; 324 } 325 if (HasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 326 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID); 327 handleTargetOutcome(false, Loc); 328 return true; 329 } 330 331 return false; 332 } 333 334 static int32_t getParentIndex(int64_t type) { 335 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 336 } 337 338 void *targetAllocExplicit(size_t size, int device_num, int kind, 339 const char *name) { 340 TIMESCOPE(); 341 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size); 342 343 if (size <= 0) { 344 DP("Call to %s with non-positive length\n", name); 345 return NULL; 346 } 347 348 void *rc = NULL; 349 350 if (device_num == omp_get_initial_device()) { 351 rc = malloc(size); 352 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc)); 353 return rc; 354 } 355 356 if (!device_is_ready(device_num)) { 357 DP("%s returns NULL ptr\n", name); 358 return NULL; 359 } 360 361 DeviceTy &Device = *PM->Devices[device_num]; 362 rc = Device.allocData(size, nullptr, kind); 363 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc)); 364 return rc; 365 } 366 367 /// Call the user-defined mapper function followed by the appropriate 368 // targetData* function (targetData{Begin,End,Update}). 369 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg, 370 int64_t arg_size, int64_t arg_type, 371 map_var_info_t arg_names, void *arg_mapper, 372 AsyncInfoTy &AsyncInfo, 373 TargetDataFuncPtrTy target_data_function) { 374 TIMESCOPE_WITH_IDENT(loc); 375 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper)); 376 377 // The mapper function fills up Components. 378 MapperComponentsTy MapperComponents; 379 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper); 380 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type, 381 arg_names); 382 383 // Construct new arrays for args_base, args, arg_sizes and arg_types 384 // using the information in MapperComponents and call the corresponding 385 // targetData* function using these new arrays. 386 std::vector<void *> MapperArgsBase(MapperComponents.Components.size()); 387 std::vector<void *> MapperArgs(MapperComponents.Components.size()); 388 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size()); 389 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size()); 390 std::vector<void *> MapperArgNames(MapperComponents.Components.size()); 391 392 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { 393 auto &C = MapperComponents.Components[I]; 394 MapperArgsBase[I] = C.Base; 395 MapperArgs[I] = C.Begin; 396 MapperArgSizes[I] = C.Size; 397 MapperArgTypes[I] = C.Type; 398 MapperArgNames[I] = C.Name; 399 } 400 401 int rc = target_data_function(loc, Device, MapperComponents.Components.size(), 402 MapperArgsBase.data(), MapperArgs.data(), 403 MapperArgSizes.data(), MapperArgTypes.data(), 404 MapperArgNames.data(), /*arg_mappers*/ nullptr, 405 AsyncInfo, /*FromMapper=*/true); 406 407 return rc; 408 } 409 410 /// Internal function to do the mapping and transfer the data to the device 411 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num, 412 void **args_base, void **args, int64_t *arg_sizes, 413 int64_t *arg_types, map_var_info_t *arg_names, 414 void **arg_mappers, AsyncInfoTy &AsyncInfo, 415 bool FromMapper) { 416 // process each input. 417 for (int32_t i = 0; i < arg_num; ++i) { 418 // Ignore private variables and arrays - there is no mapping for them. 419 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 420 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 421 continue; 422 423 if (arg_mappers && arg_mappers[i]) { 424 // Instead of executing the regular path of targetDataBegin, call the 425 // targetDataMapper variant which will call targetDataBegin again 426 // with new arguments. 427 DP("Calling targetDataMapper for the %dth argument\n", i); 428 429 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i]; 430 int rc = targetDataMapper(loc, Device, args_base[i], args[i], 431 arg_sizes[i], arg_types[i], arg_name, 432 arg_mappers[i], AsyncInfo, targetDataBegin); 433 434 if (rc != OFFLOAD_SUCCESS) { 435 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" 436 " failed.\n"); 437 return OFFLOAD_FAIL; 438 } 439 440 // Skip the rest of this function, continue to the next argument. 441 continue; 442 } 443 444 void *HstPtrBegin = args[i]; 445 void *HstPtrBase = args_base[i]; 446 int64_t data_size = arg_sizes[i]; 447 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i]; 448 449 // Adjust for proper alignment if this is a combined entry (for structs). 450 // Look at the next argument - if that is MEMBER_OF this one, then this one 451 // is a combined entry. 452 int64_t padding = 0; 453 const int next_i = i + 1; 454 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num && 455 getParentIndex(arg_types[next_i]) == i) { 456 padding = (int64_t)HstPtrBegin % Alignment; 457 if (padding) { 458 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 459 "\n", 460 padding, DPxPTR(HstPtrBegin)); 461 HstPtrBegin = (char *)HstPtrBegin - padding; 462 data_size += padding; 463 } 464 } 465 466 // Address of pointer on the host and device, respectively. 467 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin; 468 TargetPointerResultTy Pointer_TPR; 469 bool IsHostPtr = false; 470 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 471 // Force the creation of a device side copy of the data when: 472 // a close map modifier was associated with a map that contained a to. 473 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 474 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT; 475 bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD; 476 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 477 // have reached this point via __tgt_target_data_begin and not __tgt_target 478 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 479 // associated with a target region, so there are no target parameters). This 480 // may be considered a hack, we could revise the scheme in the future. 481 bool UpdateRef = 482 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0); 483 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 484 DP("Has a pointer entry: \n"); 485 // Base is address of pointer. 486 // 487 // Usually, the pointer is already allocated by this time. For example: 488 // 489 // #pragma omp target map(s.p[0:N]) 490 // 491 // The map entry for s comes first, and the PTR_AND_OBJ entry comes 492 // afterward, so the pointer is already allocated by the time the 493 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus 494 // non-null. However, "declare target link" can produce a PTR_AND_OBJ 495 // entry for a global that might not already be allocated by the time the 496 // PTR_AND_OBJ entry is handled below, and so the allocation might fail 497 // when HasPresentModifier. 498 Pointer_TPR = Device.getTargetPointer( 499 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, 500 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, 501 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); 502 PointerTgtPtrBegin = Pointer_TPR.TargetPointer; 503 IsHostPtr = Pointer_TPR.Flags.IsHostPointer; 504 if (!PointerTgtPtrBegin) { 505 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 506 HasPresentModifier ? "'present' map type modifier" 507 : "device failure or illegal mapping"); 508 return OFFLOAD_FAIL; 509 } 510 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 511 "\n", 512 sizeof(void *), DPxPTR(PointerTgtPtrBegin), 513 (Pointer_TPR.Flags.IsNewEntry ? "" : " not")); 514 Pointer_HstPtrBegin = HstPtrBase; 515 // modify current entry. 516 HstPtrBase = *(void **)HstPtrBase; 517 // No need to update pointee ref count for the first element of the 518 // subelement that comes from mapper. 519 UpdateRef = 520 (!FromMapper || i != 0); // subsequently update ref count of pointee 521 } 522 523 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO; 524 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 525 auto TPR = Device.getTargetPointer( 526 HstPtrBegin, HstPtrBase, data_size, HstPtrName, HasFlagTo, 527 HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, 528 HasPresentModifier, HasHoldModifier, AsyncInfo); 529 void *TgtPtrBegin = TPR.TargetPointer; 530 IsHostPtr = TPR.Flags.IsHostPointer; 531 // If data_size==0, then the argument could be a zero-length pointer to 532 // NULL, so getOrAlloc() returning NULL is not an error. 533 if (!TgtPtrBegin && (data_size || HasPresentModifier)) { 534 REPORT("Call to getTargetPointer returned null pointer (%s).\n", 535 HasPresentModifier ? "'present' map type modifier" 536 : "device failure or illegal mapping"); 537 return OFFLOAD_FAIL; 538 } 539 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 540 " - is%s new\n", 541 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")); 542 543 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 544 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 545 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 546 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 547 args_base[i] = TgtPtrBase; 548 } 549 550 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 551 // Check whether we need to update the pointer on the device 552 bool UpdateDevPtr = false; 553 554 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 555 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 556 557 Device.ShadowMtx.lock(); 558 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin); 559 // If this pointer is not in the map we need to insert it. If the map 560 // contains a stale entry, we need to update it (e.g. if the pointee was 561 // deallocated and later on is reallocated at another device address). The 562 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An 563 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ 564 // pair is deallocated, not when the OBJ is deallocated. In 565 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, 566 // so it stays in the map for the lifetime of the application. When the 567 // OBJ is deallocated and later on allocated again (at a different device 568 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin 569 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. 570 if (Entry == Device.ShadowPtrMap.end() || 571 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { 572 // create or update shadow pointers for this entry 573 Device.ShadowPtrMap[Pointer_HstPtrBegin] = { 574 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; 575 Pointer_TPR.MapTableEntry->setMayContainAttachedPointers(); 576 UpdateDevPtr = true; 577 } 578 579 if (UpdateDevPtr) { 580 std::lock_guard<decltype(*Pointer_TPR.MapTableEntry)> LG( 581 *Pointer_TPR.MapTableEntry); 582 Device.ShadowMtx.unlock(); 583 584 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 585 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 586 587 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); 588 TgtPtrBase = ExpectedTgtPtrBase; 589 590 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, 591 sizeof(void *), AsyncInfo); 592 if (Ret != OFFLOAD_SUCCESS) { 593 REPORT("Copying data to device failed.\n"); 594 return OFFLOAD_FAIL; 595 } 596 if (Pointer_TPR.MapTableEntry->addEventIfNecessary(Device, AsyncInfo) != 597 OFFLOAD_SUCCESS) 598 return OFFLOAD_FAIL; 599 } else 600 Device.ShadowMtx.unlock(); 601 } 602 } 603 604 return OFFLOAD_SUCCESS; 605 } 606 607 namespace { 608 /// This structure contains information to deallocate a target pointer, aka. 609 /// used to call the function \p DeviceTy::deallocTgtPtr. 610 struct DeallocTgtPtrInfo { 611 /// Host pointer used to look up into the map table 612 void *HstPtrBegin; 613 /// Size of the data 614 int64_t DataSize; 615 /// Whether it has \p ompx_hold modifier 616 bool HasHoldModifier; 617 618 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier) 619 : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {} 620 }; 621 622 /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to 623 /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the 624 /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the 625 /// rest of the map is not checked anymore. 626 template <typename CBTy> 627 static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, 628 uintptr_t Size, 629 const TargetPointerResultTy &TPR) { 630 // If we have an object that is too small to hold a pointer subobject, no need 631 // to do any checking. 632 if (Size < sizeof(void *)) 633 return; 634 635 // If the map entry for the object was never marked as containing attached 636 // pointers, no need to do any checking. 637 if (TPR.MapTableEntry == HostDataToTargetListTy::iterator{} || 638 !TPR.MapTableEntry->getMayContainAttachedPointers()) 639 return; 640 641 uintptr_t LB = (uintptr_t)Begin; 642 uintptr_t UB = LB + Size; 643 // Now we are looking into the shadow map so we need to lock it. 644 std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx); 645 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); 646 Itr != Device.ShadowPtrMap.end();) { 647 uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first; 648 649 // An STL map is sorted on its keys; use this property 650 // to quickly determine when to break out of the loop. 651 if (ShadowHstPtrAddr < LB) { 652 ++Itr; 653 continue; 654 } 655 if (ShadowHstPtrAddr >= UB) 656 break; 657 658 if (CB(Itr) == OFFLOAD_FAIL) 659 break; 660 } 661 } 662 663 } // namespace 664 665 /// Internal function to undo the mapping and retrieve the data from the device. 666 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 667 void **ArgBases, void **Args, int64_t *ArgSizes, 668 int64_t *ArgTypes, map_var_info_t *ArgNames, 669 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { 670 int Ret; 671 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs; 672 void *FromMapperBase = nullptr; 673 // process each input. 674 for (int32_t I = ArgNum - 1; I >= 0; --I) { 675 // Ignore private variables and arrays - there is no mapping for them. 676 // Also, ignore the use_device_ptr directive, it has no effect here. 677 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 678 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 679 continue; 680 681 if (ArgMappers && ArgMappers[I]) { 682 // Instead of executing the regular path of targetDataEnd, call the 683 // targetDataMapper variant which will call targetDataEnd again 684 // with new arguments. 685 DP("Calling targetDataMapper for the %dth argument\n", I); 686 687 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 688 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I], 689 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 690 targetDataEnd); 691 692 if (Ret != OFFLOAD_SUCCESS) { 693 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" 694 " failed.\n"); 695 return OFFLOAD_FAIL; 696 } 697 698 // Skip the rest of this function, continue to the next argument. 699 continue; 700 } 701 702 void *HstPtrBegin = Args[I]; 703 int64_t DataSize = ArgSizes[I]; 704 // Adjust for proper alignment if this is a combined entry (for structs). 705 // Look at the next argument - if that is MEMBER_OF this one, then this one 706 // is a combined entry. 707 const int NextI = I + 1; 708 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && 709 getParentIndex(ArgTypes[NextI]) == I) { 710 int64_t Padding = (int64_t)HstPtrBegin % Alignment; 711 if (Padding) { 712 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD 713 "\n", 714 Padding, DPxPTR(HstPtrBegin)); 715 HstPtrBegin = (char *)HstPtrBegin - Padding; 716 DataSize += Padding; 717 } 718 } 719 720 bool IsLast, IsHostPtr; 721 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; 722 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || 723 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && 724 !(FromMapper && I == 0); 725 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; 726 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; 727 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; 728 729 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 730 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 731 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, 732 !IsImplicit, ForceDelete); 733 void *TgtPtrBegin = TPR.TargetPointer; 734 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { 735 DP("Mapping does not exist (%s)\n", 736 (HasPresentModifier ? "'present' map type modifier" : "ignored")); 737 if (HasPresentModifier) { 738 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: 739 // "If a map clause appears on a target, target data, target enter data 740 // or target exit data construct with a present map-type-modifier then 741 // on entry to the region if the corresponding list item does not appear 742 // in the device data environment then an error occurs and the program 743 // terminates." 744 // 745 // This should be an error upon entering an "omp target exit data". It 746 // should not be an error upon exiting an "omp target data" or "omp 747 // target". For "omp target data", Clang thus doesn't include present 748 // modifiers for end calls. For "omp target", we have not found a valid 749 // OpenMP program for which the error matters: it appears that, if a 750 // program can guarantee that data is present at the beginning of an 751 // "omp target" region so that there's no error there, that data is also 752 // guaranteed to be present at the end. 753 MESSAGE("device mapping required by 'present' map type modifier does " 754 "not exist for host address " DPxMOD " (%" PRId64 " bytes)", 755 DPxPTR(HstPtrBegin), DataSize); 756 return OFFLOAD_FAIL; 757 } 758 } else { 759 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 760 " - is%s last\n", 761 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); 762 } 763 764 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: 765 // "If the map clause appears on a target, target data, or target exit data 766 // construct and a corresponding list item of the original list item is not 767 // present in the device data environment on exit from the region then the 768 // list item is ignored." 769 if (!TgtPtrBegin) 770 continue; 771 772 bool DelEntry = IsLast; 773 774 // If the last element from the mapper (for end transfer args comes in 775 // reverse order), do not remove the partial entry, the parent struct still 776 // exists. 777 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 778 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 779 DelEntry = false; // protect parent struct from being deallocated 780 } 781 782 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 783 // Move data back to the host 784 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 785 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; 786 if ((Always || IsLast) && !IsHostPtr) { 787 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 788 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 789 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, 790 AsyncInfo); 791 if (Ret != OFFLOAD_SUCCESS) { 792 REPORT("Copying data from device failed.\n"); 793 return OFFLOAD_FAIL; 794 } 795 } 796 } 797 if (DelEntry && FromMapper && I == 0) { 798 DelEntry = false; 799 FromMapperBase = HstPtrBegin; 800 } 801 802 // If we copied back to the host a struct/array containing pointers, we 803 // need to restore the original host pointer values from their shadow 804 // copies. If the struct is going to be deallocated, remove any remaining 805 // shadow pointer entries for this struct. 806 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 807 // If we copied the struct to the host, we need to restore the pointer. 808 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) { 809 void **ShadowHstPtrAddr = (void **)Itr->first; 810 // Wait for device-to-host memcopies for whole struct to complete, 811 // before restoring the correct host pointer. 812 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 813 return OFFLOAD_FAIL; 814 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 815 DP("Restoring original host pointer value " DPxMOD " for host " 816 "pointer " DPxMOD "\n", 817 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 818 } 819 // If the struct is to be deallocated, remove the shadow entry. 820 if (DelEntry) { 821 DP("Removing shadow pointer " DPxMOD "\n", 822 DPxPTR((void **)Itr->first)); 823 Itr = Device.ShadowPtrMap.erase(Itr); 824 } else { 825 ++Itr; 826 } 827 return OFFLOAD_SUCCESS; 828 }; 829 applyToShadowMapEntries(Device, CB, HstPtrBegin, DataSize, TPR); 830 831 // Add pointer to the buffer for later deallocation 832 if (DelEntry && !IsHostPtr) 833 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier); 834 } 835 } 836 837 // TODO: We should not synchronize here but pass the AsyncInfo object to the 838 // allocate/deallocate device APIs. 839 // 840 // We need to synchronize before deallocating data. 841 Ret = AsyncInfo.synchronize(); 842 if (Ret != OFFLOAD_SUCCESS) 843 return OFFLOAD_FAIL; 844 845 // Deallocate target pointer 846 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) { 847 if (FromMapperBase && FromMapperBase == Info.HstPtrBegin) 848 continue; 849 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize, 850 Info.HasHoldModifier); 851 if (Ret != OFFLOAD_SUCCESS) { 852 REPORT("Deallocating data from device failed.\n"); 853 return OFFLOAD_FAIL; 854 } 855 } 856 857 return OFFLOAD_SUCCESS; 858 } 859 860 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase, 861 void *HstPtrBegin, int64_t ArgSize, 862 int64_t ArgType, AsyncInfoTy &AsyncInfo) { 863 TIMESCOPE_WITH_IDENT(loc); 864 bool IsLast, IsHostPtr; 865 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 866 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, 867 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); 868 void *TgtPtrBegin = TPR.TargetPointer; 869 if (!TgtPtrBegin) { 870 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 871 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { 872 MESSAGE("device mapping required by 'present' motion modifier does not " 873 "exist for host address " DPxMOD " (%" PRId64 " bytes)", 874 DPxPTR(HstPtrBegin), ArgSize); 875 return OFFLOAD_FAIL; 876 } 877 return OFFLOAD_SUCCESS; 878 } 879 880 if (IsHostPtr) { 881 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 882 DPxPTR(HstPtrBegin)); 883 return OFFLOAD_SUCCESS; 884 } 885 886 if (ArgType & OMP_TGT_MAPTYPE_FROM) { 887 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 888 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 889 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); 890 if (Ret != OFFLOAD_SUCCESS) { 891 REPORT("Copying data from device failed.\n"); 892 return OFFLOAD_FAIL; 893 } 894 895 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 896 void **ShadowHstPtrAddr = (void **)Itr->first; 897 // Wait for device-to-host memcopies for whole struct to complete, 898 // before restoring the correct host pointer. 899 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) 900 return OFFLOAD_FAIL; 901 *ShadowHstPtrAddr = Itr->second.HstPtrVal; 902 DP("Restoring original host pointer value " DPxMOD 903 " for host pointer " DPxMOD "\n", 904 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)); 905 ++Itr; 906 return OFFLOAD_SUCCESS; 907 }; 908 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 909 } 910 911 if (ArgType & OMP_TGT_MAPTYPE_TO) { 912 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 913 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 914 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); 915 if (Ret != OFFLOAD_SUCCESS) { 916 REPORT("Copying data to device failed.\n"); 917 return OFFLOAD_FAIL; 918 } 919 920 auto CB = [&](ShadowPtrListTy::iterator &Itr) { 921 DP("Restoring original target pointer value " DPxMOD " for target " 922 "pointer " DPxMOD "\n", 923 DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr)); 924 Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal, 925 sizeof(void *), AsyncInfo); 926 if (Ret != OFFLOAD_SUCCESS) 927 REPORT("Copying data to device failed.\n"); 928 ++Itr; 929 return Ret; 930 }; 931 applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); 932 } 933 return OFFLOAD_SUCCESS; 934 } 935 936 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device, 937 void *ArgsBase, 938 __tgt_target_non_contig *NonContig, 939 uint64_t Size, int64_t ArgType, 940 int CurrentDim, int DimSize, uint64_t Offset, 941 AsyncInfoTy &AsyncInfo) { 942 TIMESCOPE_WITH_IDENT(loc); 943 int Ret = OFFLOAD_SUCCESS; 944 if (CurrentDim < DimSize) { 945 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { 946 uint64_t CurOffset = 947 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; 948 // we only need to transfer the first element for the last dimension 949 // since we've already got a contiguous piece. 950 if (CurrentDim != DimSize - 1 || I == 0) { 951 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size, 952 ArgType, CurrentDim + 1, DimSize, 953 Offset + CurOffset, AsyncInfo); 954 // Stop the whole process if any contiguous piece returns anything 955 // other than OFFLOAD_SUCCESS. 956 if (Ret != OFFLOAD_SUCCESS) 957 return Ret; 958 } 959 } 960 } else { 961 char *Ptr = (char *)ArgsBase + Offset; 962 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 963 " len %" PRIu64 "\n", 964 DPxPTR(Ptr), Offset, Size); 965 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType, 966 AsyncInfo); 967 } 968 return Ret; 969 } 970 971 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, 972 int32_t DimSize) { 973 int RemovedDim = 0; 974 for (int I = DimSize - 1; I > 0; --I) { 975 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) 976 RemovedDim++; 977 } 978 return RemovedDim; 979 } 980 981 /// Internal function to pass data to/from the target. 982 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum, 983 void **ArgsBase, void **Args, int64_t *ArgSizes, 984 int64_t *ArgTypes, map_var_info_t *ArgNames, 985 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { 986 // process each input. 987 for (int32_t I = 0; I < ArgNum; ++I) { 988 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || 989 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) 990 continue; 991 992 if (ArgMappers && ArgMappers[I]) { 993 // Instead of executing the regular path of targetDataUpdate, call the 994 // targetDataMapper variant which will call targetDataUpdate again 995 // with new arguments. 996 DP("Calling targetDataMapper for the %dth argument\n", I); 997 998 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; 999 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1000 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, 1001 targetDataUpdate); 1002 1003 if (Ret != OFFLOAD_SUCCESS) { 1004 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" 1005 " failed.\n"); 1006 return OFFLOAD_FAIL; 1007 } 1008 1009 // Skip the rest of this function, continue to the next argument. 1010 continue; 1011 } 1012 1013 int Ret = OFFLOAD_SUCCESS; 1014 1015 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { 1016 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; 1017 int32_t DimSize = ArgSizes[I]; 1018 uint64_t Size = 1019 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; 1020 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); 1021 Ret = targetDataNonContiguous( 1022 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], 1023 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); 1024 } else { 1025 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I], 1026 ArgTypes[I], AsyncInfo); 1027 } 1028 if (Ret == OFFLOAD_FAIL) 1029 return OFFLOAD_FAIL; 1030 } 1031 return OFFLOAD_SUCCESS; 1032 } 1033 1034 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 1035 OMP_TGT_MAPTYPE_LITERAL | 1036 OMP_TGT_MAPTYPE_IMPLICIT; 1037 static bool isLambdaMapping(int64_t Mapping) { 1038 return (Mapping & LambdaMapping) == LambdaMapping; 1039 } 1040 1041 namespace { 1042 /// Find the table information in the map or look it up in the translation 1043 /// tables. 1044 TableMap *getTableMap(void *HostPtr) { 1045 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1046 HostPtrToTableMapTy::iterator TableMapIt = 1047 PM->HostPtrToTableMap.find(HostPtr); 1048 1049 if (TableMapIt != PM->HostPtrToTableMap.end()) 1050 return &TableMapIt->second; 1051 1052 // We don't have a map. So search all the registered libraries. 1053 TableMap *TM = nullptr; 1054 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1055 for (HostEntriesBeginToTransTableTy::iterator Itr = 1056 PM->HostEntriesBeginToTransTable.begin(); 1057 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { 1058 // get the translation table (which contains all the good info). 1059 TranslationTable *TransTable = &Itr->second; 1060 // iterate over all the host table entries to see if we can locate the 1061 // host_ptr. 1062 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; 1063 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { 1064 if (Cur->addr != HostPtr) 1065 continue; 1066 // we got a match, now fill the HostPtrToTableMap so that we 1067 // may avoid this search next time. 1068 TM = &(PM->HostPtrToTableMap)[HostPtr]; 1069 TM->Table = TransTable; 1070 TM->Index = I; 1071 return TM; 1072 } 1073 } 1074 1075 return nullptr; 1076 } 1077 1078 /// Get loop trip count 1079 /// FIXME: This function will not work right if calling 1080 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in 1081 /// another thread, which might occur when we call task yield. 1082 uint64_t getLoopTripCount(int64_t DeviceId) { 1083 DeviceTy &Device = *PM->Devices[DeviceId]; 1084 uint64_t LoopTripCount = 0; 1085 1086 { 1087 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); 1088 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 1089 if (I != Device.LoopTripCnt.end()) { 1090 LoopTripCount = I->second; 1091 Device.LoopTripCnt.erase(I); 1092 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount); 1093 } 1094 } 1095 1096 return LoopTripCount; 1097 } 1098 1099 /// A class manages private arguments in a target region. 1100 class PrivateArgumentManagerTy { 1101 /// A data structure for the information of first-private arguments. We can 1102 /// use this information to optimize data transfer by packing all 1103 /// first-private arguments and transfer them all at once. 1104 struct FirstPrivateArgInfoTy { 1105 /// The index of the element in \p TgtArgs corresponding to the argument 1106 const int Index; 1107 /// Host pointer begin 1108 const char *HstPtrBegin; 1109 /// Host pointer end 1110 const char *HstPtrEnd; 1111 /// Aligned size 1112 const int64_t AlignedSize; 1113 /// Host pointer name 1114 const map_var_info_t HstPtrName = nullptr; 1115 1116 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size, 1117 const map_var_info_t HstPtrName = nullptr) 1118 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)), 1119 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), 1120 HstPtrName(HstPtrName) {} 1121 }; 1122 1123 /// A vector of target pointers for all private arguments 1124 std::vector<void *> TgtPtrs; 1125 1126 /// A vector of information of all first-private arguments to be packed 1127 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; 1128 /// Host buffer for all arguments to be packed 1129 std::vector<char> FirstPrivateArgBuffer; 1130 /// The total size of all arguments to be packed 1131 int64_t FirstPrivateArgSize = 0; 1132 1133 /// A reference to the \p DeviceTy object 1134 DeviceTy &Device; 1135 /// A pointer to a \p AsyncInfoTy object 1136 AsyncInfoTy &AsyncInfo; 1137 1138 // TODO: What would be the best value here? Should we make it configurable? 1139 // If the size is larger than this threshold, we will allocate and transfer it 1140 // immediately instead of packing it. 1141 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; 1142 1143 public: 1144 /// Constructor 1145 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) 1146 : Device(Dev), AsyncInfo(AsyncInfo) {} 1147 1148 /// Add a private argument 1149 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, 1150 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, 1151 const map_var_info_t HstPtrName = nullptr, 1152 const bool AllocImmediately = false) { 1153 // If the argument is not first-private, or its size is greater than a 1154 // predefined threshold, we will allocate memory and issue the transfer 1155 // immediately. 1156 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || 1157 AllocImmediately) { 1158 TgtPtr = Device.allocData(ArgSize, HstPtr); 1159 if (!TgtPtr) { 1160 DP("Data allocation for %sprivate array " DPxMOD " failed.\n", 1161 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)); 1162 return OFFLOAD_FAIL; 1163 } 1164 #ifdef OMPTARGET_DEBUG 1165 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); 1166 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD 1167 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD 1168 "\n", 1169 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""), 1170 DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); 1171 #endif 1172 // If first-private, copy data from host 1173 if (IsFirstPrivate) { 1174 DP("Submitting firstprivate data to the device.\n"); 1175 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); 1176 if (Ret != OFFLOAD_SUCCESS) { 1177 DP("Copying data to device failed, failed.\n"); 1178 return OFFLOAD_FAIL; 1179 } 1180 } 1181 TgtPtrs.push_back(TgtPtr); 1182 } else { 1183 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n", 1184 DPxPTR(HstPtr), ArgSize); 1185 // When reach this point, the argument must meet all following 1186 // requirements: 1187 // 1. Its size does not exceed the threshold (see the comment for 1188 // FirstPrivateArgSizeThreshold); 1189 // 2. It must be first-private (needs to be mapped to target device). 1190 // We will pack all this kind of arguments to transfer them all at once 1191 // to reduce the number of data transfer. We will not take 1192 // non-first-private arguments, aka. private arguments that doesn't need 1193 // to be mapped to target device, into account because data allocation 1194 // can be very efficient with memory manager. 1195 1196 // Placeholder value 1197 TgtPtr = nullptr; 1198 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, 1199 HstPtrName); 1200 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; 1201 } 1202 1203 return OFFLOAD_SUCCESS; 1204 } 1205 1206 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, 1207 /// and start the transfer. 1208 int packAndTransfer(std::vector<void *> &TgtArgs) { 1209 if (!FirstPrivateArgInfo.empty()) { 1210 assert(FirstPrivateArgSize != 0 && 1211 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty"); 1212 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); 1213 auto Itr = FirstPrivateArgBuffer.begin(); 1214 // Copy all host data to this buffer 1215 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1216 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); 1217 Itr = std::next(Itr, Info.AlignedSize); 1218 } 1219 // Allocate target memory 1220 void *TgtPtr = 1221 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); 1222 if (TgtPtr == nullptr) { 1223 DP("Failed to allocate target memory for private arguments.\n"); 1224 return OFFLOAD_FAIL; 1225 } 1226 TgtPtrs.push_back(TgtPtr); 1227 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n", 1228 FirstPrivateArgSize, DPxPTR(TgtPtr)); 1229 // Transfer data to target device 1230 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), 1231 FirstPrivateArgSize, AsyncInfo); 1232 if (Ret != OFFLOAD_SUCCESS) { 1233 DP("Failed to submit data of private arguments.\n"); 1234 return OFFLOAD_FAIL; 1235 } 1236 // Fill in all placeholder pointers 1237 auto TP = reinterpret_cast<uintptr_t>(TgtPtr); 1238 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { 1239 void *&Ptr = TgtArgs[Info.Index]; 1240 assert(Ptr == nullptr && "Target pointer is already set by mistaken"); 1241 Ptr = reinterpret_cast<void *>(TP); 1242 TP += Info.AlignedSize; 1243 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD 1244 "\n", 1245 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, 1246 DPxPTR(Ptr)); 1247 } 1248 } 1249 1250 return OFFLOAD_SUCCESS; 1251 } 1252 1253 /// Free all target memory allocated for private arguments 1254 int free() { 1255 for (void *P : TgtPtrs) { 1256 int Ret = Device.deleteData(P); 1257 if (Ret != OFFLOAD_SUCCESS) { 1258 DP("Deallocation of (first-)private arrays failed.\n"); 1259 return OFFLOAD_FAIL; 1260 } 1261 } 1262 1263 TgtPtrs.clear(); 1264 1265 return OFFLOAD_SUCCESS; 1266 } 1267 }; 1268 1269 /// Process data before launching the kernel, including calling targetDataBegin 1270 /// to map and transfer data to target device, transferring (first-)private 1271 /// variables. 1272 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr, 1273 int32_t ArgNum, void **ArgBases, void **Args, 1274 int64_t *ArgSizes, int64_t *ArgTypes, 1275 map_var_info_t *ArgNames, void **ArgMappers, 1276 std::vector<void *> &TgtArgs, 1277 std::vector<ptrdiff_t> &TgtOffsets, 1278 PrivateArgumentManagerTy &PrivateArgumentManager, 1279 AsyncInfoTy &AsyncInfo) { 1280 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc); 1281 DeviceTy &Device = *PM->Devices[DeviceId]; 1282 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1283 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1284 if (Ret != OFFLOAD_SUCCESS) { 1285 REPORT("Call to targetDataBegin failed, abort target.\n"); 1286 return OFFLOAD_FAIL; 1287 } 1288 1289 // List of (first-)private arrays allocated for this target region 1290 std::vector<int> TgtArgsPositions(ArgNum, -1); 1291 1292 for (int32_t I = 0; I < ArgNum; ++I) { 1293 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 1294 // This is not a target parameter, do not push it into TgtArgs. 1295 // Check for lambda mapping. 1296 if (isLambdaMapping(ArgTypes[I])) { 1297 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && 1298 "PTR_AND_OBJ must be also MEMBER_OF."); 1299 unsigned Idx = getParentIndex(ArgTypes[I]); 1300 int TgtIdx = TgtArgsPositions[Idx]; 1301 assert(TgtIdx != -1 && "Base address must be translated already."); 1302 // The parent lambda must be processed already and it must be the last 1303 // in TgtArgs and TgtOffsets arrays. 1304 void *HstPtrVal = Args[I]; 1305 void *HstPtrBegin = ArgBases[I]; 1306 void *HstPtrBase = Args[Idx]; 1307 bool IsLast, IsHostPtr; // IsLast is unused. 1308 void *TgtPtrBase = 1309 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); 1310 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 1311 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 1312 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 1313 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); 1314 TargetPointerResultTy TPR = Device.getTgtPtrBegin( 1315 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, 1316 /*UseHoldRefCount=*/false, IsHostPtr); 1317 PointerTgtPtrBegin = TPR.TargetPointer; 1318 if (!PointerTgtPtrBegin) { 1319 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 1320 DPxPTR(HstPtrVal)); 1321 continue; 1322 } 1323 if (IsHostPtr) { 1324 DP("Unified memory is active, no need to map lambda captured" 1325 "variable (" DPxMOD ")\n", 1326 DPxPTR(HstPtrVal)); 1327 continue; 1328 } 1329 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 1330 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); 1331 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, 1332 sizeof(void *), AsyncInfo); 1333 if (Ret != OFFLOAD_SUCCESS) { 1334 REPORT("Copying data to device failed.\n"); 1335 return OFFLOAD_FAIL; 1336 } 1337 } 1338 continue; 1339 } 1340 void *HstPtrBegin = Args[I]; 1341 void *HstPtrBase = ArgBases[I]; 1342 void *TgtPtrBegin; 1343 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; 1344 ptrdiff_t TgtBaseOffset; 1345 bool IsLast, IsHostPtr; // unused. 1346 TargetPointerResultTy TPR; 1347 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { 1348 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 1349 DPxPTR(HstPtrBase)); 1350 TgtPtrBegin = HstPtrBase; 1351 TgtBaseOffset = 0; 1352 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { 1353 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1354 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); 1355 // If there is a next argument and it depends on the current one, we need 1356 // to allocate the private memory immediately. If this is not the case, 1357 // then the argument can be marked for optimization and packed with the 1358 // other privates. 1359 const bool AllocImmediately = 1360 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); 1361 Ret = PrivateArgumentManager.addArg( 1362 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, 1363 TgtArgs.size(), HstPtrName, AllocImmediately); 1364 if (Ret != OFFLOAD_SUCCESS) { 1365 REPORT("Failed to process %sprivate argument " DPxMOD "\n", 1366 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); 1367 return OFFLOAD_FAIL; 1368 } 1369 } else { 1370 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) 1371 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); 1372 TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, 1373 /*UpdateRefCount=*/false, 1374 /*UseHoldRefCount=*/false, IsHostPtr); 1375 TgtPtrBegin = TPR.TargetPointer; 1376 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 1377 #ifdef OMPTARGET_DEBUG 1378 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 1379 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 1380 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 1381 #endif 1382 } 1383 TgtArgsPositions[I] = TgtArgs.size(); 1384 TgtArgs.push_back(TgtPtrBegin); 1385 TgtOffsets.push_back(TgtBaseOffset); 1386 } 1387 1388 assert(TgtArgs.size() == TgtOffsets.size() && 1389 "Size mismatch in arguments and offsets"); 1390 1391 // Pack and transfer first-private arguments 1392 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); 1393 if (Ret != OFFLOAD_SUCCESS) { 1394 DP("Failed to pack and transfer first private arguments\n"); 1395 return OFFLOAD_FAIL; 1396 } 1397 1398 return OFFLOAD_SUCCESS; 1399 } 1400 1401 /// Process data after launching the kernel, including transferring data back to 1402 /// host if needed and deallocating target memory of (first-)private variables. 1403 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr, 1404 int32_t ArgNum, void **ArgBases, void **Args, 1405 int64_t *ArgSizes, int64_t *ArgTypes, 1406 map_var_info_t *ArgNames, void **ArgMappers, 1407 PrivateArgumentManagerTy &PrivateArgumentManager, 1408 AsyncInfoTy &AsyncInfo) { 1409 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc); 1410 DeviceTy &Device = *PM->Devices[DeviceId]; 1411 1412 // Move data from device. 1413 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes, 1414 ArgTypes, ArgNames, ArgMappers, AsyncInfo); 1415 if (Ret != OFFLOAD_SUCCESS) { 1416 REPORT("Call to targetDataEnd failed, abort target.\n"); 1417 return OFFLOAD_FAIL; 1418 } 1419 1420 // Free target memory for private arguments 1421 Ret = PrivateArgumentManager.free(); 1422 if (Ret != OFFLOAD_SUCCESS) { 1423 REPORT("Failed to deallocate target memory for private args\n"); 1424 return OFFLOAD_FAIL; 1425 } 1426 1427 return OFFLOAD_SUCCESS; 1428 } 1429 } // namespace 1430 1431 /// performs the same actions as data_begin in case arg_num is 1432 /// non-zero and initiates run of the offloaded region on the target platform; 1433 /// if arg_num is non-zero after the region execution is done it also 1434 /// performs the same action as data_update and data_end above. This function 1435 /// returns 0 if it was able to transfer the execution to a target and an 1436 /// integer different from zero otherwise. 1437 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum, 1438 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, 1439 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum, 1440 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) { 1441 int32_t DeviceId = Device.DeviceID; 1442 1443 TableMap *TM = getTableMap(HostPtr); 1444 // No map for this host pointer found! 1445 if (!TM) { 1446 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n", 1447 DPxPTR(HostPtr)); 1448 return OFFLOAD_FAIL; 1449 } 1450 1451 // get target table. 1452 __tgt_target_table *TargetTable = nullptr; 1453 { 1454 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); 1455 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && 1456 "Not expecting a device ID outside the table's bounds!"); 1457 TargetTable = TM->Table->TargetsTable[DeviceId]; 1458 } 1459 assert(TargetTable && "Global data has not been mapped\n"); 1460 1461 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we 1462 // need to manifest base pointers prior to launching a kernel. Even if we have 1463 // mapped an object only partially, e.g. A[N:M], although the kernel is 1464 // expected to access elements starting at address &A[N] and beyond, we still 1465 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI 1466 // API, we need the begin address itself, i.e. &A[N], as the API operates on 1467 // begin addresses, not bases. That's why we pass args and offsets as two 1468 // separate entities so that each plugin can do what it needs. This behavior 1469 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. 1470 std::vector<void *> TgtArgs; 1471 std::vector<ptrdiff_t> TgtOffsets; 1472 1473 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); 1474 1475 int Ret; 1476 if (ArgNum) { 1477 // Process data, such as data mapping, before launching the kernel 1478 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1479 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs, 1480 TgtOffsets, PrivateArgumentManager, AsyncInfo); 1481 if (Ret != OFFLOAD_SUCCESS) { 1482 REPORT("Failed to process data before launching the kernel.\n"); 1483 return OFFLOAD_FAIL; 1484 } 1485 } 1486 1487 // Launch device execution. 1488 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; 1489 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 1490 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); 1491 1492 { 1493 TIMESCOPE_WITH_NAME_AND_IDENT( 1494 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc); 1495 if (IsTeamConstruct) 1496 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1497 TgtArgs.size(), TeamNum, ThreadLimit, 1498 getLoopTripCount(DeviceId), AsyncInfo); 1499 else 1500 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0], 1501 TgtArgs.size(), AsyncInfo); 1502 } 1503 1504 if (Ret != OFFLOAD_SUCCESS) { 1505 REPORT("Executing target region abort target.\n"); 1506 return OFFLOAD_FAIL; 1507 } 1508 1509 if (ArgNum) { 1510 // Transfer data back and deallocate target memory for (first-)private 1511 // variables 1512 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args, 1513 ArgSizes, ArgTypes, ArgNames, ArgMappers, 1514 PrivateArgumentManager, AsyncInfo); 1515 if (Ret != OFFLOAD_SUCCESS) { 1516 REPORT("Failed to process data after launching the kernel.\n"); 1517 return OFFLOAD_FAIL; 1518 } 1519 } 1520 1521 return OFFLOAD_SUCCESS; 1522 } 1523