1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Implementation of the interface to be used by Clang during the codegen of a 11 // target region. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include <omptarget.h> 16 17 #include "device.h" 18 #include "private.h" 19 #include "rtl.h" 20 21 #include <cassert> 22 #include <vector> 23 24 #ifdef OMPTARGET_DEBUG 25 int DebugLevel = 0; 26 #endif // OMPTARGET_DEBUG 27 28 29 30 /* All begin addresses for partially mapped structs must be 8-aligned in order 31 * to ensure proper alignment of members. E.g. 32 * 33 * struct S { 34 * int a; // 4-aligned 35 * int b; // 4-aligned 36 * int *p; // 8-aligned 37 * } s1; 38 * ... 39 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 40 * { 41 * s1.b = 5; 42 * for (int i...) s1.p[i] = ...; 43 * } 44 * 45 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 46 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 47 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 48 * requirements for its type. Now, when we allocate memory on the device, in 49 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 50 * This means that the chunk of the struct on the device will start at a 51 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 52 * address of p will be a misaligned 0x204 (on the host there was no need to add 53 * padding between b and p, so p comes exactly 4 bytes after b). If the device 54 * kernel tries to access s1.p, a misaligned address error occurs (as reported 55 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 56 * extending the size of the allocated chuck accordingly, the chuck on the 57 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 58 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 59 */ 60 static const int64_t alignment = 8; 61 62 /// Map global data and execute pending ctors 63 static int InitLibrary(DeviceTy& Device) { 64 /* 65 * Map global data 66 */ 67 int32_t device_id = Device.DeviceID; 68 int rc = OFFLOAD_SUCCESS; 69 70 Device.PendingGlobalsMtx.lock(); 71 TrlTblMtx.lock(); 72 for (HostEntriesBeginToTransTableTy::iterator 73 ii = HostEntriesBeginToTransTable.begin(); 74 ii != HostEntriesBeginToTransTable.end(); ++ii) { 75 TranslationTable *TransTable = &ii->second; 76 if (TransTable->TargetsTable[device_id] != 0) { 77 // Library entries have already been processed 78 continue; 79 } 80 81 // 1) get image. 82 assert(TransTable->TargetsImages.size() > (size_t)device_id && 83 "Not expecting a device ID outside the table's bounds!"); 84 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 85 if (!img) { 86 DP("No image loaded for device id %d.\n", device_id); 87 rc = OFFLOAD_FAIL; 88 break; 89 } 90 // 2) load image into the target table. 91 __tgt_target_table *TargetTable = 92 TransTable->TargetsTable[device_id] = Device.load_binary(img); 93 // Unable to get table for this image: invalidate image and fail. 94 if (!TargetTable) { 95 DP("Unable to generate entries table for device id %d.\n", device_id); 96 TransTable->TargetsImages[device_id] = 0; 97 rc = OFFLOAD_FAIL; 98 break; 99 } 100 101 // Verify whether the two table sizes match. 102 size_t hsize = 103 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 104 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 105 106 // Invalid image for these host entries! 107 if (hsize != tsize) { 108 DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n", 109 device_id, hsize, tsize); 110 TransTable->TargetsImages[device_id] = 0; 111 TransTable->TargetsTable[device_id] = 0; 112 rc = OFFLOAD_FAIL; 113 break; 114 } 115 116 // process global data that needs to be mapped. 117 Device.DataMapMtx.lock(); 118 __tgt_target_table *HostTable = &TransTable->HostTable; 119 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 120 *CurrHostEntry = HostTable->EntriesBegin, 121 *EntryDeviceEnd = TargetTable->EntriesEnd; 122 CurrDeviceEntry != EntryDeviceEnd; 123 CurrDeviceEntry++, CurrHostEntry++) { 124 if (CurrDeviceEntry->size != 0) { 125 // has data. 126 assert(CurrDeviceEntry->size == CurrHostEntry->size && 127 "data size mismatch"); 128 129 // Fortran may use multiple weak declarations for the same symbol, 130 // therefore we must allow for multiple weak symbols to be loaded from 131 // the fat binary. Treat these mappings as any other "regular" mapping. 132 // Add entry to map. 133 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) 134 continue; 135 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" 136 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 137 CurrDeviceEntry->size); 138 Device.HostDataToTargetMap.push_front(HostDataToTargetTy( 139 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 140 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 141 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, 142 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 143 INF_REF_CNT /*RefCount*/)); 144 } 145 } 146 Device.DataMapMtx.unlock(); 147 } 148 TrlTblMtx.unlock(); 149 150 if (rc != OFFLOAD_SUCCESS) { 151 Device.PendingGlobalsMtx.unlock(); 152 return rc; 153 } 154 155 /* 156 * Run ctors for static objects 157 */ 158 if (!Device.PendingCtorsDtors.empty()) { 159 // Call all ctors for all libraries registered so far 160 for (auto &lib : Device.PendingCtorsDtors) { 161 if (!lib.second.PendingCtors.empty()) { 162 DP("Has pending ctors... call now\n"); 163 for (auto &entry : lib.second.PendingCtors) { 164 void *ctor = entry; 165 int rc = target(device_id, ctor, 0, NULL, NULL, NULL, 166 NULL, 1, 1, true /*team*/); 167 if (rc != OFFLOAD_SUCCESS) { 168 DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 169 Device.PendingGlobalsMtx.unlock(); 170 return OFFLOAD_FAIL; 171 } 172 } 173 // Clear the list to indicate that this device has been used 174 lib.second.PendingCtors.clear(); 175 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 176 } 177 } 178 } 179 Device.HasPendingGlobals = false; 180 Device.PendingGlobalsMtx.unlock(); 181 182 return OFFLOAD_SUCCESS; 183 } 184 185 // Check whether a device has been initialized, global ctors have been 186 // executed and global data has been mapped; do so if not already done. 187 int CheckDeviceAndCtors(int64_t device_id) { 188 // Is device ready? 189 if (!device_is_ready(device_id)) { 190 DP("Device %" PRId64 " is not ready.\n", device_id); 191 return OFFLOAD_FAIL; 192 } 193 194 // Get device info. 195 DeviceTy &Device = Devices[device_id]; 196 197 // Check whether global data has been mapped for this device 198 Device.PendingGlobalsMtx.lock(); 199 bool hasPendingGlobals = Device.HasPendingGlobals; 200 Device.PendingGlobalsMtx.unlock(); 201 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 202 DP("Failed to init globals on device %" PRId64 "\n", device_id); 203 return OFFLOAD_FAIL; 204 } 205 206 return OFFLOAD_SUCCESS; 207 } 208 209 static int32_t member_of(int64_t type) { 210 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 211 } 212 213 /// Internal function to do the mapping and transfer the data to the device 214 int target_data_begin(DeviceTy &Device, int32_t arg_num, 215 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { 216 // process each input. 217 for (int32_t i = 0; i < arg_num; ++i) { 218 // Ignore private variables and arrays - there is no mapping for them. 219 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 220 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 221 continue; 222 223 void *HstPtrBegin = args[i]; 224 void *HstPtrBase = args_base[i]; 225 int64_t data_size = arg_sizes[i]; 226 227 // Adjust for proper alignment if this is a combined entry (for structs). 228 // Look at the next argument - if that is MEMBER_OF this one, then this one 229 // is a combined entry. 230 int64_t padding = 0; 231 const int next_i = i+1; 232 if (member_of(arg_types[i]) < 0 && next_i < arg_num && 233 member_of(arg_types[next_i]) == i) { 234 padding = (int64_t)HstPtrBegin % alignment; 235 if (padding) { 236 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 237 "\n", padding, DPxPTR(HstPtrBegin)); 238 HstPtrBegin = (char *) HstPtrBegin - padding; 239 data_size += padding; 240 } 241 } 242 243 // Address of pointer on the host and device, respectively. 244 void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; 245 bool IsNew, Pointer_IsNew; 246 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 247 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 248 // have reached this point via __tgt_target_data_begin and not __tgt_target 249 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 250 // associated with a target region, so there are no target parameters). This 251 // may be considered a hack, we could revise the scheme in the future. 252 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); 253 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 254 DP("Has a pointer entry: \n"); 255 // base is address of pointer. 256 Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, 257 sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef); 258 if (!Pointer_TgtPtrBegin) { 259 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " 260 "illegal mapping).\n"); 261 return OFFLOAD_FAIL; 262 } 263 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 264 "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), 265 (Pointer_IsNew ? "" : " not")); 266 Pointer_HstPtrBegin = HstPtrBase; 267 // modify current entry. 268 HstPtrBase = *(void **)HstPtrBase; 269 UpdateRef = true; // subsequently update ref count of pointee 270 } 271 272 void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, 273 data_size, IsNew, IsImplicit, UpdateRef); 274 if (!TgtPtrBegin && data_size) { 275 // If data_size==0, then the argument could be a zero-length pointer to 276 // NULL, so getOrAlloc() returning NULL is not an error. 277 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " 278 "illegal mapping).\n"); 279 } 280 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 281 " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), 282 (IsNew ? "" : " not")); 283 284 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 285 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 286 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 287 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 288 args_base[i] = TgtPtrBase; 289 } 290 291 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { 292 bool copy = false; 293 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { 294 copy = true; 295 } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { 296 // Copy data only if the "parent" struct has RefCount==1. 297 int32_t parent_idx = member_of(arg_types[i]); 298 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); 299 assert(parent_rc > 0 && "parent struct not found"); 300 if (parent_rc == 1) { 301 copy = true; 302 } 303 } 304 305 if (copy) { 306 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 307 data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 308 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); 309 if (rt != OFFLOAD_SUCCESS) { 310 DP("Copying data to device failed.\n"); 311 return OFFLOAD_FAIL; 312 } 313 } 314 } 315 316 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 317 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 318 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); 319 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 320 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 321 int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, 322 sizeof(void *)); 323 if (rt != OFFLOAD_SUCCESS) { 324 DP("Copying data to device failed.\n"); 325 return OFFLOAD_FAIL; 326 } 327 // create shadow pointers for this entry 328 Device.ShadowMtx.lock(); 329 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, 330 Pointer_TgtPtrBegin, TgtPtrBase}; 331 Device.ShadowMtx.unlock(); 332 } 333 } 334 335 return OFFLOAD_SUCCESS; 336 } 337 338 /// Internal function to undo the mapping and retrieve the data from the device. 339 int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, 340 void **args, int64_t *arg_sizes, int64_t *arg_types) { 341 // process each input. 342 for (int32_t i = arg_num - 1; i >= 0; --i) { 343 // Ignore private variables and arrays - there is no mapping for them. 344 // Also, ignore the use_device_ptr directive, it has no effect here. 345 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 346 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 347 continue; 348 349 void *HstPtrBegin = args[i]; 350 int64_t data_size = arg_sizes[i]; 351 // Adjust for proper alignment if this is a combined entry (for structs). 352 // Look at the next argument - if that is MEMBER_OF this one, then this one 353 // is a combined entry. 354 int64_t padding = 0; 355 const int next_i = i+1; 356 if (member_of(arg_types[i]) < 0 && next_i < arg_num && 357 member_of(arg_types[next_i]) == i) { 358 padding = (int64_t)HstPtrBegin % alignment; 359 if (padding) { 360 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 361 "\n", padding, DPxPTR(HstPtrBegin)); 362 HstPtrBegin = (char *) HstPtrBegin - padding; 363 data_size += padding; 364 } 365 } 366 367 bool IsLast; 368 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || 369 (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); 370 bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; 371 372 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 373 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, 374 UpdateRef); 375 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 376 " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), 377 (IsLast ? "" : " not")); 378 379 bool DelEntry = IsLast || ForceDelete; 380 381 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && 382 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 383 DelEntry = false; // protect parent struct from being deallocated 384 } 385 386 if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 387 // Move data back to the host 388 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { 389 bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 390 bool CopyMember = false; 391 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && 392 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 393 // Copy data only if the "parent" struct has RefCount==1. 394 int32_t parent_idx = member_of(arg_types[i]); 395 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); 396 assert(parent_rc > 0 && "parent struct not found"); 397 if (parent_rc == 1) { 398 CopyMember = true; 399 } 400 } 401 402 if (DelEntry || Always || CopyMember) { 403 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 404 data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 405 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); 406 if (rt != OFFLOAD_SUCCESS) { 407 DP("Copying data from device failed.\n"); 408 return OFFLOAD_FAIL; 409 } 410 } 411 } 412 413 // If we copied back to the host a struct/array containing pointers, we 414 // need to restore the original host pointer values from their shadow 415 // copies. If the struct is going to be deallocated, remove any remaining 416 // shadow pointer entries for this struct. 417 uintptr_t lb = (uintptr_t) HstPtrBegin; 418 uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; 419 Device.ShadowMtx.lock(); 420 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); 421 it != Device.ShadowPtrMap.end();) { 422 void **ShadowHstPtrAddr = (void**) it->first; 423 424 // An STL map is sorted on its keys; use this property 425 // to quickly determine when to break out of the loop. 426 if ((uintptr_t) ShadowHstPtrAddr < lb) { 427 ++it; 428 continue; 429 } 430 if ((uintptr_t) ShadowHstPtrAddr >= ub) 431 break; 432 433 // If we copied the struct to the host, we need to restore the pointer. 434 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { 435 DP("Restoring original host pointer value " DPxMOD " for host " 436 "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), 437 DPxPTR(ShadowHstPtrAddr)); 438 *ShadowHstPtrAddr = it->second.HstPtrVal; 439 } 440 // If the struct is to be deallocated, remove the shadow entry. 441 if (DelEntry) { 442 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); 443 it = Device.ShadowPtrMap.erase(it); 444 } else { 445 ++it; 446 } 447 } 448 Device.ShadowMtx.unlock(); 449 450 // Deallocate map 451 if (DelEntry) { 452 int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete); 453 if (rt != OFFLOAD_SUCCESS) { 454 DP("Deallocating data from device failed.\n"); 455 return OFFLOAD_FAIL; 456 } 457 } 458 } 459 } 460 461 return OFFLOAD_SUCCESS; 462 } 463 464 /// Internal function to pass data to/from the target. 465 int target_data_update(DeviceTy &Device, int32_t arg_num, 466 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { 467 // process each input. 468 for (int32_t i = 0; i < arg_num; ++i) { 469 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 470 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 471 continue; 472 473 void *HstPtrBegin = args[i]; 474 int64_t MapSize = arg_sizes[i]; 475 bool IsLast; 476 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, 477 false); 478 if (!TgtPtrBegin) { 479 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 480 continue; 481 } 482 483 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { 484 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 485 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 486 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); 487 if (rt != OFFLOAD_SUCCESS) { 488 DP("Copying data from device failed.\n"); 489 return OFFLOAD_FAIL; 490 } 491 492 uintptr_t lb = (uintptr_t) HstPtrBegin; 493 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; 494 Device.ShadowMtx.lock(); 495 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); 496 it != Device.ShadowPtrMap.end(); ++it) { 497 void **ShadowHstPtrAddr = (void**) it->first; 498 if ((uintptr_t) ShadowHstPtrAddr < lb) 499 continue; 500 if ((uintptr_t) ShadowHstPtrAddr >= ub) 501 break; 502 DP("Restoring original host pointer value " DPxMOD " for host pointer " 503 DPxMOD "\n", DPxPTR(it->second.HstPtrVal), 504 DPxPTR(ShadowHstPtrAddr)); 505 *ShadowHstPtrAddr = it->second.HstPtrVal; 506 } 507 Device.ShadowMtx.unlock(); 508 } 509 510 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { 511 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 512 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 513 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); 514 if (rt != OFFLOAD_SUCCESS) { 515 DP("Copying data to device failed.\n"); 516 return OFFLOAD_FAIL; 517 } 518 uintptr_t lb = (uintptr_t) HstPtrBegin; 519 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; 520 Device.ShadowMtx.lock(); 521 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); 522 it != Device.ShadowPtrMap.end(); ++it) { 523 void **ShadowHstPtrAddr = (void**) it->first; 524 if ((uintptr_t) ShadowHstPtrAddr < lb) 525 continue; 526 if ((uintptr_t) ShadowHstPtrAddr >= ub) 527 break; 528 DP("Restoring original target pointer value " DPxMOD " for target " 529 "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), 530 DPxPTR(it->second.TgtPtrAddr)); 531 rt = Device.data_submit(it->second.TgtPtrAddr, 532 &it->second.TgtPtrVal, sizeof(void *)); 533 if (rt != OFFLOAD_SUCCESS) { 534 DP("Copying data to device failed.\n"); 535 Device.ShadowMtx.unlock(); 536 return OFFLOAD_FAIL; 537 } 538 } 539 Device.ShadowMtx.unlock(); 540 } 541 } 542 return OFFLOAD_SUCCESS; 543 } 544 545 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 546 OMP_TGT_MAPTYPE_LITERAL | 547 OMP_TGT_MAPTYPE_IMPLICIT; 548 static bool isLambdaMapping(int64_t Mapping) { 549 return (Mapping & LambdaMapping) == LambdaMapping; 550 } 551 552 /// performs the same actions as data_begin in case arg_num is 553 /// non-zero and initiates run of the offloaded region on the target platform; 554 /// if arg_num is non-zero after the region execution is done it also 555 /// performs the same action as data_update and data_end above. This function 556 /// returns 0 if it was able to transfer the execution to a target and an 557 /// integer different from zero otherwise. 558 int target(int64_t device_id, void *host_ptr, int32_t arg_num, 559 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, 560 int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { 561 DeviceTy &Device = Devices[device_id]; 562 563 // Find the table information in the map or look it up in the translation 564 // tables. 565 TableMap *TM = 0; 566 TblMapMtx.lock(); 567 HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr); 568 if (TableMapIt == HostPtrToTableMap.end()) { 569 // We don't have a map. So search all the registered libraries. 570 TrlTblMtx.lock(); 571 for (HostEntriesBeginToTransTableTy::iterator 572 ii = HostEntriesBeginToTransTable.begin(), 573 ie = HostEntriesBeginToTransTable.end(); 574 !TM && ii != ie; ++ii) { 575 // get the translation table (which contains all the good info). 576 TranslationTable *TransTable = &ii->second; 577 // iterate over all the host table entries to see if we can locate the 578 // host_ptr. 579 __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; 580 __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; 581 __tgt_offload_entry *cur = begin; 582 for (uint32_t i = 0; cur < end; ++cur, ++i) { 583 if (cur->addr != host_ptr) 584 continue; 585 // we got a match, now fill the HostPtrToTableMap so that we 586 // may avoid this search next time. 587 TM = &HostPtrToTableMap[host_ptr]; 588 TM->Table = TransTable; 589 TM->Index = i; 590 break; 591 } 592 } 593 TrlTblMtx.unlock(); 594 } else { 595 TM = &TableMapIt->second; 596 } 597 TblMapMtx.unlock(); 598 599 // No map for this host pointer found! 600 if (!TM) { 601 DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", 602 DPxPTR(host_ptr)); 603 return OFFLOAD_FAIL; 604 } 605 606 // get target table. 607 TrlTblMtx.lock(); 608 assert(TM->Table->TargetsTable.size() > (size_t)device_id && 609 "Not expecting a device ID outside the table's bounds!"); 610 __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; 611 TrlTblMtx.unlock(); 612 assert(TargetTable && "Global data has not been mapped\n"); 613 614 // Move data to device. 615 int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, 616 arg_types); 617 if (rc != OFFLOAD_SUCCESS) { 618 DP("Call to target_data_begin failed, abort target.\n"); 619 return OFFLOAD_FAIL; 620 } 621 622 std::vector<void *> tgt_args; 623 std::vector<ptrdiff_t> tgt_offsets; 624 625 // List of (first-)private arrays allocated for this target region 626 std::vector<void *> fpArrays; 627 std::vector<int> tgtArgsPositions(arg_num, -1); 628 629 for (int32_t i = 0; i < arg_num; ++i) { 630 if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 631 // This is not a target parameter, do not push it into tgt_args. 632 // Check for lambda mapping. 633 if (isLambdaMapping(arg_types[i])) { 634 assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && 635 "PTR_AND_OBJ must be also MEMBER_OF."); 636 unsigned idx = member_of(arg_types[i]); 637 int tgtIdx = tgtArgsPositions[idx]; 638 assert(tgtIdx != -1 && "Base address must be translated already."); 639 // The parent lambda must be processed already and it must be the last 640 // in tgt_args and tgt_offsets arrays. 641 void *HstPtrVal = args[i]; 642 void *HstPtrBegin = args_base[i]; 643 void *HstPtrBase = args[idx]; 644 bool IsLast; // unused. 645 void *TgtPtrBase = 646 (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]); 647 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 648 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 649 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 650 void *Pointer_TgtPtrBegin = 651 Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false); 652 if (!Pointer_TgtPtrBegin) { 653 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 654 DPxPTR(HstPtrVal)); 655 continue; 656 } 657 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 658 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); 659 int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, 660 sizeof(void *)); 661 if (rt != OFFLOAD_SUCCESS) { 662 DP("Copying data to device failed.\n"); 663 return OFFLOAD_FAIL; 664 } 665 } 666 continue; 667 } 668 void *HstPtrBegin = args[i]; 669 void *HstPtrBase = args_base[i]; 670 void *TgtPtrBegin; 671 ptrdiff_t TgtBaseOffset; 672 bool IsLast; // unused. 673 if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { 674 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 675 DPxPTR(HstPtrBase)); 676 TgtPtrBegin = HstPtrBase; 677 TgtBaseOffset = 0; 678 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { 679 // Allocate memory for (first-)private array 680 TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, 681 arg_sizes[i], HstPtrBegin); 682 if (!TgtPtrBegin) { 683 DP ("Data allocation for %sprivate array " DPxMOD " failed, " 684 "abort target.\n", 685 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), 686 DPxPTR(HstPtrBegin)); 687 return OFFLOAD_FAIL; 688 } 689 fpArrays.push_back(TgtPtrBegin); 690 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 691 #ifdef OMPTARGET_DEBUG 692 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 693 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " 694 "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", 695 arg_sizes[i], DPxPTR(TgtPtrBegin), 696 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), 697 DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); 698 #endif 699 // If first-private, copy data from host 700 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { 701 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); 702 if (rt != OFFLOAD_SUCCESS) { 703 DP ("Copying data to device failed, failed.\n"); 704 return OFFLOAD_FAIL; 705 } 706 } 707 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 708 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, 709 false); 710 TgtBaseOffset = 0; // no offset for ptrs. 711 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " 712 "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), 713 DPxPTR(HstPtrBase)); 714 } else { 715 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, 716 false); 717 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 718 #ifdef OMPTARGET_DEBUG 719 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 720 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 721 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 722 #endif 723 } 724 tgtArgsPositions[i] = tgt_args.size(); 725 tgt_args.push_back(TgtPtrBegin); 726 tgt_offsets.push_back(TgtBaseOffset); 727 } 728 729 assert(tgt_args.size() == tgt_offsets.size() && 730 "Size mismatch in arguments and offsets"); 731 732 // Pop loop trip count 733 uint64_t ltc = Device.loopTripCnt; 734 Device.loopTripCnt = 0; 735 736 // Launch device execution. 737 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 738 TargetTable->EntriesBegin[TM->Index].name, 739 DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); 740 if (IsTeamConstruct) { 741 rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, 742 &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, 743 thread_limit, ltc); 744 } else { 745 rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, 746 &tgt_args[0], &tgt_offsets[0], tgt_args.size()); 747 } 748 if (rc != OFFLOAD_SUCCESS) { 749 DP ("Executing target region abort target.\n"); 750 return OFFLOAD_FAIL; 751 } 752 753 // Deallocate (first-)private arrays 754 for (auto it : fpArrays) { 755 int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); 756 if (rt != OFFLOAD_SUCCESS) { 757 DP("Deallocation of (first-)private arrays failed.\n"); 758 return OFFLOAD_FAIL; 759 } 760 } 761 762 // Move data from device. 763 int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, 764 arg_types); 765 if (rt != OFFLOAD_SUCCESS) { 766 DP("Call to target_data_end failed, abort targe.\n"); 767 return OFFLOAD_FAIL; 768 } 769 770 return OFFLOAD_SUCCESS; 771 } 772