1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implementation of the interface to be used by Clang during the codegen of a 10 // target region. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <omptarget.h> 15 16 #include "device.h" 17 #include "private.h" 18 #include "rtl.h" 19 20 #include <cassert> 21 #include <vector> 22 23 #ifdef OMPTARGET_DEBUG 24 int DebugLevel = 0; 25 #endif // OMPTARGET_DEBUG 26 27 28 29 /* All begin addresses for partially mapped structs must be 8-aligned in order 30 * to ensure proper alignment of members. E.g. 31 * 32 * struct S { 33 * int a; // 4-aligned 34 * int b; // 4-aligned 35 * int *p; // 8-aligned 36 * } s1; 37 * ... 38 * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) 39 * { 40 * s1.b = 5; 41 * for (int i...) s1.p[i] = ...; 42 * } 43 * 44 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and 45 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, 46 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment 47 * requirements for its type. Now, when we allocate memory on the device, in 48 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. 49 * This means that the chunk of the struct on the device will start at a 50 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and 51 * address of p will be a misaligned 0x204 (on the host there was no need to add 52 * padding between b and p, so p comes exactly 4 bytes after b). If the device 53 * kernel tries to access s1.p, a misaligned address error occurs (as reported 54 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and 55 * extending the size of the allocated chuck accordingly, the chuck on the 56 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and 57 * &s1.p=0x208, as they should be to satisfy the alignment requirements. 58 */ 59 static const int64_t alignment = 8; 60 61 /// Map global data and execute pending ctors 62 static int InitLibrary(DeviceTy& Device) { 63 /* 64 * Map global data 65 */ 66 int32_t device_id = Device.DeviceID; 67 int rc = OFFLOAD_SUCCESS; 68 69 Device.PendingGlobalsMtx.lock(); 70 TrlTblMtx.lock(); 71 for (HostEntriesBeginToTransTableTy::iterator 72 ii = HostEntriesBeginToTransTable.begin(); 73 ii != HostEntriesBeginToTransTable.end(); ++ii) { 74 TranslationTable *TransTable = &ii->second; 75 if (TransTable->TargetsTable[device_id] != 0) { 76 // Library entries have already been processed 77 continue; 78 } 79 80 // 1) get image. 81 assert(TransTable->TargetsImages.size() > (size_t)device_id && 82 "Not expecting a device ID outside the table's bounds!"); 83 __tgt_device_image *img = TransTable->TargetsImages[device_id]; 84 if (!img) { 85 DP("No image loaded for device id %d.\n", device_id); 86 rc = OFFLOAD_FAIL; 87 break; 88 } 89 // 2) load image into the target table. 90 __tgt_target_table *TargetTable = 91 TransTable->TargetsTable[device_id] = Device.load_binary(img); 92 // Unable to get table for this image: invalidate image and fail. 93 if (!TargetTable) { 94 DP("Unable to generate entries table for device id %d.\n", device_id); 95 TransTable->TargetsImages[device_id] = 0; 96 rc = OFFLOAD_FAIL; 97 break; 98 } 99 100 // Verify whether the two table sizes match. 101 size_t hsize = 102 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; 103 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; 104 105 // Invalid image for these host entries! 106 if (hsize != tsize) { 107 DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n", 108 device_id, hsize, tsize); 109 TransTable->TargetsImages[device_id] = 0; 110 TransTable->TargetsTable[device_id] = 0; 111 rc = OFFLOAD_FAIL; 112 break; 113 } 114 115 // process global data that needs to be mapped. 116 Device.DataMapMtx.lock(); 117 __tgt_target_table *HostTable = &TransTable->HostTable; 118 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, 119 *CurrHostEntry = HostTable->EntriesBegin, 120 *EntryDeviceEnd = TargetTable->EntriesEnd; 121 CurrDeviceEntry != EntryDeviceEnd; 122 CurrDeviceEntry++, CurrHostEntry++) { 123 if (CurrDeviceEntry->size != 0) { 124 // has data. 125 assert(CurrDeviceEntry->size == CurrHostEntry->size && 126 "data size mismatch"); 127 128 // Fortran may use multiple weak declarations for the same symbol, 129 // therefore we must allow for multiple weak symbols to be loaded from 130 // the fat binary. Treat these mappings as any other "regular" mapping. 131 // Add entry to map. 132 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) 133 continue; 134 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" 135 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), 136 CurrDeviceEntry->size); 137 Device.HostDataToTargetMap.push_front(HostDataToTargetTy( 138 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, 139 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, 140 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, 141 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, 142 INF_REF_CNT /*RefCount*/)); 143 } 144 } 145 Device.DataMapMtx.unlock(); 146 } 147 TrlTblMtx.unlock(); 148 149 if (rc != OFFLOAD_SUCCESS) { 150 Device.PendingGlobalsMtx.unlock(); 151 return rc; 152 } 153 154 /* 155 * Run ctors for static objects 156 */ 157 if (!Device.PendingCtorsDtors.empty()) { 158 // Call all ctors for all libraries registered so far 159 for (auto &lib : Device.PendingCtorsDtors) { 160 if (!lib.second.PendingCtors.empty()) { 161 DP("Has pending ctors... call now\n"); 162 for (auto &entry : lib.second.PendingCtors) { 163 void *ctor = entry; 164 int rc = target(device_id, ctor, 0, NULL, NULL, NULL, 165 NULL, 1, 1, true /*team*/); 166 if (rc != OFFLOAD_SUCCESS) { 167 DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); 168 Device.PendingGlobalsMtx.unlock(); 169 return OFFLOAD_FAIL; 170 } 171 } 172 // Clear the list to indicate that this device has been used 173 lib.second.PendingCtors.clear(); 174 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); 175 } 176 } 177 } 178 Device.HasPendingGlobals = false; 179 Device.PendingGlobalsMtx.unlock(); 180 181 return OFFLOAD_SUCCESS; 182 } 183 184 // Check whether a device has been initialized, global ctors have been 185 // executed and global data has been mapped; do so if not already done. 186 int CheckDeviceAndCtors(int64_t device_id) { 187 // Is device ready? 188 if (!device_is_ready(device_id)) { 189 DP("Device %" PRId64 " is not ready.\n", device_id); 190 return OFFLOAD_FAIL; 191 } 192 193 // Get device info. 194 DeviceTy &Device = Devices[device_id]; 195 196 // Check whether global data has been mapped for this device 197 Device.PendingGlobalsMtx.lock(); 198 bool hasPendingGlobals = Device.HasPendingGlobals; 199 Device.PendingGlobalsMtx.unlock(); 200 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { 201 DP("Failed to init globals on device %" PRId64 "\n", device_id); 202 return OFFLOAD_FAIL; 203 } 204 205 return OFFLOAD_SUCCESS; 206 } 207 208 static int32_t member_of(int64_t type) { 209 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; 210 } 211 212 /// Internal function to do the mapping and transfer the data to the device 213 int target_data_begin(DeviceTy &Device, int32_t arg_num, 214 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { 215 // process each input. 216 for (int32_t i = 0; i < arg_num; ++i) { 217 // Ignore private variables and arrays - there is no mapping for them. 218 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 219 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 220 continue; 221 222 void *HstPtrBegin = args[i]; 223 void *HstPtrBase = args_base[i]; 224 int64_t data_size = arg_sizes[i]; 225 226 // Adjust for proper alignment if this is a combined entry (for structs). 227 // Look at the next argument - if that is MEMBER_OF this one, then this one 228 // is a combined entry. 229 int64_t padding = 0; 230 const int next_i = i+1; 231 if (member_of(arg_types[i]) < 0 && next_i < arg_num && 232 member_of(arg_types[next_i]) == i) { 233 padding = (int64_t)HstPtrBegin % alignment; 234 if (padding) { 235 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 236 "\n", padding, DPxPTR(HstPtrBegin)); 237 HstPtrBegin = (char *) HstPtrBegin - padding; 238 data_size += padding; 239 } 240 } 241 242 // Address of pointer on the host and device, respectively. 243 void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; 244 bool IsNew, Pointer_IsNew; 245 bool IsHostPtr = false; 246 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; 247 // Force the creation of a device side copy of the data when: 248 // a close map modifier was associated with a map that contained a to. 249 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 250 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we 251 // have reached this point via __tgt_target_data_begin and not __tgt_target 252 // then no argument is marked as TARGET_PARAM ("omp target data map" is not 253 // associated with a target region, so there are no target parameters). This 254 // may be considered a hack, we could revise the scheme in the future. 255 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); 256 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 257 DP("Has a pointer entry: \n"); 258 // base is address of pointer. 259 Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, 260 sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef, 261 HasCloseModifier); 262 if (!Pointer_TgtPtrBegin) { 263 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " 264 "illegal mapping).\n"); 265 return OFFLOAD_FAIL; 266 } 267 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" 268 "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), 269 (Pointer_IsNew ? "" : " not")); 270 Pointer_HstPtrBegin = HstPtrBase; 271 // modify current entry. 272 HstPtrBase = *(void **)HstPtrBase; 273 UpdateRef = true; // subsequently update ref count of pointee 274 } 275 276 void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, 277 data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier); 278 if (!TgtPtrBegin && data_size) { 279 // If data_size==0, then the argument could be a zero-length pointer to 280 // NULL, so getOrAlloc() returning NULL is not an error. 281 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " 282 "illegal mapping).\n"); 283 } 284 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 285 " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), 286 (IsNew ? "" : " not")); 287 288 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { 289 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; 290 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); 291 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); 292 args_base[i] = TgtPtrBase; 293 } 294 295 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { 296 bool copy = false; 297 if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || 298 HasCloseModifier) { 299 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { 300 copy = true; 301 } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { 302 // Copy data only if the "parent" struct has RefCount==1. 303 int32_t parent_idx = member_of(arg_types[i]); 304 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); 305 assert(parent_rc > 0 && "parent struct not found"); 306 if (parent_rc == 1) { 307 copy = true; 308 } 309 } 310 } 311 312 if (copy && !IsHostPtr) { 313 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 314 data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 315 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); 316 if (rt != OFFLOAD_SUCCESS) { 317 DP("Copying data to device failed.\n"); 318 return OFFLOAD_FAIL; 319 } 320 } 321 } 322 323 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { 324 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", 325 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); 326 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 327 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); 328 int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, 329 sizeof(void *)); 330 if (rt != OFFLOAD_SUCCESS) { 331 DP("Copying data to device failed.\n"); 332 return OFFLOAD_FAIL; 333 } 334 // create shadow pointers for this entry 335 Device.ShadowMtx.lock(); 336 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, 337 Pointer_TgtPtrBegin, TgtPtrBase}; 338 Device.ShadowMtx.unlock(); 339 } 340 } 341 342 return OFFLOAD_SUCCESS; 343 } 344 345 /// Internal function to undo the mapping and retrieve the data from the device. 346 int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, 347 void **args, int64_t *arg_sizes, int64_t *arg_types) { 348 // process each input. 349 for (int32_t i = arg_num - 1; i >= 0; --i) { 350 // Ignore private variables and arrays - there is no mapping for them. 351 // Also, ignore the use_device_ptr directive, it has no effect here. 352 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 353 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 354 continue; 355 356 void *HstPtrBegin = args[i]; 357 int64_t data_size = arg_sizes[i]; 358 // Adjust for proper alignment if this is a combined entry (for structs). 359 // Look at the next argument - if that is MEMBER_OF this one, then this one 360 // is a combined entry. 361 int64_t padding = 0; 362 const int next_i = i+1; 363 if (member_of(arg_types[i]) < 0 && next_i < arg_num && 364 member_of(arg_types[next_i]) == i) { 365 padding = (int64_t)HstPtrBegin % alignment; 366 if (padding) { 367 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD 368 "\n", padding, DPxPTR(HstPtrBegin)); 369 HstPtrBegin = (char *) HstPtrBegin - padding; 370 data_size += padding; 371 } 372 } 373 374 bool IsLast, IsHostPtr; 375 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || 376 (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); 377 bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; 378 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; 379 380 // If PTR_AND_OBJ, HstPtrBegin is address of pointee 381 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, 382 UpdateRef, IsHostPtr); 383 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD 384 " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), 385 (IsLast ? "" : " not")); 386 387 bool DelEntry = IsLast || ForceDelete; 388 389 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && 390 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 391 DelEntry = false; // protect parent struct from being deallocated 392 } 393 394 if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { 395 // Move data back to the host 396 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { 397 bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; 398 bool CopyMember = false; 399 if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || 400 HasCloseModifier) { 401 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && 402 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { 403 // Copy data only if the "parent" struct has RefCount==1. 404 int32_t parent_idx = member_of(arg_types[i]); 405 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); 406 assert(parent_rc > 0 && "parent struct not found"); 407 if (parent_rc == 1) { 408 CopyMember = true; 409 } 410 } 411 } 412 413 if ((DelEntry || Always || CopyMember) && 414 !(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 415 TgtPtrBegin == HstPtrBegin)) { 416 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 417 data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 418 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); 419 if (rt != OFFLOAD_SUCCESS) { 420 DP("Copying data from device failed.\n"); 421 return OFFLOAD_FAIL; 422 } 423 } 424 } 425 426 // If we copied back to the host a struct/array containing pointers, we 427 // need to restore the original host pointer values from their shadow 428 // copies. If the struct is going to be deallocated, remove any remaining 429 // shadow pointer entries for this struct. 430 uintptr_t lb = (uintptr_t) HstPtrBegin; 431 uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; 432 Device.ShadowMtx.lock(); 433 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); 434 it != Device.ShadowPtrMap.end();) { 435 void **ShadowHstPtrAddr = (void**) it->first; 436 437 // An STL map is sorted on its keys; use this property 438 // to quickly determine when to break out of the loop. 439 if ((uintptr_t) ShadowHstPtrAddr < lb) { 440 ++it; 441 continue; 442 } 443 if ((uintptr_t) ShadowHstPtrAddr >= ub) 444 break; 445 446 // If we copied the struct to the host, we need to restore the pointer. 447 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { 448 DP("Restoring original host pointer value " DPxMOD " for host " 449 "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), 450 DPxPTR(ShadowHstPtrAddr)); 451 *ShadowHstPtrAddr = it->second.HstPtrVal; 452 } 453 // If the struct is to be deallocated, remove the shadow entry. 454 if (DelEntry) { 455 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); 456 it = Device.ShadowPtrMap.erase(it); 457 } else { 458 ++it; 459 } 460 } 461 Device.ShadowMtx.unlock(); 462 463 // Deallocate map 464 if (DelEntry) { 465 int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete, 466 HasCloseModifier); 467 if (rt != OFFLOAD_SUCCESS) { 468 DP("Deallocating data from device failed.\n"); 469 return OFFLOAD_FAIL; 470 } 471 } 472 } 473 } 474 475 return OFFLOAD_SUCCESS; 476 } 477 478 /// Internal function to pass data to/from the target. 479 int target_data_update(DeviceTy &Device, int32_t arg_num, 480 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { 481 // process each input. 482 for (int32_t i = 0; i < arg_num; ++i) { 483 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || 484 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) 485 continue; 486 487 void *HstPtrBegin = args[i]; 488 int64_t MapSize = arg_sizes[i]; 489 bool IsLast, IsHostPtr; 490 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, 491 false, IsHostPtr); 492 if (!TgtPtrBegin) { 493 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); 494 continue; 495 } 496 497 if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 498 TgtPtrBegin == HstPtrBegin) { 499 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", 500 DPxPTR(HstPtrBegin)); 501 continue; 502 } 503 504 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { 505 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", 506 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); 507 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize); 508 if (rt != OFFLOAD_SUCCESS) { 509 DP("Copying data from device failed.\n"); 510 return OFFLOAD_FAIL; 511 } 512 513 uintptr_t lb = (uintptr_t) HstPtrBegin; 514 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; 515 Device.ShadowMtx.lock(); 516 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); 517 it != Device.ShadowPtrMap.end(); ++it) { 518 void **ShadowHstPtrAddr = (void**) it->first; 519 if ((uintptr_t) ShadowHstPtrAddr < lb) 520 continue; 521 if ((uintptr_t) ShadowHstPtrAddr >= ub) 522 break; 523 DP("Restoring original host pointer value " DPxMOD " for host pointer " 524 DPxMOD "\n", DPxPTR(it->second.HstPtrVal), 525 DPxPTR(ShadowHstPtrAddr)); 526 *ShadowHstPtrAddr = it->second.HstPtrVal; 527 } 528 Device.ShadowMtx.unlock(); 529 } 530 531 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { 532 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", 533 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); 534 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize); 535 if (rt != OFFLOAD_SUCCESS) { 536 DP("Copying data to device failed.\n"); 537 return OFFLOAD_FAIL; 538 } 539 540 uintptr_t lb = (uintptr_t) HstPtrBegin; 541 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; 542 Device.ShadowMtx.lock(); 543 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); 544 it != Device.ShadowPtrMap.end(); ++it) { 545 void **ShadowHstPtrAddr = (void**) it->first; 546 if ((uintptr_t) ShadowHstPtrAddr < lb) 547 continue; 548 if ((uintptr_t) ShadowHstPtrAddr >= ub) 549 break; 550 DP("Restoring original target pointer value " DPxMOD " for target " 551 "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), 552 DPxPTR(it->second.TgtPtrAddr)); 553 rt = Device.data_submit(it->second.TgtPtrAddr, 554 &it->second.TgtPtrVal, sizeof(void *)); 555 if (rt != OFFLOAD_SUCCESS) { 556 DP("Copying data to device failed.\n"); 557 Device.ShadowMtx.unlock(); 558 return OFFLOAD_FAIL; 559 } 560 } 561 Device.ShadowMtx.unlock(); 562 } 563 } 564 return OFFLOAD_SUCCESS; 565 } 566 567 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | 568 OMP_TGT_MAPTYPE_LITERAL | 569 OMP_TGT_MAPTYPE_IMPLICIT; 570 static bool isLambdaMapping(int64_t Mapping) { 571 return (Mapping & LambdaMapping) == LambdaMapping; 572 } 573 574 /// performs the same actions as data_begin in case arg_num is 575 /// non-zero and initiates run of the offloaded region on the target platform; 576 /// if arg_num is non-zero after the region execution is done it also 577 /// performs the same action as data_update and data_end above. This function 578 /// returns 0 if it was able to transfer the execution to a target and an 579 /// integer different from zero otherwise. 580 int target(int64_t device_id, void *host_ptr, int32_t arg_num, 581 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, 582 int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { 583 DeviceTy &Device = Devices[device_id]; 584 585 // Find the table information in the map or look it up in the translation 586 // tables. 587 TableMap *TM = 0; 588 TblMapMtx.lock(); 589 HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr); 590 if (TableMapIt == HostPtrToTableMap.end()) { 591 // We don't have a map. So search all the registered libraries. 592 TrlTblMtx.lock(); 593 for (HostEntriesBeginToTransTableTy::iterator 594 ii = HostEntriesBeginToTransTable.begin(), 595 ie = HostEntriesBeginToTransTable.end(); 596 !TM && ii != ie; ++ii) { 597 // get the translation table (which contains all the good info). 598 TranslationTable *TransTable = &ii->second; 599 // iterate over all the host table entries to see if we can locate the 600 // host_ptr. 601 __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; 602 __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; 603 __tgt_offload_entry *cur = begin; 604 for (uint32_t i = 0; cur < end; ++cur, ++i) { 605 if (cur->addr != host_ptr) 606 continue; 607 // we got a match, now fill the HostPtrToTableMap so that we 608 // may avoid this search next time. 609 TM = &HostPtrToTableMap[host_ptr]; 610 TM->Table = TransTable; 611 TM->Index = i; 612 break; 613 } 614 } 615 TrlTblMtx.unlock(); 616 } else { 617 TM = &TableMapIt->second; 618 } 619 TblMapMtx.unlock(); 620 621 // No map for this host pointer found! 622 if (!TM) { 623 DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", 624 DPxPTR(host_ptr)); 625 return OFFLOAD_FAIL; 626 } 627 628 // get target table. 629 TrlTblMtx.lock(); 630 assert(TM->Table->TargetsTable.size() > (size_t)device_id && 631 "Not expecting a device ID outside the table's bounds!"); 632 __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; 633 TrlTblMtx.unlock(); 634 assert(TargetTable && "Global data has not been mapped\n"); 635 636 // Move data to device. 637 int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, 638 arg_types); 639 if (rc != OFFLOAD_SUCCESS) { 640 DP("Call to target_data_begin failed, abort target.\n"); 641 return OFFLOAD_FAIL; 642 } 643 644 std::vector<void *> tgt_args; 645 std::vector<ptrdiff_t> tgt_offsets; 646 647 // List of (first-)private arrays allocated for this target region 648 std::vector<void *> fpArrays; 649 std::vector<int> tgtArgsPositions(arg_num, -1); 650 651 for (int32_t i = 0; i < arg_num; ++i) { 652 if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { 653 // This is not a target parameter, do not push it into tgt_args. 654 // Check for lambda mapping. 655 if (isLambdaMapping(arg_types[i])) { 656 assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && 657 "PTR_AND_OBJ must be also MEMBER_OF."); 658 unsigned idx = member_of(arg_types[i]); 659 int tgtIdx = tgtArgsPositions[idx]; 660 assert(tgtIdx != -1 && "Base address must be translated already."); 661 // The parent lambda must be processed already and it must be the last 662 // in tgt_args and tgt_offsets arrays. 663 void *HstPtrVal = args[i]; 664 void *HstPtrBegin = args_base[i]; 665 void *HstPtrBase = args[idx]; 666 bool IsLast, IsHostPtr; // unused. 667 void *TgtPtrBase = 668 (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]); 669 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); 670 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; 671 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); 672 void *Pointer_TgtPtrBegin = 673 Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false, 674 IsHostPtr); 675 if (!Pointer_TgtPtrBegin) { 676 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", 677 DPxPTR(HstPtrVal)); 678 continue; 679 } 680 if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 681 TgtPtrBegin == HstPtrBegin) { 682 DP("Unified memory is active, no need to map lambda captured" 683 "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); 684 continue; 685 } 686 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", 687 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); 688 int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, 689 sizeof(void *)); 690 if (rt != OFFLOAD_SUCCESS) { 691 DP("Copying data to device failed.\n"); 692 return OFFLOAD_FAIL; 693 } 694 } 695 continue; 696 } 697 void *HstPtrBegin = args[i]; 698 void *HstPtrBase = args_base[i]; 699 void *TgtPtrBegin; 700 ptrdiff_t TgtBaseOffset; 701 bool IsLast, IsHostPtr; // unused. 702 if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { 703 DP("Forwarding first-private value " DPxMOD " to the target construct\n", 704 DPxPTR(HstPtrBase)); 705 TgtPtrBegin = HstPtrBase; 706 TgtBaseOffset = 0; 707 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { 708 // Allocate memory for (first-)private array 709 TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, 710 arg_sizes[i], HstPtrBegin); 711 if (!TgtPtrBegin) { 712 DP ("Data allocation for %sprivate array " DPxMOD " failed, " 713 "abort target.\n", 714 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), 715 DPxPTR(HstPtrBegin)); 716 return OFFLOAD_FAIL; 717 } 718 fpArrays.push_back(TgtPtrBegin); 719 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 720 #ifdef OMPTARGET_DEBUG 721 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 722 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " 723 "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", 724 arg_sizes[i], DPxPTR(TgtPtrBegin), 725 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), 726 DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); 727 #endif 728 // If first-private, copy data from host 729 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { 730 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); 731 if (rt != OFFLOAD_SUCCESS) { 732 DP ("Copying data to device failed, failed.\n"); 733 return OFFLOAD_FAIL; 734 } 735 } 736 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { 737 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, 738 false, IsHostPtr); 739 TgtBaseOffset = 0; // no offset for ptrs. 740 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " 741 "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), 742 DPxPTR(HstPtrBase)); 743 } else { 744 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, 745 false, IsHostPtr); 746 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; 747 #ifdef OMPTARGET_DEBUG 748 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); 749 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", 750 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); 751 #endif 752 } 753 tgtArgsPositions[i] = tgt_args.size(); 754 tgt_args.push_back(TgtPtrBegin); 755 tgt_offsets.push_back(TgtBaseOffset); 756 } 757 758 assert(tgt_args.size() == tgt_offsets.size() && 759 "Size mismatch in arguments and offsets"); 760 761 // Pop loop trip count 762 uint64_t ltc = 0; 763 TblMapMtx.lock(); 764 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); 765 if (I != Device.LoopTripCnt.end()) { 766 ltc = I->second; 767 Device.LoopTripCnt.erase(I); 768 DP("loop trip count is %lu.\n", ltc); 769 } 770 TblMapMtx.unlock(); 771 772 // Launch device execution. 773 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", 774 TargetTable->EntriesBegin[TM->Index].name, 775 DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); 776 if (IsTeamConstruct) { 777 rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, 778 &tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num, 779 thread_limit, ltc); 780 } else { 781 rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, 782 &tgt_args[0], &tgt_offsets[0], tgt_args.size()); 783 } 784 if (rc != OFFLOAD_SUCCESS) { 785 DP ("Executing target region abort target.\n"); 786 return OFFLOAD_FAIL; 787 } 788 789 // Deallocate (first-)private arrays 790 for (auto it : fpArrays) { 791 int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); 792 if (rt != OFFLOAD_SUCCESS) { 793 DP("Deallocation of (first-)private arrays failed.\n"); 794 return OFFLOAD_FAIL; 795 } 796 } 797 798 // Move data from device. 799 int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, 800 arg_types); 801 if (rt != OFFLOAD_SUCCESS) { 802 DP("Call to target_data_end failed, abort targe.\n"); 803 return OFFLOAD_FAIL; 804 } 805 806 return OFFLOAD_SUCCESS; 807 } 808