1 //===----------- device.h - Target independent OpenMP target RTL ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Declarations for managing devices that are handled by RTL plugins. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef _OMPTARGET_DEVICE_H 14 #define _OMPTARGET_DEVICE_H 15 16 #include <cassert> 17 #include <cstddef> 18 #include <cstdint> 19 #include <list> 20 #include <map> 21 #include <memory> 22 #include <mutex> 23 #include <set> 24 #include <thread> 25 #include <vector> 26 27 #include "ExclusiveAccess.h" 28 #include "omptarget.h" 29 #include "rtl.h" 30 31 // Forward declarations. 32 struct RTLInfoTy; 33 struct __tgt_bin_desc; 34 struct __tgt_target_table; 35 36 using map_var_info_t = void *; 37 38 // enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition 39 enum kmp_target_offload_kind { 40 tgt_disabled = 0, 41 tgt_default = 1, 42 tgt_mandatory = 2 43 }; 44 typedef enum kmp_target_offload_kind kmp_target_offload_kind_t; 45 46 /// Map between host data and target data. 47 struct HostDataToTargetTy { 48 const uintptr_t HstPtrBase; // host info. 49 const uintptr_t HstPtrBegin; 50 const uintptr_t HstPtrEnd; // non-inclusive. 51 const map_var_info_t HstPtrName; // Optional source name of mapped variable. 52 53 const uintptr_t TgtPtrBegin; // target info. 54 55 private: 56 static const uint64_t INFRefCount = ~(uint64_t)0; refCountToStrHostDataToTargetTy57 static std::string refCountToStr(uint64_t RefCount) { 58 return RefCount == INFRefCount ? "INF" : std::to_string(RefCount); 59 } 60 61 struct StatesTy { StatesTyHostDataToTargetTy::StatesTy62 StatesTy(uint64_t DRC, uint64_t HRC) 63 : DynRefCount(DRC), HoldRefCount(HRC), 64 MayContainAttachedPointers(false), DeleteThreadId(std::thread::id()) { 65 } 66 /// The dynamic reference count is the standard reference count as of OpenMP 67 /// 4.5. The hold reference count is an OpenMP extension for the sake of 68 /// OpenACC support. 69 /// 70 /// The 'ompx_hold' map type modifier is permitted only on "omp target" and 71 /// "omp target data", and "delete" is permitted only on "omp target exit 72 /// data" and associated runtime library routines. As a result, we really 73 /// need to implement "reset" functionality only for the dynamic reference 74 /// counter. Likewise, only the dynamic reference count can be infinite 75 /// because, for example, omp_target_associate_ptr and "omp declare target 76 /// link" operate only on it. Nevertheless, it's actually easier to follow 77 /// the code (and requires less assertions for special cases) when we just 78 /// implement these features generally across both reference counters here. 79 /// Thus, it's the users of this class that impose those restrictions. 80 /// 81 uint64_t DynRefCount; 82 uint64_t HoldRefCount; 83 84 /// Boolean flag to remember if any subpart of the mapped region might be 85 /// an attached pointer. 86 bool MayContainAttachedPointers; 87 88 /// This mutex will be locked when data movement is issued. For targets that 89 /// doesn't support async data movement, this mutex can guarantee that after 90 /// it is released, memory region on the target is update to date. For 91 /// targets that support async data movement, this can guarantee that data 92 /// movement has been issued. This mutex *must* be locked right before 93 /// releasing the mapping table lock. 94 std::mutex UpdateMtx; 95 /// Pointer to the event corresponding to the data update of this map. 96 /// Note: At present this event is created when the first data transfer from 97 /// host to device is issued, and only being used for H2D. It is not used 98 /// for data transfer in another direction (device to host). It is still 99 /// unclear whether we need it for D2H. If in the future we need similar 100 /// mechanism for D2H, and if the event cannot be shared between them, Event 101 /// should be written as <tt>void *Event[2]</tt>. 102 void *Event = nullptr; 103 104 /// The id of the thread responsible for deleting this entry. This thread 105 /// set the reference count to zero *last*. Other threads might reuse the 106 /// entry while it is marked for deletion but not yet deleted (e.g., the 107 /// data is still being moved back). If another thread reuses the entry we 108 /// will have a non-zero reference count *or* the thread will have changed 109 /// this id, effectively taking over deletion responsibility. 110 std::thread::id DeleteThreadId; 111 }; 112 // When HostDataToTargetTy is used by std::set, std::set::iterator is const 113 // use unique_ptr to make States mutable. 114 const std::unique_ptr<StatesTy> States; 115 116 public: 117 HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, 118 bool UseHoldRefCount, map_var_info_t Name = nullptr, 119 bool IsINF = false) HstPtrBaseHostDataToTargetTy120 : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name), 121 TgtPtrBegin(TB), States(std::make_unique<StatesTy>(UseHoldRefCount ? 0 122 : IsINF ? INFRefCount 123 : 1, 124 !UseHoldRefCount ? 0 125 : IsINF ? INFRefCount 126 : 1)) {} 127 128 /// Get the total reference count. This is smarter than just getDynRefCount() 129 /// + getHoldRefCount() because it handles the case where at least one is 130 /// infinity and the other is non-zero. getTotalRefCountHostDataToTargetTy131 uint64_t getTotalRefCount() const { 132 if (States->DynRefCount == INFRefCount || 133 States->HoldRefCount == INFRefCount) 134 return INFRefCount; 135 return States->DynRefCount + States->HoldRefCount; 136 } 137 138 /// Get the dynamic reference count. getDynRefCountHostDataToTargetTy139 uint64_t getDynRefCount() const { return States->DynRefCount; } 140 141 /// Get the hold reference count. getHoldRefCountHostDataToTargetTy142 uint64_t getHoldRefCount() const { return States->HoldRefCount; } 143 144 /// Get the event bound to this data map. getEventHostDataToTargetTy145 void *getEvent() const { return States->Event; } 146 147 /// Add a new event, if necessary. 148 /// Returns OFFLOAD_FAIL if something went wrong, OFFLOAD_SUCCESS otherwise. 149 int addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const; 150 151 /// Indicate that the current thread expected to delete this entry. setDeleteThreadIdHostDataToTargetTy152 void setDeleteThreadId() const { 153 States->DeleteThreadId = std::this_thread::get_id(); 154 } 155 156 /// Return the thread id of the thread expected to delete this entry. getDeleteThreadIdHostDataToTargetTy157 std::thread::id getDeleteThreadId() const { return States->DeleteThreadId; } 158 159 /// Set the event bound to this data map. setEventHostDataToTargetTy160 void setEvent(void *Event) const { States->Event = Event; } 161 162 /// Reset the specified reference count unless it's infinity. Reset to 1 163 /// (even if currently 0) so it can be followed by a decrement. resetRefCountHostDataToTargetTy164 void resetRefCount(bool UseHoldRefCount) const { 165 uint64_t &ThisRefCount = 166 UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; 167 if (ThisRefCount != INFRefCount) 168 ThisRefCount = 1; 169 } 170 171 /// Increment the specified reference count unless it's infinity. incRefCountHostDataToTargetTy172 void incRefCount(bool UseHoldRefCount) const { 173 uint64_t &ThisRefCount = 174 UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; 175 if (ThisRefCount != INFRefCount) { 176 ++ThisRefCount; 177 assert(ThisRefCount < INFRefCount && "refcount overflow"); 178 } 179 } 180 181 /// Decrement the specified reference count unless it's infinity or zero, and 182 /// return the total reference count. decRefCountHostDataToTargetTy183 uint64_t decRefCount(bool UseHoldRefCount) const { 184 uint64_t &ThisRefCount = 185 UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; 186 uint64_t OtherRefCount = 187 UseHoldRefCount ? States->DynRefCount : States->HoldRefCount; 188 (void)OtherRefCount; 189 if (ThisRefCount != INFRefCount) { 190 if (ThisRefCount > 0) 191 --ThisRefCount; 192 else 193 assert(OtherRefCount >= 0 && "total refcount underflow"); 194 } 195 return getTotalRefCount(); 196 } 197 198 /// Is the dynamic (and thus the total) reference count infinite? isDynRefCountInfHostDataToTargetTy199 bool isDynRefCountInf() const { return States->DynRefCount == INFRefCount; } 200 201 /// Convert the dynamic reference count to a debug string. dynRefCountToStrHostDataToTargetTy202 std::string dynRefCountToStr() const { 203 return refCountToStr(States->DynRefCount); 204 } 205 206 /// Convert the hold reference count to a debug string. holdRefCountToStrHostDataToTargetTy207 std::string holdRefCountToStr() const { 208 return refCountToStr(States->HoldRefCount); 209 } 210 211 /// Should one decrement of the specified reference count (after resetting it 212 /// if \c AfterReset) remove this mapping? 213 bool decShouldRemove(bool UseHoldRefCount, bool AfterReset = false) const { 214 uint64_t ThisRefCount = 215 UseHoldRefCount ? States->HoldRefCount : States->DynRefCount; 216 uint64_t OtherRefCount = 217 UseHoldRefCount ? States->DynRefCount : States->HoldRefCount; 218 if (OtherRefCount > 0) 219 return false; 220 if (AfterReset) 221 return ThisRefCount != INFRefCount; 222 return ThisRefCount == 1; 223 } 224 setMayContainAttachedPointersHostDataToTargetTy225 void setMayContainAttachedPointers() const { 226 States->MayContainAttachedPointers = true; 227 } getMayContainAttachedPointersHostDataToTargetTy228 bool getMayContainAttachedPointers() const { 229 return States->MayContainAttachedPointers; 230 } 231 lockHostDataToTargetTy232 void lock() const { States->UpdateMtx.lock(); } 233 unlockHostDataToTargetTy234 void unlock() const { States->UpdateMtx.unlock(); } 235 }; 236 237 /// Wrapper around the HostDataToTargetTy to be used in the HDTT map. In 238 /// addition to the HDTT pointer we store the key value explicitly. This 239 /// allows the set to inspect (sort/search/...) this entry without an additional 240 /// load of HDTT. HDTT is a pointer to allow the modification of the set without 241 /// invalidating HDTT entries which can now be inspected at the same time. 242 struct HostDataToTargetMapKeyTy { 243 uintptr_t KeyValue; 244 HostDataToTargetMapKeyTyHostDataToTargetMapKeyTy245 HostDataToTargetMapKeyTy(void *Key) : KeyValue(uintptr_t(Key)) {} HostDataToTargetMapKeyTyHostDataToTargetMapKeyTy246 HostDataToTargetMapKeyTy(HostDataToTargetTy *HDTT) 247 : KeyValue(HDTT->HstPtrBegin), HDTT(HDTT) {} 248 HostDataToTargetTy *HDTT; 249 }; 250 inline bool operator<(const HostDataToTargetMapKeyTy &LHS, 251 const uintptr_t &RHS) { 252 return LHS.KeyValue < RHS; 253 } 254 inline bool operator<(const uintptr_t &LHS, 255 const HostDataToTargetMapKeyTy &RHS) { 256 return LHS < RHS.KeyValue; 257 } 258 inline bool operator<(const HostDataToTargetMapKeyTy &LHS, 259 const HostDataToTargetMapKeyTy &RHS) { 260 return LHS.KeyValue < RHS.KeyValue; 261 } 262 263 struct LookupResult { 264 struct { 265 unsigned IsContained : 1; 266 unsigned ExtendsBefore : 1; 267 unsigned ExtendsAfter : 1; 268 } Flags; 269 270 /// The corresponding map table entry which is stable. 271 HostDataToTargetTy *Entry = nullptr; 272 LookupResultLookupResult273 LookupResult() : Flags({0, 0, 0}), Entry() {} 274 }; 275 276 /// This struct will be returned by \p DeviceTy::getTargetPointer which provides 277 /// more data than just a target pointer. 278 struct TargetPointerResultTy { 279 struct { 280 /// If the map table entry is just created 281 unsigned IsNewEntry : 1; 282 /// If the pointer is actually a host pointer (when unified memory enabled) 283 unsigned IsHostPointer : 1; 284 } Flags = {0, 0}; 285 286 /// The corresponding map table entry which is stable. 287 HostDataToTargetTy *Entry = nullptr; 288 289 /// The corresponding target pointer 290 void *TargetPointer = nullptr; 291 }; 292 293 /// Map for shadow pointers 294 struct ShadowPtrValTy { 295 void *HstPtrVal; 296 void *TgtPtrAddr; 297 void *TgtPtrVal; 298 }; 299 typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy; 300 301 /// 302 struct PendingCtorDtorListsTy { 303 std::list<void *> PendingCtors; 304 std::list<void *> PendingDtors; 305 }; 306 typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> 307 PendingCtorsDtorsPerLibrary; 308 309 struct DeviceTy { 310 int32_t DeviceID; 311 RTLInfoTy *RTL; 312 int32_t RTLDeviceID; 313 314 bool IsInit; 315 std::once_flag InitFlag; 316 bool HasPendingGlobals; 317 318 /// Host data to device map type with a wrapper key indirection that allows 319 /// concurrent modification of the entries without invalidating the underlying 320 /// entries. 321 using HostDataToTargetListTy = 322 std::set<HostDataToTargetMapKeyTy, std::less<>>; 323 324 /// The HDTTMap is a protected object that can only be accessed by one thread 325 /// at a time. 326 ProtectedObj<HostDataToTargetListTy> HostDataToTargetMap; 327 328 /// The type used to access the HDTT map. 329 using HDTTMapAccessorTy = decltype(HostDataToTargetMap)::AccessorTy; 330 331 PendingCtorsDtorsPerLibrary PendingCtorsDtors; 332 333 ShadowPtrListTy ShadowPtrMap; 334 335 std::mutex PendingGlobalsMtx, ShadowMtx; 336 337 // NOTE: Once libomp gains full target-task support, this state should be 338 // moved into the target task in libomp. 339 std::map<int32_t, uint64_t> LoopTripCnt; 340 341 DeviceTy(RTLInfoTy *RTL); 342 // DeviceTy is not copyable 343 DeviceTy(const DeviceTy &D) = delete; 344 DeviceTy &operator=(const DeviceTy &D) = delete; 345 346 ~DeviceTy(); 347 348 // Return true if data can be copied to DstDevice directly 349 bool isDataExchangable(const DeviceTy &DstDevice); 350 351 /// Lookup the mapping of \p HstPtrBegin in \p HDTTMap. The accessor ensures 352 /// exclusive access to the HDTT map. 353 LookupResult lookupMapping(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, 354 int64_t Size); 355 356 /// Get the target pointer based on host pointer begin and base. If the 357 /// mapping already exists, the target pointer will be returned directly. In 358 /// addition, if required, the memory region pointed by \p HstPtrBegin of size 359 /// \p Size will also be transferred to the device. If the mapping doesn't 360 /// exist, and if unified shared memory is not enabled, a new mapping will be 361 /// created and the data will also be transferred accordingly. nullptr will be 362 /// returned because of any of following reasons: 363 /// - Data allocation failed; 364 /// - The user tried to do an illegal mapping; 365 /// - Data transfer issue fails. 366 TargetPointerResultTy 367 getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size, 368 map_var_info_t HstPtrName, bool HasFlagTo, 369 bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount, 370 bool HasCloseModifier, bool HasPresentModifier, 371 bool HasHoldModifier, AsyncInfoTy &AsyncInfo); 372 373 /// Return the target pointer for \p HstPtrBegin in \p HDTTMap. The accessor 374 /// ensures exclusive access to the HDTT map. 375 void *getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin, 376 int64_t Size); 377 378 TargetPointerResultTy getTgtPtrBegin(void *HstPtrBegin, int64_t Size, 379 bool &IsLast, bool UpdateRefCount, 380 bool UseHoldRefCount, bool &IsHostPtr, 381 bool MustContain = false, 382 bool ForceDelete = false); 383 384 /// Deallocate \p LR and remove the entry. Assume the total reference count is 385 /// zero and the calling thread is the deleting thread for \p LR. \p HDTTMap 386 /// ensure the caller holds exclusive access and can modify the map. Return \c 387 /// OFFLOAD_SUCCESS if the map entry existed, and return \c OFFLOAD_FAIL if 388 /// not. It is the caller's responsibility to skip calling this function if 389 /// the map entry is not expected to exist because \p HstPtrBegin uses shared 390 /// memory. 391 int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size); 392 393 int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); 394 int disassociatePtr(void *HstPtrBegin); 395 396 // calls to RTL 397 int32_t initOnce(); 398 __tgt_target_table *loadBinary(void *Img); 399 400 // device memory allocation/deallocation routines 401 /// Allocates \p Size bytes on the device, host or shared memory space 402 /// (depending on \p Kind) and returns the address/nullptr when 403 /// succeeds/fails. \p HstPtr is an address of the host data which the 404 /// allocated target data will be associated with. If it is unknown, the 405 /// default value of \p HstPtr is nullptr. Note: this function doesn't do 406 /// pointer association. Actually, all the __tgt_rtl_data_alloc 407 /// implementations ignore \p HstPtr. \p Kind dictates what allocator should 408 /// be used (host, shared, device). 409 void *allocData(int64_t Size, void *HstPtr = nullptr, 410 int32_t Kind = TARGET_ALLOC_DEFAULT); 411 /// Deallocates memory which \p TgtPtrBegin points at and returns 412 /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. 413 int32_t deleteData(void *TgtPtrBegin); 414 415 // Data transfer. When AsyncInfo is nullptr, the transfer will be 416 // synchronous. 417 // Copy data from host to device 418 int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, 419 AsyncInfoTy &AsyncInfo); 420 // Copy data from device back to host 421 int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, 422 AsyncInfoTy &AsyncInfo); 423 // Copy data from current device to destination device directly 424 int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, 425 int64_t Size, AsyncInfoTy &AsyncInfo); 426 427 int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, 428 int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo); 429 int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr, 430 ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, 431 int32_t NumTeams, int32_t ThreadLimit, 432 uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo); 433 434 /// Synchronize device/queue/event based on \p AsyncInfo and return 435 /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails. 436 int32_t synchronize(AsyncInfoTy &AsyncInfo); 437 438 /// Calls the corresponding print in the \p RTLDEVID 439 /// device RTL to obtain the information of the specific device. 440 bool printDeviceInfo(int32_t RTLDevID); 441 442 /// Event related interfaces. 443 /// { 444 /// Create an event. 445 int32_t createEvent(void **Event); 446 447 /// Record the event based on status in AsyncInfo->Queue at the moment the 448 /// function is called. 449 int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo); 450 451 /// Wait for an event. This function can be blocking or non-blocking, 452 /// depending on the implmentation. It is expected to set a dependence on the 453 /// event such that corresponding operations shall only start once the event 454 /// is fulfilled. 455 int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo); 456 457 /// Synchronize the event. It is expected to block the thread. 458 int32_t syncEvent(void *Event); 459 460 /// Destroy the event. 461 int32_t destroyEvent(void *Event); 462 /// } 463 464 private: 465 // Call to RTL 466 void init(); // To be called only via DeviceTy::initOnce() 467 468 /// Deinitialize the device (and plugin). 469 void deinit(); 470 }; 471 472 extern bool deviceIsReady(int DeviceNum); 473 474 /// Struct for the data required to handle plugins 475 struct PluginManager { PluginManagerPluginManager476 PluginManager(bool UseEventsForAtomicTransfers) 477 : UseEventsForAtomicTransfers(UseEventsForAtomicTransfers) {} 478 479 /// RTLs identified on the host 480 RTLsTy RTLs; 481 482 /// Executable images and information extracted from the input images passed 483 /// to the runtime. 484 std::list<std::pair<__tgt_device_image, __tgt_image_info>> Images; 485 486 /// Devices associated with RTLs 487 std::vector<std::unique_ptr<DeviceTy>> Devices; 488 std::mutex RTLsMtx; ///< For RTLs and Devices 489 490 /// Translation table retreived from the binary 491 HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable; 492 std::mutex TrlTblMtx; ///< For Translation Table 493 /// Host offload entries in order of image registration 494 std::vector<__tgt_offload_entry *> HostEntriesBeginRegistrationOrder; 495 496 /// Map from ptrs on the host to an entry in the Translation Table 497 HostPtrToTableMapTy HostPtrToTableMap; 498 std::mutex TblMapMtx; ///< For HostPtrToTableMap 499 500 // Store target policy (disabled, mandatory, default) 501 kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; 502 std::mutex TargetOffloadMtx; ///< For TargetOffloadPolicy 503 504 /// Flag to indicate if we use events to ensure the atomicity of 505 /// map clauses or not. Can be modified with an environment variable. 506 const bool UseEventsForAtomicTransfers; 507 }; 508 509 extern PluginManager *PM; 510 511 #endif 512