1 //===----------- device.h - Target independent OpenMP target RTL ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Declarations for managing devices that are handled by RTL plugins.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef _OMPTARGET_DEVICE_H
14 #define _OMPTARGET_DEVICE_H
15 
16 #include <cassert>
17 #include <cstddef>
18 #include <cstdint>
19 #include <list>
20 #include <map>
21 #include <memory>
22 #include <mutex>
23 #include <set>
24 #include <thread>
25 #include <vector>
26 
27 #include "ExclusiveAccess.h"
28 #include "omptarget.h"
29 #include "rtl.h"
30 
31 // Forward declarations.
32 struct RTLInfoTy;
33 struct __tgt_bin_desc;
34 struct __tgt_target_table;
35 
36 using map_var_info_t = void *;
37 
38 // enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition
39 enum kmp_target_offload_kind {
40   tgt_disabled = 0,
41   tgt_default = 1,
42   tgt_mandatory = 2
43 };
44 typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
45 
46 /// Map between host data and target data.
47 struct HostDataToTargetTy {
48   const uintptr_t HstPtrBase; // host info.
49   const uintptr_t HstPtrBegin;
50   const uintptr_t HstPtrEnd;       // non-inclusive.
51   const map_var_info_t HstPtrName; // Optional source name of mapped variable.
52 
53   const uintptr_t TgtPtrBegin; // target info.
54 
55 private:
56   static const uint64_t INFRefCount = ~(uint64_t)0;
refCountToStrHostDataToTargetTy57   static std::string refCountToStr(uint64_t RefCount) {
58     return RefCount == INFRefCount ? "INF" : std::to_string(RefCount);
59   }
60 
61   struct StatesTy {
StatesTyHostDataToTargetTy::StatesTy62     StatesTy(uint64_t DRC, uint64_t HRC)
63         : DynRefCount(DRC), HoldRefCount(HRC),
64           MayContainAttachedPointers(false), DeleteThreadId(std::thread::id()) {
65     }
66     /// The dynamic reference count is the standard reference count as of OpenMP
67     /// 4.5.  The hold reference count is an OpenMP extension for the sake of
68     /// OpenACC support.
69     ///
70     /// The 'ompx_hold' map type modifier is permitted only on "omp target" and
71     /// "omp target data", and "delete" is permitted only on "omp target exit
72     /// data" and associated runtime library routines.  As a result, we really
73     /// need to implement "reset" functionality only for the dynamic reference
74     /// counter.  Likewise, only the dynamic reference count can be infinite
75     /// because, for example, omp_target_associate_ptr and "omp declare target
76     /// link" operate only on it.  Nevertheless, it's actually easier to follow
77     /// the code (and requires less assertions for special cases) when we just
78     /// implement these features generally across both reference counters here.
79     /// Thus, it's the users of this class that impose those restrictions.
80     ///
81     uint64_t DynRefCount;
82     uint64_t HoldRefCount;
83 
84     /// Boolean flag to remember if any subpart of the mapped region might be
85     /// an attached pointer.
86     bool MayContainAttachedPointers;
87 
88     /// This mutex will be locked when data movement is issued. For targets that
89     /// doesn't support async data movement, this mutex can guarantee that after
90     /// it is released, memory region on the target is update to date. For
91     /// targets that support async data movement, this can guarantee that data
92     /// movement has been issued. This mutex *must* be locked right before
93     /// releasing the mapping table lock.
94     std::mutex UpdateMtx;
95     /// Pointer to the event corresponding to the data update of this map.
96     /// Note: At present this event is created when the first data transfer from
97     /// host to device is issued, and only being used for H2D. It is not used
98     /// for data transfer in another direction (device to host). It is still
99     /// unclear whether we need it for D2H. If in the future we need similar
100     /// mechanism for D2H, and if the event cannot be shared between them, Event
101     /// should be written as <tt>void *Event[2]</tt>.
102     void *Event = nullptr;
103 
104     /// The id of the thread responsible for deleting this entry. This thread
105     /// set the reference count to zero *last*. Other threads might reuse the
106     /// entry while it is marked for deletion but not yet deleted (e.g., the
107     /// data is still being moved back). If another thread reuses the entry we
108     /// will have a non-zero reference count *or* the thread will have changed
109     /// this id, effectively taking over deletion responsibility.
110     std::thread::id DeleteThreadId;
111   };
112   // When HostDataToTargetTy is used by std::set, std::set::iterator is const
113   // use unique_ptr to make States mutable.
114   const std::unique_ptr<StatesTy> States;
115 
116 public:
117   HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
118                      bool UseHoldRefCount, map_var_info_t Name = nullptr,
119                      bool IsINF = false)
HstPtrBaseHostDataToTargetTy120       : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), HstPtrName(Name),
121         TgtPtrBegin(TB), States(std::make_unique<StatesTy>(UseHoldRefCount ? 0
122                                                            : IsINF ? INFRefCount
123                                                                    : 1,
124                                                            !UseHoldRefCount ? 0
125                                                            : IsINF ? INFRefCount
126                                                                    : 1)) {}
127 
128   /// Get the total reference count.  This is smarter than just getDynRefCount()
129   /// + getHoldRefCount() because it handles the case where at least one is
130   /// infinity and the other is non-zero.
getTotalRefCountHostDataToTargetTy131   uint64_t getTotalRefCount() const {
132     if (States->DynRefCount == INFRefCount ||
133         States->HoldRefCount == INFRefCount)
134       return INFRefCount;
135     return States->DynRefCount + States->HoldRefCount;
136   }
137 
138   /// Get the dynamic reference count.
getDynRefCountHostDataToTargetTy139   uint64_t getDynRefCount() const { return States->DynRefCount; }
140 
141   /// Get the hold reference count.
getHoldRefCountHostDataToTargetTy142   uint64_t getHoldRefCount() const { return States->HoldRefCount; }
143 
144   /// Get the event bound to this data map.
getEventHostDataToTargetTy145   void *getEvent() const { return States->Event; }
146 
147   /// Add a new event, if necessary.
148   /// Returns OFFLOAD_FAIL if something went wrong, OFFLOAD_SUCCESS otherwise.
149   int addEventIfNecessary(DeviceTy &Device, AsyncInfoTy &AsyncInfo) const;
150 
151   /// Indicate that the current thread expected to delete this entry.
setDeleteThreadIdHostDataToTargetTy152   void setDeleteThreadId() const {
153     States->DeleteThreadId = std::this_thread::get_id();
154   }
155 
156   /// Return the thread id of the thread expected to delete this entry.
getDeleteThreadIdHostDataToTargetTy157   std::thread::id getDeleteThreadId() const { return States->DeleteThreadId; }
158 
159   /// Set the event bound to this data map.
setEventHostDataToTargetTy160   void setEvent(void *Event) const { States->Event = Event; }
161 
162   /// Reset the specified reference count unless it's infinity.  Reset to 1
163   /// (even if currently 0) so it can be followed by a decrement.
resetRefCountHostDataToTargetTy164   void resetRefCount(bool UseHoldRefCount) const {
165     uint64_t &ThisRefCount =
166         UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
167     if (ThisRefCount != INFRefCount)
168       ThisRefCount = 1;
169   }
170 
171   /// Increment the specified reference count unless it's infinity.
incRefCountHostDataToTargetTy172   void incRefCount(bool UseHoldRefCount) const {
173     uint64_t &ThisRefCount =
174         UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
175     if (ThisRefCount != INFRefCount) {
176       ++ThisRefCount;
177       assert(ThisRefCount < INFRefCount && "refcount overflow");
178     }
179   }
180 
181   /// Decrement the specified reference count unless it's infinity or zero, and
182   /// return the total reference count.
decRefCountHostDataToTargetTy183   uint64_t decRefCount(bool UseHoldRefCount) const {
184     uint64_t &ThisRefCount =
185         UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
186     uint64_t OtherRefCount =
187         UseHoldRefCount ? States->DynRefCount : States->HoldRefCount;
188     (void)OtherRefCount;
189     if (ThisRefCount != INFRefCount) {
190       if (ThisRefCount > 0)
191         --ThisRefCount;
192       else
193         assert(OtherRefCount >= 0 && "total refcount underflow");
194     }
195     return getTotalRefCount();
196   }
197 
198   /// Is the dynamic (and thus the total) reference count infinite?
isDynRefCountInfHostDataToTargetTy199   bool isDynRefCountInf() const { return States->DynRefCount == INFRefCount; }
200 
201   /// Convert the dynamic reference count to a debug string.
dynRefCountToStrHostDataToTargetTy202   std::string dynRefCountToStr() const {
203     return refCountToStr(States->DynRefCount);
204   }
205 
206   /// Convert the hold reference count to a debug string.
holdRefCountToStrHostDataToTargetTy207   std::string holdRefCountToStr() const {
208     return refCountToStr(States->HoldRefCount);
209   }
210 
211   /// Should one decrement of the specified reference count (after resetting it
212   /// if \c AfterReset) remove this mapping?
213   bool decShouldRemove(bool UseHoldRefCount, bool AfterReset = false) const {
214     uint64_t ThisRefCount =
215         UseHoldRefCount ? States->HoldRefCount : States->DynRefCount;
216     uint64_t OtherRefCount =
217         UseHoldRefCount ? States->DynRefCount : States->HoldRefCount;
218     if (OtherRefCount > 0)
219       return false;
220     if (AfterReset)
221       return ThisRefCount != INFRefCount;
222     return ThisRefCount == 1;
223   }
224 
setMayContainAttachedPointersHostDataToTargetTy225   void setMayContainAttachedPointers() const {
226     States->MayContainAttachedPointers = true;
227   }
getMayContainAttachedPointersHostDataToTargetTy228   bool getMayContainAttachedPointers() const {
229     return States->MayContainAttachedPointers;
230   }
231 
lockHostDataToTargetTy232   void lock() const { States->UpdateMtx.lock(); }
233 
unlockHostDataToTargetTy234   void unlock() const { States->UpdateMtx.unlock(); }
235 };
236 
237 /// Wrapper around the HostDataToTargetTy to be used in the HDTT map. In
238 /// addition to the HDTT pointer we store the key value explicitly. This
239 /// allows the set to inspect (sort/search/...) this entry without an additional
240 /// load of HDTT. HDTT is a pointer to allow the modification of the set without
241 /// invalidating HDTT entries which can now be inspected at the same time.
242 struct HostDataToTargetMapKeyTy {
243   uintptr_t KeyValue;
244 
HostDataToTargetMapKeyTyHostDataToTargetMapKeyTy245   HostDataToTargetMapKeyTy(void *Key) : KeyValue(uintptr_t(Key)) {}
HostDataToTargetMapKeyTyHostDataToTargetMapKeyTy246   HostDataToTargetMapKeyTy(HostDataToTargetTy *HDTT)
247       : KeyValue(HDTT->HstPtrBegin), HDTT(HDTT) {}
248   HostDataToTargetTy *HDTT;
249 };
250 inline bool operator<(const HostDataToTargetMapKeyTy &LHS,
251                       const uintptr_t &RHS) {
252   return LHS.KeyValue < RHS;
253 }
254 inline bool operator<(const uintptr_t &LHS,
255                       const HostDataToTargetMapKeyTy &RHS) {
256   return LHS < RHS.KeyValue;
257 }
258 inline bool operator<(const HostDataToTargetMapKeyTy &LHS,
259                       const HostDataToTargetMapKeyTy &RHS) {
260   return LHS.KeyValue < RHS.KeyValue;
261 }
262 
263 struct LookupResult {
264   struct {
265     unsigned IsContained : 1;
266     unsigned ExtendsBefore : 1;
267     unsigned ExtendsAfter : 1;
268   } Flags;
269 
270   /// The corresponding map table entry which is stable.
271   HostDataToTargetTy *Entry = nullptr;
272 
LookupResultLookupResult273   LookupResult() : Flags({0, 0, 0}), Entry() {}
274 };
275 
276 /// This struct will be returned by \p DeviceTy::getTargetPointer which provides
277 /// more data than just a target pointer.
278 struct TargetPointerResultTy {
279   struct {
280     /// If the map table entry is just created
281     unsigned IsNewEntry : 1;
282     /// If the pointer is actually a host pointer (when unified memory enabled)
283     unsigned IsHostPointer : 1;
284   } Flags = {0, 0};
285 
286   /// The corresponding map table entry which is stable.
287   HostDataToTargetTy *Entry = nullptr;
288 
289   /// The corresponding target pointer
290   void *TargetPointer = nullptr;
291 };
292 
293 /// Map for shadow pointers
294 struct ShadowPtrValTy {
295   void *HstPtrVal;
296   void *TgtPtrAddr;
297   void *TgtPtrVal;
298 };
299 typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
300 
301 ///
302 struct PendingCtorDtorListsTy {
303   std::list<void *> PendingCtors;
304   std::list<void *> PendingDtors;
305 };
306 typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
307     PendingCtorsDtorsPerLibrary;
308 
309 struct DeviceTy {
310   int32_t DeviceID;
311   RTLInfoTy *RTL;
312   int32_t RTLDeviceID;
313 
314   bool IsInit;
315   std::once_flag InitFlag;
316   bool HasPendingGlobals;
317 
318   /// Host data to device map type with a wrapper key indirection that allows
319   /// concurrent modification of the entries without invalidating the underlying
320   /// entries.
321   using HostDataToTargetListTy =
322       std::set<HostDataToTargetMapKeyTy, std::less<>>;
323 
324   /// The HDTTMap is a protected object that can only be accessed by one thread
325   /// at a time.
326   ProtectedObj<HostDataToTargetListTy> HostDataToTargetMap;
327 
328   /// The type used to access the HDTT map.
329   using HDTTMapAccessorTy = decltype(HostDataToTargetMap)::AccessorTy;
330 
331   PendingCtorsDtorsPerLibrary PendingCtorsDtors;
332 
333   ShadowPtrListTy ShadowPtrMap;
334 
335   std::mutex PendingGlobalsMtx, ShadowMtx;
336 
337   // NOTE: Once libomp gains full target-task support, this state should be
338   // moved into the target task in libomp.
339   std::map<int32_t, uint64_t> LoopTripCnt;
340 
341   DeviceTy(RTLInfoTy *RTL);
342   // DeviceTy is not copyable
343   DeviceTy(const DeviceTy &D) = delete;
344   DeviceTy &operator=(const DeviceTy &D) = delete;
345 
346   ~DeviceTy();
347 
348   // Return true if data can be copied to DstDevice directly
349   bool isDataExchangable(const DeviceTy &DstDevice);
350 
351   /// Lookup the mapping of \p HstPtrBegin in \p HDTTMap. The accessor ensures
352   /// exclusive access to the HDTT map.
353   LookupResult lookupMapping(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
354                              int64_t Size);
355 
356   /// Get the target pointer based on host pointer begin and base. If the
357   /// mapping already exists, the target pointer will be returned directly. In
358   /// addition, if required, the memory region pointed by \p HstPtrBegin of size
359   /// \p Size will also be transferred to the device. If the mapping doesn't
360   /// exist, and if unified shared memory is not enabled, a new mapping will be
361   /// created and the data will also be transferred accordingly. nullptr will be
362   /// returned because of any of following reasons:
363   /// - Data allocation failed;
364   /// - The user tried to do an illegal mapping;
365   /// - Data transfer issue fails.
366   TargetPointerResultTy
367   getTargetPointer(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
368                    map_var_info_t HstPtrName, bool HasFlagTo,
369                    bool HasFlagAlways, bool IsImplicit, bool UpdateRefCount,
370                    bool HasCloseModifier, bool HasPresentModifier,
371                    bool HasHoldModifier, AsyncInfoTy &AsyncInfo);
372 
373   /// Return the target pointer for \p HstPtrBegin in \p HDTTMap. The accessor
374   /// ensures exclusive access to the HDTT map.
375   void *getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
376                        int64_t Size);
377 
378   TargetPointerResultTy getTgtPtrBegin(void *HstPtrBegin, int64_t Size,
379                                        bool &IsLast, bool UpdateRefCount,
380                                        bool UseHoldRefCount, bool &IsHostPtr,
381                                        bool MustContain = false,
382                                        bool ForceDelete = false);
383 
384   /// Deallocate \p LR and remove the entry. Assume the total reference count is
385   /// zero and the calling thread is the deleting thread for \p LR. \p HDTTMap
386   /// ensure the caller holds exclusive access and can modify the map. Return \c
387   /// OFFLOAD_SUCCESS if the map entry existed, and return \c OFFLOAD_FAIL if
388   /// not. It is the caller's responsibility to skip calling this function if
389   /// the map entry is not expected to exist because \p HstPtrBegin uses shared
390   /// memory.
391   int deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR, int64_t Size);
392 
393   int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
394   int disassociatePtr(void *HstPtrBegin);
395 
396   // calls to RTL
397   int32_t initOnce();
398   __tgt_target_table *loadBinary(void *Img);
399 
400   // device memory allocation/deallocation routines
401   /// Allocates \p Size bytes on the device, host or shared memory space
402   /// (depending on \p Kind) and returns the address/nullptr when
403   /// succeeds/fails. \p HstPtr is an address of the host data which the
404   /// allocated target data will be associated with. If it is unknown, the
405   /// default value of \p HstPtr is nullptr. Note: this function doesn't do
406   /// pointer association. Actually, all the __tgt_rtl_data_alloc
407   /// implementations ignore \p HstPtr. \p Kind dictates what allocator should
408   /// be used (host, shared, device).
409   void *allocData(int64_t Size, void *HstPtr = nullptr,
410                   int32_t Kind = TARGET_ALLOC_DEFAULT);
411   /// Deallocates memory which \p TgtPtrBegin points at and returns
412   /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
413   int32_t deleteData(void *TgtPtrBegin);
414 
415   // Data transfer. When AsyncInfo is nullptr, the transfer will be
416   // synchronous.
417   // Copy data from host to device
418   int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
419                      AsyncInfoTy &AsyncInfo);
420   // Copy data from device back to host
421   int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
422                        AsyncInfoTy &AsyncInfo);
423   // Copy data from current device to destination device directly
424   int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
425                        int64_t Size, AsyncInfoTy &AsyncInfo);
426 
427   int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets,
428                     int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo);
429   int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
430                         ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
431                         int32_t NumTeams, int32_t ThreadLimit,
432                         uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
433 
434   /// Synchronize device/queue/event based on \p AsyncInfo and return
435   /// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
436   int32_t synchronize(AsyncInfoTy &AsyncInfo);
437 
438   /// Calls the corresponding print in the \p RTLDEVID
439   /// device RTL to obtain the information of the specific device.
440   bool printDeviceInfo(int32_t RTLDevID);
441 
442   /// Event related interfaces.
443   /// {
444   /// Create an event.
445   int32_t createEvent(void **Event);
446 
447   /// Record the event based on status in AsyncInfo->Queue at the moment the
448   /// function is called.
449   int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo);
450 
451   /// Wait for an event. This function can be blocking or non-blocking,
452   /// depending on the implmentation. It is expected to set a dependence on the
453   /// event such that corresponding operations shall only start once the event
454   /// is fulfilled.
455   int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo);
456 
457   /// Synchronize the event. It is expected to block the thread.
458   int32_t syncEvent(void *Event);
459 
460   /// Destroy the event.
461   int32_t destroyEvent(void *Event);
462   /// }
463 
464 private:
465   // Call to RTL
466   void init(); // To be called only via DeviceTy::initOnce()
467 
468   /// Deinitialize the device (and plugin).
469   void deinit();
470 };
471 
472 extern bool deviceIsReady(int DeviceNum);
473 
474 /// Struct for the data required to handle plugins
475 struct PluginManager {
PluginManagerPluginManager476   PluginManager(bool UseEventsForAtomicTransfers)
477       : UseEventsForAtomicTransfers(UseEventsForAtomicTransfers) {}
478 
479   /// RTLs identified on the host
480   RTLsTy RTLs;
481 
482   /// Executable images and information extracted from the input images passed
483   /// to the runtime.
484   std::list<std::pair<__tgt_device_image, __tgt_image_info>> Images;
485 
486   /// Devices associated with RTLs
487   std::vector<std::unique_ptr<DeviceTy>> Devices;
488   std::mutex RTLsMtx; ///< For RTLs and Devices
489 
490   /// Translation table retreived from the binary
491   HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
492   std::mutex TrlTblMtx; ///< For Translation Table
493   /// Host offload entries in order of image registration
494   std::vector<__tgt_offload_entry *> HostEntriesBeginRegistrationOrder;
495 
496   /// Map from ptrs on the host to an entry in the Translation Table
497   HostPtrToTableMapTy HostPtrToTableMap;
498   std::mutex TblMapMtx; ///< For HostPtrToTableMap
499 
500   // Store target policy (disabled, mandatory, default)
501   kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default;
502   std::mutex TargetOffloadMtx; ///< For TargetOffloadPolicy
503 
504   /// Flag to indicate if we use events to ensure the atomicity of
505   /// map clauses or not. Can be modified with an environment variable.
506   const bool UseEventsForAtomicTransfers;
507 };
508 
509 extern PluginManager *PM;
510 
511 #endif
512