1 //===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "rt.h"
9 #include <memory>
10 
11 /*
12  * Data
13  */
14 
15 // host pointer (either src or dest) must be locked via hsa_amd_memory_lock
16 static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
17                                     hsa_agent_t agent, const void *src,
18                                     size_t size) {
19   const hsa_signal_value_t init = 1;
20   const hsa_signal_value_t success = 0;
21   hsa_signal_store_screlease(signal, init);
22 
23   hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
24                                                nullptr, signal);
25   if (err != HSA_STATUS_SUCCESS)
26     return err;
27 
28   // async_copy reports success by decrementing and failure by setting to < 0
29   hsa_signal_value_t got = init;
30   while (got == init)
31     got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
32                                     UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
33 
34   if (got != success)
35     return HSA_STATUS_ERROR;
36 
37   return err;
38 }
39 
40 struct implFreePtrDeletor {
41   void operator()(void *p) {
42     core::Runtime::Memfree(p); // ignore failure to free
43   }
44 };
45 
46 enum CopyDirection { H2D, D2H };
47 
48 static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
49                                          hsa_signal_t signal, void *dest,
50                                          hsa_agent_t agent, void *src,
51                                          void *lockingPtr, size_t size) {
52   hsa_status_t err;
53 
54   void *lockedPtr = nullptr;
55   err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
56   if (err != HSA_STATUS_SUCCESS)
57     return err;
58 
59   switch (direction) {
60   case H2D:
61     err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
62     break;
63   case D2H:
64     err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
65     break;
66   default:
67     err = HSA_STATUS_ERROR; // fall into unlock before returning
68   }
69 
70   if (err != HSA_STATUS_SUCCESS) {
71     // do not leak locked host pointers, but discard potential error message
72     hsa_amd_memory_unlock(lockingPtr);
73     return err;
74   }
75 
76   err = hsa_amd_memory_unlock(lockingPtr);
77   if (err != HSA_STATUS_SUCCESS)
78     return err;
79 
80   return HSA_STATUS_SUCCESS;
81 }
82 
83 hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
84                              void *hostSrc, size_t size,
85                              hsa_agent_t device_agent,
86                              hsa_amd_memory_pool_t MemoryPool) {
87   hsa_status_t err;
88 
89   err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
90                              device_agent, hostSrc, hostSrc, size);
91 
92   if (err == HSA_STATUS_SUCCESS)
93     return err;
94 
95   // async memcpy sometimes fails in situations where
96   // allocate + copy succeeds. Looks like it might be related to
97   // locking part of a read only segment. Fall back for now.
98   void *tempHostPtr;
99   hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
100   if (ret != HSA_STATUS_SUCCESS) {
101     DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
102     return ret;
103   }
104   std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
105   memcpy(tempHostPtr, hostSrc, size);
106 
107   return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
108                               device_agent, tempHostPtr, tempHostPtr, size);
109 }
110 
111 hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
112                              void *deviceSrc, size_t size,
113                              hsa_agent_t deviceAgent,
114                              hsa_amd_memory_pool_t MemoryPool) {
115   hsa_status_t err;
116 
117   // device has always visibility over both pointers, so use that
118   err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
119                              deviceSrc, hostDest, size);
120 
121   if (err == HSA_STATUS_SUCCESS)
122     return err;
123 
124   // hsa_memory_copy sometimes fails in situations where
125   // allocate + copy succeeds. Looks like it might be related to
126   // locking part of a read only segment. Fall back for now.
127   void *tempHostPtr;
128   hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
129   if (ret != HSA_STATUS_SUCCESS) {
130     DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
131     return ret;
132   }
133   std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
134 
135   err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
136                              deviceAgent, deviceSrc, tempHostPtr, size);
137   if (err != HSA_STATUS_SUCCESS)
138     return HSA_STATUS_ERROR;
139 
140   memcpy(hostDest, tempHostPtr, size);
141   return HSA_STATUS_SUCCESS;
142 }
143