1 //===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "rt.h"
9 #include <memory>
10
11 /*
12 * Data
13 */
14
15 // host pointer (either src or dest) must be locked via hsa_amd_memory_lock
invoke_hsa_copy(hsa_signal_t signal,void * dest,hsa_agent_t agent,const void * src,size_t size)16 static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
17 hsa_agent_t agent, const void *src,
18 size_t size) {
19 const hsa_signal_value_t init = 1;
20 const hsa_signal_value_t success = 0;
21 hsa_signal_store_screlease(signal, init);
22
23 hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
24 nullptr, signal);
25 if (err != HSA_STATUS_SUCCESS)
26 return err;
27
28 // async_copy reports success by decrementing and failure by setting to < 0
29 hsa_signal_value_t got = init;
30 while (got == init)
31 got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
32 UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
33
34 if (got != success)
35 return HSA_STATUS_ERROR;
36
37 return err;
38 }
39
40 struct implFreePtrDeletor {
operator ()implFreePtrDeletor41 void operator()(void *p) {
42 core::Runtime::Memfree(p); // ignore failure to free
43 }
44 };
45
46 enum CopyDirection { H2D, D2H };
47
locking_async_memcpy(enum CopyDirection direction,hsa_signal_t signal,void * dest,hsa_agent_t agent,void * src,void * lockingPtr,size_t size)48 static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
49 hsa_signal_t signal, void *dest,
50 hsa_agent_t agent, void *src,
51 void *lockingPtr, size_t size) {
52 hsa_status_t err;
53
54 void *lockedPtr = nullptr;
55 err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
56 if (err != HSA_STATUS_SUCCESS)
57 return err;
58
59 switch (direction) {
60 case H2D:
61 err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
62 break;
63 case D2H:
64 err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
65 break;
66 default:
67 err = HSA_STATUS_ERROR; // fall into unlock before returning
68 }
69
70 if (err != HSA_STATUS_SUCCESS) {
71 // do not leak locked host pointers, but discard potential error message
72 hsa_amd_memory_unlock(lockingPtr);
73 return err;
74 }
75
76 err = hsa_amd_memory_unlock(lockingPtr);
77 if (err != HSA_STATUS_SUCCESS)
78 return err;
79
80 return HSA_STATUS_SUCCESS;
81 }
82
impl_memcpy_h2d(hsa_signal_t signal,void * deviceDest,void * hostSrc,size_t size,hsa_agent_t device_agent,hsa_amd_memory_pool_t MemoryPool)83 hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
84 void *hostSrc, size_t size,
85 hsa_agent_t device_agent,
86 hsa_amd_memory_pool_t MemoryPool) {
87 hsa_status_t err;
88
89 err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
90 device_agent, hostSrc, hostSrc, size);
91
92 if (err == HSA_STATUS_SUCCESS)
93 return err;
94
95 // async memcpy sometimes fails in situations where
96 // allocate + copy succeeds. Looks like it might be related to
97 // locking part of a read only segment. Fall back for now.
98 void *tempHostPtr;
99 hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
100 if (ret != HSA_STATUS_SUCCESS) {
101 DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
102 return ret;
103 }
104 std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
105 memcpy(tempHostPtr, hostSrc, size);
106
107 return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
108 device_agent, tempHostPtr, tempHostPtr, size);
109 }
110
impl_memcpy_d2h(hsa_signal_t signal,void * hostDest,void * deviceSrc,size_t size,hsa_agent_t deviceAgent,hsa_amd_memory_pool_t MemoryPool)111 hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
112 void *deviceSrc, size_t size,
113 hsa_agent_t deviceAgent,
114 hsa_amd_memory_pool_t MemoryPool) {
115 hsa_status_t err;
116
117 // device has always visibility over both pointers, so use that
118 err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
119 deviceSrc, hostDest, size);
120
121 if (err == HSA_STATUS_SUCCESS)
122 return err;
123
124 // hsa_memory_copy sometimes fails in situations where
125 // allocate + copy succeeds. Looks like it might be related to
126 // locking part of a read only segment. Fall back for now.
127 void *tempHostPtr;
128 hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
129 if (ret != HSA_STATUS_SUCCESS) {
130 DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
131 return ret;
132 }
133 std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
134
135 err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
136 deviceAgent, deviceSrc, tempHostPtr, size);
137 if (err != HSA_STATUS_SUCCESS)
138 return HSA_STATUS_ERROR;
139
140 memcpy(hostDest, tempHostPtr, size);
141 return HSA_STATUS_SUCCESS;
142 }
143