168ab93f4SJon Chesterfield //===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
268ab93f4SJon Chesterfield //
368ab93f4SJon Chesterfield // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
468ab93f4SJon Chesterfield // See https://llvm.org/LICENSE.txt for license information.
568ab93f4SJon Chesterfield // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
668ab93f4SJon Chesterfield //
768ab93f4SJon Chesterfield //===----------------------------------------------------------------------===//
868ab93f4SJon Chesterfield #include "rt.h"
968ab93f4SJon Chesterfield #include <memory>
1068ab93f4SJon Chesterfield
1168ab93f4SJon Chesterfield /*
1268ab93f4SJon Chesterfield * Data
1368ab93f4SJon Chesterfield */
1468ab93f4SJon Chesterfield
15*cc8dc5e2SCarlo Bertolli // host pointer (either src or dest) must be locked via hsa_amd_memory_lock
invoke_hsa_copy(hsa_signal_t signal,void * dest,hsa_agent_t agent,const void * src,size_t size)16*cc8dc5e2SCarlo Bertolli static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
17*cc8dc5e2SCarlo Bertolli hsa_agent_t agent, const void *src,
18*cc8dc5e2SCarlo Bertolli size_t size) {
1968ab93f4SJon Chesterfield const hsa_signal_value_t init = 1;
2068ab93f4SJon Chesterfield const hsa_signal_value_t success = 0;
21*cc8dc5e2SCarlo Bertolli hsa_signal_store_screlease(signal, init);
2268ab93f4SJon Chesterfield
23*cc8dc5e2SCarlo Bertolli hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
24*cc8dc5e2SCarlo Bertolli nullptr, signal);
25*cc8dc5e2SCarlo Bertolli if (err != HSA_STATUS_SUCCESS)
2668ab93f4SJon Chesterfield return err;
2768ab93f4SJon Chesterfield
2868ab93f4SJon Chesterfield // async_copy reports success by decrementing and failure by setting to < 0
2968ab93f4SJon Chesterfield hsa_signal_value_t got = init;
30*cc8dc5e2SCarlo Bertolli while (got == init)
31*cc8dc5e2SCarlo Bertolli got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
323153bdd5SJon Chesterfield UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
3368ab93f4SJon Chesterfield
34*cc8dc5e2SCarlo Bertolli if (got != success)
3568ab93f4SJon Chesterfield return HSA_STATUS_ERROR;
3668ab93f4SJon Chesterfield
3768ab93f4SJon Chesterfield return err;
3868ab93f4SJon Chesterfield }
3968ab93f4SJon Chesterfield
403d853429SJon Chesterfield struct implFreePtrDeletor {
operator ()implFreePtrDeletor4168ab93f4SJon Chesterfield void operator()(void *p) {
4268ab93f4SJon Chesterfield core::Runtime::Memfree(p); // ignore failure to free
4368ab93f4SJon Chesterfield }
4468ab93f4SJon Chesterfield };
4568ab93f4SJon Chesterfield
46*cc8dc5e2SCarlo Bertolli enum CopyDirection { H2D, D2H };
4768ab93f4SJon Chesterfield
locking_async_memcpy(enum CopyDirection direction,hsa_signal_t signal,void * dest,hsa_agent_t agent,void * src,void * lockingPtr,size_t size)48*cc8dc5e2SCarlo Bertolli static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
49*cc8dc5e2SCarlo Bertolli hsa_signal_t signal, void *dest,
50*cc8dc5e2SCarlo Bertolli hsa_agent_t agent, void *src,
51*cc8dc5e2SCarlo Bertolli void *lockingPtr, size_t size) {
52*cc8dc5e2SCarlo Bertolli hsa_status_t err;
53*cc8dc5e2SCarlo Bertolli
54*cc8dc5e2SCarlo Bertolli void *lockedPtr = nullptr;
55*cc8dc5e2SCarlo Bertolli err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
56*cc8dc5e2SCarlo Bertolli if (err != HSA_STATUS_SUCCESS)
57*cc8dc5e2SCarlo Bertolli return err;
58*cc8dc5e2SCarlo Bertolli
59*cc8dc5e2SCarlo Bertolli switch (direction) {
60*cc8dc5e2SCarlo Bertolli case H2D:
61*cc8dc5e2SCarlo Bertolli err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
62*cc8dc5e2SCarlo Bertolli break;
63*cc8dc5e2SCarlo Bertolli case D2H:
64*cc8dc5e2SCarlo Bertolli err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
65*cc8dc5e2SCarlo Bertolli break;
66*cc8dc5e2SCarlo Bertolli default:
67*cc8dc5e2SCarlo Bertolli err = HSA_STATUS_ERROR; // fall into unlock before returning
68*cc8dc5e2SCarlo Bertolli }
69*cc8dc5e2SCarlo Bertolli
70*cc8dc5e2SCarlo Bertolli if (err != HSA_STATUS_SUCCESS) {
71*cc8dc5e2SCarlo Bertolli // do not leak locked host pointers, but discard potential error message
72*cc8dc5e2SCarlo Bertolli hsa_amd_memory_unlock(lockingPtr);
73*cc8dc5e2SCarlo Bertolli return err;
74*cc8dc5e2SCarlo Bertolli }
75*cc8dc5e2SCarlo Bertolli
76*cc8dc5e2SCarlo Bertolli err = hsa_amd_memory_unlock(lockingPtr);
77*cc8dc5e2SCarlo Bertolli if (err != HSA_STATUS_SUCCESS)
78*cc8dc5e2SCarlo Bertolli return err;
79*cc8dc5e2SCarlo Bertolli
8068ab93f4SJon Chesterfield return HSA_STATUS_SUCCESS;
8168ab93f4SJon Chesterfield }
8268ab93f4SJon Chesterfield
impl_memcpy_h2d(hsa_signal_t signal,void * deviceDest,void * hostSrc,size_t size,hsa_agent_t device_agent,hsa_amd_memory_pool_t MemoryPool)83*cc8dc5e2SCarlo Bertolli hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
84*cc8dc5e2SCarlo Bertolli void *hostSrc, size_t size,
85*cc8dc5e2SCarlo Bertolli hsa_agent_t device_agent,
86*cc8dc5e2SCarlo Bertolli hsa_amd_memory_pool_t MemoryPool) {
87*cc8dc5e2SCarlo Bertolli hsa_status_t err;
88*cc8dc5e2SCarlo Bertolli
89*cc8dc5e2SCarlo Bertolli err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
90*cc8dc5e2SCarlo Bertolli device_agent, hostSrc, hostSrc, size);
91*cc8dc5e2SCarlo Bertolli
92*cc8dc5e2SCarlo Bertolli if (err == HSA_STATUS_SUCCESS)
93*cc8dc5e2SCarlo Bertolli return err;
94*cc8dc5e2SCarlo Bertolli
95*cc8dc5e2SCarlo Bertolli // async memcpy sometimes fails in situations where
96*cc8dc5e2SCarlo Bertolli // allocate + copy succeeds. Looks like it might be related to
97*cc8dc5e2SCarlo Bertolli // locking part of a read only segment. Fall back for now.
9868ab93f4SJon Chesterfield void *tempHostPtr;
9968ab93f4SJon Chesterfield hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
10068ab93f4SJon Chesterfield if (ret != HSA_STATUS_SUCCESS) {
1013153bdd5SJon Chesterfield DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
10268ab93f4SJon Chesterfield return ret;
10368ab93f4SJon Chesterfield }
1043d853429SJon Chesterfield std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
10568ab93f4SJon Chesterfield memcpy(tempHostPtr, hostSrc, size);
10668ab93f4SJon Chesterfield
107*cc8dc5e2SCarlo Bertolli return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
108*cc8dc5e2SCarlo Bertolli device_agent, tempHostPtr, tempHostPtr, size);
10968ab93f4SJon Chesterfield }
11068ab93f4SJon Chesterfield
impl_memcpy_d2h(hsa_signal_t signal,void * hostDest,void * deviceSrc,size_t size,hsa_agent_t deviceAgent,hsa_amd_memory_pool_t MemoryPool)111*cc8dc5e2SCarlo Bertolli hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
112*cc8dc5e2SCarlo Bertolli void *deviceSrc, size_t size,
113*cc8dc5e2SCarlo Bertolli hsa_agent_t deviceAgent,
11468ab93f4SJon Chesterfield hsa_amd_memory_pool_t MemoryPool) {
115*cc8dc5e2SCarlo Bertolli hsa_status_t err;
116*cc8dc5e2SCarlo Bertolli
117*cc8dc5e2SCarlo Bertolli // device has always visibility over both pointers, so use that
118*cc8dc5e2SCarlo Bertolli err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
119*cc8dc5e2SCarlo Bertolli deviceSrc, hostDest, size);
120*cc8dc5e2SCarlo Bertolli
121*cc8dc5e2SCarlo Bertolli if (err == HSA_STATUS_SUCCESS)
122*cc8dc5e2SCarlo Bertolli return err;
12368ab93f4SJon Chesterfield
12468ab93f4SJon Chesterfield // hsa_memory_copy sometimes fails in situations where
12568ab93f4SJon Chesterfield // allocate + copy succeeds. Looks like it might be related to
12668ab93f4SJon Chesterfield // locking part of a read only segment. Fall back for now.
12768ab93f4SJon Chesterfield void *tempHostPtr;
12868ab93f4SJon Chesterfield hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
12968ab93f4SJon Chesterfield if (ret != HSA_STATUS_SUCCESS) {
1303153bdd5SJon Chesterfield DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
13168ab93f4SJon Chesterfield return ret;
13268ab93f4SJon Chesterfield }
1333d853429SJon Chesterfield std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
13468ab93f4SJon Chesterfield
135*cc8dc5e2SCarlo Bertolli err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
136*cc8dc5e2SCarlo Bertolli deviceAgent, deviceSrc, tempHostPtr, size);
137*cc8dc5e2SCarlo Bertolli if (err != HSA_STATUS_SUCCESS)
13868ab93f4SJon Chesterfield return HSA_STATUS_ERROR;
13968ab93f4SJon Chesterfield
140*cc8dc5e2SCarlo Bertolli memcpy(hostDest, tempHostPtr, size);
14168ab93f4SJon Chesterfield return HSA_STATUS_SUCCESS;
14268ab93f4SJon Chesterfield }
143