168ab93f4SJon Chesterfield //===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
268ab93f4SJon Chesterfield //
368ab93f4SJon Chesterfield // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
468ab93f4SJon Chesterfield // See https://llvm.org/LICENSE.txt for license information.
568ab93f4SJon Chesterfield // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
668ab93f4SJon Chesterfield //
768ab93f4SJon Chesterfield //===----------------------------------------------------------------------===//
868ab93f4SJon Chesterfield #include "rt.h"
968ab93f4SJon Chesterfield #include <memory>
1068ab93f4SJon Chesterfield 
1168ab93f4SJon Chesterfield /*
1268ab93f4SJon Chesterfield  * Data
1368ab93f4SJon Chesterfield  */
1468ab93f4SJon Chesterfield 
15*cc8dc5e2SCarlo Bertolli // host pointer (either src or dest) must be locked via hsa_amd_memory_lock
invoke_hsa_copy(hsa_signal_t signal,void * dest,hsa_agent_t agent,const void * src,size_t size)16*cc8dc5e2SCarlo Bertolli static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
17*cc8dc5e2SCarlo Bertolli                                     hsa_agent_t agent, const void *src,
18*cc8dc5e2SCarlo Bertolli                                     size_t size) {
1968ab93f4SJon Chesterfield   const hsa_signal_value_t init = 1;
2068ab93f4SJon Chesterfield   const hsa_signal_value_t success = 0;
21*cc8dc5e2SCarlo Bertolli   hsa_signal_store_screlease(signal, init);
2268ab93f4SJon Chesterfield 
23*cc8dc5e2SCarlo Bertolli   hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
24*cc8dc5e2SCarlo Bertolli                                                nullptr, signal);
25*cc8dc5e2SCarlo Bertolli   if (err != HSA_STATUS_SUCCESS)
2668ab93f4SJon Chesterfield     return err;
2768ab93f4SJon Chesterfield 
2868ab93f4SJon Chesterfield   // async_copy reports success by decrementing and failure by setting to < 0
2968ab93f4SJon Chesterfield   hsa_signal_value_t got = init;
30*cc8dc5e2SCarlo Bertolli   while (got == init)
31*cc8dc5e2SCarlo Bertolli     got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
323153bdd5SJon Chesterfield                                     UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
3368ab93f4SJon Chesterfield 
34*cc8dc5e2SCarlo Bertolli   if (got != success)
3568ab93f4SJon Chesterfield     return HSA_STATUS_ERROR;
3668ab93f4SJon Chesterfield 
3768ab93f4SJon Chesterfield   return err;
3868ab93f4SJon Chesterfield }
3968ab93f4SJon Chesterfield 
403d853429SJon Chesterfield struct implFreePtrDeletor {
operator ()implFreePtrDeletor4168ab93f4SJon Chesterfield   void operator()(void *p) {
4268ab93f4SJon Chesterfield     core::Runtime::Memfree(p); // ignore failure to free
4368ab93f4SJon Chesterfield   }
4468ab93f4SJon Chesterfield };
4568ab93f4SJon Chesterfield 
46*cc8dc5e2SCarlo Bertolli enum CopyDirection { H2D, D2H };
4768ab93f4SJon Chesterfield 
locking_async_memcpy(enum CopyDirection direction,hsa_signal_t signal,void * dest,hsa_agent_t agent,void * src,void * lockingPtr,size_t size)48*cc8dc5e2SCarlo Bertolli static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
49*cc8dc5e2SCarlo Bertolli                                          hsa_signal_t signal, void *dest,
50*cc8dc5e2SCarlo Bertolli                                          hsa_agent_t agent, void *src,
51*cc8dc5e2SCarlo Bertolli                                          void *lockingPtr, size_t size) {
52*cc8dc5e2SCarlo Bertolli   hsa_status_t err;
53*cc8dc5e2SCarlo Bertolli 
54*cc8dc5e2SCarlo Bertolli   void *lockedPtr = nullptr;
55*cc8dc5e2SCarlo Bertolli   err = hsa_amd_memory_lock(lockingPtr, size, nullptr, 0, (void **)&lockedPtr);
56*cc8dc5e2SCarlo Bertolli   if (err != HSA_STATUS_SUCCESS)
57*cc8dc5e2SCarlo Bertolli     return err;
58*cc8dc5e2SCarlo Bertolli 
59*cc8dc5e2SCarlo Bertolli   switch (direction) {
60*cc8dc5e2SCarlo Bertolli   case H2D:
61*cc8dc5e2SCarlo Bertolli     err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
62*cc8dc5e2SCarlo Bertolli     break;
63*cc8dc5e2SCarlo Bertolli   case D2H:
64*cc8dc5e2SCarlo Bertolli     err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
65*cc8dc5e2SCarlo Bertolli     break;
66*cc8dc5e2SCarlo Bertolli   default:
67*cc8dc5e2SCarlo Bertolli     err = HSA_STATUS_ERROR; // fall into unlock before returning
68*cc8dc5e2SCarlo Bertolli   }
69*cc8dc5e2SCarlo Bertolli 
70*cc8dc5e2SCarlo Bertolli   if (err != HSA_STATUS_SUCCESS) {
71*cc8dc5e2SCarlo Bertolli     // do not leak locked host pointers, but discard potential error message
72*cc8dc5e2SCarlo Bertolli     hsa_amd_memory_unlock(lockingPtr);
73*cc8dc5e2SCarlo Bertolli     return err;
74*cc8dc5e2SCarlo Bertolli   }
75*cc8dc5e2SCarlo Bertolli 
76*cc8dc5e2SCarlo Bertolli   err = hsa_amd_memory_unlock(lockingPtr);
77*cc8dc5e2SCarlo Bertolli   if (err != HSA_STATUS_SUCCESS)
78*cc8dc5e2SCarlo Bertolli     return err;
79*cc8dc5e2SCarlo Bertolli 
8068ab93f4SJon Chesterfield   return HSA_STATUS_SUCCESS;
8168ab93f4SJon Chesterfield }
8268ab93f4SJon Chesterfield 
impl_memcpy_h2d(hsa_signal_t signal,void * deviceDest,void * hostSrc,size_t size,hsa_agent_t device_agent,hsa_amd_memory_pool_t MemoryPool)83*cc8dc5e2SCarlo Bertolli hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
84*cc8dc5e2SCarlo Bertolli                              void *hostSrc, size_t size,
85*cc8dc5e2SCarlo Bertolli                              hsa_agent_t device_agent,
86*cc8dc5e2SCarlo Bertolli                              hsa_amd_memory_pool_t MemoryPool) {
87*cc8dc5e2SCarlo Bertolli   hsa_status_t err;
88*cc8dc5e2SCarlo Bertolli 
89*cc8dc5e2SCarlo Bertolli   err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
90*cc8dc5e2SCarlo Bertolli                              device_agent, hostSrc, hostSrc, size);
91*cc8dc5e2SCarlo Bertolli 
92*cc8dc5e2SCarlo Bertolli   if (err == HSA_STATUS_SUCCESS)
93*cc8dc5e2SCarlo Bertolli     return err;
94*cc8dc5e2SCarlo Bertolli 
95*cc8dc5e2SCarlo Bertolli   // async memcpy sometimes fails in situations where
96*cc8dc5e2SCarlo Bertolli   // allocate + copy succeeds. Looks like it might be related to
97*cc8dc5e2SCarlo Bertolli   // locking part of a read only segment. Fall back for now.
9868ab93f4SJon Chesterfield   void *tempHostPtr;
9968ab93f4SJon Chesterfield   hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
10068ab93f4SJon Chesterfield   if (ret != HSA_STATUS_SUCCESS) {
1013153bdd5SJon Chesterfield     DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
10268ab93f4SJon Chesterfield     return ret;
10368ab93f4SJon Chesterfield   }
1043d853429SJon Chesterfield   std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
10568ab93f4SJon Chesterfield   memcpy(tempHostPtr, hostSrc, size);
10668ab93f4SJon Chesterfield 
107*cc8dc5e2SCarlo Bertolli   return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
108*cc8dc5e2SCarlo Bertolli                               device_agent, tempHostPtr, tempHostPtr, size);
10968ab93f4SJon Chesterfield }
11068ab93f4SJon Chesterfield 
impl_memcpy_d2h(hsa_signal_t signal,void * hostDest,void * deviceSrc,size_t size,hsa_agent_t deviceAgent,hsa_amd_memory_pool_t MemoryPool)111*cc8dc5e2SCarlo Bertolli hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
112*cc8dc5e2SCarlo Bertolli                              void *deviceSrc, size_t size,
113*cc8dc5e2SCarlo Bertolli                              hsa_agent_t deviceAgent,
11468ab93f4SJon Chesterfield                              hsa_amd_memory_pool_t MemoryPool) {
115*cc8dc5e2SCarlo Bertolli   hsa_status_t err;
116*cc8dc5e2SCarlo Bertolli 
117*cc8dc5e2SCarlo Bertolli   // device has always visibility over both pointers, so use that
118*cc8dc5e2SCarlo Bertolli   err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
119*cc8dc5e2SCarlo Bertolli                              deviceSrc, hostDest, size);
120*cc8dc5e2SCarlo Bertolli 
121*cc8dc5e2SCarlo Bertolli   if (err == HSA_STATUS_SUCCESS)
122*cc8dc5e2SCarlo Bertolli     return err;
12368ab93f4SJon Chesterfield 
12468ab93f4SJon Chesterfield   // hsa_memory_copy sometimes fails in situations where
12568ab93f4SJon Chesterfield   // allocate + copy succeeds. Looks like it might be related to
12668ab93f4SJon Chesterfield   // locking part of a read only segment. Fall back for now.
12768ab93f4SJon Chesterfield   void *tempHostPtr;
12868ab93f4SJon Chesterfield   hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
12968ab93f4SJon Chesterfield   if (ret != HSA_STATUS_SUCCESS) {
1303153bdd5SJon Chesterfield     DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
13168ab93f4SJon Chesterfield     return ret;
13268ab93f4SJon Chesterfield   }
1333d853429SJon Chesterfield   std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
13468ab93f4SJon Chesterfield 
135*cc8dc5e2SCarlo Bertolli   err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
136*cc8dc5e2SCarlo Bertolli                              deviceAgent, deviceSrc, tempHostPtr, size);
137*cc8dc5e2SCarlo Bertolli   if (err != HSA_STATUS_SUCCESS)
13868ab93f4SJon Chesterfield     return HSA_STATUS_ERROR;
13968ab93f4SJon Chesterfield 
140*cc8dc5e2SCarlo Bertolli   memcpy(hostDest, tempHostPtr, size);
14168ab93f4SJon Chesterfield   return HSA_STATUS_SUCCESS;
14268ab93f4SJon Chesterfield }
143