1 //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for AMD hsa machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <algorithm>
14 #include <assert.h>
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 #include <functional>
19 #include <libelf.h>
20 #include <list>
21 #include <memory>
22 #include <mutex>
23 #include <shared_mutex>
24 #include <unordered_map>
25 #include <vector>
26
27 #include "impl_runtime.h"
28 #include "interop_hsa.h"
29
30 #include "internal.h"
31 #include "rt.h"
32
33 #include "DeviceEnvironment.h"
34 #include "get_elf_mach_gfx_name.h"
35 #include "omptargetplugin.h"
36 #include "print_tracing.h"
37
38 #include "llvm/ADT/StringMap.h"
39 #include "llvm/ADT/StringRef.h"
40 #include "llvm/Frontend/OpenMP/OMPConstants.h"
41 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
42
43 using namespace llvm;
44
45 // hostrpc interface, FIXME: consider moving to its own include these are
46 // statically linked into amdgpu/plugin if present from hostrpc_services.a,
47 // linked as --whole-archive to override the weak symbols that are used to
48 // implement a fallback for toolchains that do not yet have a hostrpc library.
49 extern "C" {
50 uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
51 uint32_t DeviceId);
52 hsa_status_t hostrpc_init();
53 hsa_status_t hostrpc_terminate();
54
hostrpc_init()55 __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
hostrpc_terminate()56 __attribute__((weak)) hsa_status_t hostrpc_terminate() {
57 return HSA_STATUS_SUCCESS;
58 }
hostrpc_assign_buffer(hsa_agent_t,hsa_queue_t *,uint32_t DeviceId)59 __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
60 uint32_t DeviceId) {
61 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
62 "missing\n",
63 DeviceId);
64 return 0;
65 }
66 }
67
68 // Heuristic parameters used for kernel launch
69 // Number of teams per CU to allow scheduling flexibility
70 static const unsigned DefaultTeamsPerCU = 4;
71
72 int print_kernel_trace;
73
74 #ifdef OMPTARGET_DEBUG
75 #define check(msg, status) \
76 if (status != HSA_STATUS_SUCCESS) { \
77 DP(#msg " failed\n"); \
78 } else { \
79 DP(#msg " succeeded\n"); \
80 }
81 #else
82 #define check(msg, status) \
83 {}
84 #endif
85
86 #include "elf_common.h"
87
88 namespace hsa {
iterate_agents(C Cb)89 template <typename C> hsa_status_t iterate_agents(C Cb) {
90 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
91 C *Unwrapped = static_cast<C *>(Data);
92 return (*Unwrapped)(Agent);
93 };
94 return hsa_iterate_agents(L, static_cast<void *>(&Cb));
95 }
96
97 template <typename C>
amd_agent_iterate_memory_pools(hsa_agent_t Agent,C Cb)98 hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
99 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
100 C *Unwrapped = static_cast<C *>(Data);
101 return (*Unwrapped)(MemoryPool);
102 };
103
104 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
105 }
106
107 } // namespace hsa
108
109 /// Keep entries table per device
110 struct FuncOrGblEntryTy {
111 __tgt_target_table Table;
112 std::vector<__tgt_offload_entry> Entries;
113 };
114
115 struct KernelArgPool {
116 private:
117 static pthread_mutex_t Mutex;
118
119 public:
120 uint32_t KernargSegmentSize;
121 void *KernargRegion = nullptr;
122 std::queue<int> FreeKernargSegments;
123
kernargSizeIncludingImplicitKernelArgPool124 uint32_t kernargSizeIncludingImplicit() {
125 return KernargSegmentSize + sizeof(impl_implicit_args_t);
126 }
127
~KernelArgPoolKernelArgPool128 ~KernelArgPool() {
129 if (KernargRegion) {
130 auto R = hsa_amd_memory_pool_free(KernargRegion);
131 if (R != HSA_STATUS_SUCCESS) {
132 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
133 }
134 }
135 }
136
137 // Can't really copy or move a mutex
138 KernelArgPool() = default;
139 KernelArgPool(const KernelArgPool &) = delete;
140 KernelArgPool(KernelArgPool &&) = delete;
141
KernelArgPoolKernelArgPool142 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
143 : KernargSegmentSize(KernargSegmentSize) {
144
145 // impl uses one pool per kernel for all gpus, with a fixed upper size
146 // preserving that exact scheme here, including the queue<int>
147
148 hsa_status_t Err = hsa_amd_memory_pool_allocate(
149 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
150 &KernargRegion);
151
152 if (Err != HSA_STATUS_SUCCESS) {
153 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
154 KernargRegion = nullptr; // paranoid
155 return;
156 }
157
158 Err = core::allow_access_to_all_gpu_agents(KernargRegion);
159 if (Err != HSA_STATUS_SUCCESS) {
160 DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
161 get_error_string(Err));
162 auto R = hsa_amd_memory_pool_free(KernargRegion);
163 if (R != HSA_STATUS_SUCCESS) {
164 // if free failed, can't do anything more to resolve it
165 DP("hsa memory poll free failed: %s\n", get_error_string(Err));
166 }
167 KernargRegion = nullptr;
168 return;
169 }
170
171 for (int I = 0; I < MAX_NUM_KERNELS; I++) {
172 FreeKernargSegments.push(I);
173 }
174 }
175
allocateKernelArgPool176 void *allocate(uint64_t ArgNum) {
177 assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
178 Lock L(&Mutex);
179 void *Res = nullptr;
180 if (!FreeKernargSegments.empty()) {
181
182 int FreeIdx = FreeKernargSegments.front();
183 Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
184 (FreeIdx * kernargSizeIncludingImplicit()));
185 assert(FreeIdx == pointerToIndex(Res));
186 FreeKernargSegments.pop();
187 }
188 return Res;
189 }
190
deallocateKernelArgPool191 void deallocate(void *Ptr) {
192 Lock L(&Mutex);
193 int Idx = pointerToIndex(Ptr);
194 FreeKernargSegments.push(Idx);
195 }
196
197 private:
pointerToIndexKernelArgPool198 int pointerToIndex(void *Ptr) {
199 ptrdiff_t Bytes =
200 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
201 assert(Bytes >= 0);
202 assert(Bytes % kernargSizeIncludingImplicit() == 0);
203 return Bytes / kernargSizeIncludingImplicit();
204 }
205 struct Lock {
LockKernelArgPool::Lock206 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
~LockKernelArgPool::Lock207 ~Lock() { pthread_mutex_unlock(M); }
208 pthread_mutex_t *M;
209 };
210 };
211 pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
212
213 std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
214 KernelArgPoolMap;
215
216 /// Use a single entity to encode a kernel and a set of flags
217 struct KernelTy {
218 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
219 int16_t ConstWGSize;
220 int32_t DeviceId;
221 void *CallStackAddr = nullptr;
222 const char *Name;
223
KernelTyKernelTy224 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
225 int32_t DeviceId, void *CallStackAddr, const char *Name,
226 uint32_t KernargSegmentSize,
227 hsa_amd_memory_pool_t &KernArgMemoryPool)
228 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
229 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
230 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
231
232 std::string N(Name);
233 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
234 KernelArgPoolMap.insert(
235 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
236 KernargSegmentSize, KernArgMemoryPool))));
237 }
238 }
239 };
240
241 /// List that contains all the kernels.
242 /// FIXME: we may need this to be per device and per library.
243 std::list<KernelTy> KernelsList;
244
findAgents(Callback CB)245 template <typename Callback> static hsa_status_t findAgents(Callback CB) {
246
247 hsa_status_t Err =
248 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
249 hsa_device_type_t DeviceType;
250 // get_info fails iff HSA runtime not yet initialized
251 hsa_status_t Err =
252 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
253
254 if (Err != HSA_STATUS_SUCCESS) {
255 if (print_kernel_trace > 0)
256 DP("rtl.cpp: err %s\n", get_error_string(Err));
257
258 return Err;
259 }
260
261 CB(DeviceType, Agent);
262 return HSA_STATUS_SUCCESS;
263 });
264
265 // iterate_agents fails iff HSA runtime not yet initialized
266 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
267 DP("rtl.cpp: err %s\n", get_error_string(Err));
268 }
269
270 return Err;
271 }
272
callbackQueue(hsa_status_t Status,hsa_queue_t * Source,void * Data)273 static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
274 void *Data) {
275 if (Status != HSA_STATUS_SUCCESS) {
276 const char *StatusString;
277 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
278 StatusString = "unavailable";
279 }
280 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
281 Status, StatusString);
282 abort();
283 }
284 }
285
286 namespace core {
287 namespace {
288
checkResult(hsa_status_t Err,const char * ErrMsg)289 bool checkResult(hsa_status_t Err, const char *ErrMsg) {
290 if (Err == HSA_STATUS_SUCCESS)
291 return true;
292
293 REPORT("%s", ErrMsg);
294 REPORT("%s", get_error_string(Err));
295 return false;
296 }
297
packetStoreRelease(uint32_t * Packet,uint16_t Header,uint16_t Rest)298 void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
299 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
300 }
301
createHeader()302 uint16_t createHeader() {
303 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
304 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
305 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
306 return Header;
307 }
308
isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool)309 hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
310 bool AllocAllowed = false;
311 hsa_status_t Err = hsa_amd_memory_pool_get_info(
312 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
313 &AllocAllowed);
314 if (Err != HSA_STATUS_SUCCESS) {
315 DP("Alloc allowed in memory pool check failed: %s\n",
316 get_error_string(Err));
317 return Err;
318 }
319
320 size_t Size = 0;
321 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
322 &Size);
323 if (Err != HSA_STATUS_SUCCESS) {
324 DP("Get memory pool size failed: %s\n", get_error_string(Err));
325 return Err;
326 }
327
328 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
329 }
330
addMemoryPool(hsa_amd_memory_pool_t MemoryPool,void * Data)331 hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
332 std::vector<hsa_amd_memory_pool_t> *Result =
333 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
334
335 hsa_status_t Err;
336 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
337 return Err;
338 }
339
340 Result->push_back(MemoryPool);
341 return HSA_STATUS_SUCCESS;
342 }
343
344 } // namespace
345 } // namespace core
346
347 struct EnvironmentVariables {
348 int NumTeams;
349 int TeamLimit;
350 int TeamThreadLimit;
351 int MaxTeamsDefault;
352 int DynamicMemSize;
353 };
354
355 template <uint32_t wavesize>
getGridValue()356 static constexpr const llvm::omp::GV &getGridValue() {
357 return llvm::omp::getAMDGPUGridValues<wavesize>();
358 }
359
360 struct HSALifetime {
361 // Wrapper around HSA used to ensure it is constructed before other types
362 // and destructed after, which means said other types can use raii for
363 // cleanup without risking running outside of the lifetime of HSA
364 const hsa_status_t S;
365
HSAInitSuccessHSALifetime366 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
HSALifetimeHSALifetime367 HSALifetime() : S(hsa_init()) {}
368
~HSALifetimeHSALifetime369 ~HSALifetime() {
370 if (S == HSA_STATUS_SUCCESS) {
371 hsa_status_t Err = hsa_shut_down();
372 if (Err != HSA_STATUS_SUCCESS) {
373 // Can't call into HSA to get a string from the integer
374 DP("Shutting down HSA failed: %d\n", Err);
375 }
376 }
377 }
378 };
379
380 // Handle scheduling of multiple hsa_queue's per device to
381 // multiple threads (one scheduler per device)
382 class HSAQueueScheduler {
383 public:
HSAQueueScheduler()384 HSAQueueScheduler() : Current(0) {}
385
386 HSAQueueScheduler(const HSAQueueScheduler &) = delete;
387
HSAQueueScheduler(HSAQueueScheduler && Q)388 HSAQueueScheduler(HSAQueueScheduler &&Q) {
389 Current = Q.Current.load();
390 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
391 HSAQueues[I] = Q.HSAQueues[I];
392 Q.HSAQueues[I] = nullptr;
393 }
394 }
395
396 // \return false if any HSA queue creation fails
createQueues(hsa_agent_t HSAAgent,uint32_t QueueSize)397 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
398 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
399 hsa_queue_t *Q = nullptr;
400 hsa_status_t Rc =
401 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
402 callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
403 if (Rc != HSA_STATUS_SUCCESS) {
404 DP("Failed to create HSA queue %d\n", I);
405 return false;
406 }
407 HSAQueues[I] = Q;
408 }
409 return true;
410 }
411
~HSAQueueScheduler()412 ~HSAQueueScheduler() {
413 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
414 if (HSAQueues[I]) {
415 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
416 if (Err != HSA_STATUS_SUCCESS)
417 DP("Error destroying HSA queue");
418 }
419 }
420 }
421
422 // \return next queue to use for device
next()423 hsa_queue_t *next() {
424 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
425 NUM_QUEUES_PER_DEVICE];
426 }
427
428 private:
429 // Number of queues per device
430 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
431 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
432 std::atomic<uint8_t> Current;
433 };
434
435 /// Class containing all the device information
436 class RTLDeviceInfoTy : HSALifetime {
437 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
438
439 struct QueueDeleter {
operator ()RTLDeviceInfoTy::QueueDeleter440 void operator()(hsa_queue_t *Q) {
441 if (Q) {
442 hsa_status_t Err = hsa_queue_destroy(Q);
443 if (Err != HSA_STATUS_SUCCESS) {
444 DP("Error destroying hsa queue: %s\n", get_error_string(Err));
445 }
446 }
447 }
448 };
449
450 public:
451 bool ConstructionSucceeded = false;
452
453 // load binary populates symbol tables and mutates various global state
454 // run uses those symbol tables
455 std::shared_timed_mutex LoadRunLock;
456
457 int NumberOfDevices = 0;
458
459 // GPU devices
460 std::vector<hsa_agent_t> HSAAgents;
461 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
462
463 // CPUs
464 std::vector<hsa_agent_t> CPUAgents;
465
466 // Device properties
467 std::vector<int> ComputeUnits;
468 std::vector<int> GroupsPerDevice;
469 std::vector<int> ThreadsPerGroup;
470 std::vector<int> WarpSize;
471 std::vector<std::string> GPUName;
472 std::vector<std::string> TargetID;
473
474 // OpenMP properties
475 std::vector<int> NumTeams;
476 std::vector<int> NumThreads;
477
478 // OpenMP Environment properties
479 EnvironmentVariables Env;
480
481 // OpenMP Requires Flags
482 int64_t RequiresFlags;
483
484 // Resource pools
485 SignalPoolT FreeSignalPool;
486
487 bool HostcallRequired = false;
488
489 std::vector<hsa_executable_t> HSAExecutables;
490
491 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
492 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
493
494 hsa_amd_memory_pool_t KernArgPool;
495
496 // fine grained memory pool for host allocations
497 hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
498
499 // fine and coarse-grained memory pools per offloading device
500 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
501 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
502
503 struct ImplFreePtrDeletor {
operator ()RTLDeviceInfoTy::ImplFreePtrDeletor504 void operator()(void *P) {
505 core::Runtime::Memfree(P); // ignore failure to free
506 }
507 };
508
509 // device_State shared across loaded binaries, error if inconsistent size
510 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
511 DeviceStateStore;
512
513 static const unsigned HardTeamLimit =
514 (1 << 16) - 1; // 64K needed to fit in uint16
515 static const int DefaultNumTeams = 128;
516
517 // These need to be per-device since different devices can have different
518 // wave sizes, but are currently the same number for each so that refactor
519 // can be postponed.
520 static_assert(getGridValue<32>().GV_Max_Teams ==
521 getGridValue<64>().GV_Max_Teams,
522 "");
523 static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
524
525 static_assert(getGridValue<32>().GV_Max_WG_Size ==
526 getGridValue<64>().GV_Max_WG_Size,
527 "");
528 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
529
530 static_assert(getGridValue<32>().GV_Default_WG_Size ==
531 getGridValue<64>().GV_Default_WG_Size,
532 "");
533 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
534
535 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
536 hsa_agent_t, hsa_amd_memory_pool_t);
freesignalpoolMemcpy(void * Dest,void * Src,size_t Size,MemcpyFunc Func,int32_t DeviceId)537 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
538 MemcpyFunc Func, int32_t DeviceId) {
539 hsa_agent_t Agent = HSAAgents[DeviceId];
540 hsa_signal_t S = FreeSignalPool.pop();
541 if (S.handle == 0) {
542 return HSA_STATUS_ERROR;
543 }
544 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
545 FreeSignalPool.push(S);
546 return R;
547 }
548
freesignalpoolMemcpyD2H(void * Dest,void * Src,size_t Size,int32_t DeviceId)549 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
550 int32_t DeviceId) {
551 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
552 }
553
freesignalpoolMemcpyH2D(void * Dest,void * Src,size_t Size,int32_t DeviceId)554 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
555 int32_t DeviceId) {
556 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
557 }
558
printDeviceInfo(int32_t DeviceId,hsa_agent_t Agent)559 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
560 char TmpChar[1000];
561 uint16_t Major, Minor;
562 uint32_t TmpUInt;
563 uint32_t TmpUInt2;
564 uint32_t CacheSize[4];
565 bool TmpBool;
566 uint16_t WorkgroupMaxDim[3];
567 hsa_dim3_t GridMaxDim;
568
569 // Getting basic information about HSA and Device
570 core::checkResult(
571 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
572 "Error from hsa_system_get_info when obtaining "
573 "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
574 core::checkResult(
575 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
576 "Error from hsa_system_get_info when obtaining "
577 "HSA_SYSTEM_INFO_VERSION_MINOR\n");
578 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
579 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
580 core::checkResult(
581 hsa_agent_get_info(
582 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
583 "Error returned from hsa_agent_get_info when obtaining "
584 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
585 printf(" Product Name: \t\t\t%s \n", TmpChar);
586 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
587 "Error returned from hsa_agent_get_info when obtaining "
588 "HSA_AGENT_INFO_NAME\n");
589 printf(" Device Name: \t\t\t%s \n", TmpChar);
590 core::checkResult(
591 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
592 "Error returned from hsa_agent_get_info when obtaining "
593 "HSA_AGENT_INFO_NAME\n");
594 printf(" Vendor Name: \t\t\t%s \n", TmpChar);
595 hsa_device_type_t DevType;
596 core::checkResult(
597 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
598 "Error returned from hsa_agent_get_info when obtaining "
599 "HSA_AGENT_INFO_DEVICE\n");
600 printf(" Device Type: \t\t\t%s \n",
601 DevType == HSA_DEVICE_TYPE_CPU
602 ? "CPU"
603 : (DevType == HSA_DEVICE_TYPE_GPU
604 ? "GPU"
605 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
606 core::checkResult(
607 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
608 "Error returned from hsa_agent_get_info when obtaining "
609 "HSA_AGENT_INFO_QUEUES_MAX\n");
610 printf(" Max Queues: \t\t\t%u \n", TmpUInt);
611 core::checkResult(
612 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
613 "Error returned from hsa_agent_get_info when obtaining "
614 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
615 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
616 core::checkResult(
617 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
618 "Error returned from hsa_agent_get_info when obtaining "
619 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
620 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
621
622 // Getting cache information
623 printf(" Cache:\n");
624
625 // FIXME: This is deprecated according to HSA documentation. But using
626 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
627 // runtime.
628 core::checkResult(
629 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
630 "Error returned from hsa_agent_get_info when obtaining "
631 "HSA_AGENT_INFO_CACHE_SIZE\n");
632
633 for (int I = 0; I < 4; I++) {
634 if (CacheSize[I]) {
635 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
636 }
637 }
638
639 core::checkResult(
640 hsa_agent_get_info(Agent,
641 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
642 &TmpUInt),
643 "Error returned from hsa_agent_get_info when obtaining "
644 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
645 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
646 core::checkResult(
647 hsa_agent_get_info(
648 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
649 &TmpUInt),
650 "Error returned from hsa_agent_get_info when obtaining "
651 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
652 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
653 core::checkResult(
654 hsa_agent_get_info(
655 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
656 &TmpUInt),
657 "Error returned from hsa_agent_get_info when obtaining "
658 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
659 printf(" Compute Units: \t\t\t%u \n", TmpUInt);
660 core::checkResult(hsa_agent_get_info(
661 Agent,
662 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
663 &TmpUInt),
664 "Error returned from hsa_agent_get_info when obtaining "
665 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
666 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
667 core::checkResult(
668 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
669 "Error returned from hsa_agent_get_info when obtaining "
670 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
671 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
672 core::checkResult(
673 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
674 "Error returned from hsa_agent_get_info when obtaining "
675 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
676 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
677 core::checkResult(
678 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
679 "Error returned from hsa_agent_get_info when obtaining "
680 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
681 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
682 core::checkResult(hsa_agent_get_info(Agent,
683 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
684 WorkgroupMaxDim),
685 "Error returned from hsa_agent_get_info when obtaining "
686 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
687 printf(" Workgroup Max Size per Dimension:\n");
688 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
689 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
690 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
691 core::checkResult(hsa_agent_get_info(
692 Agent,
693 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
694 &TmpUInt),
695 "Error returned from hsa_agent_get_info when obtaining "
696 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
697 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
698 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
699 core::checkResult(
700 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
701 "Error returned from hsa_agent_get_info when obtaining "
702 "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
703 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
704 core::checkResult(
705 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
706 "Error returned from hsa_agent_get_info when obtaining "
707 "HSA_AGENT_INFO_GRID_MAX_DIM\n");
708 printf(" Grid Max Size per Dimension: \t\t\n");
709 printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
710 printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
711 printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
712 core::checkResult(
713 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
714 "Error returned from hsa_agent_get_info when obtaining "
715 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
716 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
717
718 printf(" Memory Pools:\n");
719 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
720 std::string TmpStr;
721 size_t Size;
722 bool Alloc, Access;
723 hsa_amd_segment_t Segment;
724 hsa_amd_memory_pool_global_flag_t GlobalFlags;
725 core::checkResult(
726 hsa_amd_memory_pool_get_info(
727 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
728 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
729 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
730 core::checkResult(hsa_amd_memory_pool_get_info(
731 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
732 "Error returned from hsa_amd_memory_pool_get_info when "
733 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
734
735 switch (Segment) {
736 case HSA_AMD_SEGMENT_GLOBAL:
737 TmpStr = "GLOBAL; FLAGS: ";
738 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
739 TmpStr += "KERNARG, ";
740 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
741 TmpStr += "FINE GRAINED, ";
742 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
743 TmpStr += "COARSE GRAINED, ";
744 break;
745 case HSA_AMD_SEGMENT_READONLY:
746 TmpStr = "READONLY";
747 break;
748 case HSA_AMD_SEGMENT_PRIVATE:
749 TmpStr = "PRIVATE";
750 break;
751 case HSA_AMD_SEGMENT_GROUP:
752 TmpStr = "GROUP";
753 break;
754 }
755 printf(" Pool %s: \n", TmpStr.c_str());
756
757 core::checkResult(hsa_amd_memory_pool_get_info(
758 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
759 "Error returned from hsa_amd_memory_pool_get_info when "
760 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
761 printf(" Size: \t\t\t\t %zu bytes\n", Size);
762 core::checkResult(
763 hsa_amd_memory_pool_get_info(
764 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
765 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
766 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
767 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
768 core::checkResult(
769 hsa_amd_memory_pool_get_info(
770 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
771 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
772 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
773 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
774 core::checkResult(
775 hsa_amd_memory_pool_get_info(
776 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
777 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
778 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
779 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
780 core::checkResult(
781 hsa_amd_memory_pool_get_info(
782 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
783 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
784 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
785 printf(" Accessable by all: \t\t %s\n",
786 (Access ? "TRUE" : "FALSE"));
787
788 return HSA_STATUS_SUCCESS;
789 };
790 // Iterate over all the memory regions for this agent. Get the memory region
791 // type and size
792 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
793
794 printf(" ISAs:\n");
795 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
796 char TmpChar[1000];
797 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
798 "Error returned from hsa_isa_get_info_alt when "
799 "obtaining HSA_ISA_INFO_NAME\n");
800 printf(" Name: \t\t\t\t %s\n", TmpChar);
801
802 return HSA_STATUS_SUCCESS;
803 };
804 // Iterate over all the memory regions for this agent. Get the memory region
805 // type and size
806 hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
807 }
808
809 // Record entry point associated with device
addOffloadEntry(int32_t DeviceId,__tgt_offload_entry Entry)810 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
811 assert(DeviceId < (int32_t)FuncGblEntries.size() &&
812 "Unexpected device id!");
813 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
814
815 E.Entries.push_back(Entry);
816 }
817
818 // Return true if the entry is associated with device
findOffloadEntry(int32_t DeviceId,void * Addr)819 bool findOffloadEntry(int32_t DeviceId, void *Addr) {
820 assert(DeviceId < (int32_t)FuncGblEntries.size() &&
821 "Unexpected device id!");
822 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
823
824 for (auto &It : E.Entries) {
825 if (It.addr == Addr)
826 return true;
827 }
828
829 return false;
830 }
831
832 // Return the pointer to the target entries table
getOffloadEntriesTable(int32_t DeviceId)833 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
834 assert(DeviceId < (int32_t)FuncGblEntries.size() &&
835 "Unexpected device id!");
836 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
837
838 int32_t Size = E.Entries.size();
839
840 // Table is empty
841 if (!Size)
842 return 0;
843
844 __tgt_offload_entry *Begin = &E.Entries[0];
845 __tgt_offload_entry *End = &E.Entries[Size - 1];
846
847 // Update table info according to the entries and return the pointer
848 E.Table.EntriesBegin = Begin;
849 E.Table.EntriesEnd = ++End;
850
851 return &E.Table;
852 }
853
854 // Clear entries table for a device
clearOffloadEntriesTable(int DeviceId)855 void clearOffloadEntriesTable(int DeviceId) {
856 assert(DeviceId < (int32_t)FuncGblEntries.size() &&
857 "Unexpected device id!");
858 FuncGblEntries[DeviceId].emplace_back();
859 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
860 // KernelArgPoolMap.clear();
861 E.Entries.clear();
862 E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
863 }
864
addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,unsigned int DeviceId)865 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
866 unsigned int DeviceId) {
867 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
868 uint32_t GlobalFlags = 0;
869 hsa_status_t Err = hsa_amd_memory_pool_get_info(
870 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
871
872 if (Err != HSA_STATUS_SUCCESS) {
873 return Err;
874 }
875
876 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
877 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
878 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
879 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
880 }
881
882 return HSA_STATUS_SUCCESS;
883 }
884
setupDevicePools(const std::vector<hsa_agent_t> & Agents)885 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
886 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
887 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
888 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
889 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
890 if (ValidStatus != HSA_STATUS_SUCCESS) {
891 DP("Alloc allowed in memory pool check failed: %s\n",
892 get_error_string(ValidStatus));
893 return HSA_STATUS_SUCCESS;
894 }
895 return addDeviceMemoryPool(MemoryPool, DeviceId);
896 });
897
898 if (Err != HSA_STATUS_SUCCESS) {
899 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
900 "Iterate all memory pools", get_error_string(Err));
901 return Err;
902 }
903 }
904 return HSA_STATUS_SUCCESS;
905 }
906
setupHostMemoryPools(std::vector<hsa_agent_t> & Agents)907 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
908 std::vector<hsa_amd_memory_pool_t> HostPools;
909
910 // collect all the "valid" pools for all the given agents.
911 for (const auto &Agent : Agents) {
912 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
913 Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
914 if (Err != HSA_STATUS_SUCCESS) {
915 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
916 }
917 }
918
919 // We need two fine-grained pools.
920 // 1. One with kernarg flag set for storing kernel arguments
921 // 2. Second for host allocations
922 bool FineGrainedMemoryPoolSet = false;
923 bool KernArgPoolSet = false;
924 for (const auto &MemoryPool : HostPools) {
925 hsa_status_t Err = HSA_STATUS_SUCCESS;
926 uint32_t GlobalFlags = 0;
927 Err = hsa_amd_memory_pool_get_info(
928 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
929 if (Err != HSA_STATUS_SUCCESS) {
930 DP("Get memory pool info failed: %s\n", get_error_string(Err));
931 return Err;
932 }
933
934 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
935 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
936 KernArgPool = MemoryPool;
937 KernArgPoolSet = true;
938 }
939 HostFineGrainedMemoryPool = MemoryPool;
940 FineGrainedMemoryPoolSet = true;
941 }
942 }
943
944 if (FineGrainedMemoryPoolSet && KernArgPoolSet)
945 return HSA_STATUS_SUCCESS;
946
947 return HSA_STATUS_ERROR;
948 }
949
getDeviceMemoryPool(unsigned int DeviceId)950 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
951 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
952 "Invalid device Id");
953 return DeviceCoarseGrainedMemoryPools[DeviceId];
954 }
955
getHostMemoryPool()956 hsa_amd_memory_pool_t getHostMemoryPool() {
957 return HostFineGrainedMemoryPool;
958 }
959
readEnv(const char * Env,int Default=-1)960 static int readEnv(const char *Env, int Default = -1) {
961 const char *EnvStr = getenv(Env);
962 int Res = Default;
963 if (EnvStr) {
964 Res = std::stoi(EnvStr);
965 DP("Parsed %s=%d\n", Env, Res);
966 }
967 return Res;
968 }
969
RTLDeviceInfoTy()970 RTLDeviceInfoTy() {
971 DP("Start initializing " GETNAME(TARGET_NAME) "\n");
972
973 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
974 // anytime. You do not need a debug library build.
975 // 0 => no tracing
976 // 1 => tracing dispatch only
977 // >1 => verbosity increase
978
979 if (!HSAInitSuccess()) {
980 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
981 return;
982 }
983
984 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
985 print_kernel_trace = atoi(EnvStr);
986 else
987 print_kernel_trace = 0;
988
989 hsa_status_t Err = core::atl_init_gpu_context();
990 if (Err != HSA_STATUS_SUCCESS) {
991 DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
992 return;
993 }
994
995 // Init hostcall soon after initializing hsa
996 hostrpc_init();
997
998 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
999 if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1000 CPUAgents.push_back(Agent);
1001 } else {
1002 HSAAgents.push_back(Agent);
1003 }
1004 });
1005 if (Err != HSA_STATUS_SUCCESS)
1006 return;
1007
1008 NumberOfDevices = (int)HSAAgents.size();
1009
1010 if (NumberOfDevices == 0) {
1011 DP("There are no devices supporting HSA.\n");
1012 return;
1013 }
1014 DP("There are %d devices supporting HSA.\n", NumberOfDevices);
1015
1016 // Init the device info
1017 HSAQueueSchedulers.reserve(NumberOfDevices);
1018 FuncGblEntries.resize(NumberOfDevices);
1019 ThreadsPerGroup.resize(NumberOfDevices);
1020 ComputeUnits.resize(NumberOfDevices);
1021 GPUName.resize(NumberOfDevices);
1022 GroupsPerDevice.resize(NumberOfDevices);
1023 WarpSize.resize(NumberOfDevices);
1024 NumTeams.resize(NumberOfDevices);
1025 NumThreads.resize(NumberOfDevices);
1026 DeviceStateStore.resize(NumberOfDevices);
1027 KernelInfoTable.resize(NumberOfDevices);
1028 SymbolInfoTable.resize(NumberOfDevices);
1029 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1030 DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1031
1032 Err = setupDevicePools(HSAAgents);
1033 if (Err != HSA_STATUS_SUCCESS) {
1034 DP("Setup for Device Memory Pools failed\n");
1035 return;
1036 }
1037
1038 Err = setupHostMemoryPools(CPUAgents);
1039 if (Err != HSA_STATUS_SUCCESS) {
1040 DP("Setup for Host Memory Pools failed\n");
1041 return;
1042 }
1043
1044 for (int I = 0; I < NumberOfDevices; I++) {
1045 uint32_t QueueSize = 0;
1046 {
1047 hsa_status_t Err = hsa_agent_get_info(
1048 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1049 if (Err != HSA_STATUS_SUCCESS) {
1050 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
1051 return;
1052 }
1053 enum { MaxQueueSize = 4096 };
1054 if (QueueSize > MaxQueueSize) {
1055 QueueSize = MaxQueueSize;
1056 }
1057 }
1058
1059 {
1060 HSAQueueScheduler QSched;
1061 if (!QSched.createQueues(HSAAgents[I], QueueSize))
1062 return;
1063 HSAQueueSchedulers.emplace_back(std::move(QSched));
1064 }
1065
1066 DeviceStateStore[I] = {nullptr, 0};
1067 }
1068
1069 for (int I = 0; I < NumberOfDevices; I++) {
1070 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1071 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1072 ComputeUnits[I] = 1;
1073 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
1074 GroupsPerDevice[I], ThreadsPerGroup[I]);
1075 }
1076
1077 // Get environment variables regarding teams
1078 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1079 Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1080 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1081 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1082 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1083
1084 // Default state.
1085 RequiresFlags = OMP_REQ_UNDEFINED;
1086
1087 ConstructionSucceeded = true;
1088 }
1089
~RTLDeviceInfoTy()1090 ~RTLDeviceInfoTy() {
1091 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
1092 if (!HSAInitSuccess()) {
1093 // Then none of these can have been set up and they can't be torn down
1094 return;
1095 }
1096 // Run destructors on types that use HSA before
1097 // impl_finalize removes access to it
1098 DeviceStateStore.clear();
1099 KernelArgPoolMap.clear();
1100 // Terminate hostrpc before finalizing hsa
1101 hostrpc_terminate();
1102
1103 hsa_status_t Err;
1104 for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1105 Err = hsa_executable_destroy(HSAExecutables[I]);
1106 if (Err != HSA_STATUS_SUCCESS) {
1107 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
1108 "Destroying executable", get_error_string(Err));
1109 }
1110 }
1111 }
1112 };
1113
1114 pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
1115
1116 // Putting accesses to DeviceInfo global behind a function call prior
1117 // to changing to use init_plugin/deinit_plugin calls
1118 static RTLDeviceInfoTy DeviceInfoState;
DeviceInfo()1119 static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1120
1121 namespace {
1122
dataRetrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfo)1123 int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
1124 __tgt_async_info *AsyncInfo) {
1125 assert(AsyncInfo && "AsyncInfo is nullptr");
1126 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
1127 // Return success if we are not copying back to host from target.
1128 if (!HstPtr)
1129 return OFFLOAD_SUCCESS;
1130 hsa_status_t Err;
1131 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1132 (long long unsigned)(Elf64_Addr)TgtPtr,
1133 (long long unsigned)(Elf64_Addr)HstPtr);
1134
1135 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1136 DeviceId);
1137
1138 if (Err != HSA_STATUS_SUCCESS) {
1139 DP("Error when copying data from device to host. Pointers: "
1140 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1141 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1142 return OFFLOAD_FAIL;
1143 }
1144 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
1145 (long long unsigned)(Elf64_Addr)TgtPtr,
1146 (long long unsigned)(Elf64_Addr)HstPtr);
1147 return OFFLOAD_SUCCESS;
1148 }
1149
dataSubmit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1150 int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
1151 __tgt_async_info *AsyncInfo) {
1152 assert(AsyncInfo && "AsyncInfo is nullptr");
1153 hsa_status_t Err;
1154 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
1155 // Return success if we are not doing host to target.
1156 if (!HstPtr)
1157 return OFFLOAD_SUCCESS;
1158
1159 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
1160 (long long unsigned)(Elf64_Addr)HstPtr,
1161 (long long unsigned)(Elf64_Addr)TgtPtr);
1162 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1163 DeviceId);
1164 if (Err != HSA_STATUS_SUCCESS) {
1165 DP("Error when copying data from host to device. Pointers: "
1166 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
1167 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
1168 return OFFLOAD_FAIL;
1169 }
1170 return OFFLOAD_SUCCESS;
1171 }
1172
1173 // Async.
1174 // The implementation was written with cuda streams in mind. The semantics of
1175 // that are to execute kernels on a queue in order of insertion. A synchronise
1176 // call then makes writes visible between host and device. This means a series
1177 // of N data_submit_async calls are expected to execute serially. HSA offers
1178 // various options to run the data copies concurrently. This may require changes
1179 // to libomptarget.
1180
1181 // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1182 // there are no outstanding kernels that need to be synchronized. Any async call
1183 // may be passed a Queue==0, at which point the cuda implementation will set it
1184 // to non-null (see getStream). The cuda streams are per-device. Upstream may
1185 // change this interface to explicitly initialize the AsyncInfo_pointer, but
1186 // until then hsa lazily initializes it as well.
1187
initAsyncInfo(__tgt_async_info * AsyncInfo)1188 void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1189 // set non-null while using async calls, return to null to indicate completion
1190 assert(AsyncInfo);
1191 if (!AsyncInfo->Queue) {
1192 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
1193 }
1194 }
finiAsyncInfo(__tgt_async_info * AsyncInfo)1195 void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1196 assert(AsyncInfo);
1197 assert(AsyncInfo->Queue);
1198 AsyncInfo->Queue = 0;
1199 }
1200
1201 // Determine launch values for kernel.
1202 struct LaunchVals {
1203 int WorkgroupSize;
1204 int GridSize;
1205 };
getLaunchVals(int WarpSize,EnvironmentVariables Env,int ConstWGSize,llvm::omp::OMPTgtExecModeFlags ExecutionMode,int NumTeams,int ThreadLimit,uint64_t LoopTripcount,int DeviceNumTeams)1206 LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1207 int ConstWGSize,
1208 llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1209 int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1210 int DeviceNumTeams) {
1211
1212 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1213 int NumGroups = 0;
1214
1215 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1216 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1217 MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1218
1219 if (print_kernel_trace & STARTUP_DETAILS) {
1220 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
1221 DP("Max_Teams: %d\n", MaxTeams);
1222 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
1223 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
1224 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
1225 RTLDeviceInfoTy::DefaultWgSize);
1226 DP("thread_limit: %d\n", ThreadLimit);
1227 DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1228 DP("ConstWGSize: %d\n", ConstWGSize);
1229 }
1230 // check for thread_limit() clause
1231 if (ThreadLimit > 0) {
1232 ThreadsPerGroup = ThreadLimit;
1233 DP("Setting threads per block to requested %d\n", ThreadLimit);
1234 // Add master warp for GENERIC
1235 if (ExecutionMode ==
1236 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1237 ThreadsPerGroup += WarpSize;
1238 DP("Adding master wavefront: +%d threads\n", WarpSize);
1239 }
1240 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1241 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1242 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
1243 }
1244 }
1245 // check flat_max_work_group_size attr here
1246 if (ThreadsPerGroup > ConstWGSize) {
1247 ThreadsPerGroup = ConstWGSize;
1248 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
1249 ThreadsPerGroup);
1250 }
1251 if (print_kernel_trace & STARTUP_DETAILS)
1252 DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1253 DP("Preparing %d threads\n", ThreadsPerGroup);
1254
1255 // Set default num_groups (teams)
1256 if (Env.TeamLimit > 0)
1257 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1258 else
1259 NumGroups = MaxTeams;
1260 DP("Set default num of groups %d\n", NumGroups);
1261
1262 if (print_kernel_trace & STARTUP_DETAILS) {
1263 DP("num_groups: %d\n", NumGroups);
1264 DP("num_teams: %d\n", NumTeams);
1265 }
1266
1267 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1268 // This reduction is typical for default case (no thread_limit clause).
1269 // or when user goes crazy with num_teams clause.
1270 // FIXME: We cant distinguish between a constant or variable thread limit.
1271 // So we only handle constant thread_limits.
1272 if (ThreadsPerGroup >
1273 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
1274 // Should we round threadsPerGroup up to nearest WarpSize
1275 // here?
1276 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1277
1278 // check for num_teams() clause
1279 if (NumTeams > 0) {
1280 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1281 }
1282 if (print_kernel_trace & STARTUP_DETAILS) {
1283 DP("num_groups: %d\n", NumGroups);
1284 DP("Env.NumTeams %d\n", Env.NumTeams);
1285 DP("Env.TeamLimit %d\n", Env.TeamLimit);
1286 }
1287
1288 if (Env.NumTeams > 0) {
1289 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1290 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
1291 } else if (Env.TeamLimit > 0) {
1292 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1293 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
1294 } else {
1295 if (NumTeams <= 0) {
1296 if (LoopTripcount > 0) {
1297 if (ExecutionMode ==
1298 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1299 // round up to the nearest integer
1300 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1301 } else if (ExecutionMode ==
1302 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1303 NumGroups = LoopTripcount;
1304 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1305 // This is a generic kernel that was transformed to use SPMD-mode
1306 // execution but uses Generic-mode semantics for scheduling.
1307 NumGroups = LoopTripcount;
1308 }
1309 DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
1310 "threads per block %d\n",
1311 NumGroups, LoopTripcount, ThreadsPerGroup);
1312 }
1313 } else {
1314 NumGroups = NumTeams;
1315 }
1316 if (NumGroups > MaxTeams) {
1317 NumGroups = MaxTeams;
1318 if (print_kernel_trace & STARTUP_DETAILS)
1319 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
1320 }
1321 if (NumGroups > NumTeams && NumTeams > 0) {
1322 NumGroups = NumTeams;
1323 if (print_kernel_trace & STARTUP_DETAILS)
1324 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
1325 NumTeams);
1326 }
1327 }
1328
1329 // num_teams clause always honored, no matter what, unless DEFAULT is active.
1330 if (NumTeams > 0) {
1331 NumGroups = NumTeams;
1332 // Cap num_groups to EnvMaxTeamsDefault if set.
1333 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1334 NumGroups = Env.MaxTeamsDefault;
1335 }
1336 if (print_kernel_trace & STARTUP_DETAILS) {
1337 DP("threadsPerGroup: %d\n", ThreadsPerGroup);
1338 DP("num_groups: %d\n", NumGroups);
1339 DP("loop_tripcount: %ld\n", LoopTripcount);
1340 }
1341 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
1342 ThreadsPerGroup);
1343
1344 LaunchVals Res;
1345 Res.WorkgroupSize = ThreadsPerGroup;
1346 Res.GridSize = ThreadsPerGroup * NumGroups;
1347 return Res;
1348 }
1349
acquireAvailablePacketId(hsa_queue_t * Queue)1350 static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1351 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1352 bool Full = true;
1353 while (Full) {
1354 Full =
1355 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1356 }
1357 return PacketId;
1358 }
1359
runRegionLocked(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount)1360 int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1361 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1362 int32_t ThreadLimit, uint64_t LoopTripcount) {
1363 // Set the context we are using
1364 // update thread limit content in gpu memory if un-initialized or specified
1365 // from host
1366
1367 DP("Run target team region thread_limit %d\n", ThreadLimit);
1368
1369 // All args are references.
1370 std::vector<void *> Args(ArgNum);
1371 std::vector<void *> Ptrs(ArgNum);
1372
1373 DP("Arg_num: %d\n", ArgNum);
1374 for (int32_t I = 0; I < ArgNum; ++I) {
1375 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1376 Args[I] = &Ptrs[I];
1377 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
1378 }
1379
1380 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1381
1382 std::string KernelName = std::string(KernelInfo->Name);
1383 auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1384 if (KernelInfoTable[DeviceId].find(KernelName) ==
1385 KernelInfoTable[DeviceId].end()) {
1386 DP("Kernel %s not found\n", KernelName.c_str());
1387 return OFFLOAD_FAIL;
1388 }
1389
1390 const atl_kernel_info_t KernelInfoEntry =
1391 KernelInfoTable[DeviceId][KernelName];
1392 const uint32_t GroupSegmentSize =
1393 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1394 const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1395 const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1396 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1397 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1398
1399 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
1400
1401 /*
1402 * Set limit based on ThreadsPerGroup and GroupsPerDevice
1403 */
1404 LaunchVals LV =
1405 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1406 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1407 NumTeams, // From run_region arg
1408 ThreadLimit, // From run_region arg
1409 LoopTripcount, // From run_region arg
1410 DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1411 const int GridSize = LV.GridSize;
1412 const int WorkgroupSize = LV.WorkgroupSize;
1413
1414 if (print_kernel_trace >= LAUNCH) {
1415 int NumGroups = GridSize / WorkgroupSize;
1416 // enum modes are SPMD, GENERIC, NONE 0,1,2
1417 // if doing rtl timing, print to stderr, unless stdout requested.
1418 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1419 fprintf(TraceToStdout ? stdout : stderr,
1420 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1421 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1422 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1423 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1424 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1425 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1426 VgprSpillCount, LoopTripcount, KernelInfo->Name);
1427 }
1428
1429 // Run on the device.
1430 {
1431 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1432 if (!Queue) {
1433 return OFFLOAD_FAIL;
1434 }
1435 uint64_t PacketId = acquireAvailablePacketId(Queue);
1436
1437 const uint32_t Mask = Queue->size - 1; // size is a power of 2
1438 hsa_kernel_dispatch_packet_t *Packet =
1439 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1440
1441 // packet->header is written last
1442 Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1443 Packet->workgroup_size_x = WorkgroupSize;
1444 Packet->workgroup_size_y = 1;
1445 Packet->workgroup_size_z = 1;
1446 Packet->reserved0 = 0;
1447 Packet->grid_size_x = GridSize;
1448 Packet->grid_size_y = 1;
1449 Packet->grid_size_z = 1;
1450 Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1451 Packet->group_segment_size = GroupSegmentSize;
1452 Packet->kernel_object = KernelInfoEntry.kernel_object;
1453 Packet->kernarg_address = 0; // use the block allocator
1454 Packet->reserved2 = 0; // impl writes id_ here
1455 Packet->completion_signal = {0}; // may want a pool of signals
1456
1457 KernelArgPool *ArgPool = nullptr;
1458 void *KernArg = nullptr;
1459 {
1460 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1461 if (It != KernelArgPoolMap.end()) {
1462 ArgPool = (It->second).get();
1463 }
1464 }
1465 if (!ArgPool) {
1466 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
1467 DeviceId);
1468 }
1469 {
1470 if (ArgPool) {
1471 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
1472 KernArg = ArgPool->allocate(ArgNum);
1473 }
1474 if (!KernArg) {
1475 DP("Allocate kernarg failed\n");
1476 return OFFLOAD_FAIL;
1477 }
1478
1479 // Copy explicit arguments
1480 for (int I = 0; I < ArgNum; I++) {
1481 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1482 }
1483
1484 // Initialize implicit arguments. TODO: Which of these can be dropped
1485 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
1486 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1487 memset(ImplArgs, 0,
1488 sizeof(impl_implicit_args_t)); // may not be necessary
1489 ImplArgs->offset_x = 0;
1490 ImplArgs->offset_y = 0;
1491 ImplArgs->offset_z = 0;
1492
1493 // assign a hostcall buffer for the selected Q
1494 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
1495 // hostrpc_assign_buffer is not thread safe, and this function is
1496 // under a multiple reader lock, not a writer lock.
1497 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
1498 pthread_mutex_lock(&HostcallInitLock);
1499 uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo().HSAAgents[DeviceId],
1500 Queue, DeviceId);
1501 pthread_mutex_unlock(&HostcallInitLock);
1502 if (!Buffer) {
1503 DP("hostrpc_assign_buffer failed, gpu would dereference null and "
1504 "error\n");
1505 return OFFLOAD_FAIL;
1506 }
1507
1508 DP("Implicit argument count: %d\n",
1509 KernelInfoEntry.implicit_argument_count);
1510 if (KernelInfoEntry.implicit_argument_count >= 4) {
1511 // Initialise pointer for implicit_argument_count != 0 ABI
1512 // Guess that the right implicit argument is at offset 24 after
1513 // the explicit arguments. In the future, should be able to read
1514 // the offset from msgpack. Clang is not annotating it at present.
1515 uint64_t Offset =
1516 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1517 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1518 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
1519 "args: %d\n",
1520 Offset + 8, ArgPool->kernargSizeIncludingImplicit());
1521 } else {
1522 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1523 }
1524 }
1525
1526 // initialise pointer for implicit_argument_count == 0 ABI
1527 ImplArgs->hostcall_ptr = Buffer;
1528 }
1529
1530 Packet->kernarg_address = KernArg;
1531 }
1532
1533 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1534 if (S.handle == 0) {
1535 DP("Failed to get signal instance\n");
1536 return OFFLOAD_FAIL;
1537 }
1538 Packet->completion_signal = S;
1539 hsa_signal_store_relaxed(Packet->completion_signal, 1);
1540
1541 // Publish the packet indicating it is ready to be processed
1542 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1543 core::createHeader(), Packet->setup);
1544
1545 // Since the packet is already published, its contents must not be
1546 // accessed any more
1547 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1548
1549 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
1550 HSA_WAIT_STATE_BLOCKED) != 0)
1551 ;
1552
1553 assert(ArgPool);
1554 ArgPool->deallocate(KernArg);
1555 DeviceInfo().FreeSignalPool.push(S);
1556 }
1557
1558 DP("Kernel completed\n");
1559 return OFFLOAD_SUCCESS;
1560 }
1561
elfMachineIdIsAmdgcn(__tgt_device_image * Image)1562 bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1563 const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h
1564 int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1565 if (!R) {
1566 DP("Supported machine ID not found\n");
1567 }
1568 return R;
1569 }
1570
elfEFlags(__tgt_device_image * Image)1571 uint32_t elfEFlags(__tgt_device_image *Image) {
1572 char *ImgBegin = (char *)Image->ImageStart;
1573 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1574
1575 Elf *E = elf_memory(ImgBegin, ImgSize);
1576 if (!E) {
1577 DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1));
1578 return 0;
1579 }
1580
1581 Elf64_Ehdr *Eh64 = elf64_getehdr(E);
1582
1583 if (!Eh64) {
1584 DP("Unable to get machine ID from ELF file!\n");
1585 elf_end(E);
1586 return 0;
1587 }
1588
1589 uint32_t Flags = Eh64->e_flags;
1590
1591 elf_end(E);
1592 DP("ELF Flags: 0x%x\n", Flags);
1593 return Flags;
1594 }
1595
enforceUpperBound(T * Value,T Upper)1596 template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1597 bool Changed = *Value > Upper;
1598 if (Changed) {
1599 *Value = Upper;
1600 }
1601 return Changed;
1602 }
1603
findOnlyShtHash(Elf * Elf)1604 Elf64_Shdr *findOnlyShtHash(Elf *Elf) {
1605 size_t N;
1606 int Rc = elf_getshdrnum(Elf, &N);
1607 if (Rc != 0) {
1608 return nullptr;
1609 }
1610
1611 Elf64_Shdr *Result = nullptr;
1612 for (size_t I = 0; I < N; I++) {
1613 Elf_Scn *Scn = elf_getscn(Elf, I);
1614 if (Scn) {
1615 Elf64_Shdr *Shdr = elf64_getshdr(Scn);
1616 if (Shdr) {
1617 if (Shdr->sh_type == SHT_HASH) {
1618 if (Result == nullptr) {
1619 Result = Shdr;
1620 } else {
1621 // multiple SHT_HASH sections not handled
1622 return nullptr;
1623 }
1624 }
1625 }
1626 }
1627 }
1628 return Result;
1629 }
1630
elfLookup(Elf * Elf,char * Base,Elf64_Shdr * SectionHash,const char * Symname)1631 const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash,
1632 const char *Symname) {
1633
1634 assert(SectionHash);
1635 size_t SectionSymtabIndex = SectionHash->sh_link;
1636 Elf64_Shdr *SectionSymtab =
1637 elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex));
1638 size_t SectionStrtabIndex = SectionSymtab->sh_link;
1639
1640 const Elf64_Sym *Symtab =
1641 reinterpret_cast<const Elf64_Sym *>(Base + SectionSymtab->sh_offset);
1642
1643 const uint32_t *Hashtab =
1644 reinterpret_cast<const uint32_t *>(Base + SectionHash->sh_offset);
1645
1646 // Layout:
1647 // nbucket
1648 // nchain
1649 // bucket[nbucket]
1650 // chain[nchain]
1651 uint32_t Nbucket = Hashtab[0];
1652 const uint32_t *Bucket = &Hashtab[2];
1653 const uint32_t *Chain = &Hashtab[Nbucket + 2];
1654
1655 const size_t Max = strlen(Symname) + 1;
1656 const uint32_t Hash = elf_hash(Symname);
1657 for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) {
1658 char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name);
1659 if (strncmp(Symname, N, Max) == 0) {
1660 return &Symtab[I];
1661 }
1662 }
1663
1664 return nullptr;
1665 }
1666
1667 struct SymbolInfo {
1668 void *Addr = nullptr;
1669 uint32_t Size = UINT32_MAX;
1670 uint32_t ShType = SHT_NULL;
1671 };
1672
getSymbolInfoWithoutLoading(Elf * Elf,char * Base,const char * Symname,SymbolInfo * Res)1673 int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname,
1674 SymbolInfo *Res) {
1675 if (elf_kind(Elf) != ELF_K_ELF) {
1676 return 1;
1677 }
1678
1679 Elf64_Shdr *SectionHash = findOnlyShtHash(Elf);
1680 if (!SectionHash) {
1681 return 1;
1682 }
1683
1684 const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname);
1685 if (!Sym) {
1686 return 1;
1687 }
1688
1689 if (Sym->st_size > UINT32_MAX) {
1690 return 1;
1691 }
1692
1693 if (Sym->st_shndx == SHN_UNDEF) {
1694 return 1;
1695 }
1696
1697 Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx);
1698 if (!Section) {
1699 return 1;
1700 }
1701
1702 Elf64_Shdr *Header = elf64_getshdr(Section);
1703 if (!Header) {
1704 return 1;
1705 }
1706
1707 Res->Addr = Sym->st_value + Base;
1708 Res->Size = static_cast<uint32_t>(Sym->st_size);
1709 Res->ShType = Header->sh_type;
1710 return 0;
1711 }
1712
getSymbolInfoWithoutLoading(char * Base,size_t ImgSize,const char * Symname,SymbolInfo * Res)1713 int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname,
1714 SymbolInfo *Res) {
1715 Elf *Elf = elf_memory(Base, ImgSize);
1716 if (Elf) {
1717 int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res);
1718 elf_end(Elf);
1719 return Rc;
1720 }
1721 return 1;
1722 }
1723
interopGetSymbolInfo(char * Base,size_t ImgSize,const char * SymName,void ** VarAddr,uint32_t * VarSize)1724 hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1725 const char *SymName, void **VarAddr,
1726 uint32_t *VarSize) {
1727 SymbolInfo SI;
1728 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1729 if (Rc == 0) {
1730 *VarAddr = SI.Addr;
1731 *VarSize = SI.Size;
1732 return HSA_STATUS_SUCCESS;
1733 }
1734 return HSA_STATUS_ERROR;
1735 }
1736
1737 template <typename C>
moduleRegisterFromMemoryToPlace(std::map<std::string,atl_kernel_info_t> & KernelInfoTable,std::map<std::string,atl_symbol_info_t> & SymbolInfoTable,void * ModuleBytes,size_t ModuleSize,int DeviceId,C Cb,std::vector<hsa_executable_t> & HSAExecutables)1738 hsa_status_t moduleRegisterFromMemoryToPlace(
1739 std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1740 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1741 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1742 std::vector<hsa_executable_t> &HSAExecutables) {
1743 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1744 C *Unwrapped = static_cast<C *>(CbState);
1745 return (*Unwrapped)(Data, Size);
1746 };
1747 return core::RegisterModuleFromMemory(
1748 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1749 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1750 HSAExecutables);
1751 }
1752
getDeviceStateBytes(char * ImageStart,size_t ImgSize)1753 uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1754 uint64_t DeviceStateBytes = 0;
1755 {
1756 // If this is the deviceRTL, get the state variable size
1757 SymbolInfo SizeSi;
1758 int Rc = getSymbolInfoWithoutLoading(
1759 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1760
1761 if (Rc == 0) {
1762 if (SizeSi.Size != sizeof(uint64_t)) {
1763 DP("Found device_State_size variable with wrong size\n");
1764 return 0;
1765 }
1766
1767 // Read number of bytes directly from the elf
1768 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1769 }
1770 }
1771 return DeviceStateBytes;
1772 }
1773
1774 struct DeviceEnvironment {
1775 // initialise an DeviceEnvironmentTy in the deviceRTL
1776 // patches around differences in the deviceRTL between trunk, aomp,
1777 // rocmcc. Over time these differences will tend to zero and this class
1778 // simplified.
1779 // Symbol may be in .data or .bss, and may be missing fields, todo:
1780 // review aomp/trunk/rocm and simplify the following
1781
1782 // The symbol may also have been deadstripped because the device side
1783 // accessors were unused.
1784
1785 // If the symbol is in .data (aomp, rocm) it can be written directly.
1786 // If it is in .bss, we must wait for it to be allocated space on the
1787 // gpu (trunk) and initialize after loading.
sym__anone797b96e0b11::DeviceEnvironment1788 const char *sym() { return "omptarget_device_environment"; }
1789
1790 DeviceEnvironmentTy HostDeviceEnv;
1791 SymbolInfo SI;
1792 bool Valid = false;
1793
1794 __tgt_device_image *Image;
1795 const size_t ImgSize;
1796
DeviceEnvironment__anone797b96e0b11::DeviceEnvironment1797 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1798 __tgt_device_image *Image, const size_t ImgSize)
1799 : Image(Image), ImgSize(ImgSize) {
1800
1801 HostDeviceEnv.NumDevices = NumberDevices;
1802 HostDeviceEnv.DeviceNum = DeviceId;
1803 HostDeviceEnv.DebugKind = 0;
1804 HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1805 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1806 HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1807
1808 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1809 sym(), &SI);
1810 if (Rc != 0) {
1811 DP("Finding global device environment '%s' - symbol missing.\n", sym());
1812 return;
1813 }
1814
1815 if (SI.Size > sizeof(HostDeviceEnv)) {
1816 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
1817 sizeof(HostDeviceEnv));
1818 return;
1819 }
1820
1821 Valid = true;
1822 }
1823
inImage__anone797b96e0b11::DeviceEnvironment1824 bool inImage() { return SI.ShType != SHT_NOBITS; }
1825
beforeLoading__anone797b96e0b11::DeviceEnvironment1826 hsa_status_t beforeLoading(void *Data, size_t Size) {
1827 if (Valid) {
1828 if (inImage()) {
1829 DP("Setting global device environment before load (%u bytes)\n",
1830 SI.Size);
1831 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
1832 void *Pos = (char *)Data + Offset;
1833 memcpy(Pos, &HostDeviceEnv, SI.Size);
1834 }
1835 }
1836 return HSA_STATUS_SUCCESS;
1837 }
1838
afterLoading__anone797b96e0b11::DeviceEnvironment1839 hsa_status_t afterLoading() {
1840 if (Valid) {
1841 if (!inImage()) {
1842 DP("Setting global device environment after load (%u bytes)\n",
1843 SI.Size);
1844 int DeviceId = HostDeviceEnv.DeviceNum;
1845 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1846 void *StatePtr;
1847 uint32_t StatePtrSize;
1848 hsa_status_t Err = interop_hsa_get_symbol_info(
1849 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1850 if (Err != HSA_STATUS_SUCCESS) {
1851 DP("failed to find %s in loaded image\n", sym());
1852 return Err;
1853 }
1854
1855 if (StatePtrSize != SI.Size) {
1856 DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
1857 SI.Size);
1858 return HSA_STATUS_ERROR;
1859 }
1860
1861 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1862 StatePtrSize, DeviceId);
1863 }
1864 }
1865 return HSA_STATUS_SUCCESS;
1866 }
1867 };
1868
implCalloc(void ** RetPtr,size_t Size,int DeviceId)1869 hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1870 uint64_t Rounded = 4 * ((Size + 3) / 4);
1871 void *Ptr;
1872 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1873 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1874 if (Err != HSA_STATUS_SUCCESS) {
1875 return Err;
1876 }
1877
1878 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1879 if (Rc != HSA_STATUS_SUCCESS) {
1880 DP("zero fill device_state failed with %u\n", Rc);
1881 core::Runtime::Memfree(Ptr);
1882 return HSA_STATUS_ERROR;
1883 }
1884
1885 *RetPtr = Ptr;
1886 return HSA_STATUS_SUCCESS;
1887 }
1888
imageContainsSymbol(void * Data,size_t Size,const char * Sym)1889 bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1890 SymbolInfo SI;
1891 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1892 return (Rc == 0) && (SI.Addr != nullptr);
1893 }
1894
1895 } // namespace
1896
1897 namespace core {
allow_access_to_all_gpu_agents(void * Ptr)1898 hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1899 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1900 &DeviceInfo().HSAAgents[0], NULL, Ptr);
1901 }
1902 } // namespace core
1903
GetIsaInfo(hsa_isa_t isa,void * data)1904 static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
1905 hsa_status_t err;
1906 uint32_t name_len;
1907 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
1908 if (err != HSA_STATUS_SUCCESS) {
1909 DP("Error getting ISA info length\n");
1910 return err;
1911 }
1912
1913 char TargetID[name_len];
1914 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
1915 if (err != HSA_STATUS_SUCCESS) {
1916 DP("Error getting ISA info name\n");
1917 return err;
1918 }
1919
1920 auto TripleTargetID = llvm::StringRef(TargetID);
1921 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1922 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
1923 }
1924 return HSA_STATUS_SUCCESS;
1925 }
1926
1927 /// Parse a TargetID to get processor arch and feature map.
1928 /// Returns processor subarch.
1929 /// Returns TargetID features in \p FeatureMap argument.
1930 /// If the \p TargetID contains feature+, FeatureMap it to true.
1931 /// If the \p TargetID contains feature-, FeatureMap it to false.
1932 /// If the \p TargetID does not contain a feature (default), do not map it.
parseTargetID(StringRef TargetID,StringMap<bool> & FeatureMap)1933 StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
1934 if (TargetID.empty())
1935 return llvm::StringRef();
1936
1937 auto ArchFeature = TargetID.split(":");
1938 auto Arch = ArchFeature.first;
1939 auto Features = ArchFeature.second;
1940 if (Features.empty())
1941 return Arch;
1942
1943 if (Features.contains("sramecc+")) {
1944 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
1945 } else if (Features.contains("sramecc-")) {
1946 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
1947 }
1948 if (Features.contains("xnack+")) {
1949 FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
1950 } else if (Features.contains("xnack-")) {
1951 FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
1952 }
1953
1954 return Arch;
1955 }
1956
1957 /// Checks if an image \p ImgInfo is compatible with current
1958 /// system's environment \p EnvInfo
IsImageCompatibleWithEnv(const char * ImgInfo,std::string EnvInfo)1959 bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
1960 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);
1961
1962 // Compatible in case of exact match
1963 if (ImgTID == EnvTID) {
1964 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",
1965 ImgTID.data(), EnvTID.data());
1966 return true;
1967 }
1968
1969 // Incompatible if Archs mismatch.
1970 StringMap<bool> ImgMap, EnvMap;
1971 StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
1972 StringRef EnvArch = parseTargetID(EnvTID, EnvMap);
1973
1974 // Both EnvArch and ImgArch can't be empty here.
1975 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
1976 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",
1977 ImgTID.data(), EnvTID.data());
1978 return false;
1979 }
1980
1981 // Incompatible if image has more features than the environment, irrespective
1982 // of type or sign of features.
1983 if (ImgMap.size() > EnvMap.size()) {
1984 DP("Incompatible: Image has more features than the environment \t[Image: "
1985 "%s]\t:\t[Environment: %s]\n",
1986 ImgTID.data(), EnvTID.data());
1987 return false;
1988 }
1989
1990 // Compatible if each target feature specified by the environment is
1991 // compatible with target feature of the image. The target feature is
1992 // compatible if the iamge does not specify it (meaning Any), or if it
1993 // specifies it with the same value (meaning On or Off).
1994 for (const auto &ImgFeature : ImgMap) {
1995 auto EnvFeature = EnvMap.find(ImgFeature.first());
1996 if (EnvFeature == EnvMap.end()) {
1997 DP("Incompatible: Value of Image's non-ANY feature is not matching with "
1998 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "
1999 "%s]\n",
2000 ImgTID.data(), EnvTID.data());
2001 return false;
2002 } else if (EnvFeature->first() == ImgFeature.first() &&
2003 EnvFeature->second != ImgFeature.second) {
2004 DP("Incompatible: Value of Image's non-ANY feature is not matching with "
2005 "the Environment feature's non-ANY value \t[Image: "
2006 "%s]\t:\t[Environment: %s]\n",
2007 ImgTID.data(), EnvTID.data());
2008 return false;
2009 }
2010 }
2011
2012 // Image is compatible if all features of Environment are:
2013 // - either, present in the Image's features map with the same sign,
2014 // - or, the feature is missing from Image's features map i.e. it is
2015 // set to ANY
2016 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "
2017 "%s]\n",
2018 ImgTID.data(), EnvTID.data());
2019 return true;
2020 }
2021
2022 extern "C" {
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)2023 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
2024 return elfMachineIdIsAmdgcn(Image);
2025 }
2026
__tgt_rtl_is_valid_binary_info(__tgt_device_image * image,__tgt_image_info * info)2027 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
2028 __tgt_image_info *info) {
2029 if (!__tgt_rtl_is_valid_binary(image))
2030 return false;
2031
2032 // A subarchitecture was not specified. Assume it is compatible.
2033 if (!info->Arch)
2034 return true;
2035
2036 int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
2037
2038 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
2039 __tgt_rtl_init_device(DeviceId);
2040 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
2041 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
2042 if (err != HSA_STATUS_SUCCESS) {
2043 DP("Error iterating ISAs\n");
2044 return false;
2045 }
2046 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId]))
2047 return false;
2048 }
2049 DP("Image has Target ID compatible with the current environment: %s\n",
2050 info->Arch);
2051 return true;
2052 }
2053
__tgt_rtl_init_plugin()2054 int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
__tgt_rtl_deinit_plugin()2055 int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
2056
__tgt_rtl_number_of_devices()2057 int __tgt_rtl_number_of_devices() {
2058 // If the construction failed, no methods are safe to call
2059 if (DeviceInfo().ConstructionSucceeded) {
2060 return DeviceInfo().NumberOfDevices;
2061 }
2062 DP("AMDGPU plugin construction failed. Zero devices available\n");
2063 return 0;
2064 }
2065
__tgt_rtl_init_requires(int64_t RequiresFlags)2066 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
2067 DP("Init requires flags to %ld\n", RequiresFlags);
2068 DeviceInfo().RequiresFlags = RequiresFlags;
2069 return RequiresFlags;
2070 }
2071
__tgt_rtl_init_device(int DeviceId)2072 int32_t __tgt_rtl_init_device(int DeviceId) {
2073 hsa_status_t Err = hsa_init();
2074 if (Err != HSA_STATUS_SUCCESS) {
2075 DP("HSA Initialization Failed.\n");
2076 return HSA_STATUS_ERROR;
2077 }
2078 // this is per device id init
2079 DP("Initialize the device id: %d\n", DeviceId);
2080
2081 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2082
2083 // Get number of Compute Unit
2084 uint32_t ComputeUnits = 0;
2085 Err = hsa_agent_get_info(
2086 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
2087 &ComputeUnits);
2088 if (Err != HSA_STATUS_SUCCESS) {
2089 DeviceInfo().ComputeUnits[DeviceId] = 1;
2090 DP("Error getting compute units : settiing to 1\n");
2091 } else {
2092 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
2093 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
2094 }
2095
2096 char GetInfoName[64]; // 64 max size returned by get info
2097 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
2098 (void *)GetInfoName);
2099 if (Err)
2100 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
2101 else {
2102 DeviceInfo().GPUName[DeviceId] = GetInfoName;
2103 }
2104
2105 if (print_kernel_trace & STARTUP_DETAILS)
2106 DP("Device#%-2d CU's: %2d %s\n", DeviceId,
2107 DeviceInfo().ComputeUnits[DeviceId], DeviceInfo().GPUName[DeviceId].c_str());
2108
2109 // Query attributes to determine number of threads/block and blocks/grid.
2110 uint16_t WorkgroupMaxDim[3];
2111 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
2112 &WorkgroupMaxDim);
2113 if (Err != HSA_STATUS_SUCCESS) {
2114 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
2115 DP("Error getting grid dims: num groups : %d\n",
2116 RTLDeviceInfoTy::DefaultNumTeams);
2117 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
2118 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
2119 DP("Using %d ROCm blocks per grid\n", DeviceInfo().GroupsPerDevice[DeviceId]);
2120 } else {
2121 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
2122 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
2123 "at the hard limit\n",
2124 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
2125 }
2126
2127 // Get thread limit
2128 hsa_dim3_t GridMaxDim;
2129 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
2130 if (Err == HSA_STATUS_SUCCESS) {
2131 DeviceInfo().ThreadsPerGroup[DeviceId] =
2132 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
2133 DeviceInfo().GroupsPerDevice[DeviceId];
2134
2135 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
2136 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2137 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
2138 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
2139 RTLDeviceInfoTy::MaxWgSize)) {
2140 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
2141 } else {
2142 DP("Using ROCm Queried thread limit: %d\n",
2143 DeviceInfo().ThreadsPerGroup[DeviceId]);
2144 }
2145 } else {
2146 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2147 DP("Error getting max block dimension, use default:%d \n",
2148 RTLDeviceInfoTy::MaxWgSize);
2149 }
2150
2151 // Get wavefront size
2152 uint32_t WavefrontSize = 0;
2153 Err =
2154 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2155 if (Err == HSA_STATUS_SUCCESS) {
2156 DP("Queried wavefront size: %d\n", WavefrontSize);
2157 DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2158 } else {
2159 // TODO: Burn the wavefront size into the code object
2160 DP("Warning: Unknown wavefront size, assuming 64\n");
2161 DeviceInfo().WarpSize[DeviceId] = 64;
2162 }
2163
2164 // Adjust teams to the env variables
2165
2166 if (DeviceInfo().Env.TeamLimit > 0 &&
2167 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2168 DeviceInfo().Env.TeamLimit))) {
2169 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
2170 DeviceInfo().Env.TeamLimit);
2171 }
2172
2173 // Set default number of teams
2174 if (DeviceInfo().Env.NumTeams > 0) {
2175 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2176 DP("Default number of teams set according to environment %d\n",
2177 DeviceInfo().Env.NumTeams);
2178 } else {
2179 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2180 int TeamsPerCU = DefaultTeamsPerCU;
2181 if (TeamsPerCUEnvStr) {
2182 TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2183 }
2184
2185 DeviceInfo().NumTeams[DeviceId] =
2186 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2187 DP("Default number of teams = %d * number of compute units %d\n",
2188 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
2189 }
2190
2191 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2192 DeviceInfo().GroupsPerDevice[DeviceId])) {
2193 DP("Default number of teams exceeds device limit, capping at %d\n",
2194 DeviceInfo().GroupsPerDevice[DeviceId]);
2195 }
2196
2197 // Adjust threads to the env variables
2198 if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2199 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2200 DeviceInfo().Env.TeamThreadLimit))) {
2201 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
2202 DeviceInfo().Env.TeamThreadLimit);
2203 }
2204
2205 // Set default number of threads
2206 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2207 DP("Default number of threads set according to library's default %d\n",
2208 RTLDeviceInfoTy::DefaultWgSize);
2209 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2210 DeviceInfo().ThreadsPerGroup[DeviceId])) {
2211 DP("Default number of threads exceeds device limit, capping at %d\n",
2212 DeviceInfo().ThreadsPerGroup[DeviceId]);
2213 }
2214
2215 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
2216 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
2217 DeviceInfo().ThreadsPerGroup[DeviceId]);
2218
2219 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
2220 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
2221 DeviceInfo().GroupsPerDevice[DeviceId],
2222 DeviceInfo().GroupsPerDevice[DeviceId] *
2223 DeviceInfo().ThreadsPerGroup[DeviceId]);
2224
2225 return OFFLOAD_SUCCESS;
2226 }
2227
2228 static __tgt_target_table *
2229 __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2230
__tgt_rtl_load_binary(int32_t DeviceId,__tgt_device_image * Image)2231 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2232 __tgt_device_image *Image) {
2233 DeviceInfo().LoadRunLock.lock();
2234 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2235 DeviceInfo().LoadRunLock.unlock();
2236 return Res;
2237 }
2238
__tgt_rtl_load_binary_locked(int32_t DeviceId,__tgt_device_image * Image)2239 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2240 __tgt_device_image *Image) {
2241 // This function loads the device image onto gpu[DeviceId] and does other
2242 // per-image initialization work. Specifically:
2243 //
2244 // - Initialize an DeviceEnvironmentTy instance embedded in the
2245 // image at the symbol "omptarget_device_environment"
2246 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2247 //
2248 // - Allocate a large array per-gpu (could be moved to init_device)
2249 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2250 // - Allocate at least that many bytes of gpu memory
2251 // - Zero initialize it
2252 // - Write the pointer to the symbol omptarget_nvptx_device_State
2253 //
2254 // - Pulls some per-kernel information together from various sources and
2255 // records it in the KernelsList for quicker access later
2256 //
2257 // The initialization can be done before or after loading the image onto the
2258 // gpu. This function presently does a mixture. Using the hsa api to get/set
2259 // the information is simpler to implement, in exchange for more complicated
2260 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2261 // back from the gpu vs a hashtable lookup on the host.
2262
2263 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2264
2265 DeviceInfo().clearOffloadEntriesTable(DeviceId);
2266
2267 // We do not need to set the ELF version because the caller of this function
2268 // had to do that to decide the right runtime to use
2269
2270 if (!elfMachineIdIsAmdgcn(Image))
2271 return NULL;
2272
2273 {
2274 auto Env = DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2275 DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2276
2277 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2278 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2279 hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2280 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2281 [&](void *Data, size_t Size) {
2282 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2283 __atomic_store_n(&DeviceInfo().HostcallRequired, true,
2284 __ATOMIC_RELEASE);
2285 }
2286 return Env.beforeLoading(Data, Size);
2287 },
2288 DeviceInfo().HSAExecutables);
2289
2290 check("Module registering", Err);
2291 if (Err != HSA_STATUS_SUCCESS) {
2292 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2293 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2294
2295 if (strcmp(DeviceName, ElfName) != 0) {
2296 DP("Possible gpu arch mismatch: device:%s, image:%s please check"
2297 " compiler flag: -march=<gpu>\n",
2298 DeviceName, ElfName);
2299 } else {
2300 DP("Error loading image onto GPU: %s\n", get_error_string(Err));
2301 }
2302
2303 return NULL;
2304 }
2305
2306 Err = Env.afterLoading();
2307 if (Err != HSA_STATUS_SUCCESS) {
2308 return NULL;
2309 }
2310 }
2311
2312 DP("AMDGPU module successfully loaded!\n");
2313
2314 {
2315 // the device_State array is either large value in bss or a void* that
2316 // needs to be assigned to a pointer to an array of size device_state_bytes
2317 // If absent, it has been deadstripped and needs no setup.
2318
2319 void *StatePtr;
2320 uint32_t StatePtrSize;
2321 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2322 hsa_status_t Err = interop_hsa_get_symbol_info(
2323 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2324 &StatePtrSize);
2325
2326 if (Err != HSA_STATUS_SUCCESS) {
2327 DP("No device_state symbol found, skipping initialization\n");
2328 } else {
2329 if (StatePtrSize < sizeof(void *)) {
2330 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
2331 sizeof(void *));
2332 return NULL;
2333 }
2334
2335 // if it's larger than a void*, assume it's a bss array and no further
2336 // initialization is required. Only try to set up a pointer for
2337 // sizeof(void*)
2338 if (StatePtrSize == sizeof(void *)) {
2339 uint64_t DeviceStateBytes =
2340 getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2341 if (DeviceStateBytes == 0) {
2342 DP("Can't initialize device_State, missing size information\n");
2343 return NULL;
2344 }
2345
2346 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2347 if (DSS.first.get() == nullptr) {
2348 assert(DSS.second == 0);
2349 void *Ptr = NULL;
2350 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2351 if (Err != HSA_STATUS_SUCCESS) {
2352 DP("Failed to allocate device_state array\n");
2353 return NULL;
2354 }
2355 DSS = {
2356 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2357 DeviceStateBytes,
2358 };
2359 }
2360
2361 void *Ptr = DSS.first.get();
2362 if (DeviceStateBytes != DSS.second) {
2363 DP("Inconsistent sizes of device_State unsupported\n");
2364 return NULL;
2365 }
2366
2367 // write ptr to device memory so it can be used by later kernels
2368 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2369 sizeof(void *), DeviceId);
2370 if (Err != HSA_STATUS_SUCCESS) {
2371 DP("memcpy install of state_ptr failed\n");
2372 return NULL;
2373 }
2374 }
2375 }
2376 }
2377
2378 // Here, we take advantage of the data that is appended after img_end to get
2379 // the symbols' name we need to load. This data consist of the host entries
2380 // begin and end as well as the target name (see the offloading linker script
2381 // creation in clang compiler).
2382
2383 // Find the symbols in the module by name. The name can be obtain by
2384 // concatenating the host entry name with the target name
2385
2386 __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2387 __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2388
2389 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2390
2391 if (!E->addr) {
2392 // The host should have always something in the address to
2393 // uniquely identify the target region.
2394 DP("Analyzing host entry '<null>' (size = %lld)...\n",
2395 (unsigned long long)E->size);
2396 return NULL;
2397 }
2398
2399 if (E->size) {
2400 __tgt_offload_entry Entry = *E;
2401
2402 void *Varptr;
2403 uint32_t Varsize;
2404
2405 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2406 hsa_status_t Err = interop_hsa_get_symbol_info(
2407 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2408
2409 if (Err != HSA_STATUS_SUCCESS) {
2410 // Inform the user what symbol prevented offloading
2411 DP("Loading global '%s' (Failed)\n", E->name);
2412 return NULL;
2413 }
2414
2415 if (Varsize != E->size) {
2416 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
2417 Varsize, E->size);
2418 return NULL;
2419 }
2420
2421 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
2422 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
2423 Entry.addr = (void *)Varptr;
2424
2425 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2426
2427 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2428 E->flags & OMP_DECLARE_TARGET_LINK) {
2429 // If unified memory is present any target link variables
2430 // can access host addresses directly. There is no longer a
2431 // need for device copies.
2432 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *),
2433 DeviceId);
2434 if (Err != HSA_STATUS_SUCCESS)
2435 DP("Error when copying USM\n");
2436 DP("Copy linked variable host address (" DPxMOD ")"
2437 "to device address (" DPxMOD ")\n",
2438 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
2439 }
2440
2441 continue;
2442 }
2443
2444 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
2445
2446 // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2447 uint32_t KernargSegmentSize = 0;
2448 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2449 hsa_status_t Err = HSA_STATUS_SUCCESS;
2450 if (!E->name) {
2451 Err = HSA_STATUS_ERROR;
2452 } else {
2453 std::string KernelStr = std::string(E->name);
2454 auto It = KernelInfoMap.find(KernelStr);
2455 if (It != KernelInfoMap.end()) {
2456 atl_kernel_info_t Info = It->second;
2457 KernargSegmentSize = Info.kernel_segment_size;
2458 } else {
2459 Err = HSA_STATUS_ERROR;
2460 }
2461 }
2462
2463 // default value GENERIC (in case symbol is missing from cubin file)
2464 llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2465 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2466
2467 // get flat group size if present, else Default_WG_Size
2468 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2469
2470 // get Kernel Descriptor if present.
2471 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2472 struct KernDescValType {
2473 uint16_t Version;
2474 uint16_t TSize;
2475 uint16_t WGSize;
2476 };
2477 struct KernDescValType KernDescVal;
2478 std::string KernDescNameStr(E->name);
2479 KernDescNameStr += "_kern_desc";
2480 const char *KernDescName = KernDescNameStr.c_str();
2481
2482 void *KernDescPtr;
2483 uint32_t KernDescSize;
2484 void *CallStackAddr = nullptr;
2485 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2486 &KernDescPtr, &KernDescSize);
2487
2488 if (Err == HSA_STATUS_SUCCESS) {
2489 if ((size_t)KernDescSize != sizeof(KernDescVal))
2490 DP("Loading global computation properties '%s' - size mismatch (%u != "
2491 "%lu)\n",
2492 KernDescName, KernDescSize, sizeof(KernDescVal));
2493
2494 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2495
2496 // Check structure size against recorded size.
2497 if ((size_t)KernDescSize != KernDescVal.TSize)
2498 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
2499 sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
2500
2501 DP("After loading global for %s KernDesc \n", KernDescName);
2502 DP("KernDesc: Version: %d\n", KernDescVal.Version);
2503 DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
2504 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
2505
2506 if (KernDescVal.WGSize == 0) {
2507 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2508 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
2509 }
2510 WGSizeVal = KernDescVal.WGSize;
2511 DP("WGSizeVal %d\n", WGSizeVal);
2512 check("Loading KernDesc computation property", Err);
2513 } else {
2514 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
2515
2516 // Flat group size
2517 std::string WGSizeNameStr(E->name);
2518 WGSizeNameStr += "_wg_size";
2519 const char *WGSizeName = WGSizeNameStr.c_str();
2520
2521 void *WGSizePtr;
2522 uint32_t WGSize;
2523 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2524 &WGSizePtr, &WGSize);
2525
2526 if (Err == HSA_STATUS_SUCCESS) {
2527 if ((size_t)WGSize != sizeof(int16_t)) {
2528 DP("Loading global computation properties '%s' - size mismatch (%u "
2529 "!= "
2530 "%lu)\n",
2531 WGSizeName, WGSize, sizeof(int16_t));
2532 return NULL;
2533 }
2534
2535 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2536
2537 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
2538
2539 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2540 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2541 DP("Error wrong WGSize value specified in HSA code object file: "
2542 "%d\n",
2543 WGSizeVal);
2544 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2545 }
2546 } else {
2547 DP("Warning: Loading WGSize '%s' - symbol not found, "
2548 "using default value %d\n",
2549 WGSizeName, WGSizeVal);
2550 }
2551
2552 check("Loading WGSize computation property", Err);
2553 }
2554
2555 // Read execution mode from global in binary
2556 std::string ExecModeNameStr(E->name);
2557 ExecModeNameStr += "_exec_mode";
2558 const char *ExecModeName = ExecModeNameStr.c_str();
2559
2560 void *ExecModePtr;
2561 uint32_t VarSize;
2562 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2563 &ExecModePtr, &VarSize);
2564
2565 if (Err == HSA_STATUS_SUCCESS) {
2566 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2567 DP("Loading global computation properties '%s' - size mismatch(%u != "
2568 "%lu)\n",
2569 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
2570 return NULL;
2571 }
2572
2573 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2574
2575 DP("After loading global for %s ExecMode = %d\n", ExecModeName,
2576 ExecModeVal);
2577
2578 if (ExecModeVal < 0 ||
2579 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2580 DP("Error wrong exec_mode value specified in HSA code object file: "
2581 "%d\n",
2582 ExecModeVal);
2583 return NULL;
2584 }
2585 } else {
2586 DP("Loading global exec_mode '%s' - symbol missing, using default "
2587 "value "
2588 "GENERIC (1)\n",
2589 ExecModeName);
2590 }
2591 check("Loading computation property", Err);
2592
2593 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2594 CallStackAddr, E->name, KernargSegmentSize,
2595 DeviceInfo().KernArgPool));
2596 __tgt_offload_entry Entry = *E;
2597 Entry.addr = (void *)&KernelsList.back();
2598 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2599 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
2600 }
2601
2602 return DeviceInfo().getOffloadEntriesTable(DeviceId);
2603 }
2604
__tgt_rtl_data_alloc(int DeviceId,int64_t Size,void *,int32_t Kind)2605 void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2606 void *Ptr = NULL;
2607 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2608
2609 if (Kind != TARGET_ALLOC_DEFAULT) {
2610 REPORT("Invalid target data allocation kind or requested allocator not "
2611 "implemented yet\n");
2612 return NULL;
2613 }
2614
2615 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2616 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2617 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
2618 (long long unsigned)(Elf64_Addr)Ptr);
2619 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
2620 return Ptr;
2621 }
2622
__tgt_rtl_data_submit(int DeviceId,void * TgtPtr,void * HstPtr,int64_t Size)2623 int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2624 int64_t Size) {
2625 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2626 __tgt_async_info AsyncInfo;
2627 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2628 if (Rc != OFFLOAD_SUCCESS)
2629 return OFFLOAD_FAIL;
2630
2631 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2632 }
2633
__tgt_rtl_data_submit_async(int DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfo)2634 int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2635 int64_t Size, __tgt_async_info *AsyncInfo) {
2636 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2637 if (AsyncInfo) {
2638 initAsyncInfo(AsyncInfo);
2639 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2640 }
2641 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2642 }
2643
__tgt_rtl_data_retrieve(int DeviceId,void * HstPtr,void * TgtPtr,int64_t Size)2644 int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2645 int64_t Size) {
2646 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2647 __tgt_async_info AsyncInfo;
2648 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2649 if (Rc != OFFLOAD_SUCCESS)
2650 return OFFLOAD_FAIL;
2651
2652 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2653 }
2654
__tgt_rtl_data_retrieve_async(int DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfo)2655 int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2656 int64_t Size,
2657 __tgt_async_info *AsyncInfo) {
2658 assert(AsyncInfo && "AsyncInfo is nullptr");
2659 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2660 initAsyncInfo(AsyncInfo);
2661 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2662 }
2663
__tgt_rtl_data_delete(int DeviceId,void * TgtPtr)2664 int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) {
2665 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
2666 hsa_status_t Err;
2667 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
2668 Err = core::Runtime::Memfree(TgtPtr);
2669 if (Err != HSA_STATUS_SUCCESS) {
2670 DP("Error when freeing CUDA memory\n");
2671 return OFFLOAD_FAIL;
2672 }
2673 return OFFLOAD_SUCCESS;
2674 }
2675
__tgt_rtl_run_target_team_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount)2676 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
2677 void **TgtArgs, ptrdiff_t *TgtOffsets,
2678 int32_t ArgNum, int32_t NumTeams,
2679 int32_t ThreadLimit,
2680 uint64_t LoopTripcount) {
2681
2682 DeviceInfo().LoadRunLock.lock_shared();
2683 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2684 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2685
2686 DeviceInfo().LoadRunLock.unlock_shared();
2687 return Res;
2688 }
2689
__tgt_rtl_run_target_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum)2690 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
2691 void **TgtArgs, ptrdiff_t *TgtOffsets,
2692 int32_t ArgNum) {
2693 // use one team and one thread
2694 // fix thread num
2695 int32_t TeamNum = 1;
2696 int32_t ThreadLimit = 0; // use default
2697 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
2698 TgtOffsets, ArgNum, TeamNum,
2699 ThreadLimit, 0);
2700 }
2701
__tgt_rtl_run_target_team_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t NumTeams,int32_t ThreadLimit,uint64_t LoopTripcount,__tgt_async_info * AsyncInfo)2702 int32_t __tgt_rtl_run_target_team_region_async(
2703 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
2704 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
2705 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
2706 assert(AsyncInfo && "AsyncInfo is nullptr");
2707 initAsyncInfo(AsyncInfo);
2708
2709 DeviceInfo().LoadRunLock.lock_shared();
2710 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2711 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2712
2713 DeviceInfo().LoadRunLock.unlock_shared();
2714 return Res;
2715 }
2716
__tgt_rtl_run_target_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,__tgt_async_info * AsyncInfo)2717 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
2718 void **TgtArgs, ptrdiff_t *TgtOffsets,
2719 int32_t ArgNum,
2720 __tgt_async_info *AsyncInfo) {
2721 // use one team and one thread
2722 // fix thread num
2723 int32_t TeamNum = 1;
2724 int32_t ThreadLimit = 0; // use default
2725 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
2726 TgtOffsets, ArgNum, TeamNum,
2727 ThreadLimit, 0, AsyncInfo);
2728 }
2729
__tgt_rtl_synchronize(int32_t DeviceId,__tgt_async_info * AsyncInfo)2730 int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2731 assert(AsyncInfo && "AsyncInfo is nullptr");
2732
2733 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2734 // is not ensured by devices.cpp for amdgcn
2735 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2736 if (AsyncInfo->Queue) {
2737 finiAsyncInfo(AsyncInfo);
2738 }
2739 return OFFLOAD_SUCCESS;
2740 }
2741
__tgt_rtl_print_device_info(int32_t DeviceId)2742 void __tgt_rtl_print_device_info(int32_t DeviceId) {
2743 // TODO: Assertion to see if DeviceId is correct
2744 // NOTE: We don't need to set context for print device info.
2745
2746 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
2747 }
2748
2749 } // extern "C"
2750