//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // RTL for AMD hsa machine // //===----------------------------------------------------------------------===// #include #include #include #include #include #include #include #include #include #include #include #include #include #include "impl_runtime.h" #include "interop_hsa.h" #include "internal.h" #include "rt.h" #include "DeviceEnvironment.h" #include "get_elf_mach_gfx_name.h" #include "omptargetplugin.h" #include "print_tracing.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPGridValues.h" // hostrpc interface, FIXME: consider moving to its own include these are // statically linked into amdgpu/plugin if present from hostrpc_services.a, // linked as --whole-archive to override the weak symbols that are used to // implement a fallback for toolchains that do not yet have a hostrpc library. extern "C" { uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, uint32_t DeviceId); hsa_status_t hostrpc_init(); hsa_status_t hostrpc_terminate(); __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } __attribute__((weak)) hsa_status_t hostrpc_terminate() { return HSA_STATUS_SUCCESS; } __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, uint32_t DeviceId) { DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library " "missing\n", DeviceId); return 0; } } // Heuristic parameters used for kernel launch // Number of teams per CU to allow scheduling flexibility static const unsigned DefaultTeamsPerCU = 4; int print_kernel_trace; #ifdef OMPTARGET_DEBUG #define check(msg, status) \ if (status != HSA_STATUS_SUCCESS) { \ DP(#msg " failed\n"); \ } else { \ DP(#msg " succeeded\n"); \ } #else #define check(msg, status) \ {} #endif #include "elf_common.h" namespace hsa { template hsa_status_t iterate_agents(C Cb) { auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { C *Unwrapped = static_cast(Data); return (*Unwrapped)(Agent); }; return hsa_iterate_agents(L, static_cast(&Cb)); } template hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { C *Unwrapped = static_cast(Data); return (*Unwrapped)(MemoryPool); }; return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast(&Cb)); } } // namespace hsa /// Keep entries table per device struct FuncOrGblEntryTy { __tgt_target_table Table; std::vector<__tgt_offload_entry> Entries; }; struct KernelArgPool { private: static pthread_mutex_t Mutex; public: uint32_t KernargSegmentSize; void *KernargRegion = nullptr; std::queue FreeKernargSegments; uint32_t kernargSizeIncludingImplicit() { return KernargSegmentSize + sizeof(impl_implicit_args_t); } ~KernelArgPool() { if (KernargRegion) { auto R = hsa_amd_memory_pool_free(KernargRegion); if (R != HSA_STATUS_SUCCESS) { DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)); } } } // Can't really copy or move a mutex KernelArgPool() = default; KernelArgPool(const KernelArgPool &) = delete; KernelArgPool(KernelArgPool &&) = delete; KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) : KernargSegmentSize(KernargSegmentSize) { // impl uses one pool per kernel for all gpus, with a fixed upper size // preserving that exact scheme here, including the queue hsa_status_t Err = hsa_amd_memory_pool_allocate( MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0, &KernargRegion); if (Err != HSA_STATUS_SUCCESS) { DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)); KernargRegion = nullptr; // paranoid return; } Err = core::allow_access_to_all_gpu_agents(KernargRegion); if (Err != HSA_STATUS_SUCCESS) { DP("hsa allow_access_to_all_gpu_agents failed: %s\n", get_error_string(Err)); auto R = hsa_amd_memory_pool_free(KernargRegion); if (R != HSA_STATUS_SUCCESS) { // if free failed, can't do anything more to resolve it DP("hsa memory poll free failed: %s\n", get_error_string(Err)); } KernargRegion = nullptr; return; } for (int I = 0; I < MAX_NUM_KERNELS; I++) { FreeKernargSegments.push(I); } } void *allocate(uint64_t ArgNum) { assert((ArgNum * sizeof(void *)) == KernargSegmentSize); Lock L(&Mutex); void *Res = nullptr; if (!FreeKernargSegments.empty()) { int FreeIdx = FreeKernargSegments.front(); Res = static_cast(static_cast(KernargRegion) + (FreeIdx * kernargSizeIncludingImplicit())); assert(FreeIdx == pointerToIndex(Res)); FreeKernargSegments.pop(); } return Res; } void deallocate(void *Ptr) { Lock L(&Mutex); int Idx = pointerToIndex(Ptr); FreeKernargSegments.push(Idx); } private: int pointerToIndex(void *Ptr) { ptrdiff_t Bytes = static_cast(Ptr) - static_cast(KernargRegion); assert(Bytes >= 0); assert(Bytes % kernargSizeIncludingImplicit() == 0); return Bytes / kernargSizeIncludingImplicit(); } struct Lock { Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } ~Lock() { pthread_mutex_unlock(M); } pthread_mutex_t *M; }; }; pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER; std::unordered_map> KernelArgPoolMap; /// Use a single entity to encode a kernel and a set of flags struct KernelTy { llvm::omp::OMPTgtExecModeFlags ExecutionMode; int16_t ConstWGSize; int32_t DeviceId; void *CallStackAddr = nullptr; const char *Name; KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, int32_t DeviceId, void *CallStackAddr, const char *Name, uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &KernArgMemoryPool) : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(Name); if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { KernelArgPoolMap.insert( std::make_pair(N, std::unique_ptr(new KernelArgPool( KernargSegmentSize, KernArgMemoryPool)))); } } }; /// List that contains all the kernels. /// FIXME: we may need this to be per device and per library. std::list KernelsList; template static hsa_status_t findAgents(Callback CB) { hsa_status_t Err = hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { hsa_device_type_t DeviceType; // get_info fails iff HSA runtime not yet initialized hsa_status_t Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); if (Err != HSA_STATUS_SUCCESS) { if (print_kernel_trace > 0) DP("rtl.cpp: err %s\n", get_error_string(Err)); return Err; } CB(DeviceType, Agent); return HSA_STATUS_SUCCESS; }); // iterate_agents fails iff HSA runtime not yet initialized if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { DP("rtl.cpp: err %s\n", get_error_string(Err)); } return Err; } static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, void *Data) { if (Status != HSA_STATUS_SUCCESS) { const char *StatusString; if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { StatusString = "unavailable"; } DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source, Status, StatusString); abort(); } } namespace core { namespace { bool checkResult(hsa_status_t Err, const char *ErrMsg) { if (Err == HSA_STATUS_SUCCESS) return true; REPORT("%s", ErrMsg); REPORT("%s", get_error_string(Err)); return false; } void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE); } uint16_t createHeader() { uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; return Header; } hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { bool AllocAllowed = false; hsa_status_t Err = hsa_amd_memory_pool_get_info( MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &AllocAllowed); if (Err != HSA_STATUS_SUCCESS) { DP("Alloc allowed in memory pool check failed: %s\n", get_error_string(Err)); return Err; } size_t Size = 0; Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size); if (Err != HSA_STATUS_SUCCESS) { DP("Get memory pool size failed: %s\n", get_error_string(Err)); return Err; } return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; } hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { std::vector *Result = static_cast *>(Data); hsa_status_t Err; if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { return Err; } Result->push_back(MemoryPool); return HSA_STATUS_SUCCESS; } } // namespace } // namespace core struct EnvironmentVariables { int NumTeams; int TeamLimit; int TeamThreadLimit; int MaxTeamsDefault; int DynamicMemSize; }; template static constexpr const llvm::omp::GV &getGridValue() { return llvm::omp::getAMDGPUGridValues(); } struct HSALifetime { // Wrapper around HSA used to ensure it is constructed before other types // and destructed after, which means said other types can use raii for // cleanup without risking running outside of the lifetime of HSA const hsa_status_t S; bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } HSALifetime() : S(hsa_init()) {} ~HSALifetime() { if (S == HSA_STATUS_SUCCESS) { hsa_status_t Err = hsa_shut_down(); if (Err != HSA_STATUS_SUCCESS) { // Can't call into HSA to get a string from the integer DP("Shutting down HSA failed: %d\n", Err); } } } }; // Handle scheduling of multiple hsa_queue's per device to // multiple threads (one scheduler per device) class HSAQueueScheduler { public: HSAQueueScheduler() : Current(0) {} HSAQueueScheduler(const HSAQueueScheduler &) = delete; HSAQueueScheduler(HSAQueueScheduler &&Q) { Current = Q.Current.load(); for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { HSAQueues[I] = Q.HSAQueues[I]; Q.HSAQueues[I] = nullptr; } } // \return false if any HSA queue creation fails bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { hsa_queue_t *Q = nullptr; hsa_status_t Rc = hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q); if (Rc != HSA_STATUS_SUCCESS) { DP("Failed to create HSA queue %d\n", I); return false; } HSAQueues[I] = Q; } return true; } ~HSAQueueScheduler() { for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { if (HSAQueues[I]) { hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); if (Err != HSA_STATUS_SUCCESS) DP("Error destroying HSA queue"); } } } // \return next queue to use for device hsa_queue_t *next() { return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % NUM_QUEUES_PER_DEVICE]; } private: // Number of queues per device enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; std::atomic Current; }; /// Class containing all the device information class RTLDeviceInfoTy : HSALifetime { std::vector> FuncGblEntries; struct QueueDeleter { void operator()(hsa_queue_t *Q) { if (Q) { hsa_status_t Err = hsa_queue_destroy(Q); if (Err != HSA_STATUS_SUCCESS) { DP("Error destroying hsa queue: %s\n", get_error_string(Err)); } } } }; public: bool ConstructionSucceeded = false; // load binary populates symbol tables and mutates various global state // run uses those symbol tables std::shared_timed_mutex LoadRunLock; int NumberOfDevices = 0; // GPU devices std::vector HSAAgents; std::vector HSAQueueSchedulers; // one per gpu // CPUs std::vector CPUAgents; // Device properties std::vector ComputeUnits; std::vector GroupsPerDevice; std::vector ThreadsPerGroup; std::vector WarpSize; std::vector GPUName; // OpenMP properties std::vector NumTeams; std::vector NumThreads; // OpenMP Environment properties EnvironmentVariables Env; // OpenMP Requires Flags int64_t RequiresFlags; // Resource pools SignalPoolT FreeSignalPool; bool HostcallRequired = false; std::vector HSAExecutables; std::vector> KernelInfoTable; std::vector> SymbolInfoTable; hsa_amd_memory_pool_t KernArgPool; // fine grained memory pool for host allocations hsa_amd_memory_pool_t HostFineGrainedMemoryPool; // fine and coarse-grained memory pools per offloading device std::vector DeviceFineGrainedMemoryPools; std::vector DeviceCoarseGrainedMemoryPools; struct ImplFreePtrDeletor { void operator()(void *P) { core::Runtime::Memfree(P); // ignore failure to free } }; // device_State shared across loaded binaries, error if inconsistent size std::vector, uint64_t>> DeviceStateStore; static const unsigned HardTeamLimit = (1 << 16) - 1; // 64K needed to fit in uint16 static const int DefaultNumTeams = 128; // These need to be per-device since different devices can have different // wave sizes, but are currently the same number for each so that refactor // can be postponed. static_assert(getGridValue<32>().GV_Max_Teams == getGridValue<64>().GV_Max_Teams, ""); static const int MaxTeams = getGridValue<64>().GV_Max_Teams; static_assert(getGridValue<32>().GV_Max_WG_Size == getGridValue<64>().GV_Max_WG_Size, ""); static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; static_assert(getGridValue<32>().GV_Default_WG_Size == getGridValue<64>().GV_Default_WG_Size, ""); static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, hsa_agent_t, hsa_amd_memory_pool_t); hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, MemcpyFunc Func, int32_t DeviceId) { hsa_agent_t Agent = HSAAgents[DeviceId]; hsa_signal_t S = FreeSignalPool.pop(); if (S.handle == 0) { return HSA_STATUS_ERROR; } hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); FreeSignalPool.push(S); return R; } hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, int32_t DeviceId) { return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); } hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, int32_t DeviceId) { return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); } static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { char TmpChar[1000]; uint16_t Major, Minor; uint32_t TmpUInt; uint32_t TmpUInt2; uint32_t CacheSize[4]; bool TmpBool; uint16_t WorkgroupMaxDim[3]; hsa_dim3_t GridMaxDim; // Getting basic information about HSA and Device core::checkResult( hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), "Error from hsa_system_get_info when obtaining " "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); core::checkResult( hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), "Error from hsa_system_get_info when obtaining " "HSA_SYSTEM_INFO_VERSION_MINOR\n"); printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); core::checkResult( hsa_agent_get_info( Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); printf(" Product Name: \t\t\t%s \n", TmpChar); core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_NAME\n"); printf(" Device Name: \t\t\t%s \n", TmpChar); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_NAME\n"); printf(" Vendor Name: \t\t\t%s \n", TmpChar); hsa_device_type_t DevType; core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_DEVICE\n"); printf(" Device Type: \t\t\t%s \n", DevType == HSA_DEVICE_TYPE_CPU ? "CPU" : (DevType == HSA_DEVICE_TYPE_GPU ? "GPU" : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_QUEUES_MAX\n"); printf(" Max Queues: \t\t\t%u \n", TmpUInt); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); // Getting cache information printf(" Cache:\n"); // FIXME: This is deprecated according to HSA documentation. But using // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during // runtime. core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_CACHE_SIZE\n"); for (int I = 0; I < 4; I++) { if (CacheSize[I]) { printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); } } core::checkResult( hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); core::checkResult( hsa_agent_get_info( Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); core::checkResult( hsa_agent_get_info( Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); printf(" Compute Units: \t\t\t%u \n", TmpUInt); core::checkResult(hsa_agent_get_info( Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgroupMaxDim), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); printf(" Workgroup Max Size per Dimension:\n"); printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); core::checkResult(hsa_agent_get_info( Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_GRID_MAX_DIM\n"); printf(" Grid Max Size per Dimension: \t\t\n"); printf(" x: \t\t\t\t%u\n", GridMaxDim.x); printf(" y: \t\t\t\t%u\n", GridMaxDim.y); printf(" z: \t\t\t\t%u\n", GridMaxDim.z); core::checkResult( hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), "Error returned from hsa_agent_get_info when obtaining " "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); printf(" Memory Pools:\n"); auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { std::string TmpStr; size_t Size; bool Alloc, Access; hsa_amd_segment_t Segment; hsa_amd_memory_pool_global_flag_t GlobalFlags; core::checkResult( hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), "Error returned from hsa_amd_memory_pool_get_info when obtaining " "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); core::checkResult(hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), "Error returned from hsa_amd_memory_pool_get_info when " "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); switch (Segment) { case HSA_AMD_SEGMENT_GLOBAL: TmpStr = "GLOBAL; FLAGS: "; if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) TmpStr += "KERNARG, "; if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) TmpStr += "FINE GRAINED, "; if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) TmpStr += "COARSE GRAINED, "; break; case HSA_AMD_SEGMENT_READONLY: TmpStr = "READONLY"; break; case HSA_AMD_SEGMENT_PRIVATE: TmpStr = "PRIVATE"; break; case HSA_AMD_SEGMENT_GROUP: TmpStr = "GROUP"; break; } printf(" Pool %s: \n", TmpStr.c_str()); core::checkResult(hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), "Error returned from hsa_amd_memory_pool_get_info when " "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); printf(" Size: \t\t\t\t %zu bytes\n", Size); core::checkResult( hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), "Error returned from hsa_amd_memory_pool_get_info when obtaining " "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); core::checkResult( hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), "Error returned from hsa_amd_memory_pool_get_info when obtaining " "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); core::checkResult( hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), "Error returned from hsa_amd_memory_pool_get_info when obtaining " "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); core::checkResult( hsa_amd_memory_pool_get_info( Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), "Error returned from hsa_amd_memory_pool_get_info when obtaining " "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); printf(" Accessable by all: \t\t %s\n", (Access ? "TRUE" : "FALSE")); return HSA_STATUS_SUCCESS; }; // Iterate over all the memory regions for this agent. Get the memory region // type and size hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); printf(" ISAs:\n"); auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { char TmpChar[1000]; core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), "Error returned from hsa_isa_get_info_alt when " "obtaining HSA_ISA_INFO_NAME\n"); printf(" Name: \t\t\t\t %s\n", TmpChar); return HSA_STATUS_SUCCESS; }; // Iterate over all the memory regions for this agent. Get the memory region // type and size hsa_agent_iterate_isas(Agent, CBIsas, nullptr); } // Record entry point associated with device void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { assert(DeviceId < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); E.Entries.push_back(Entry); } // Return true if the entry is associated with device bool findOffloadEntry(int32_t DeviceId, void *Addr) { assert(DeviceId < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); for (auto &It : E.Entries) { if (It.addr == Addr) return true; } return false; } // Return the pointer to the target entries table __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { assert(DeviceId < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); int32_t Size = E.Entries.size(); // Table is empty if (!Size) return 0; __tgt_offload_entry *Begin = &E.Entries[0]; __tgt_offload_entry *End = &E.Entries[Size - 1]; // Update table info according to the entries and return the pointer E.Table.EntriesBegin = Begin; E.Table.EntriesEnd = ++End; return &E.Table; } // Clear entries table for a device void clearOffloadEntriesTable(int DeviceId) { assert(DeviceId < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); FuncGblEntries[DeviceId].emplace_back(); FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); // KernelArgPoolMap.clear(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, int DeviceId) { assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here."); uint32_t GlobalFlags = 0; hsa_status_t Err = hsa_amd_memory_pool_get_info( MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); if (Err != HSA_STATUS_SUCCESS) { return Err; } if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; } return HSA_STATUS_SUCCESS; } hsa_status_t setupDevicePools(const std::vector &Agents) { for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); if (ValidStatus != HSA_STATUS_SUCCESS) { DP("Alloc allowed in memory pool check failed: %s\n", get_error_string(ValidStatus)); return HSA_STATUS_SUCCESS; } return addDeviceMemoryPool(MemoryPool, DeviceId); }); if (Err != HSA_STATUS_SUCCESS) { DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Iterate all memory pools", get_error_string(Err)); return Err; } } return HSA_STATUS_SUCCESS; } hsa_status_t setupHostMemoryPools(std::vector &Agents) { std::vector HostPools; // collect all the "valid" pools for all the given agents. for (const auto &Agent : Agents) { hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( Agent, core::addMemoryPool, static_cast(&HostPools)); if (Err != HSA_STATUS_SUCCESS) { DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)); } } // We need two fine-grained pools. // 1. One with kernarg flag set for storing kernel arguments // 2. Second for host allocations bool FineGrainedMemoryPoolSet = false; bool KernArgPoolSet = false; for (const auto &MemoryPool : HostPools) { hsa_status_t Err = HSA_STATUS_SUCCESS; uint32_t GlobalFlags = 0; Err = hsa_amd_memory_pool_get_info( MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); if (Err != HSA_STATUS_SUCCESS) { DP("Get memory pool info failed: %s\n", get_error_string(Err)); return Err; } if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { KernArgPool = MemoryPool; KernArgPoolSet = true; } HostFineGrainedMemoryPool = MemoryPool; FineGrainedMemoryPoolSet = true; } } if (FineGrainedMemoryPoolSet && KernArgPoolSet) return HSA_STATUS_SUCCESS; return HSA_STATUS_ERROR; } hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) { assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"); return DeviceCoarseGrainedMemoryPools[DeviceId]; } hsa_amd_memory_pool_t getHostMemoryPool() { return HostFineGrainedMemoryPool; } static int readEnv(const char *Env, int Default = -1) { const char *EnvStr = getenv(Env); int Res = Default; if (EnvStr) { Res = std::stoi(EnvStr); DP("Parsed %s=%d\n", Env, Res); } return Res; } RTLDeviceInfoTy() { DP("Start initializing " GETNAME(TARGET_NAME) "\n"); // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr // anytime. You do not need a debug library build. // 0 => no tracing // 1 => tracing dispatch only // >1 => verbosity increase if (!HSAInitSuccess()) { DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"); return; } if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) print_kernel_trace = atoi(EnvStr); else print_kernel_trace = 0; hsa_status_t Err = core::atl_init_gpu_context(); if (Err != HSA_STATUS_SUCCESS) { DP("Error when initializing " GETNAME(TARGET_NAME) "\n"); return; } // Init hostcall soon after initializing hsa hostrpc_init(); Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { if (DeviceType == HSA_DEVICE_TYPE_CPU) { CPUAgents.push_back(Agent); } else { HSAAgents.push_back(Agent); } }); if (Err != HSA_STATUS_SUCCESS) return; NumberOfDevices = (int)HSAAgents.size(); if (NumberOfDevices == 0) { DP("There are no devices supporting HSA.\n"); return; } DP("There are %d devices supporting HSA.\n", NumberOfDevices); // Init the device info HSAQueueSchedulers.reserve(NumberOfDevices); FuncGblEntries.resize(NumberOfDevices); ThreadsPerGroup.resize(NumberOfDevices); ComputeUnits.resize(NumberOfDevices); GPUName.resize(NumberOfDevices); GroupsPerDevice.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); NumTeams.resize(NumberOfDevices); NumThreads.resize(NumberOfDevices); DeviceStateStore.resize(NumberOfDevices); KernelInfoTable.resize(NumberOfDevices); SymbolInfoTable.resize(NumberOfDevices); DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); DeviceFineGrainedMemoryPools.resize(NumberOfDevices); Err = setupDevicePools(HSAAgents); if (Err != HSA_STATUS_SUCCESS) { DP("Setup for Device Memory Pools failed\n"); return; } Err = setupHostMemoryPools(CPUAgents); if (Err != HSA_STATUS_SUCCESS) { DP("Setup for Host Memory Pools failed\n"); return; } for (int I = 0; I < NumberOfDevices; I++) { uint32_t QueueSize = 0; { hsa_status_t Err = hsa_agent_get_info( HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); if (Err != HSA_STATUS_SUCCESS) { DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I); return; } enum { MaxQueueSize = 4096 }; if (QueueSize > MaxQueueSize) { QueueSize = MaxQueueSize; } } { HSAQueueScheduler QSched; if (!QSched.createQueues(HSAAgents[I], QueueSize)) return; HSAQueueSchedulers.emplace_back(std::move(QSched)); } DeviceStateStore[I] = {nullptr, 0}; } for (int I = 0; I < NumberOfDevices; I++) { ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; ComputeUnits[I] = 1; DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I, GroupsPerDevice[I], ThreadsPerGroup[I]); } // Get environment variables regarding teams Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); Env.NumTeams = readEnv("OMP_NUM_TEAMS"); Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); // Default state. RequiresFlags = OMP_REQ_UNDEFINED; ConstructionSucceeded = true; } ~RTLDeviceInfoTy() { DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"); if (!HSAInitSuccess()) { // Then none of these can have been set up and they can't be torn down return; } // Run destructors on types that use HSA before // impl_finalize removes access to it DeviceStateStore.clear(); KernelArgPoolMap.clear(); // Terminate hostrpc before finalizing hsa hostrpc_terminate(); hsa_status_t Err; for (uint32_t I = 0; I < HSAExecutables.size(); I++) { Err = hsa_executable_destroy(HSAExecutables[I]); if (Err != HSA_STATUS_SUCCESS) { DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__, "Destroying executable", get_error_string(Err)); } } } }; pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER; static RTLDeviceInfoTy DeviceInfo; namespace { int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); // Return success if we are not copying back to host from target. if (!HstPtr) return OFFLOAD_SUCCESS; hsa_status_t Err; DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)TgtPtr, (long long unsigned)(Elf64_Addr)HstPtr); Err = DeviceInfo.freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Error when copying data from device to host. Pointers: " "host = 0x%016lx, device = 0x%016lx, size = %lld\n", (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); return OFFLOAD_FAIL; } DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)TgtPtr, (long long unsigned)(Elf64_Addr)HstPtr); return OFFLOAD_SUCCESS; } int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); hsa_status_t Err; assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); // Return success if we are not doing host to target. if (!HstPtr) return OFFLOAD_SUCCESS; DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)HstPtr, (long long unsigned)(Elf64_Addr)TgtPtr); Err = DeviceInfo.freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Error when copying data from host to device. Pointers: " "host = 0x%016lx, device = 0x%016lx, size = %lld\n", (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } // Async. // The implementation was written with cuda streams in mind. The semantics of // that are to execute kernels on a queue in order of insertion. A synchronise // call then makes writes visible between host and device. This means a series // of N data_submit_async calls are expected to execute serially. HSA offers // various options to run the data copies concurrently. This may require changes // to libomptarget. // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that // there are no outstanding kernels that need to be synchronized. Any async call // may be passed a Queue==0, at which point the cuda implementation will set it // to non-null (see getStream). The cuda streams are per-device. Upstream may // change this interface to explicitly initialize the AsyncInfo_pointer, but // until then hsa lazily initializes it as well. void initAsyncInfo(__tgt_async_info *AsyncInfo) { // set non-null while using async calls, return to null to indicate completion assert(AsyncInfo); if (!AsyncInfo->Queue) { AsyncInfo->Queue = reinterpret_cast(UINT64_MAX); } } void finiAsyncInfo(__tgt_async_info *AsyncInfo) { assert(AsyncInfo); assert(AsyncInfo->Queue); AsyncInfo->Queue = 0; } // Determine launch values for kernel. struct LaunchVals { int WorkgroupSize; int GridSize; }; LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, int ConstWGSize, llvm::omp::OMPTgtExecModeFlags ExecutionMode, int NumTeams, int ThreadLimit, uint64_t LoopTripcount, int DeviceNumTeams) { int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; int NumGroups = 0; int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; if (MaxTeams > RTLDeviceInfoTy::HardTeamLimit) MaxTeams = RTLDeviceInfoTy::HardTeamLimit; if (print_kernel_trace & STARTUP_DETAILS) { DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams); DP("Max_Teams: %d\n", MaxTeams); DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize); DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize); DP("RTLDeviceInfoTy::Default_WG_Size: %d\n", RTLDeviceInfoTy::DefaultWgSize); DP("thread_limit: %d\n", ThreadLimit); DP("threadsPerGroup: %d\n", ThreadsPerGroup); DP("ConstWGSize: %d\n", ConstWGSize); } // check for thread_limit() clause if (ThreadLimit > 0) { ThreadsPerGroup = ThreadLimit; DP("Setting threads per block to requested %d\n", ThreadLimit); // Add master warp for GENERIC if (ExecutionMode == llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { ThreadsPerGroup += WarpSize; DP("Adding master wavefront: +%d threads\n", WarpSize); } if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; DP("Setting threads per block to maximum %d\n", ThreadsPerGroup); } } // check flat_max_work_group_size attr here if (ThreadsPerGroup > ConstWGSize) { ThreadsPerGroup = ConstWGSize; DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", ThreadsPerGroup); } if (print_kernel_trace & STARTUP_DETAILS) DP("threadsPerGroup: %d\n", ThreadsPerGroup); DP("Preparing %d threads\n", ThreadsPerGroup); // Set default num_groups (teams) if (Env.TeamLimit > 0) NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; else NumGroups = MaxTeams; DP("Set default num of groups %d\n", NumGroups); if (print_kernel_trace & STARTUP_DETAILS) { DP("num_groups: %d\n", NumGroups); DP("num_teams: %d\n", NumTeams); } // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size // This reduction is typical for default case (no thread_limit clause). // or when user goes crazy with num_teams clause. // FIXME: We cant distinguish between a constant or variable thread limit. // So we only handle constant thread_limits. if (ThreadsPerGroup > RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 // Should we round threadsPerGroup up to nearest WarpSize // here? NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; // check for num_teams() clause if (NumTeams > 0) { NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; } if (print_kernel_trace & STARTUP_DETAILS) { DP("num_groups: %d\n", NumGroups); DP("Env.NumTeams %d\n", Env.NumTeams); DP("Env.TeamLimit %d\n", Env.TeamLimit); } if (Env.NumTeams > 0) { NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams); } else if (Env.TeamLimit > 0) { NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit); } else { if (NumTeams <= 0) { if (LoopTripcount > 0) { if (ExecutionMode == llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { // round up to the nearest integer NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; } else if (ExecutionMode == llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { NumGroups = LoopTripcount; } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { // This is a generic kernel that was transformed to use SPMD-mode // execution but uses Generic-mode semantics for scheduling. NumGroups = LoopTripcount; } DP("Using %d teams due to loop trip count %" PRIu64 " and number of " "threads per block %d\n", NumGroups, LoopTripcount, ThreadsPerGroup); } } else { NumGroups = NumTeams; } if (NumGroups > MaxTeams) { NumGroups = MaxTeams; if (print_kernel_trace & STARTUP_DETAILS) DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams); } if (NumGroups > NumTeams && NumTeams > 0) { NumGroups = NumTeams; if (print_kernel_trace & STARTUP_DETAILS) DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups, NumTeams); } } // num_teams clause always honored, no matter what, unless DEFAULT is active. if (NumTeams > 0) { NumGroups = NumTeams; // Cap num_groups to EnvMaxTeamsDefault if set. if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) NumGroups = Env.MaxTeamsDefault; } if (print_kernel_trace & STARTUP_DETAILS) { DP("threadsPerGroup: %d\n", ThreadsPerGroup); DP("num_groups: %d\n", NumGroups); DP("loop_tripcount: %ld\n", LoopTripcount); } DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups, ThreadsPerGroup); LaunchVals Res; Res.WorkgroupSize = ThreadsPerGroup; Res.GridSize = ThreadsPerGroup * NumGroups; return Res; } static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); bool Full = true; while (Full) { Full = PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); } return PacketId; } int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { // Set the context we are using // update thread limit content in gpu memory if un-initialized or specified // from host DP("Run target team region thread_limit %d\n", ThreadLimit); // All args are references. std::vector Args(ArgNum); std::vector Ptrs(ArgNum); DP("Arg_num: %d\n", ArgNum); for (int32_t I = 0; I < ArgNum; ++I) { Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); Args[I] = &Ptrs[I]; DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])); } KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; std::string KernelName = std::string(KernelInfo->Name); auto &KernelInfoTable = DeviceInfo.KernelInfoTable; if (KernelInfoTable[DeviceId].find(KernelName) == KernelInfoTable[DeviceId].end()) { DP("Kernel %s not found\n", KernelName.c_str()); return OFFLOAD_FAIL; } const atl_kernel_info_t KernelInfoEntry = KernelInfoTable[DeviceId][KernelName]; const uint32_t GroupSegmentSize = KernelInfoEntry.group_segment_size + DeviceInfo.Env.DynamicMemSize; const uint32_t SgprCount = KernelInfoEntry.sgpr_count; const uint32_t VgprCount = KernelInfoEntry.vgpr_count; const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count); /* * Set limit based on ThreadsPerGroup and GroupsPerDevice */ LaunchVals LV = getLaunchVals(DeviceInfo.WarpSize[DeviceId], DeviceInfo.Env, KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, NumTeams, // From run_region arg ThreadLimit, // From run_region arg LoopTripcount, // From run_region arg DeviceInfo.NumTeams[KernelInfo->DeviceId]); const int GridSize = LV.GridSize; const int WorkgroupSize = LV.WorkgroupSize; if (print_kernel_trace >= LAUNCH) { int NumGroups = GridSize / WorkgroupSize; // enum modes are SPMD, GENERIC, NONE 0,1,2 // if doing rtl timing, print to stderr, unless stdout requested. bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); fprintf(TraceToStdout ? stdout : stderr, "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, VgprSpillCount, LoopTripcount, KernelInfo->Name); } // Run on the device. { hsa_queue_t *Queue = DeviceInfo.HSAQueueSchedulers[DeviceId].next(); if (!Queue) { return OFFLOAD_FAIL; } uint64_t PacketId = acquireAvailablePacketId(Queue); const uint32_t Mask = Queue->size - 1; // size is a power of 2 hsa_kernel_dispatch_packet_t *Packet = (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); // packet->header is written last Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; Packet->workgroup_size_x = WorkgroupSize; Packet->workgroup_size_y = 1; Packet->workgroup_size_z = 1; Packet->reserved0 = 0; Packet->grid_size_x = GridSize; Packet->grid_size_y = 1; Packet->grid_size_z = 1; Packet->private_segment_size = KernelInfoEntry.private_segment_size; Packet->group_segment_size = GroupSegmentSize; Packet->kernel_object = KernelInfoEntry.kernel_object; Packet->kernarg_address = 0; // use the block allocator Packet->reserved2 = 0; // impl writes id_ here Packet->completion_signal = {0}; // may want a pool of signals KernelArgPool *ArgPool = nullptr; void *KernArg = nullptr; { auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); if (It != KernelArgPoolMap.end()) { ArgPool = (It->second).get(); } } if (!ArgPool) { DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name, DeviceId); } { if (ArgPool) { assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))); KernArg = ArgPool->allocate(ArgNum); } if (!KernArg) { DP("Allocate kernarg failed\n"); return OFFLOAD_FAIL; } // Copy explicit arguments for (int I = 0; I < ArgNum; I++) { memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); } // Initialize implicit arguments. TODO: Which of these can be dropped impl_implicit_args_t *ImplArgs = reinterpret_cast( static_cast(KernArg) + ArgPool->KernargSegmentSize); memset(ImplArgs, 0, sizeof(impl_implicit_args_t)); // may not be necessary ImplArgs->offset_x = 0; ImplArgs->offset_y = 0; ImplArgs->offset_z = 0; // assign a hostcall buffer for the selected Q if (__atomic_load_n(&DeviceInfo.HostcallRequired, __ATOMIC_ACQUIRE)) { // hostrpc_assign_buffer is not thread safe, and this function is // under a multiple reader lock, not a writer lock. static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&HostcallInitLock); uint64_t Buffer = hostrpc_assign_buffer(DeviceInfo.HSAAgents[DeviceId], Queue, DeviceId); pthread_mutex_unlock(&HostcallInitLock); if (!Buffer) { DP("hostrpc_assign_buffer failed, gpu would dereference null and " "error\n"); return OFFLOAD_FAIL; } DP("Implicit argument count: %d\n", KernelInfoEntry.implicit_argument_count); if (KernelInfoEntry.implicit_argument_count >= 4) { // Initialise pointer for implicit_argument_count != 0 ABI // Guess that the right implicit argument is at offset 24 after // the explicit arguments. In the future, should be able to read // the offset from msgpack. Clang is not annotating it at present. uint64_t Offset = sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit " "args: %d\n", Offset + 8, ArgPool->kernargSizeIncludingImplicit()); } else { memcpy(static_cast(KernArg) + Offset, &Buffer, 8); } } // initialise pointer for implicit_argument_count == 0 ABI ImplArgs->hostcall_ptr = Buffer; } Packet->kernarg_address = KernArg; } hsa_signal_t S = DeviceInfo.FreeSignalPool.pop(); if (S.handle == 0) { DP("Failed to get signal instance\n"); return OFFLOAD_FAIL; } Packet->completion_signal = S; hsa_signal_store_relaxed(Packet->completion_signal, 1); // Publish the packet indicating it is ready to be processed core::packetStoreRelease(reinterpret_cast(Packet), core::createHeader(), Packet->setup); // Since the packet is already published, its contents must not be // accessed any more hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; assert(ArgPool); ArgPool->deallocate(KernArg); DeviceInfo.FreeSignalPool.push(S); } DP("Kernel completed\n"); return OFFLOAD_SUCCESS; } bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { const uint16_t AmdgcnMachineID = 224; // EM_AMDGPU may not be in system elf.h int32_t R = elf_check_machine(Image, AmdgcnMachineID); if (!R) { DP("Supported machine ID not found\n"); } return R; } uint32_t elfEFlags(__tgt_device_image *Image) { char *ImgBegin = (char *)Image->ImageStart; size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; Elf *E = elf_memory(ImgBegin, ImgSize); if (!E) { DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); return 0; } Elf64_Ehdr *Eh64 = elf64_getehdr(E); if (!Eh64) { DP("Unable to get machine ID from ELF file!\n"); elf_end(E); return 0; } uint32_t Flags = Eh64->e_flags; elf_end(E); DP("ELF Flags: 0x%x\n", Flags); return Flags; } template bool enforceUpperBound(T *Value, T Upper) { bool Changed = *Value > Upper; if (Changed) { *Value = Upper; } return Changed; } Elf64_Shdr *findOnlyShtHash(Elf *Elf) { size_t N; int Rc = elf_getshdrnum(Elf, &N); if (Rc != 0) { return nullptr; } Elf64_Shdr *Result = nullptr; for (size_t I = 0; I < N; I++) { Elf_Scn *Scn = elf_getscn(Elf, I); if (Scn) { Elf64_Shdr *Shdr = elf64_getshdr(Scn); if (Shdr) { if (Shdr->sh_type == SHT_HASH) { if (Result == nullptr) { Result = Shdr; } else { // multiple SHT_HASH sections not handled return nullptr; } } } } } return Result; } const Elf64_Sym *elfLookup(Elf *Elf, char *Base, Elf64_Shdr *SectionHash, const char *Symname) { assert(SectionHash); size_t SectionSymtabIndex = SectionHash->sh_link; Elf64_Shdr *SectionSymtab = elf64_getshdr(elf_getscn(Elf, SectionSymtabIndex)); size_t SectionStrtabIndex = SectionSymtab->sh_link; const Elf64_Sym *Symtab = reinterpret_cast(Base + SectionSymtab->sh_offset); const uint32_t *Hashtab = reinterpret_cast(Base + SectionHash->sh_offset); // Layout: // nbucket // nchain // bucket[nbucket] // chain[nchain] uint32_t Nbucket = Hashtab[0]; const uint32_t *Bucket = &Hashtab[2]; const uint32_t *Chain = &Hashtab[Nbucket + 2]; const size_t Max = strlen(Symname) + 1; const uint32_t Hash = elf_hash(Symname); for (uint32_t I = Bucket[Hash % Nbucket]; I != 0; I = Chain[I]) { char *N = elf_strptr(Elf, SectionStrtabIndex, Symtab[I].st_name); if (strncmp(Symname, N, Max) == 0) { return &Symtab[I]; } } return nullptr; } struct SymbolInfo { void *Addr = nullptr; uint32_t Size = UINT32_MAX; uint32_t ShType = SHT_NULL; }; int getSymbolInfoWithoutLoading(Elf *Elf, char *Base, const char *Symname, SymbolInfo *Res) { if (elf_kind(Elf) != ELF_K_ELF) { return 1; } Elf64_Shdr *SectionHash = findOnlyShtHash(Elf); if (!SectionHash) { return 1; } const Elf64_Sym *Sym = elfLookup(Elf, Base, SectionHash, Symname); if (!Sym) { return 1; } if (Sym->st_size > UINT32_MAX) { return 1; } if (Sym->st_shndx == SHN_UNDEF) { return 1; } Elf_Scn *Section = elf_getscn(Elf, Sym->st_shndx); if (!Section) { return 1; } Elf64_Shdr *Header = elf64_getshdr(Section); if (!Header) { return 1; } Res->Addr = Sym->st_value + Base; Res->Size = static_cast(Sym->st_size); Res->ShType = Header->sh_type; return 0; } int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *Symname, SymbolInfo *Res) { Elf *Elf = elf_memory(Base, ImgSize); if (Elf) { int Rc = getSymbolInfoWithoutLoading(Elf, Base, Symname, Res); elf_end(Elf); return Rc; } return 1; } hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, const char *SymName, void **VarAddr, uint32_t *VarSize) { SymbolInfo SI; int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); if (Rc == 0) { *VarAddr = SI.Addr; *VarSize = SI.Size; return HSA_STATUS_SUCCESS; } return HSA_STATUS_ERROR; } template hsa_status_t moduleRegisterFromMemoryToPlace( std::map &KernelInfoTable, std::map &SymbolInfoTable, void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, std::vector &HSAExecutables) { auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { C *Unwrapped = static_cast(CbState); return (*Unwrapped)(Data, Size); }; return core::RegisterModuleFromMemory( KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, DeviceInfo.HSAAgents[DeviceId], L, static_cast(&Cb), HSAExecutables); } uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { uint64_t DeviceStateBytes = 0; { // If this is the deviceRTL, get the state variable size SymbolInfo SizeSi; int Rc = getSymbolInfoWithoutLoading( ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); if (Rc == 0) { if (SizeSi.Size != sizeof(uint64_t)) { DP("Found device_State_size variable with wrong size\n"); return 0; } // Read number of bytes directly from the elf memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); } } return DeviceStateBytes; } struct DeviceEnvironment { // initialise an DeviceEnvironmentTy in the deviceRTL // patches around differences in the deviceRTL between trunk, aomp, // rocmcc. Over time these differences will tend to zero and this class // simplified. // Symbol may be in .data or .bss, and may be missing fields, todo: // review aomp/trunk/rocm and simplify the following // The symbol may also have been deadstripped because the device side // accessors were unused. // If the symbol is in .data (aomp, rocm) it can be written directly. // If it is in .bss, we must wait for it to be allocated space on the // gpu (trunk) and initialize after loading. const char *sym() { return "omptarget_device_environment"; } DeviceEnvironmentTy HostDeviceEnv; SymbolInfo SI; bool Valid = false; __tgt_device_image *Image; const size_t ImgSize; DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, __tgt_device_image *Image, const size_t ImgSize) : Image(Image), ImgSize(ImgSize) { HostDeviceEnv.NumDevices = NumberDevices; HostDeviceEnv.DeviceNum = DeviceId; HostDeviceEnv.DebugKind = 0; HostDeviceEnv.DynamicMemSize = DynamicMemSize; if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) HostDeviceEnv.DebugKind = std::stoi(EnvStr); int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, sym(), &SI); if (Rc != 0) { DP("Finding global device environment '%s' - symbol missing.\n", sym()); return; } if (SI.Size > sizeof(HostDeviceEnv)) { DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size, sizeof(HostDeviceEnv)); return; } Valid = true; } bool inImage() { return SI.ShType != SHT_NOBITS; } hsa_status_t beforeLoading(void *Data, size_t Size) { if (Valid) { if (inImage()) { DP("Setting global device environment before load (%u bytes)\n", SI.Size); uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart; void *Pos = (char *)Data + Offset; memcpy(Pos, &HostDeviceEnv, SI.Size); } } return HSA_STATUS_SUCCESS; } hsa_status_t afterLoading() { if (Valid) { if (!inImage()) { DP("Setting global device environment after load (%u bytes)\n", SI.Size); int DeviceId = HostDeviceEnv.DeviceNum; auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; void *StatePtr; uint32_t StatePtrSize; hsa_status_t Err = interop_hsa_get_symbol_info( SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); if (Err != HSA_STATUS_SUCCESS) { DP("failed to find %s in loaded image\n", sym()); return Err; } if (StatePtrSize != SI.Size) { DP("Symbol had size %u before loading, %u after\n", StatePtrSize, SI.Size); return HSA_STATUS_ERROR; } return DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, StatePtrSize, DeviceId); } } return HSA_STATUS_SUCCESS; } }; hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { uint64_t Rounded = 4 * ((Size + 3) / 4); void *Ptr; hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); if (Err != HSA_STATUS_SUCCESS) { return Err; } hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); if (Rc != HSA_STATUS_SUCCESS) { DP("zero fill device_state failed with %u\n", Rc); core::Runtime::Memfree(Ptr); return HSA_STATUS_ERROR; } *RetPtr = Ptr; return HSA_STATUS_SUCCESS; } bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { SymbolInfo SI; int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); return (Rc == 0) && (SI.Addr != nullptr); } } // namespace namespace core { hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { return hsa_amd_agents_allow_access(DeviceInfo.HSAAgents.size(), &DeviceInfo.HSAAgents[0], NULL, Ptr); } } // namespace core extern "C" { int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { return elfMachineIdIsAmdgcn(Image); } int __tgt_rtl_number_of_devices() { // If the construction failed, no methods are safe to call if (DeviceInfo.ConstructionSucceeded) { return DeviceInfo.NumberOfDevices; } DP("AMDGPU plugin construction failed. Zero devices available\n"); return 0; } int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { DP("Init requires flags to %ld\n", RequiresFlags); DeviceInfo.RequiresFlags = RequiresFlags; return RequiresFlags; } int32_t __tgt_rtl_init_device(int DeviceId) { hsa_status_t Err; // this is per device id init DP("Initialize the device id: %d\n", DeviceId); hsa_agent_t Agent = DeviceInfo.HSAAgents[DeviceId]; // Get number of Compute Unit uint32_t ComputeUnits = 0; Err = hsa_agent_get_info( Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &ComputeUnits); if (Err != HSA_STATUS_SUCCESS) { DeviceInfo.ComputeUnits[DeviceId] = 1; DP("Error getting compute units : settiing to 1\n"); } else { DeviceInfo.ComputeUnits[DeviceId] = ComputeUnits; DP("Using %d compute unis per grid\n", DeviceInfo.ComputeUnits[DeviceId]); } char GetInfoName[64]; // 64 max size returned by get info Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, (void *)GetInfoName); if (Err) DeviceInfo.GPUName[DeviceId] = "--unknown gpu--"; else { DeviceInfo.GPUName[DeviceId] = GetInfoName; } if (print_kernel_trace & STARTUP_DETAILS) DP("Device#%-2d CU's: %2d %s\n", DeviceId, DeviceInfo.ComputeUnits[DeviceId], DeviceInfo.GPUName[DeviceId].c_str()); // Query attributes to determine number of threads/block and blocks/grid. uint16_t WorkgroupMaxDim[3]; Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, &WorkgroupMaxDim); if (Err != HSA_STATUS_SUCCESS) { DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; DP("Error getting grid dims: num groups : %d\n", RTLDeviceInfoTy::DefaultNumTeams); } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { DeviceInfo.GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; DP("Using %d ROCm blocks per grid\n", DeviceInfo.GroupsPerDevice[DeviceId]); } else { DeviceInfo.GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping " "at the hard limit\n", WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit); } // Get thread limit hsa_dim3_t GridMaxDim; Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); if (Err == HSA_STATUS_SUCCESS) { DeviceInfo.ThreadsPerGroup[DeviceId] = reinterpret_cast(&GridMaxDim)[0] / DeviceInfo.GroupsPerDevice[DeviceId]; if (DeviceInfo.ThreadsPerGroup[DeviceId] == 0) { DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); } else if (enforceUpperBound(&DeviceInfo.ThreadsPerGroup[DeviceId], RTLDeviceInfoTy::MaxWgSize)) { DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize); } else { DP("Using ROCm Queried thread limit: %d\n", DeviceInfo.ThreadsPerGroup[DeviceId]); } } else { DeviceInfo.ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; DP("Error getting max block dimension, use default:%d \n", RTLDeviceInfoTy::MaxWgSize); } // Get wavefront size uint32_t WavefrontSize = 0; Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); if (Err == HSA_STATUS_SUCCESS) { DP("Queried wavefront size: %d\n", WavefrontSize); DeviceInfo.WarpSize[DeviceId] = WavefrontSize; } else { // TODO: Burn the wavefront size into the code object DP("Warning: Unknown wavefront size, assuming 64\n"); DeviceInfo.WarpSize[DeviceId] = 64; } // Adjust teams to the env variables if (DeviceInfo.Env.TeamLimit > 0 && (enforceUpperBound(&DeviceInfo.GroupsPerDevice[DeviceId], DeviceInfo.Env.TeamLimit))) { DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n", DeviceInfo.Env.TeamLimit); } // Set default number of teams if (DeviceInfo.Env.NumTeams > 0) { DeviceInfo.NumTeams[DeviceId] = DeviceInfo.Env.NumTeams; DP("Default number of teams set according to environment %d\n", DeviceInfo.Env.NumTeams); } else { char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); int TeamsPerCU = DefaultTeamsPerCU; if (TeamsPerCUEnvStr) { TeamsPerCU = std::stoi(TeamsPerCUEnvStr); } DeviceInfo.NumTeams[DeviceId] = TeamsPerCU * DeviceInfo.ComputeUnits[DeviceId]; DP("Default number of teams = %d * number of compute units %d\n", TeamsPerCU, DeviceInfo.ComputeUnits[DeviceId]); } if (enforceUpperBound(&DeviceInfo.NumTeams[DeviceId], DeviceInfo.GroupsPerDevice[DeviceId])) { DP("Default number of teams exceeds device limit, capping at %d\n", DeviceInfo.GroupsPerDevice[DeviceId]); } // Adjust threads to the env variables if (DeviceInfo.Env.TeamThreadLimit > 0 && (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], DeviceInfo.Env.TeamThreadLimit))) { DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n", DeviceInfo.Env.TeamThreadLimit); } // Set default number of threads DeviceInfo.NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; DP("Default number of threads set according to library's default %d\n", RTLDeviceInfoTy::DefaultWgSize); if (enforceUpperBound(&DeviceInfo.NumThreads[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId])) { DP("Default number of threads exceeds device limit, capping at %d\n", DeviceInfo.ThreadsPerGroup[DeviceId]); } DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n", DeviceId, DeviceInfo.GroupsPerDevice[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId]); DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId, DeviceInfo.WarpSize[DeviceId], DeviceInfo.ThreadsPerGroup[DeviceId], DeviceInfo.GroupsPerDevice[DeviceId], DeviceInfo.GroupsPerDevice[DeviceId] * DeviceInfo.ThreadsPerGroup[DeviceId]); return OFFLOAD_SUCCESS; } static __tgt_target_table * __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *Image) { DeviceInfo.LoadRunLock.lock(); __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); DeviceInfo.LoadRunLock.unlock(); return Res; } __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image) { // This function loads the device image onto gpu[DeviceId] and does other // per-image initialization work. Specifically: // // - Initialize an DeviceEnvironmentTy instance embedded in the // image at the symbol "omptarget_device_environment" // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. // // - Allocate a large array per-gpu (could be moved to init_device) // - Read a uint64_t at symbol omptarget_nvptx_device_State_size // - Allocate at least that many bytes of gpu memory // - Zero initialize it // - Write the pointer to the symbol omptarget_nvptx_device_State // // - Pulls some per-kernel information together from various sources and // records it in the KernelsList for quicker access later // // The initialization can be done before or after loading the image onto the // gpu. This function presently does a mixture. Using the hsa api to get/set // the information is simpler to implement, in exchange for more complicated // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes // back from the gpu vs a hashtable lookup on the host. const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; DeviceInfo.clearOffloadEntriesTable(DeviceId); // We do not need to set the ELF version because the caller of this function // had to do that to decide the right runtime to use if (!elfMachineIdIsAmdgcn(Image)) return NULL; { auto Env = DeviceEnvironment(DeviceId, DeviceInfo.NumberOfDevices, DeviceInfo.Env.DynamicMemSize, Image, ImgSize); auto &KernelInfo = DeviceInfo.KernelInfoTable[DeviceId]; auto &SymbolInfo = DeviceInfo.SymbolInfoTable[DeviceId]; hsa_status_t Err = moduleRegisterFromMemoryToPlace( KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, [&](void *Data, size_t Size) { if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { __atomic_store_n(&DeviceInfo.HostcallRequired, true, __ATOMIC_RELEASE); } return Env.beforeLoading(Data, Size); }, DeviceInfo.HSAExecutables); check("Module registering", Err); if (Err != HSA_STATUS_SUCCESS) { const char *DeviceName = DeviceInfo.GPUName[DeviceId].c_str(); const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); if (strcmp(DeviceName, ElfName) != 0) { DP("Possible gpu arch mismatch: device:%s, image:%s please check" " compiler flag: -march=\n", DeviceName, ElfName); } else { DP("Error loading image onto GPU: %s\n", get_error_string(Err)); } return NULL; } Err = Env.afterLoading(); if (Err != HSA_STATUS_SUCCESS) { return NULL; } } DP("AMDGPU module successfully loaded!\n"); { // the device_State array is either large value in bss or a void* that // needs to be assigned to a pointer to an array of size device_state_bytes // If absent, it has been deadstripped and needs no setup. void *StatePtr; uint32_t StatePtrSize; auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; hsa_status_t Err = interop_hsa_get_symbol_info( SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, &StatePtrSize); if (Err != HSA_STATUS_SUCCESS) { DP("No device_state symbol found, skipping initialization\n"); } else { if (StatePtrSize < sizeof(void *)) { DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize, sizeof(void *)); return NULL; } // if it's larger than a void*, assume it's a bss array and no further // initialization is required. Only try to set up a pointer for // sizeof(void*) if (StatePtrSize == sizeof(void *)) { uint64_t DeviceStateBytes = getDeviceStateBytes((char *)Image->ImageStart, ImgSize); if (DeviceStateBytes == 0) { DP("Can't initialize device_State, missing size information\n"); return NULL; } auto &DSS = DeviceInfo.DeviceStateStore[DeviceId]; if (DSS.first.get() == nullptr) { assert(DSS.second == 0); void *Ptr = NULL; hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("Failed to allocate device_state array\n"); return NULL; } DSS = { std::unique_ptr{Ptr}, DeviceStateBytes, }; } void *Ptr = DSS.first.get(); if (DeviceStateBytes != DSS.second) { DP("Inconsistent sizes of device_State unsupported\n"); return NULL; } // write ptr to device memory so it can be used by later kernels Err = DeviceInfo.freesignalpoolMemcpyH2D(StatePtr, &Ptr, sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) { DP("memcpy install of state_ptr failed\n"); return NULL; } } } } // Here, we take advantage of the data that is appended after img_end to get // the symbols' name we need to load. This data consist of the host entries // begin and end as well as the target name (see the offloading linker script // creation in clang compiler). // Find the symbols in the module by name. The name can be obtain by // concatenating the host entry name with the target name __tgt_offload_entry *HostBegin = Image->EntriesBegin; __tgt_offload_entry *HostEnd = Image->EntriesEnd; for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { if (!E->addr) { // The host should have always something in the address to // uniquely identify the target region. DP("Analyzing host entry '' (size = %lld)...\n", (unsigned long long)E->size); return NULL; } if (E->size) { __tgt_offload_entry Entry = *E; void *Varptr; uint32_t Varsize; auto &SymbolInfoMap = DeviceInfo.SymbolInfoTable[DeviceId]; hsa_status_t Err = interop_hsa_get_symbol_info( SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); if (Err != HSA_STATUS_SUCCESS) { // Inform the user what symbol prevented offloading DP("Loading global '%s' (Failed)\n", E->name); return NULL; } if (Varsize != E->size) { DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name, Varsize, E->size); return NULL; } DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)); Entry.addr = (void *)Varptr; DeviceInfo.addOffloadEntry(DeviceId, Entry); if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && E->flags & OMP_DECLARE_TARGET_LINK) { // If unified memory is present any target link variables // can access host addresses directly. There is no longer a // need for device copies. Err = DeviceInfo.freesignalpoolMemcpyH2D(Varptr, E->addr, sizeof(void *), DeviceId); if (Err != HSA_STATUS_SUCCESS) DP("Error when copying USM\n"); DP("Copy linked variable host address (" DPxMOD ")" "to device address (" DPxMOD ")\n", DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)); } continue; } DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)); // errors in kernarg_segment_size previously treated as = 0 (or as undef) uint32_t KernargSegmentSize = 0; auto &KernelInfoMap = DeviceInfo.KernelInfoTable[DeviceId]; hsa_status_t Err = HSA_STATUS_SUCCESS; if (!E->name) { Err = HSA_STATUS_ERROR; } else { std::string KernelStr = std::string(E->name); auto It = KernelInfoMap.find(KernelStr); if (It != KernelInfoMap.end()) { atl_kernel_info_t Info = It->second; KernargSegmentSize = Info.kernel_segment_size; } else { Err = HSA_STATUS_ERROR; } } // default value GENERIC (in case symbol is missing from cubin file) llvm::omp::OMPTgtExecModeFlags ExecModeVal = llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; // get flat group size if present, else Default_WG_Size int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; // get Kernel Descriptor if present. // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp struct KernDescValType { uint16_t Version; uint16_t TSize; uint16_t WGSize; }; struct KernDescValType KernDescVal; std::string KernDescNameStr(E->name); KernDescNameStr += "_kern_desc"; const char *KernDescName = KernDescNameStr.c_str(); void *KernDescPtr; uint32_t KernDescSize; void *CallStackAddr = nullptr; Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, &KernDescPtr, &KernDescSize); if (Err == HSA_STATUS_SUCCESS) { if ((size_t)KernDescSize != sizeof(KernDescVal)) DP("Loading global computation properties '%s' - size mismatch (%u != " "%lu)\n", KernDescName, KernDescSize, sizeof(KernDescVal)); memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); // Check structure size against recorded size. if ((size_t)KernDescSize != KernDescVal.TSize) DP("KernDescVal size %lu does not match advertized size %d for '%s'\n", sizeof(KernDescVal), KernDescVal.TSize, KernDescName); DP("After loading global for %s KernDesc \n", KernDescName); DP("KernDesc: Version: %d\n", KernDescVal.Version); DP("KernDesc: TSize: %d\n", KernDescVal.TSize); DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize); if (KernDescVal.WGSize == 0) { KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize); } WGSizeVal = KernDescVal.WGSize; DP("WGSizeVal %d\n", WGSizeVal); check("Loading KernDesc computation property", Err); } else { DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName); // Flat group size std::string WGSizeNameStr(E->name); WGSizeNameStr += "_wg_size"; const char *WGSizeName = WGSizeNameStr.c_str(); void *WGSizePtr; uint32_t WGSize; Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, &WGSizePtr, &WGSize); if (Err == HSA_STATUS_SUCCESS) { if ((size_t)WGSize != sizeof(int16_t)) { DP("Loading global computation properties '%s' - size mismatch (%u " "!= " "%lu)\n", WGSizeName, WGSize, sizeof(int16_t)); return NULL; } memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal); if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { DP("Error wrong WGSize value specified in HSA code object file: " "%d\n", WGSizeVal); WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; } } else { DP("Warning: Loading WGSize '%s' - symbol not found, " "using default value %d\n", WGSizeName, WGSizeVal); } check("Loading WGSize computation property", Err); } // Read execution mode from global in binary std::string ExecModeNameStr(E->name); ExecModeNameStr += "_exec_mode"; const char *ExecModeName = ExecModeNameStr.c_str(); void *ExecModePtr; uint32_t VarSize; Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, &ExecModePtr, &VarSize); if (Err == HSA_STATUS_SUCCESS) { if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { DP("Loading global computation properties '%s' - size mismatch(%u != " "%lu)\n", ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); return NULL; } memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); DP("After loading global for %s ExecMode = %d\n", ExecModeName, ExecModeVal); if (ExecModeVal < 0 || ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { DP("Error wrong exec_mode value specified in HSA code object file: " "%d\n", ExecModeVal); return NULL; } } else { DP("Loading global exec_mode '%s' - symbol missing, using default " "value " "GENERIC (1)\n", ExecModeName); } check("Loading computation property", Err); KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, CallStackAddr, E->name, KernargSegmentSize, DeviceInfo.KernArgPool)); __tgt_offload_entry Entry = *E; Entry.addr = (void *)&KernelsList.back(); DeviceInfo.addOffloadEntry(DeviceId, Entry); DP("Entry point %ld maps to %s\n", E - HostBegin, E->name); } return DeviceInfo.getOffloadEntriesTable(DeviceId); } void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { void *Ptr = NULL; assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); if (Kind != TARGET_ALLOC_DEFAULT) { REPORT("Invalid target data allocation kind or requested allocator not " "implemented yet\n"); return NULL; } hsa_amd_memory_pool_t MemoryPool = DeviceInfo.getDeviceMemoryPool(DeviceId); hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size, (long long unsigned)(Elf64_Addr)Ptr); Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL; return Ptr; } int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, int64_t Size) { assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); __tgt_async_info AsyncInfo; int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); if (Rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); } int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); if (AsyncInfo) { initAsyncInfo(AsyncInfo); return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); } return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); } int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, int64_t Size) { assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); __tgt_async_info AsyncInfo; int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); if (Rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); } int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); initAsyncInfo(AsyncInfo); return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); } int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr) { assert(DeviceId < DeviceInfo.NumberOfDevices && "Device ID too large"); hsa_status_t Err; DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr); Err = core::Runtime::Memfree(TgtPtr); if (Err != HSA_STATUS_SUCCESS) { DP("Error when freeing CUDA memory\n"); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount) { DeviceInfo.LoadRunLock.lock_shared(); int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, NumTeams, ThreadLimit, LoopTripcount); DeviceInfo.LoadRunLock.unlock_shared(); return Res; } int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum) { // use one team and one thread // fix thread num int32_t TeamNum = 1; int32_t ThreadLimit = 0; // use default return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit, 0); } int32_t __tgt_rtl_run_target_team_region_async( int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); initAsyncInfo(AsyncInfo); DeviceInfo.LoadRunLock.lock_shared(); int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, NumTeams, ThreadLimit, LoopTripcount); DeviceInfo.LoadRunLock.unlock_shared(); return Res; } int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t ArgNum, __tgt_async_info *AsyncInfo) { // use one team and one thread // fix thread num int32_t TeamNum = 1; int32_t ThreadLimit = 0; // use default return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit, 0, AsyncInfo); } int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { assert(AsyncInfo && "AsyncInfo is nullptr"); // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant // is not ensured by devices.cpp for amdgcn // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); if (AsyncInfo->Queue) { finiAsyncInfo(AsyncInfo); } return OFFLOAD_SUCCESS; } void __tgt_rtl_print_device_info(int32_t DeviceId) { // TODO: Assertion to see if DeviceId is correct // NOTE: We don't need to set context for print device info. DeviceInfo.printDeviceInfo(DeviceId, DeviceInfo.HSAAgents[DeviceId]); } } // extern "C"