//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // RTL for CUDA machine // //===----------------------------------------------------------------------===// #include #include #include #include #include #include #include #include #include #include "Debug.h" #include "DeviceEnvironment.h" #include "omptargetplugin.h" #define TARGET_NAME CUDA #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" #include "MemoryManager.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" // Utility for retrieving and printing CUDA error string. #ifdef OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ do { \ if (getDebugLevel() > 0) { \ const char *errStr = nullptr; \ CUresult errStr_status = cuGetErrorString(err, &errStr); \ if (errStr_status == CUDA_ERROR_INVALID_VALUE) \ REPORT("Unrecognized CUDA error code: %d\n", err); \ else if (errStr_status == CUDA_SUCCESS) \ REPORT("CUDA error is: %s\n", errStr); \ else { \ REPORT("Unresolved CUDA error code: %d\n", err); \ REPORT("Unsuccessful cuGetErrorString return status: %d\n", \ errStr_status); \ } \ } else { \ const char *errStr = nullptr; \ CUresult errStr_status = cuGetErrorString(err, &errStr); \ if (errStr_status == CUDA_SUCCESS) \ REPORT("%s \n", errStr); \ } \ } while (false) #else // OMPTARGET_DEBUG #define CUDA_ERR_STRING(err) \ do { \ const char *errStr = nullptr; \ CUresult errStr_status = cuGetErrorString(err, &errStr); \ if (errStr_status == CUDA_SUCCESS) \ REPORT("%s \n", errStr); \ } while (false) #endif // OMPTARGET_DEBUG #define BOOL2TEXT(b) ((b) ? "Yes" : "No") #include "elf_common.h" /// Keep entries table per device. struct FuncOrGblEntryTy { __tgt_target_table Table; std::vector<__tgt_offload_entry> Entries; }; /// Use a single entity to encode a kernel and a set of flags. struct KernelTy { CUfunction Func; // execution mode of kernel llvm::omp::OMPTgtExecModeFlags ExecutionMode; /// Maximal number of threads per block for this kernel. int MaxThreadsPerBlock = 0; KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode) : Func(_Func), ExecutionMode(_ExecutionMode) {} }; namespace { bool checkResult(CUresult Err, const char *ErrMsg) { if (Err == CUDA_SUCCESS) return true; REPORT("%s", ErrMsg); CUDA_ERR_STRING(Err); return false; } int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size, CUstream Stream) { CUresult Err = cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); if (Err != CUDA_SUCCESS) { DP("Error when copying data from device to device. Pointers: src " "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int createEvent(void **P) { CUevent Event = nullptr; CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); if (Err != CUDA_SUCCESS) { DP("Error when creating event event = " DPxMOD "\n", DPxPTR(Event)); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } *P = Event; return OFFLOAD_SUCCESS; } int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { CUstream Stream = reinterpret_cast(AsyncInfo->Queue); CUevent Event = reinterpret_cast(EventPtr); CUresult Err = cuEventRecord(Event, Stream); if (Err != CUDA_SUCCESS) { DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n", DPxPTR(Stream), DPxPTR(Event)); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int syncEvent(void *EventPtr) { CUevent Event = reinterpret_cast(EventPtr); CUresult Err = cuEventSynchronize(Event); if (Err != CUDA_SUCCESS) { DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event)); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int destroyEvent(void *EventPtr) { CUevent Event = reinterpret_cast(EventPtr); CUresult Err = cuEventDestroy(Event); if (Err != CUDA_SUCCESS) { DP("Error when destroying event = " DPxMOD "\n", DPxPTR(Event)); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } // Structure contains per-device data struct DeviceDataTy { /// List that contains all the kernels. std::list KernelsList; std::list FuncGblEntries; CUcontext Context = nullptr; // Device properties int ThreadsPerBlock = 0; int BlocksPerGrid = 0; int WarpSize = 0; // OpenMP properties int NumTeams = 0; int NumThreads = 0; }; class StreamManagerTy { int NumberOfDevices; // The initial size of stream pool int EnvNumInitialStreams; // Per-device stream mutex std::vector> StreamMtx; // Per-device stream Id indicates the next available stream in the pool std::vector NextStreamId; // Per-device stream pool std::vector> StreamPool; // Reference to per-device data std::vector &DeviceData; // If there is no CUstream left in the pool, we will resize the pool to // allocate more CUstream. This function should be called with device mutex, // and we do not resize to smaller one. void resizeStreamPool(const int DeviceId, const size_t NewSize) { std::vector &Pool = StreamPool[DeviceId]; const size_t CurrentSize = Pool.size(); assert(NewSize > CurrentSize && "new size is not larger than current size"); CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { // We will return if cannot switch to the right context in case of // creating bunch of streams that are not corresponding to the right // device. The offloading will fail later because selected CUstream is // nullptr. return; } Pool.resize(NewSize, nullptr); for (size_t I = CurrentSize; I < NewSize; ++I) { checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), "Error returned from cuStreamCreate\n"); } } public: StreamManagerTy(const int NumberOfDevices, std::vector &DeviceData) : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), DeviceData(DeviceData) { StreamPool.resize(NumberOfDevices); NextStreamId.resize(NumberOfDevices); StreamMtx.resize(NumberOfDevices); if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) EnvNumInitialStreams = std::stoi(EnvStr); // Initialize the next stream id std::fill(NextStreamId.begin(), NextStreamId.end(), 0); // Initialize stream mutex for (std::unique_ptr &Ptr : StreamMtx) Ptr = std::make_unique(); } ~StreamManagerTy() { // Destroy streams for (int I = 0; I < NumberOfDevices; ++I) { checkResult(cuCtxSetCurrent(DeviceData[I].Context), "Error returned from cuCtxSetCurrent\n"); for (CUstream &S : StreamPool[I]) { if (S) checkResult(cuStreamDestroy(S), "Error returned from cuStreamDestroy\n"); } } } // Get a CUstream from pool. Per-device next stream id always points to the // next available CUstream. That means, CUstreams [0, id-1] have been // assigned, and [id,] are still available. If there is no CUstream left, we // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, // the id will increase one. // xxxxxs+++++++++ // ^ // id // After assignment, the pool becomes the following and s is assigned. // xxxxxs+++++++++ // ^ // id CUstream getStream(const int DeviceId) { const std::lock_guard Lock(*StreamMtx[DeviceId]); int &Id = NextStreamId[DeviceId]; // No CUstream left in the pool, we need to request from CUDA RT if (Id == static_cast(StreamPool[DeviceId].size())) { // By default we double the stream pool every time resizeStreamPool(DeviceId, Id * 2); } return StreamPool[DeviceId][Id++]; } // Return a CUstream back to pool. As mentioned above, per-device next // stream is always points to the next available CUstream, so when we return // a CUstream, we need to first decrease the id, and then copy the CUstream // back. // It is worth noting that, the order of streams return might be different // from that they're assigned, that saying, at some point, there might be // two identical CUstreams. // xxax+a+++++ // ^ // id // However, it doesn't matter, because they're always on the two sides of // id. The left one will in the end be overwritten by another CUstream. // Therefore, after several execution, the order of pool might be different // from its initial state. void returnStream(const int DeviceId, CUstream Stream) { const std::lock_guard Lock(*StreamMtx[DeviceId]); int &Id = NextStreamId[DeviceId]; assert(Id > 0 && "Wrong stream ID"); StreamPool[DeviceId][--Id] = Stream; } bool initializeDeviceStreamPool(const int DeviceId) { assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); resizeStreamPool(DeviceId, EnvNumInitialStreams); // Check the size of stream pool if (static_cast(StreamPool[DeviceId].size()) != EnvNumInitialStreams) return false; // Check whether each stream is valid for (CUstream &S : StreamPool[DeviceId]) if (!S) return false; return true; } }; class DeviceRTLTy { int NumberOfDevices; // OpenMP environment properties int EnvNumTeams; int EnvTeamLimit; int EnvTeamThreadLimit; // OpenMP requires flags int64_t RequiresFlags; // Amount of dynamic shared memory to use at launch. uint64_t DynamicMemorySize; static constexpr const int HardTeamLimit = 1U << 16U; // 64k static constexpr const int HardThreadLimit = 1024; static constexpr const int DefaultNumTeams = 128; static constexpr const int DefaultNumThreads = 128; std::unique_ptr StreamManager; std::vector DeviceData; std::vector Modules; /// A class responsible for interacting with device native runtime library to /// allocate and free memory. class CUDADeviceAllocatorTy : public DeviceAllocatorTy { const int DeviceId; const std::vector &DeviceData; std::unordered_map HostPinnedAllocs; public: CUDADeviceAllocatorTy(int DeviceId, std::vector &DeviceData) : DeviceId(DeviceId), DeviceData(DeviceData) {} void *allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return nullptr; void *MemAlloc = nullptr; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: CUdeviceptr DevicePtr; Err = cuMemAlloc(&DevicePtr, Size); MemAlloc = (void *)DevicePtr; if (!checkResult(Err, "Error returned from cuMemAlloc\n")) return nullptr; break; case TARGET_ALLOC_HOST: void *HostPtr; Err = cuMemAllocHost(&HostPtr, Size); MemAlloc = HostPtr; if (!checkResult(Err, "Error returned from cuMemAllocHost\n")) return nullptr; HostPinnedAllocs[MemAlloc] = Kind; break; case TARGET_ALLOC_SHARED: CUdeviceptr SharedPtr; Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL); MemAlloc = (void *)SharedPtr; if (!checkResult(Err, "Error returned from cuMemAllocManaged\n")) return nullptr; break; } return MemAlloc; } int free(void *TgtPtr) override { CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; // Host pinned memory must be freed differently. TargetAllocTy Kind = (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end()) ? TARGET_ALLOC_DEFAULT : TARGET_ALLOC_HOST; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: case TARGET_ALLOC_SHARED: Err = cuMemFree((CUdeviceptr)TgtPtr); if (!checkResult(Err, "Error returned from cuMemFree\n")) return OFFLOAD_FAIL; break; case TARGET_ALLOC_HOST: Err = cuMemFreeHost(TgtPtr); if (!checkResult(Err, "Error returned from cuMemFreeHost\n")) return OFFLOAD_FAIL; break; } return OFFLOAD_SUCCESS; } }; /// A vector of device allocators std::vector DeviceAllocators; /// A vector of memory managers. Since the memory manager is non-copyable and // non-removable, we wrap them into std::unique_ptr. std::vector> MemoryManagers; /// Whether use memory manager bool UseMemoryManager = true; // Record entry point associated with device void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); E.Entries.push_back(entry); } // Return a pointer to the entry associated with the pointer const __tgt_offload_entry *getOffloadEntry(const int DeviceId, const void *Addr) const { for (const __tgt_offload_entry &Itr : DeviceData[DeviceId].FuncGblEntries.back().Entries) if (Itr.addr == Addr) return &Itr; return nullptr; } // Return the pointer to the target entries table __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); if (E.Entries.empty()) return nullptr; // Update table info according to the entries and return the pointer E.Table.EntriesBegin = E.Entries.data(); E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); return &E.Table; } // Clear entries table for a device void clearOffloadEntriesTable(const int DeviceId) { DeviceData[DeviceId].FuncGblEntries.emplace_back(); FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; } CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const { assert(AsyncInfo && "AsyncInfo is nullptr"); if (!AsyncInfo->Queue) AsyncInfo->Queue = StreamManager->getStream(DeviceId); return reinterpret_cast(AsyncInfo->Queue); } public: // This class should not be copied DeviceRTLTy(const DeviceRTLTy &) = delete; DeviceRTLTy(DeviceRTLTy &&) = delete; DeviceRTLTy() : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED), DynamicMemorySize(0) { DP("Start initializing CUDA\n"); CUresult Err = cuInit(0); if (Err == CUDA_ERROR_INVALID_HANDLE) { // Can't call cuGetErrorString if dlsym failed DP("Failed to load CUDA shared library\n"); return; } if (!checkResult(Err, "Error returned from cuInit\n")) { return; } Err = cuDeviceGetCount(&NumberOfDevices); if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) return; if (NumberOfDevices == 0) { DP("There are no devices supporting CUDA.\n"); return; } DeviceData.resize(NumberOfDevices); // Get environment variables regarding teams if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { // OMP_TEAM_LIMIT has been set EnvTeamLimit = std::stoi(EnvStr); DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); } if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) { // OMP_TEAMS_THREAD_LIMIT has been set EnvTeamThreadLimit = std::stoi(EnvStr); DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit); } if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { // OMP_NUM_TEAMS has been set EnvNumTeams = std::stoi(EnvStr); DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); } if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) { // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set DynamicMemorySize = std::stoi(EnvStr); DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n", DynamicMemorySize); } StreamManager = std::make_unique(NumberOfDevices, DeviceData); for (int I = 0; I < NumberOfDevices; ++I) DeviceAllocators.emplace_back(I, DeviceData); // Get the size threshold from environment variable std::pair Res = MemoryManagerTy::getSizeThresholdFromEnv(); UseMemoryManager = Res.second; size_t MemoryManagerThreshold = Res.first; if (UseMemoryManager) for (int I = 0; I < NumberOfDevices; ++I) MemoryManagers.emplace_back(std::make_unique( DeviceAllocators[I], MemoryManagerThreshold)); } ~DeviceRTLTy() { // We first destruct memory managers in case that its dependent data are // destroyed before it. for (auto &M : MemoryManagers) M.release(); StreamManager = nullptr; for (CUmodule &M : Modules) // Close module if (M) checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); for (DeviceDataTy &D : DeviceData) { // Destroy context if (D.Context) { checkResult(cuCtxSetCurrent(D.Context), "Error returned from cuCtxSetCurrent\n"); CUdevice Device; checkResult(cuCtxGetDevice(&Device), "Error returned from cuCtxGetDevice\n"); checkResult(cuDevicePrimaryCtxRelease(Device), "Error returned from cuDevicePrimaryCtxRelease\n"); } } } // Check whether a given DeviceId is valid bool isValidDeviceId(const int DeviceId) const { return DeviceId >= 0 && DeviceId < NumberOfDevices; } int getNumOfDevices() const { return NumberOfDevices; } void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } int initDevice(const int DeviceId) { CUdevice Device; DP("Getting device %d\n", DeviceId); CUresult Err = cuDeviceGet(&Device, DeviceId); if (!checkResult(Err, "Error returned from cuDeviceGet\n")) return OFFLOAD_FAIL; // Query the current flags of the primary context and set its flags if // it is inactive unsigned int FormerPrimaryCtxFlags = 0; int FormerPrimaryCtxIsActive = 0; Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, &FormerPrimaryCtxIsActive); if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) return OFFLOAD_FAIL; if (FormerPrimaryCtxIsActive) { DP("The primary context is active, no change to its flags\n"); if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != CU_CTX_SCHED_BLOCKING_SYNC) DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); } else { DP("The primary context is inactive, set its flags to " "CU_CTX_SCHED_BLOCKING_SYNC\n"); Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) return OFFLOAD_FAIL; } // Retain the per device primary context and save it to use whenever this // device is selected. Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device); if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) return OFFLOAD_FAIL; Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; // Initialize stream pool if (!StreamManager->initializeDeviceStreamPool(DeviceId)) return OFFLOAD_FAIL; // Query attributes to determine number of threads/block and blocks/grid. int MaxGridDimX; Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device); if (Err != CUDA_SUCCESS) { DP("Error getting max grid dimension, use default value %d\n", DeviceRTLTy::DefaultNumTeams); DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { DP("Using %d CUDA blocks per grid\n", MaxGridDimX); DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; } else { DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " "at the hard limit\n", MaxGridDimX, DeviceRTLTy::HardTeamLimit); DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; } // We are only exploiting threads along the x axis. int MaxBlockDimX; Err = cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); if (Err != CUDA_SUCCESS) { DP("Error getting max block dimension, use default value %d\n", DeviceRTLTy::DefaultNumThreads); DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; } else { DP("Using %d CUDA threads per block\n", MaxBlockDimX); DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; if (EnvTeamThreadLimit > 0 && DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) { DP("Max CUDA threads per block %d exceeds the thread limit %d set by " "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n", DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit); DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit; } if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) { DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " "capping at the hard limit\n", DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit); DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; } } // Get and set warp size int WarpSize; Err = cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); if (Err != CUDA_SUCCESS) { DP("Error getting warp size, assume default value 32\n"); DeviceData[DeviceId].WarpSize = 32; } else { DP("Using warp size %d\n", WarpSize); DeviceData[DeviceId].WarpSize = WarpSize; } // Adjust teams to the env variables if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; } size_t StackLimit; size_t HeapLimit; if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) { StackLimit = std::stol(EnvStr); if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS) return OFFLOAD_FAIL; } else { if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS) return OFFLOAD_FAIL; } if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) { HeapLimit = std::stol(EnvStr); if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS) return OFFLOAD_FAIL; } else { if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS) return OFFLOAD_FAIL; } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Device supports up to %d CUDA blocks and %d threads with a " "warp size of %d\n", DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Device heap size is %d Bytes, device stack size is %d Bytes per " "thread\n", (int)HeapLimit, (int)StackLimit); // Set default number of teams if (EnvNumTeams > 0) { DP("Default number of teams set according to environment %d\n", EnvNumTeams); DeviceData[DeviceId].NumTeams = EnvNumTeams; } else { DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; DP("Default number of teams set according to library's default %d\n", DeviceRTLTy::DefaultNumTeams); } if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { DP("Default number of teams exceeds device limit, capping at %d\n", DeviceData[DeviceId].BlocksPerGrid); DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; } // Set default number of threads DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; DP("Default number of threads set according to library's default %d\n", DeviceRTLTy::DefaultNumThreads); if (DeviceData[DeviceId].NumThreads > DeviceData[DeviceId].ThreadsPerBlock) { DP("Default number of threads exceeds device limit, capping at %d\n", DeviceData[DeviceId].ThreadsPerBlock); DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock; } return OFFLOAD_SUCCESS; } __tgt_target_table *loadBinary(const int DeviceId, const __tgt_device_image *Image) { // Set the context we are using CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return nullptr; // Clear the offload table as we are going to create a new one. clearOffloadEntriesTable(DeviceId); // Create the module and extract the function pointers. CUmodule Module; DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) return nullptr; DP("CUDA module successfully loaded!\n"); Modules.push_back(Module); // Find the symbols in the module by name. const __tgt_offload_entry *HostBegin = Image->EntriesBegin; const __tgt_offload_entry *HostEnd = Image->EntriesEnd; std::list &KernelsList = DeviceData[DeviceId].KernelsList; for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { if (!E->addr) { // We return nullptr when something like this happens, the host should // have always something in the address to uniquely identify the target // region. DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); return nullptr; } if (E->size) { __tgt_offload_entry Entry = *E; CUdeviceptr CUPtr; size_t CUSize; Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); // We keep this style here because we need the name if (Err != CUDA_SUCCESS) { REPORT("Loading global '%s' Failed\n", E->name); CUDA_ERR_STRING(Err); return nullptr; } if (CUSize != E->size) { DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, CUSize, E->size); return nullptr; } DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); Entry.addr = (void *)(CUPtr); // Note: In the current implementation declare target variables // can either be link or to. This means that once unified // memory is activated via the requires directive, the variable // can be used directly from the host in both cases. // TODO: when variables types other than to or link are added, // the below condition should be changed to explicitly // check for to and link variables types: // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { // If unified memory is present any target link or to variables // can access host addresses directly. There is no longer a // need for device copies. cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); DP("Copy linked variable host address (" DPxMOD ") to device address (" DPxMOD ")\n", DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); } addOffloadEntry(DeviceId, Entry); continue; } CUfunction Func; Err = cuModuleGetFunction(&Func, Module, E->name); // We keep this style here because we need the name if (Err != CUDA_SUCCESS) { REPORT("Loading '%s' Failed\n", E->name); CUDA_ERR_STRING(Err); return nullptr; } DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); // default value GENERIC (in case symbol is missing from cubin file) llvm::omp::OMPTgtExecModeFlags ExecModeVal; std::string ExecModeNameStr(E->name); ExecModeNameStr += "_exec_mode"; const char *ExecModeName = ExecModeNameStr.c_str(); CUdeviceptr ExecModePtr; size_t CUSize; Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); if (Err == CUDA_SUCCESS) { if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags)); return nullptr; } Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); if (Err != CUDA_SUCCESS) { REPORT("Error when copying data from device to host. Pointers: " "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); CUDA_ERR_STRING(Err); return nullptr; } } else { DP("Loading global exec_mode '%s' - symbol missing, using default " "value GENERIC (1)\n", ExecModeName); } KernelsList.emplace_back(Func, ExecModeVal); __tgt_offload_entry Entry = *E; Entry.addr = &KernelsList.back(); addOffloadEntry(DeviceId, Entry); } // send device environment data to the device { // TODO: The device ID used here is not the real device ID used by OpenMP. DeviceEnvironmentTy DeviceEnv{0, static_cast(NumberOfDevices), static_cast(DeviceId), static_cast(DynamicMemorySize)}; if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) DeviceEnv.DebugKind = std::stoi(EnvStr); const char *DeviceEnvName = "omptarget_device_environment"; CUdeviceptr DeviceEnvPtr; size_t CUSize; Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); if (Err == CUDA_SUCCESS) { if (CUSize != sizeof(DeviceEnv)) { REPORT( "Global device_environment '%s' - size mismatch (%zu != %zu)\n", DeviceEnvName, CUSize, sizeof(int32_t)); CUDA_ERR_STRING(Err); return nullptr; } Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); if (Err != CUDA_SUCCESS) { REPORT("Error when copying data from host to device. Pointers: " "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); CUDA_ERR_STRING(Err); return nullptr; } DP("Sending global device environment data %zu bytes\n", CUSize); } else { DP("Finding global device environment '%s' - symbol missing.\n", DeviceEnvName); DP("Continue, considering this is a device RTL which does not accept " "environment setting.\n"); } } return getOffloadEntriesTable(DeviceId); } void *dataAlloc(const int DeviceId, const int64_t Size, const TargetAllocTy Kind) { switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: if (UseMemoryManager) return MemoryManagers[DeviceId]->allocate(Size, nullptr); else return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind); } REPORT("Invalid target data allocation kind or requested allocator not " "implemented yet\n"); return nullptr; } int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, const int64_t Size, __tgt_async_info *AsyncInfo) const { assert(AsyncInfo && "AsyncInfo is nullptr"); CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; CUstream Stream = getStream(DeviceId, AsyncInfo); Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); if (Err != CUDA_SUCCESS) { DP("Error when copying data from host to device. Pointers: host " "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, const int64_t Size, __tgt_async_info *AsyncInfo) const { assert(AsyncInfo && "AsyncInfo is nullptr"); CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; CUstream Stream = getStream(DeviceId, AsyncInfo); Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); if (Err != CUDA_SUCCESS) { DP("Error when copying data from device to host. Pointers: host " "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo) const { assert(AsyncInfo && "AsyncInfo is nullptr"); CUresult Err = cuCtxSetCurrent(DeviceData[SrcDevId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; CUstream Stream = getStream(SrcDevId, AsyncInfo); // If they are two devices, we try peer to peer copy first if (SrcDevId != DstDevId) { int CanAccessPeer = 0; Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId); if (Err != CUDA_SUCCESS) { REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32 ", dst = %" PRId32 "\n", SrcDevId, DstDevId); CUDA_ERR_STRING(Err); return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } if (!CanAccessPeer) { DP("P2P memcpy not supported so fall back to D2D memcpy"); return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0); if (Err != CUDA_SUCCESS) { REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32 ", dst = %" PRId32 "\n", SrcDevId, DstDevId); CUDA_ERR_STRING(Err); return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } Err = cuMemcpyPeerAsync((CUdeviceptr)DstPtr, DeviceData[DstDevId].Context, (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream); if (Err == CUDA_SUCCESS) return OFFLOAD_SUCCESS; DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32 "\n", DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId); CUDA_ERR_STRING(Err); } return memcpyDtoD(SrcPtr, DstPtr, Size, Stream); } int dataDelete(const int DeviceId, void *TgtPtr) { if (UseMemoryManager) return MemoryManagers[DeviceId]->free(TgtPtr); return DeviceAllocators[DeviceId].free(TgtPtr); } int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, const int ArgNum, const int TeamNum, const int ThreadLimit, const unsigned int LoopTripCount, __tgt_async_info *AsyncInfo) const { CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) return OFFLOAD_FAIL; // All args are references. std::vector Args(ArgNum); std::vector Ptrs(ArgNum); for (int I = 0; I < ArgNum; ++I) { Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); Args[I] = &Ptrs[I]; } KernelTy *KernelInfo = reinterpret_cast(TgtEntryPtr); const bool IsSPMDGenericMode = KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD; const bool IsSPMDMode = KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD; const bool IsGenericMode = KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC; int CudaThreadsPerBlock; if (ThreadLimit > 0) { DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); CudaThreadsPerBlock = ThreadLimit; // Add master warp if necessary if (IsGenericMode) { DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; } } else { DP("Setting CUDA threads per block to default %d\n", DeviceData[DeviceId].NumThreads); CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; } if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { DP("Threads per block capped at device limit %d\n", DeviceData[DeviceId].ThreadsPerBlock); CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; } if (!KernelInfo->MaxThreadsPerBlock) { Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n")) return OFFLOAD_FAIL; } if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) { DP("Threads per block capped at kernel limit %d\n", KernelInfo->MaxThreadsPerBlock); CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock; } unsigned int CudaBlocksPerGrid; if (TeamNum <= 0) { if (LoopTripCount > 0 && EnvNumTeams < 0) { if (IsSPMDGenericMode) { // If we reach this point, then we are executing a kernel that was // transformed from Generic-mode to SPMD-mode. This kernel has // SPMD-mode execution, but needs its blocks to be scheduled // differently because the current loop trip count only applies to the // `teams distribute` region and will create var too few blocks using // the regular SPMD-mode method. CudaBlocksPerGrid = LoopTripCount; } else if (IsSPMDMode) { // We have a combined construct, i.e. `target teams distribute // parallel for [simd]`. We launch so many teams so that each thread // will execute one iteration of the loop. round up to the nearest // integer CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; } else if (IsGenericMode) { // If we reach this point, then we have a non-combined construct, i.e. // `teams distribute` with a nested `parallel for` and each team is // assigned one iteration of the `distribute` loop. E.g.: // // #pragma omp target teams distribute // for(...loop_tripcount...) { // #pragma omp parallel for // for(...) {} // } // // Threads within a team will execute the iterations of the `parallel` // loop. CudaBlocksPerGrid = LoopTripCount; } else { REPORT("Unknown execution mode: %d\n", static_cast(KernelInfo->ExecutionMode)); return OFFLOAD_FAIL; } DP("Using %d teams due to loop trip count %" PRIu32 " and number of threads per block %d\n", CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); } else { DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; } } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { DP("Capping number of teams to team limit %d\n", DeviceData[DeviceId].BlocksPerGrid); CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; } else { DP("Using requested number of teams %d\n", TeamNum); CudaBlocksPerGrid = TeamNum; } INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Launching kernel %s with %d blocks and %d threads in %s mode\n", (getOffloadEntry(DeviceId, TgtEntryPtr)) ? getOffloadEntry(DeviceId, TgtEntryPtr)->name : "(null)", CudaBlocksPerGrid, CudaThreadsPerBlock, (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD")); CUstream Stream = getStream(DeviceId, AsyncInfo); Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, /* gridDimZ */ 1, CudaThreadsPerBlock, /* blockDimY */ 1, /* blockDimZ */ 1, DynamicMemorySize, Stream, &Args[0], nullptr); if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) return OFFLOAD_FAIL; DP("Launch of entry point at " DPxMOD " successful!\n", DPxPTR(TgtEntryPtr)); return OFFLOAD_SUCCESS; } int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const { CUstream Stream = reinterpret_cast(AsyncInfo->Queue); CUresult Err = cuStreamSynchronize(Stream); // Once the stream is synchronized, return it to stream pool and reset // AsyncInfo. This is to make sure the synchronization only works for its // own tasks. StreamManager->returnStream(DeviceId, reinterpret_cast(AsyncInfo->Queue)); AsyncInfo->Queue = nullptr; if (Err != CUDA_SUCCESS) { DP("Error when synchronizing stream. stream = " DPxMOD ", async info ptr = " DPxMOD "\n", DPxPTR(Stream), DPxPTR(AsyncInfo)); CUDA_ERR_STRING(Err); } return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL; } void printDeviceInfo(int32_t device_id) { char TmpChar[1000]; std::string TmpStr; size_t TmpSt; int TmpInt, TmpInt2, TmpInt3; CUdevice Device; checkResult(cuDeviceGet(&Device, device_id), "Error returned from cuCtxGetDevice\n"); cuDriverGetVersion(&TmpInt); printf(" CUDA Driver Version: \t\t%d \n", TmpInt); printf(" CUDA Device Number: \t\t%d \n", device_id); checkResult(cuDeviceGetName(TmpChar, 1000, Device), "Error returned from cuDeviceGetName\n"); printf(" Device Name: \t\t\t%s \n", TmpChar); checkResult(cuDeviceTotalMem(&TmpSt, Device), "Error returned from cuDeviceTotalMem\n"); printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt)); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); checkResult( cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); checkResult( cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Registers per Block: \t\t%d \n", TmpInt); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), "Error returned from cuDeviceGetAttribute\n"); checkResult(cuDeviceGetAttribute( &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), "Error returned from cuDeviceGetAttribute\n"); checkResult(cuDeviceGetAttribute( &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, TmpInt3); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), "Error returned from cuDeviceGetAttribute\n"); checkResult(cuDeviceGetAttribute( &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), "Error returned from cuDeviceGetAttribute\n"); checkResult(cuDeviceGetAttribute( &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, TmpInt3); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt)); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), "Error returned from cuDeviceGetAttribute\n"); if (TmpInt == CU_COMPUTEMODE_DEFAULT) TmpStr = "DEFAULT"; else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) TmpStr = "PROHIBITED"; else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) TmpStr = "EXCLUSIVE PROCESS"; else TmpStr = "unknown"; printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt)); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); checkResult( cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult(cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt)); checkResult( cuDeviceGetAttribute( &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), "Error returned from cuDeviceGetAttribute\n"); checkResult( cuDeviceGetAttribute( &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), "Error returned from cuDeviceGetAttribute\n"); printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); } int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo, void *EventPtr) const { CUstream Stream = getStream(DeviceId, AsyncInfo); CUevent Event = reinterpret_cast(EventPtr); // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from // specific CUDA version, and defined as 0x0. In previous version, per CUDA // API document, that argument has to be 0x0. CUresult Err = cuStreamWaitEvent(Stream, Event, 0); if (Err != CUDA_SUCCESS) { DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n", DPxPTR(Stream), DPxPTR(Event)); CUDA_ERR_STRING(Err); return OFFLOAD_FAIL; } return OFFLOAD_SUCCESS; } }; DeviceRTLTy DeviceRTL; } // namespace // Exposed library API function #ifdef __cplusplus extern "C" { #endif int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { return elf_check_machine(image, /* EM_CUDA */ 190); } int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { DP("Init requires flags to %" PRId64 "\n", RequiresFlags); DeviceRTL.setRequiresFlag(RequiresFlags); return RequiresFlags; } int32_t __tgt_rtl_is_data_exchangable(int32_t src_dev_id, int dst_dev_id) { if (DeviceRTL.isValidDeviceId(src_dev_id) && DeviceRTL.isValidDeviceId(dst_dev_id)) return 1; return 0; } int32_t __tgt_rtl_init_device(int32_t device_id) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); return DeviceRTL.initDevice(device_id); } __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, __tgt_device_image *image) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); return DeviceRTL.loadBinary(device_id, image); } void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *, int32_t kind) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); return DeviceRTL.dataAlloc(device_id, size, (TargetAllocTy)kind); } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); __tgt_async_info AsyncInfo; const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, size, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(device_id, &AsyncInfo); } int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, void *hst_ptr, int64_t size, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); assert(async_info_ptr && "async_info_ptr is nullptr"); return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); } int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); __tgt_async_info AsyncInfo; const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, size, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(device_id, &AsyncInfo); } int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, void *tgt_ptr, int64_t size, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); assert(async_info_ptr && "async_info_ptr is nullptr"); return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); } int32_t __tgt_rtl_data_exchange_async(int32_t src_dev_id, void *src_ptr, int dst_dev_id, void *dst_ptr, int64_t size, __tgt_async_info *AsyncInfo) { assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); assert(AsyncInfo && "AsyncInfo is nullptr"); return DeviceRTL.dataExchange(src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, AsyncInfo); } int32_t __tgt_rtl_data_exchange(int32_t src_dev_id, void *src_ptr, int32_t dst_dev_id, void *dst_ptr, int64_t size) { assert(DeviceRTL.isValidDeviceId(src_dev_id) && "src_dev_id is invalid"); assert(DeviceRTL.isValidDeviceId(dst_dev_id) && "dst_dev_id is invalid"); __tgt_async_info AsyncInfo; const int32_t rc = __tgt_rtl_data_exchange_async( src_dev_id, src_ptr, dst_dev_id, dst_ptr, size, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(src_dev_id, &AsyncInfo); } int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); return DeviceRTL.dataDelete(device_id, tgt_ptr); } int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, int32_t thread_limit, uint64_t loop_tripcount) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); __tgt_async_info AsyncInfo; const int32_t rc = __tgt_rtl_run_target_team_region_async( device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, thread_limit, loop_tripcount, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(device_id, &AsyncInfo); } int32_t __tgt_rtl_run_target_team_region_async( int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, int32_t thread_limit, uint64_t loop_tripcount, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); return DeviceRTL.runTargetTeamRegion( device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, thread_limit, loop_tripcount, async_info_ptr); } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); __tgt_async_info AsyncInfo; const int32_t rc = __tgt_rtl_run_target_region_async( device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &AsyncInfo); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(device_id, &AsyncInfo); } int32_t __tgt_rtl_run_target_region_async(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); return __tgt_rtl_run_target_team_region_async( device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, async_info_ptr); } int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); assert(async_info_ptr && "async_info_ptr is nullptr"); assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); return DeviceRTL.synchronize(device_id, async_info_ptr); } void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { std::atomic &InfoLevel = getInfoLevelInternal(); InfoLevel.store(NewInfoLevel); } void __tgt_rtl_print_device_info(int32_t device_id) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); DeviceRTL.printDeviceInfo(device_id); } int32_t __tgt_rtl_create_event(int32_t device_id, void **event) { assert(event && "event is nullptr"); return createEvent(event); } int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr, __tgt_async_info *async_info_ptr) { assert(async_info_ptr && "async_info_ptr is nullptr"); assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); assert(event_ptr && "event_ptr is nullptr"); return recordEvent(event_ptr, async_info_ptr); } int32_t __tgt_rtl_wait_event(int32_t device_id, void *event_ptr, __tgt_async_info *async_info_ptr) { assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); assert(async_info_ptr && "async_info_ptr is nullptr"); assert(event_ptr && "event is nullptr"); return DeviceRTL.waitEvent(device_id, async_info_ptr, event_ptr); } int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) { assert(event_ptr && "event is nullptr"); return syncEvent(event_ptr); } int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) { assert(event_ptr && "event is nullptr"); return destroyEvent(event_ptr); } #ifdef __cplusplus } #endif