1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // RTL for CUDA machine
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/ADT/StringRef.h"
14
15 #include <algorithm>
16 #include <cassert>
17 #include <cstddef>
18 #include <cuda.h>
19 #include <list>
20 #include <memory>
21 #include <mutex>
22 #include <string>
23 #include <unordered_map>
24 #include <vector>
25
26 #include "Debug.h"
27 #include "DeviceEnvironment.h"
28 #include "omptarget.h"
29 #include "omptargetplugin.h"
30
31 #define TARGET_NAME CUDA
32 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
33
34 #include "MemoryManager.h"
35
36 #include "llvm/Frontend/OpenMP/OMPConstants.h"
37
38 using namespace llvm;
39
40 // Utility for retrieving and printing CUDA error string.
41 #ifdef OMPTARGET_DEBUG
42 #define CUDA_ERR_STRING(err) \
43 do { \
44 if (getDebugLevel() > 0) { \
45 const char *errStr = nullptr; \
46 CUresult errStr_status = cuGetErrorString(err, &errStr); \
47 if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
48 REPORT("Unrecognized CUDA error code: %d\n", err); \
49 else if (errStr_status == CUDA_SUCCESS) \
50 REPORT("CUDA error is: %s\n", errStr); \
51 else { \
52 REPORT("Unresolved CUDA error code: %d\n", err); \
53 REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
54 errStr_status); \
55 } \
56 } else { \
57 const char *errStr = nullptr; \
58 CUresult errStr_status = cuGetErrorString(err, &errStr); \
59 if (errStr_status == CUDA_SUCCESS) \
60 REPORT("%s \n", errStr); \
61 } \
62 } while (false)
63 #else // OMPTARGET_DEBUG
64 #define CUDA_ERR_STRING(err) \
65 do { \
66 const char *errStr = nullptr; \
67 CUresult errStr_status = cuGetErrorString(err, &errStr); \
68 if (errStr_status == CUDA_SUCCESS) \
69 REPORT("%s \n", errStr); \
70 } while (false)
71 #endif // OMPTARGET_DEBUG
72
73 #define BOOL2TEXT(b) ((b) ? "Yes" : "No")
74
75 #include "elf_common.h"
76
77 /// Keep entries table per device.
78 struct FuncOrGblEntryTy {
79 __tgt_target_table Table;
80 std::vector<__tgt_offload_entry> Entries;
81 };
82
83 /// Use a single entity to encode a kernel and a set of flags.
84 struct KernelTy {
85 CUfunction Func;
86
87 // execution mode of kernel
88 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
89
90 /// Maximal number of threads per block for this kernel.
91 int MaxThreadsPerBlock = 0;
92
KernelTyKernelTy93 KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
94 : Func(Func), ExecutionMode(ExecutionMode) {}
95 };
96
97 namespace {
checkResult(CUresult Err,const char * ErrMsg)98 bool checkResult(CUresult Err, const char *ErrMsg) {
99 if (Err == CUDA_SUCCESS)
100 return true;
101
102 REPORT("%s", ErrMsg);
103 CUDA_ERR_STRING(Err);
104 return false;
105 }
106
memcpyDtoD(const void * SrcPtr,void * DstPtr,int64_t Size,CUstream Stream)107 int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
108 CUstream Stream) {
109 CUresult Err =
110 cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
111
112 if (Err != CUDA_SUCCESS) {
113 DP("Error when copying data from device to device. Pointers: src "
114 "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
115 DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
116 CUDA_ERR_STRING(Err);
117 return OFFLOAD_FAIL;
118 }
119
120 return OFFLOAD_SUCCESS;
121 }
122
recordEvent(void * EventPtr,__tgt_async_info * AsyncInfo)123 int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
124 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
125 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
126
127 CUresult Err = cuEventRecord(Event, Stream);
128 if (Err != CUDA_SUCCESS) {
129 DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
130 DPxPTR(Stream), DPxPTR(Event));
131 CUDA_ERR_STRING(Err);
132 return OFFLOAD_FAIL;
133 }
134
135 return OFFLOAD_SUCCESS;
136 }
137
syncEvent(void * EventPtr)138 int syncEvent(void *EventPtr) {
139 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
140
141 CUresult Err = cuEventSynchronize(Event);
142 if (Err != CUDA_SUCCESS) {
143 DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
144 CUDA_ERR_STRING(Err);
145 return OFFLOAD_FAIL;
146 }
147
148 return OFFLOAD_SUCCESS;
149 }
150
151 namespace {
152
153 // Structure contains per-device data
154 struct DeviceDataTy {
155 /// List that contains all the kernels.
156 std::list<KernelTy> KernelsList;
157
158 std::list<FuncOrGblEntryTy> FuncGblEntries;
159
160 CUcontext Context = nullptr;
161 // Device properties
162 unsigned int ThreadsPerBlock = 0;
163 unsigned int BlocksPerGrid = 0;
164 unsigned int WarpSize = 0;
165 // OpenMP properties
166 int NumTeams = 0;
167 int NumThreads = 0;
168 };
169
170 /// Resource allocator where \p T is the resource type.
171 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
172 /// accordingly. The implementation should not raise any exception.
173 template <typename T> struct AllocatorTy {
174 using ElementTy = T;
~AllocatorTy__anondfa3b3cd0111::__anondfa3b3cd0211::AllocatorTy175 virtual ~AllocatorTy() {}
176
177 /// Create a resource and assign to R.
178 virtual int create(T &R) noexcept = 0;
179 /// Destroy the resource.
180 virtual int destroy(T) noexcept = 0;
181 };
182
183 /// Allocator for CUstream.
184 struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
185 /// See AllocatorTy<T>::create.
create__anondfa3b3cd0111::__anondfa3b3cd0211::StreamAllocatorTy186 int create(CUstream &Stream) noexcept override {
187 if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
188 "Error returned from cuStreamCreate\n"))
189 return OFFLOAD_FAIL;
190
191 return OFFLOAD_SUCCESS;
192 }
193
194 /// See AllocatorTy<T>::destroy.
destroy__anondfa3b3cd0111::__anondfa3b3cd0211::StreamAllocatorTy195 int destroy(CUstream Stream) noexcept override {
196 if (!checkResult(cuStreamDestroy(Stream),
197 "Error returned from cuStreamDestroy\n"))
198 return OFFLOAD_FAIL;
199
200 return OFFLOAD_SUCCESS;
201 }
202 };
203
204 /// Allocator for CUevent.
205 struct EventAllocatorTy final : public AllocatorTy<CUevent> {
206 /// See AllocatorTy<T>::create.
create__anondfa3b3cd0111::__anondfa3b3cd0211::EventAllocatorTy207 int create(CUevent &Event) noexcept override {
208 if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
209 "Error returned from cuEventCreate\n"))
210 return OFFLOAD_FAIL;
211
212 return OFFLOAD_SUCCESS;
213 }
214
215 /// See AllocatorTy<T>::destroy.
destroy__anondfa3b3cd0111::__anondfa3b3cd0211::EventAllocatorTy216 int destroy(CUevent Event) noexcept override {
217 if (!checkResult(cuEventDestroy(Event),
218 "Error returned from cuEventDestroy\n"))
219 return OFFLOAD_FAIL;
220
221 return OFFLOAD_SUCCESS;
222 }
223 };
224
225 /// A generic pool of resources where \p T is the resource type.
226 /// \p T should be copyable as the object is stored in \p std::vector .
227 template <typename AllocTy> class ResourcePoolTy {
228 using ElementTy = typename AllocTy::ElementTy;
229 /// Index of the next available resource.
230 size_t Next = 0;
231 /// Mutex to guard the pool.
232 std::mutex Mutex;
233 /// Pool of resources. The difference between \p Resources and \p Pool is,
234 /// when a resource is acquired and released, it is all on \p Resources. When
235 /// a batch of new resources are needed, they are both added to \p Resources
236 /// and \p Pool. The reason for this setting is, \p Resources could contain
237 /// redundant elements because resources are not released, which can cause
238 /// double free. This setting makes sure that \p Pool always has every
239 /// resource allocated from the device.
240 std::vector<ElementTy> Resources;
241 std::vector<ElementTy> Pool;
242 /// A reference to the corresponding allocator.
243 AllocTy Allocator;
244
245 /// If `Resources` is used up, we will fill in more resources. It assumes that
246 /// the new size `Size` should be always larger than the current size.
resize(size_t Size)247 bool resize(size_t Size) {
248 assert(Resources.size() == Pool.size() && "size mismatch");
249 auto CurSize = Resources.size();
250 assert(Size > CurSize && "Unexpected smaller size");
251 Pool.reserve(Size);
252 Resources.reserve(Size);
253 for (auto I = CurSize; I < Size; ++I) {
254 ElementTy NewItem;
255 int Ret = Allocator.create(NewItem);
256 if (Ret != OFFLOAD_SUCCESS)
257 return false;
258 Pool.push_back(NewItem);
259 Resources.push_back(NewItem);
260 }
261 return true;
262 }
263
264 public:
ResourcePoolTy(AllocTy && A,size_t Size=0)265 ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
266 : Allocator(std::move(A)) {
267 if (Size)
268 (void)resize(Size);
269 }
270
~ResourcePoolTy()271 ~ResourcePoolTy() noexcept { clear(); }
272
273 /// Get a resource from pool. `Next` always points to the next available
274 /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
275 /// still available. If there is no resource left, we will ask for more. Each
276 /// time a resource is assigned, the id will increase one.
277 /// xxxxxs+++++++++
278 /// ^
279 /// Next
280 /// After assignment, the pool becomes the following and s is assigned.
281 /// xxxxxs+++++++++
282 /// ^
283 /// Next
acquire(ElementTy & R)284 int acquire(ElementTy &R) noexcept {
285 std::lock_guard<std::mutex> LG(Mutex);
286 if (Next == Resources.size()) {
287 auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
288 if (!resize(NewSize))
289 return OFFLOAD_FAIL;
290 }
291
292 assert(Next < Resources.size());
293
294 R = Resources[Next++];
295
296 return OFFLOAD_SUCCESS;
297 }
298
299 /// Return the resource back to the pool. When we return a resource, we need
300 /// to first decrease `Next`, and then copy the resource back. It is worth
301 /// noting that, the order of resources return might be different from that
302 /// they're assigned, that saying, at some point, there might be two identical
303 /// resources.
304 /// xxax+a+++++
305 /// ^
306 /// Next
307 /// However, it doesn't matter, because they're always on the two sides of
308 /// `Next`. The left one will in the end be overwritten by another resource.
309 /// Therefore, after several execution, the order of pool might be different
310 /// from its initial state.
release(ElementTy R)311 void release(ElementTy R) noexcept {
312 std::lock_guard<std::mutex> LG(Mutex);
313 Resources[--Next] = R;
314 }
315
316 /// Released all stored resources and clear the pool.
317 /// Note: This function is not thread safe. Be sure to guard it if necessary.
clear()318 void clear() noexcept {
319 for (auto &R : Pool)
320 (void)Allocator.destroy(R);
321 Pool.clear();
322 Resources.clear();
323 }
324 };
325
326 } // namespace
327
328 class DeviceRTLTy {
329 int NumberOfDevices;
330 // OpenMP environment properties
331 int EnvNumTeams;
332 int EnvTeamLimit;
333 int EnvTeamThreadLimit;
334 // OpenMP requires flags
335 int64_t RequiresFlags;
336 // Amount of dynamic shared memory to use at launch.
337 uint64_t DynamicMemorySize;
338
339 /// Number of initial streams for each device.
340 int NumInitialStreams = 32;
341
342 /// Number of initial events for each device.
343 int NumInitialEvents = 8;
344
345 static constexpr const int32_t HardThreadLimit = 1024;
346 static constexpr const int32_t DefaultNumTeams = 128;
347 static constexpr const int32_t DefaultNumThreads = 128;
348
349 using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
350 std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
351
352 using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
353 std::vector<std::unique_ptr<EventPoolTy>> EventPool;
354
355 std::vector<DeviceDataTy> DeviceData;
356 std::vector<std::vector<CUmodule>> Modules;
357
358 /// Vector of flags indicating the initalization status of all associated
359 /// devices.
360 std::vector<bool> InitializedFlags;
361
362 enum class PeerAccessState : uint8_t { Unkown, Yes, No };
363 std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
364 std::mutex PeerAccessMatrixLock;
365
366 /// A class responsible for interacting with device native runtime library to
367 /// allocate and free memory.
368 class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
369 std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
370
371 public:
allocate(size_t Size,void *,TargetAllocTy Kind)372 void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
373 if (Size == 0)
374 return nullptr;
375
376 void *MemAlloc = nullptr;
377 CUresult Err;
378 switch (Kind) {
379 case TARGET_ALLOC_DEFAULT:
380 case TARGET_ALLOC_DEVICE:
381 CUdeviceptr DevicePtr;
382 Err = cuMemAlloc(&DevicePtr, Size);
383 MemAlloc = (void *)DevicePtr;
384 if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
385 return nullptr;
386 break;
387 case TARGET_ALLOC_HOST:
388 void *HostPtr;
389 Err = cuMemAllocHost(&HostPtr, Size);
390 MemAlloc = HostPtr;
391 if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
392 return nullptr;
393 HostPinnedAllocs[MemAlloc] = Kind;
394 break;
395 case TARGET_ALLOC_SHARED:
396 CUdeviceptr SharedPtr;
397 Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
398 MemAlloc = (void *)SharedPtr;
399 if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
400 return nullptr;
401 break;
402 }
403
404 return MemAlloc;
405 }
406
free(void * TgtPtr)407 int free(void *TgtPtr) override {
408 CUresult Err;
409 // Host pinned memory must be freed differently.
410 TargetAllocTy Kind =
411 (HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
412 ? TARGET_ALLOC_DEFAULT
413 : TARGET_ALLOC_HOST;
414 switch (Kind) {
415 case TARGET_ALLOC_DEFAULT:
416 case TARGET_ALLOC_DEVICE:
417 case TARGET_ALLOC_SHARED:
418 Err = cuMemFree((CUdeviceptr)TgtPtr);
419 if (!checkResult(Err, "Error returned from cuMemFree\n"))
420 return OFFLOAD_FAIL;
421 break;
422 case TARGET_ALLOC_HOST:
423 Err = cuMemFreeHost(TgtPtr);
424 if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
425 return OFFLOAD_FAIL;
426 break;
427 }
428
429 return OFFLOAD_SUCCESS;
430 }
431 };
432
433 /// A vector of device allocators
434 std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
435
436 /// A vector of memory managers. Since the memory manager is non-copyable and
437 // non-removable, we wrap them into std::unique_ptr.
438 std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
439
440 /// Whether use memory manager
441 bool UseMemoryManager = true;
442
443 // Record entry point associated with device
addOffloadEntry(const int DeviceId,const __tgt_offload_entry Entry)444 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
445 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
446 E.Entries.push_back(Entry);
447 }
448
449 // Return a pointer to the entry associated with the pointer
getOffloadEntry(const int DeviceId,const void * Addr) const450 const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
451 const void *Addr) const {
452 for (const __tgt_offload_entry &Itr :
453 DeviceData[DeviceId].FuncGblEntries.back().Entries)
454 if (Itr.addr == Addr)
455 return &Itr;
456
457 return nullptr;
458 }
459
460 // Return the pointer to the target entries table
getOffloadEntriesTable(const int DeviceId)461 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
462 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
463
464 if (E.Entries.empty())
465 return nullptr;
466
467 // Update table info according to the entries and return the pointer
468 E.Table.EntriesBegin = E.Entries.data();
469 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
470
471 return &E.Table;
472 }
473
474 // Clear entries table for a device
clearOffloadEntriesTable(const int DeviceId)475 void clearOffloadEntriesTable(const int DeviceId) {
476 DeviceData[DeviceId].FuncGblEntries.emplace_back();
477 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
478 E.Entries.clear();
479 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
480 }
481
482 public:
getStream(const int DeviceId,__tgt_async_info * AsyncInfo) const483 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
484 assert(AsyncInfo && "AsyncInfo is nullptr");
485
486 if (!AsyncInfo->Queue) {
487 CUstream S;
488 if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
489 return nullptr;
490
491 AsyncInfo->Queue = S;
492 }
493
494 return reinterpret_cast<CUstream>(AsyncInfo->Queue);
495 }
496
497 // This class should not be copied
498 DeviceRTLTy(const DeviceRTLTy &) = delete;
499 DeviceRTLTy(DeviceRTLTy &&) = delete;
500
DeviceRTLTy()501 DeviceRTLTy()
502 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
503 EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
504 DynamicMemorySize(0) {
505
506 DP("Start initializing CUDA\n");
507
508 CUresult Err = cuInit(0);
509 if (Err == CUDA_ERROR_INVALID_HANDLE) {
510 // Can't call cuGetErrorString if dlsym failed
511 DP("Failed to load CUDA shared library\n");
512 return;
513 }
514 if (Err == CUDA_ERROR_NO_DEVICE) {
515 DP("There are no devices supporting CUDA.\n");
516 return;
517 }
518 if (!checkResult(Err, "Error returned from cuInit\n")) {
519 return;
520 }
521
522 Err = cuDeviceGetCount(&NumberOfDevices);
523 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
524 return;
525
526 if (NumberOfDevices == 0) {
527 DP("There are no devices supporting CUDA.\n");
528 return;
529 }
530
531 DeviceData.resize(NumberOfDevices);
532 Modules.resize(NumberOfDevices);
533 StreamPool.resize(NumberOfDevices);
534 EventPool.resize(NumberOfDevices);
535 PeerAccessMatrix.resize(NumberOfDevices);
536 for (auto &V : PeerAccessMatrix)
537 V.resize(NumberOfDevices, PeerAccessState::Unkown);
538
539 // Get environment variables regarding teams
540 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
541 // OMP_TEAM_LIMIT has been set
542 EnvTeamLimit = std::stoi(EnvStr);
543 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
544 }
545 if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
546 // OMP_TEAMS_THREAD_LIMIT has been set
547 EnvTeamThreadLimit = std::stoi(EnvStr);
548 DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
549 }
550 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
551 // OMP_NUM_TEAMS has been set
552 EnvNumTeams = std::stoi(EnvStr);
553 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
554 }
555 if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
556 // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
557 DynamicMemorySize = std::stoi(EnvStr);
558 DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
559 DynamicMemorySize);
560 }
561 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
562 // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
563 NumInitialStreams = std::stoi(EnvStr);
564 DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
565 }
566
567 for (int I = 0; I < NumberOfDevices; ++I)
568 DeviceAllocators.emplace_back();
569
570 // Get the size threshold from environment variable
571 std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
572 UseMemoryManager = Res.second;
573 size_t MemoryManagerThreshold = Res.first;
574
575 if (UseMemoryManager)
576 for (int I = 0; I < NumberOfDevices; ++I)
577 MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
578 DeviceAllocators[I], MemoryManagerThreshold));
579
580 // We lazily initialize all devices later.
581 InitializedFlags.assign(NumberOfDevices, false);
582 }
583
~DeviceRTLTy()584 ~DeviceRTLTy() {
585 for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
586 deinitDevice(DeviceId);
587 }
588
589 // Check whether a given DeviceId is valid
isValidDeviceId(const int DeviceId) const590 bool isValidDeviceId(const int DeviceId) const {
591 return DeviceId >= 0 && DeviceId < NumberOfDevices;
592 }
593
getNumOfDevices() const594 int getNumOfDevices() const { return NumberOfDevices; }
595
setRequiresFlag(const int64_t Flags)596 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
597
initDevice(const int DeviceId)598 int initDevice(const int DeviceId) {
599 CUdevice Device;
600
601 DP("Getting device %d\n", DeviceId);
602 CUresult Err = cuDeviceGet(&Device, DeviceId);
603 if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
604 return OFFLOAD_FAIL;
605
606 assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
607 InitializedFlags[DeviceId] = true;
608
609 // Query the current flags of the primary context and set its flags if
610 // it is inactive
611 unsigned int FormerPrimaryCtxFlags = 0;
612 int FormerPrimaryCtxIsActive = 0;
613 Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
614 &FormerPrimaryCtxIsActive);
615 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
616 return OFFLOAD_FAIL;
617
618 if (FormerPrimaryCtxIsActive) {
619 DP("The primary context is active, no change to its flags\n");
620 if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
621 CU_CTX_SCHED_BLOCKING_SYNC)
622 DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
623 } else {
624 DP("The primary context is inactive, set its flags to "
625 "CU_CTX_SCHED_BLOCKING_SYNC\n");
626 Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
627 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
628 return OFFLOAD_FAIL;
629 }
630
631 // Retain the per device primary context and save it to use whenever this
632 // device is selected.
633 Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
634 if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
635 return OFFLOAD_FAIL;
636
637 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
638 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
639 return OFFLOAD_FAIL;
640
641 // Initialize the stream pool.
642 if (!StreamPool[DeviceId])
643 StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
644 NumInitialStreams);
645
646 // Initialize the event pool.
647 if (!EventPool[DeviceId])
648 EventPool[DeviceId] =
649 std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
650
651 // Query attributes to determine number of threads/block and blocks/grid.
652 int MaxGridDimX;
653 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
654 Device);
655 if (Err != CUDA_SUCCESS) {
656 DP("Error getting max grid dimension, use default value %d\n",
657 DeviceRTLTy::DefaultNumTeams);
658 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
659 } else {
660 DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
661 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
662 }
663
664 // We are only exploiting threads along the x axis.
665 int MaxBlockDimX;
666 Err = cuDeviceGetAttribute(&MaxBlockDimX,
667 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
668 if (Err != CUDA_SUCCESS) {
669 DP("Error getting max block dimension, use default value %d\n",
670 DeviceRTLTy::DefaultNumThreads);
671 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
672 } else {
673 DP("Using %d CUDA threads per block\n", MaxBlockDimX);
674 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
675
676 if (EnvTeamThreadLimit > 0 &&
677 DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
678 DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
679 "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
680 DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
681 DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
682 }
683 if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
684 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
685 "capping at the hard limit\n",
686 DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
687 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
688 }
689 }
690
691 // Get and set warp size
692 int WarpSize;
693 Err =
694 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
695 if (Err != CUDA_SUCCESS) {
696 DP("Error getting warp size, assume default value 32\n");
697 DeviceData[DeviceId].WarpSize = 32;
698 } else {
699 DP("Using warp size %d\n", WarpSize);
700 DeviceData[DeviceId].WarpSize = WarpSize;
701 }
702
703 // Adjust teams to the env variables
704 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
705 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
706 EnvTeamLimit);
707 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
708 }
709
710 size_t StackLimit;
711 size_t HeapLimit;
712 if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
713 StackLimit = std::stol(EnvStr);
714 if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
715 return OFFLOAD_FAIL;
716 } else {
717 if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
718 return OFFLOAD_FAIL;
719 }
720 if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
721 HeapLimit = std::stol(EnvStr);
722 if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
723 return OFFLOAD_FAIL;
724 } else {
725 if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
726 return OFFLOAD_FAIL;
727 }
728
729 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
730 "Device supports up to %d CUDA blocks and %d threads with a "
731 "warp size of %d\n",
732 DeviceData[DeviceId].BlocksPerGrid,
733 DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
734 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
735 "Device heap size is %d Bytes, device stack size is %d Bytes per "
736 "thread\n",
737 (int)HeapLimit, (int)StackLimit);
738
739 // Set default number of teams
740 if (EnvNumTeams > 0) {
741 DP("Default number of teams set according to environment %d\n",
742 EnvNumTeams);
743 DeviceData[DeviceId].NumTeams = EnvNumTeams;
744 } else {
745 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
746 DP("Default number of teams set according to library's default %d\n",
747 DeviceRTLTy::DefaultNumTeams);
748 }
749
750 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
751 DP("Default number of teams exceeds device limit, capping at %d\n",
752 DeviceData[DeviceId].BlocksPerGrid);
753 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
754 }
755
756 // Set default number of threads
757 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
758 DP("Default number of threads set according to library's default %d\n",
759 DeviceRTLTy::DefaultNumThreads);
760 if (DeviceData[DeviceId].NumThreads >
761 DeviceData[DeviceId].ThreadsPerBlock) {
762 DP("Default number of threads exceeds device limit, capping at %d\n",
763 DeviceData[DeviceId].ThreadsPerBlock);
764 DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
765 }
766
767 return OFFLOAD_SUCCESS;
768 }
769
deinitDevice(const int DeviceId)770 int deinitDevice(const int DeviceId) {
771 auto IsInitialized = InitializedFlags[DeviceId];
772 if (!IsInitialized)
773 return OFFLOAD_SUCCESS;
774 InitializedFlags[DeviceId] = false;
775
776 if (UseMemoryManager)
777 MemoryManagers[DeviceId].release();
778
779 StreamPool[DeviceId].reset();
780 EventPool[DeviceId].reset();
781
782 DeviceDataTy &D = DeviceData[DeviceId];
783 if (!checkResult(cuCtxSetCurrent(D.Context),
784 "Error returned from cuCtxSetCurrent\n"))
785 return OFFLOAD_FAIL;
786
787 // Unload all modules.
788 for (auto &M : Modules[DeviceId])
789 if (!checkResult(cuModuleUnload(M),
790 "Error returned from cuModuleUnload\n"))
791 return OFFLOAD_FAIL;
792
793 // Destroy context.
794 CUdevice Device;
795 if (!checkResult(cuCtxGetDevice(&Device),
796 "Error returned from cuCtxGetDevice\n"))
797 return OFFLOAD_FAIL;
798
799 if (!checkResult(cuDevicePrimaryCtxRelease(Device),
800 "Error returned from cuDevicePrimaryCtxRelease\n"))
801 return OFFLOAD_FAIL;
802
803 return OFFLOAD_SUCCESS;
804 }
805
loadBinary(const int DeviceId,const __tgt_device_image * Image)806 __tgt_target_table *loadBinary(const int DeviceId,
807 const __tgt_device_image *Image) {
808 // Clear the offload table as we are going to create a new one.
809 clearOffloadEntriesTable(DeviceId);
810
811 // Create the module and extract the function pointers.
812 CUmodule Module;
813 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
814 CUresult Err =
815 cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
816 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
817 return nullptr;
818
819 DP("CUDA module successfully loaded!\n");
820
821 Modules[DeviceId].push_back(Module);
822
823 // Find the symbols in the module by name.
824 const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
825 const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
826
827 std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
828 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
829 if (!E->addr) {
830 // We return nullptr when something like this happens, the host should
831 // have always something in the address to uniquely identify the target
832 // region.
833 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
834 return nullptr;
835 }
836
837 if (E->size) {
838 __tgt_offload_entry Entry = *E;
839 CUdeviceptr CUPtr;
840 size_t CUSize;
841 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
842 // We keep this style here because we need the name
843 if (Err != CUDA_SUCCESS) {
844 REPORT("Loading global '%s' Failed\n", E->name);
845 CUDA_ERR_STRING(Err);
846 return nullptr;
847 }
848
849 if (CUSize != E->size) {
850 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
851 CUSize, E->size);
852 return nullptr;
853 }
854
855 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
856 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
857
858 Entry.addr = (void *)(CUPtr);
859
860 // Note: In the current implementation declare target variables
861 // can either be link or to. This means that once unified
862 // memory is activated via the requires directive, the variable
863 // can be used directly from the host in both cases.
864 // TODO: when variables types other than to or link are added,
865 // the below condition should be changed to explicitly
866 // check for to and link variables types:
867 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
868 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
869 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
870 // If unified memory is present any target link or to variables
871 // can access host addresses directly. There is no longer a
872 // need for device copies.
873 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
874 DP("Copy linked variable host address (" DPxMOD
875 ") to device address (" DPxMOD ")\n",
876 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
877 }
878
879 addOffloadEntry(DeviceId, Entry);
880
881 continue;
882 }
883
884 CUfunction Func;
885 Err = cuModuleGetFunction(&Func, Module, E->name);
886 // We keep this style here because we need the name
887 if (Err != CUDA_SUCCESS) {
888 REPORT("Loading '%s' Failed\n", E->name);
889 CUDA_ERR_STRING(Err);
890 return nullptr;
891 }
892
893 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
894 DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
895
896 // default value GENERIC (in case symbol is missing from cubin file)
897 llvm::omp::OMPTgtExecModeFlags ExecModeVal;
898 std::string ExecModeNameStr(E->name);
899 ExecModeNameStr += "_exec_mode";
900 const char *ExecModeName = ExecModeNameStr.c_str();
901
902 CUdeviceptr ExecModePtr;
903 size_t CUSize;
904 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
905 if (Err == CUDA_SUCCESS) {
906 if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
907 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
908 ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
909 return nullptr;
910 }
911
912 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
913 if (Err != CUDA_SUCCESS) {
914 REPORT("Error when copying data from device to host. Pointers: "
915 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
916 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
917 CUDA_ERR_STRING(Err);
918 return nullptr;
919 }
920 } else {
921 DP("Loading global exec_mode '%s' - symbol missing, using default "
922 "value GENERIC (1)\n",
923 ExecModeName);
924 }
925
926 KernelsList.emplace_back(Func, ExecModeVal);
927
928 __tgt_offload_entry Entry = *E;
929 Entry.addr = &KernelsList.back();
930 addOffloadEntry(DeviceId, Entry);
931 }
932
933 // send device environment data to the device
934 {
935 // TODO: The device ID used here is not the real device ID used by OpenMP.
936 DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
937 static_cast<uint32_t>(DeviceId),
938 static_cast<uint32_t>(DynamicMemorySize)};
939
940 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
941 DeviceEnv.DebugKind = std::stoi(EnvStr);
942
943 const char *DeviceEnvName = "omptarget_device_environment";
944 CUdeviceptr DeviceEnvPtr;
945 size_t CUSize;
946
947 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
948 if (Err == CUDA_SUCCESS) {
949 if (CUSize != sizeof(DeviceEnv)) {
950 REPORT(
951 "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
952 DeviceEnvName, CUSize, sizeof(int32_t));
953 CUDA_ERR_STRING(Err);
954 return nullptr;
955 }
956
957 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
958 if (Err != CUDA_SUCCESS) {
959 REPORT("Error when copying data from host to device. Pointers: "
960 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
961 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
962 CUDA_ERR_STRING(Err);
963 return nullptr;
964 }
965
966 DP("Sending global device environment data %zu bytes\n", CUSize);
967 } else {
968 DP("Finding global device environment '%s' - symbol missing.\n",
969 DeviceEnvName);
970 DP("Continue, considering this is a device RTL which does not accept "
971 "environment setting.\n");
972 }
973 }
974
975 return getOffloadEntriesTable(DeviceId);
976 }
977
dataAlloc(const int DeviceId,const int64_t Size,const TargetAllocTy Kind)978 void *dataAlloc(const int DeviceId, const int64_t Size,
979 const TargetAllocTy Kind) {
980 switch (Kind) {
981 case TARGET_ALLOC_DEFAULT:
982 case TARGET_ALLOC_DEVICE:
983 if (UseMemoryManager)
984 return MemoryManagers[DeviceId]->allocate(Size, nullptr);
985 else
986 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
987 case TARGET_ALLOC_HOST:
988 case TARGET_ALLOC_SHARED:
989 return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
990 }
991
992 REPORT("Invalid target data allocation kind or requested allocator not "
993 "implemented yet\n");
994
995 return nullptr;
996 }
997
dataSubmit(const int DeviceId,const void * TgtPtr,const void * HstPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const998 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
999 const int64_t Size, __tgt_async_info *AsyncInfo) const {
1000 assert(AsyncInfo && "AsyncInfo is nullptr");
1001
1002 CUstream Stream = getStream(DeviceId, AsyncInfo);
1003 CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
1004 if (Err != CUDA_SUCCESS) {
1005 DP("Error when copying data from host to device. Pointers: host "
1006 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1007 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1008 CUDA_ERR_STRING(Err);
1009 return OFFLOAD_FAIL;
1010 }
1011
1012 return OFFLOAD_SUCCESS;
1013 }
1014
dataRetrieve(const int DeviceId,void * HstPtr,const void * TgtPtr,const int64_t Size,__tgt_async_info * AsyncInfo) const1015 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
1016 const int64_t Size, __tgt_async_info *AsyncInfo) const {
1017 assert(AsyncInfo && "AsyncInfo is nullptr");
1018
1019 CUstream Stream = getStream(DeviceId, AsyncInfo);
1020 CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
1021 if (Err != CUDA_SUCCESS) {
1022 DP("Error when copying data from device to host. Pointers: host "
1023 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
1024 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
1025 CUDA_ERR_STRING(Err);
1026 return OFFLOAD_FAIL;
1027 }
1028
1029 return OFFLOAD_SUCCESS;
1030 }
1031
dataExchange(int SrcDevId,const void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1032 int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
1033 int64_t Size, __tgt_async_info *AsyncInfo) {
1034 assert(AsyncInfo && "AsyncInfo is nullptr");
1035
1036 CUresult Err;
1037 CUstream Stream = getStream(SrcDevId, AsyncInfo);
1038
1039 // If they are two devices, we try peer to peer copy first
1040 if (SrcDevId != DstDevId) {
1041 std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
1042
1043 switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
1044 case PeerAccessState::No: {
1045 REPORT("Peer access from %" PRId32 " to %" PRId32
1046 " is not supported. Fall back to D2D memcpy.\n",
1047 SrcDevId, DstDevId);
1048 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1049 }
1050 case PeerAccessState::Unkown: {
1051 int CanAccessPeer = 0;
1052 Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
1053 if (Err != CUDA_SUCCESS) {
1054 REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
1055 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1056 SrcDevId, DstDevId);
1057 CUDA_ERR_STRING(Err);
1058 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1059 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1060 }
1061
1062 if (!CanAccessPeer) {
1063 REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
1064 "memcpy.\n",
1065 SrcDevId, DstDevId);
1066 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1067 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1068 }
1069
1070 Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
1071 if (Err != CUDA_SUCCESS) {
1072 REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
1073 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
1074 SrcDevId, DstDevId);
1075 CUDA_ERR_STRING(Err);
1076 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
1077 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1078 }
1079
1080 PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
1081
1082 LLVM_FALLTHROUGH;
1083 }
1084 case PeerAccessState::Yes: {
1085 Err = cuMemcpyPeerAsync(
1086 (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
1087 (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
1088 if (Err == CUDA_SUCCESS)
1089 return OFFLOAD_SUCCESS;
1090
1091 DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
1092 ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
1093 ". Fall back to D2D memcpy.\n",
1094 DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
1095 CUDA_ERR_STRING(Err);
1096
1097 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1098 }
1099 }
1100 }
1101
1102 return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
1103 }
1104
dataDelete(const int DeviceId,void * TgtPtr)1105 int dataDelete(const int DeviceId, void *TgtPtr) {
1106 if (UseMemoryManager)
1107 return MemoryManagers[DeviceId]->free(TgtPtr);
1108
1109 return DeviceAllocators[DeviceId].free(TgtPtr);
1110 }
1111
runTargetTeamRegion(const int DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,const int ArgNum,const int TeamNum,const int ThreadLimit,const unsigned int LoopTripCount,__tgt_async_info * AsyncInfo) const1112 int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
1113 ptrdiff_t *TgtOffsets, const int ArgNum,
1114 const int TeamNum, const int ThreadLimit,
1115 const unsigned int LoopTripCount,
1116 __tgt_async_info *AsyncInfo) const {
1117 // All args are references.
1118 std::vector<void *> Args(ArgNum);
1119 std::vector<void *> Ptrs(ArgNum);
1120
1121 for (int I = 0; I < ArgNum; ++I) {
1122 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1123 Args[I] = &Ptrs[I];
1124 }
1125
1126 KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
1127
1128 const bool IsSPMDGenericMode =
1129 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1130 const bool IsSPMDMode =
1131 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1132 const bool IsGenericMode =
1133 KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1134
1135 int CudaThreadsPerBlock;
1136 if (ThreadLimit > 0) {
1137 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
1138 CudaThreadsPerBlock = ThreadLimit;
1139 // Add master warp if necessary
1140 if (IsGenericMode) {
1141 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
1142 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
1143 }
1144 } else {
1145 DP("Setting CUDA threads per block to default %d\n",
1146 DeviceData[DeviceId].NumThreads);
1147 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
1148 }
1149
1150 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
1151 DP("Threads per block capped at device limit %d\n",
1152 DeviceData[DeviceId].ThreadsPerBlock);
1153 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
1154 }
1155
1156 CUresult Err;
1157 if (!KernelInfo->MaxThreadsPerBlock) {
1158 Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
1159 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
1160 KernelInfo->Func);
1161 if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
1162 return OFFLOAD_FAIL;
1163 }
1164
1165 if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
1166 DP("Threads per block capped at kernel limit %d\n",
1167 KernelInfo->MaxThreadsPerBlock);
1168 CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
1169 }
1170
1171 unsigned int CudaBlocksPerGrid;
1172 if (TeamNum <= 0) {
1173 if (LoopTripCount > 0 && EnvNumTeams < 0) {
1174 if (IsSPMDGenericMode) {
1175 // If we reach this point, then we are executing a kernel that was
1176 // transformed from Generic-mode to SPMD-mode. This kernel has
1177 // SPMD-mode execution, but needs its blocks to be scheduled
1178 // differently because the current loop trip count only applies to the
1179 // `teams distribute` region and will create var too few blocks using
1180 // the regular SPMD-mode method.
1181 CudaBlocksPerGrid = LoopTripCount;
1182 } else if (IsSPMDMode) {
1183 // We have a combined construct, i.e. `target teams distribute
1184 // parallel for [simd]`. We launch so many teams so that each thread
1185 // will execute one iteration of the loop. round up to the nearest
1186 // integer
1187 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1188 } else if (IsGenericMode) {
1189 // If we reach this point, then we have a non-combined construct, i.e.
1190 // `teams distribute` with a nested `parallel for` and each team is
1191 // assigned one iteration of the `distribute` loop. E.g.:
1192 //
1193 // #pragma omp target teams distribute
1194 // for(...loop_tripcount...) {
1195 // #pragma omp parallel for
1196 // for(...) {}
1197 // }
1198 //
1199 // Threads within a team will execute the iterations of the `parallel`
1200 // loop.
1201 CudaBlocksPerGrid = LoopTripCount;
1202 } else {
1203 REPORT("Unknown execution mode: %d\n",
1204 static_cast<int8_t>(KernelInfo->ExecutionMode));
1205 return OFFLOAD_FAIL;
1206 }
1207 DP("Using %d teams due to loop trip count %" PRIu32
1208 " and number of threads per block %d\n",
1209 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
1210 } else {
1211 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
1212 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
1213 }
1214 } else {
1215 DP("Using requested number of teams %d\n", TeamNum);
1216 CudaBlocksPerGrid = TeamNum;
1217 }
1218
1219 if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
1220 DP("Capping number of teams to team limit %d\n",
1221 DeviceData[DeviceId].BlocksPerGrid);
1222 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
1223 }
1224
1225 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1226 "Launching kernel %s with %d blocks and %d threads in %s mode\n",
1227 (getOffloadEntry(DeviceId, TgtEntryPtr))
1228 ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
1229 : "(null)",
1230 CudaBlocksPerGrid, CudaThreadsPerBlock,
1231 (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
1232
1233 CUstream Stream = getStream(DeviceId, AsyncInfo);
1234 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
1235 /* gridDimZ */ 1, CudaThreadsPerBlock,
1236 /* blockDimY */ 1, /* blockDimZ */ 1,
1237 DynamicMemorySize, Stream, &Args[0], nullptr);
1238 if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
1239 return OFFLOAD_FAIL;
1240
1241 DP("Launch of entry point at " DPxMOD " successful!\n",
1242 DPxPTR(TgtEntryPtr));
1243
1244 return OFFLOAD_SUCCESS;
1245 }
1246
synchronize(const int DeviceId,__tgt_async_info * AsyncInfo) const1247 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
1248 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
1249 CUresult Err = cuStreamSynchronize(Stream);
1250
1251 // Once the stream is synchronized, return it to stream pool and reset
1252 // AsyncInfo. This is to make sure the synchronization only works for its
1253 // own tasks.
1254 StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
1255 AsyncInfo->Queue = nullptr;
1256
1257 if (Err != CUDA_SUCCESS) {
1258 DP("Error when synchronizing stream. stream = " DPxMOD
1259 ", async info ptr = " DPxMOD "\n",
1260 DPxPTR(Stream), DPxPTR(AsyncInfo));
1261 CUDA_ERR_STRING(Err);
1262 }
1263 return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
1264 }
1265
printDeviceInfo(int32_t DeviceId)1266 void printDeviceInfo(int32_t DeviceId) {
1267 char TmpChar[1000];
1268 std::string TmpStr;
1269 size_t TmpSt;
1270 int TmpInt, TmpInt2, TmpInt3;
1271
1272 CUdevice Device;
1273 checkResult(cuDeviceGet(&Device, DeviceId),
1274 "Error returned from cuCtxGetDevice\n");
1275
1276 cuDriverGetVersion(&TmpInt);
1277 printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
1278 printf(" CUDA Device Number: \t\t%d \n", DeviceId);
1279 checkResult(cuDeviceGetName(TmpChar, 1000, Device),
1280 "Error returned from cuDeviceGetName\n");
1281 printf(" Device Name: \t\t\t%s \n", TmpChar);
1282 checkResult(cuDeviceTotalMem(&TmpSt, Device),
1283 "Error returned from cuDeviceTotalMem\n");
1284 printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
1285 checkResult(cuDeviceGetAttribute(
1286 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
1287 "Error returned from cuDeviceGetAttribute\n");
1288 printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
1289 checkResult(
1290 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
1291 "Error returned from cuDeviceGetAttribute\n");
1292 printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
1293 checkResult(cuDeviceGetAttribute(
1294 &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
1295 "Error returned from cuDeviceGetAttribute\n");
1296 printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
1297 checkResult(
1298 cuDeviceGetAttribute(
1299 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
1300 "Error returned from cuDeviceGetAttribute\n");
1301 printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
1302 checkResult(
1303 cuDeviceGetAttribute(
1304 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
1305 "Error returned from cuDeviceGetAttribute\n");
1306 printf(" Registers per Block: \t\t%d \n", TmpInt);
1307 checkResult(
1308 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
1309 "Error returned from cuDeviceGetAttribute\n");
1310 printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
1311 checkResult(cuDeviceGetAttribute(
1312 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
1313 "Error returned from cuDeviceGetAttribute\n");
1314 printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
1315 checkResult(cuDeviceGetAttribute(
1316 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
1317 "Error returned from cuDeviceGetAttribute\n");
1318 checkResult(cuDeviceGetAttribute(
1319 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
1320 "Error returned from cuDeviceGetAttribute\n");
1321 checkResult(cuDeviceGetAttribute(
1322 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
1323 "Error returned from cuDeviceGetAttribute\n");
1324 printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
1325 TmpInt3);
1326 checkResult(cuDeviceGetAttribute(
1327 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
1328 "Error returned from cuDeviceGetAttribute\n");
1329 checkResult(cuDeviceGetAttribute(
1330 &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
1331 "Error returned from cuDeviceGetAttribute\n");
1332 checkResult(cuDeviceGetAttribute(
1333 &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
1334 "Error returned from cuDeviceGetAttribute\n");
1335 printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
1336 TmpInt3);
1337 checkResult(
1338 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
1339 "Error returned from cuDeviceGetAttribute\n");
1340 printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
1341 checkResult(cuDeviceGetAttribute(
1342 &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
1343 "Error returned from cuDeviceGetAttribute\n");
1344 printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
1345 checkResult(
1346 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
1347 "Error returned from cuDeviceGetAttribute\n");
1348 printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
1349 checkResult(cuDeviceGetAttribute(
1350 &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
1351 "Error returned from cuDeviceGetAttribute\n");
1352 printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1353 checkResult(
1354 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
1355 "Error returned from cuDeviceGetAttribute\n");
1356 printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1357 checkResult(cuDeviceGetAttribute(
1358 &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
1359 "Error returned from cuDeviceGetAttribute\n");
1360 printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1361 checkResult(
1362 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
1363 "Error returned from cuDeviceGetAttribute\n");
1364 if (TmpInt == CU_COMPUTEMODE_DEFAULT)
1365 TmpStr = "DEFAULT";
1366 else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
1367 TmpStr = "PROHIBITED";
1368 else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
1369 TmpStr = "EXCLUSIVE PROCESS";
1370 else
1371 TmpStr = "unknown";
1372 printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
1373 checkResult(cuDeviceGetAttribute(
1374 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
1375 "Error returned from cuDeviceGetAttribute\n");
1376 printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
1377 checkResult(
1378 cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
1379 "Error returned from cuDeviceGetAttribute\n");
1380 printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1381 checkResult(cuDeviceGetAttribute(
1382 &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
1383 "Error returned from cuDeviceGetAttribute\n");
1384 printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
1385 checkResult(
1386 cuDeviceGetAttribute(
1387 &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
1388 "Error returned from cuDeviceGetAttribute\n");
1389 printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
1390 checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
1391 Device),
1392 "Error returned from cuDeviceGetAttribute\n");
1393 printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
1394 checkResult(cuDeviceGetAttribute(
1395 &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
1396 Device),
1397 "Error returned from cuDeviceGetAttribute\n");
1398 printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
1399 checkResult(cuDeviceGetAttribute(
1400 &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
1401 "Error returned from cuDeviceGetAttribute\n");
1402 printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
1403 checkResult(cuDeviceGetAttribute(
1404 &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
1405 "Error returned from cuDeviceGetAttribute\n");
1406 printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
1407 checkResult(cuDeviceGetAttribute(
1408 &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
1409 "Error returned from cuDeviceGetAttribute\n");
1410 printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
1411 checkResult(
1412 cuDeviceGetAttribute(
1413 &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
1414 "Error returned from cuDeviceGetAttribute\n");
1415 printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
1416 checkResult(
1417 cuDeviceGetAttribute(
1418 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
1419 "Error returned from cuDeviceGetAttribute\n");
1420 printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
1421 checkResult(cuDeviceGetAttribute(
1422 &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
1423 "Error returned from cuDeviceGetAttribute\n");
1424 printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
1425 checkResult(cuDeviceGetAttribute(
1426 &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
1427 "Error returned from cuDeviceGetAttribute\n");
1428 printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
1429 checkResult(
1430 cuDeviceGetAttribute(
1431 &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
1432 "Error returned from cuDeviceGetAttribute\n");
1433 checkResult(
1434 cuDeviceGetAttribute(
1435 &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
1436 "Error returned from cuDeviceGetAttribute\n");
1437 printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
1438 }
1439
createEvent(int DeviceId,void ** P)1440 int createEvent(int DeviceId, void **P) {
1441 CUevent Event = nullptr;
1442 if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
1443 return OFFLOAD_FAIL;
1444 *P = Event;
1445 return OFFLOAD_SUCCESS;
1446 }
1447
destroyEvent(int DeviceId,void * EventPtr)1448 int destroyEvent(int DeviceId, void *EventPtr) {
1449 EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
1450 return OFFLOAD_SUCCESS;
1451 }
1452
waitEvent(const int DeviceId,__tgt_async_info * AsyncInfo,void * EventPtr) const1453 int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
1454 void *EventPtr) const {
1455 CUstream Stream = getStream(DeviceId, AsyncInfo);
1456 CUevent Event = reinterpret_cast<CUevent>(EventPtr);
1457
1458 // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
1459 // specific CUDA version, and defined as 0x0. In previous version, per CUDA
1460 // API document, that argument has to be 0x0.
1461 CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
1462 if (Err != CUDA_SUCCESS) {
1463 DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
1464 DPxPTR(Stream), DPxPTR(Event));
1465 CUDA_ERR_STRING(Err);
1466 return OFFLOAD_FAIL;
1467 }
1468
1469 return OFFLOAD_SUCCESS;
1470 }
1471
releaseAsyncInfo(int DeviceId,__tgt_async_info * AsyncInfo) const1472 int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
1473 if (AsyncInfo->Queue) {
1474 StreamPool[DeviceId]->release(
1475 reinterpret_cast<CUstream>(AsyncInfo->Queue));
1476 AsyncInfo->Queue = nullptr;
1477 }
1478
1479 return OFFLOAD_SUCCESS;
1480 }
1481
initAsyncInfo(int DeviceId,__tgt_async_info ** AsyncInfo) const1482 int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
1483 *AsyncInfo = new __tgt_async_info;
1484 getStream(DeviceId, *AsyncInfo);
1485 return OFFLOAD_SUCCESS;
1486 }
1487
initDeviceInfo(int DeviceId,__tgt_device_info * DeviceInfo,const char ** ErrStr) const1488 int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
1489 const char **ErrStr) const {
1490 assert(DeviceInfo && "DeviceInfo is nullptr");
1491
1492 if (!DeviceInfo->Context)
1493 DeviceInfo->Context = DeviceData[DeviceId].Context;
1494 if (!DeviceInfo->Device) {
1495 CUdevice Dev;
1496 CUresult Err = cuDeviceGet(&Dev, DeviceId);
1497 if (Err == CUDA_SUCCESS) {
1498 DeviceInfo->Device = reinterpret_cast<void *>(Dev);
1499 } else {
1500 cuGetErrorString(Err, ErrStr);
1501 return OFFLOAD_FAIL;
1502 }
1503 }
1504 return OFFLOAD_SUCCESS;
1505 }
1506
setContext(int DeviceId)1507 int setContext(int DeviceId) {
1508 assert(InitializedFlags[DeviceId] && "Device is not initialized");
1509
1510 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
1511 if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
1512 return OFFLOAD_FAIL;
1513
1514 return OFFLOAD_SUCCESS;
1515 }
1516 };
1517
1518 DeviceRTLTy DeviceRTL;
1519 } // namespace
1520
1521 // Exposed library API function
1522 #ifdef __cplusplus
1523 extern "C" {
1524 #endif
1525
__tgt_rtl_is_valid_binary(__tgt_device_image * Image)1526 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1527 return elf_check_machine(Image, /* EM_CUDA */ 190);
1528 }
1529
__tgt_rtl_is_valid_binary_info(__tgt_device_image * image,__tgt_image_info * info)1530 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1531 __tgt_image_info *info) {
1532 if (!__tgt_rtl_is_valid_binary(image))
1533 return false;
1534
1535 // A subarchitecture was not specified. Assume it is compatible.
1536 if (!info || !info->Arch)
1537 return true;
1538
1539 int32_t NumberOfDevices = 0;
1540 if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
1541 return false;
1542
1543 StringRef ArchStr = StringRef(info->Arch).drop_front(sizeof("sm_") - 1);
1544 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1545 CUdevice Device;
1546 if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
1547 return false;
1548
1549 int32_t Major, Minor;
1550 if (cuDeviceGetAttribute(&Major,
1551 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
1552 Device) != CUDA_SUCCESS)
1553 return false;
1554 if (cuDeviceGetAttribute(&Minor,
1555 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
1556 Device) != CUDA_SUCCESS)
1557 return false;
1558
1559 // A cubin generated for a certain compute capability is supported to run on
1560 // any GPU with the same major revision and same or higher minor revision.
1561 int32_t ImageMajor = ArchStr[0] - '0';
1562 int32_t ImageMinor = ArchStr[1] - '0';
1563 if (Major != ImageMajor || Minor < ImageMinor)
1564 return false;
1565 }
1566
1567 DP("Image has compatible compute capability: %s\n", info->Arch);
1568 return true;
1569 }
1570
__tgt_rtl_number_of_devices()1571 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
1572
__tgt_rtl_init_requires(int64_t RequiresFlags)1573 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1574 DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
1575 DeviceRTL.setRequiresFlag(RequiresFlags);
1576 return RequiresFlags;
1577 }
1578
__tgt_rtl_is_data_exchangable(int32_t SrcDevId,int DstDevId)1579 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
1580 if (DeviceRTL.isValidDeviceId(SrcDevId) &&
1581 DeviceRTL.isValidDeviceId(DstDevId))
1582 return 1;
1583
1584 return 0;
1585 }
1586
__tgt_rtl_init_device(int32_t DeviceId)1587 int32_t __tgt_rtl_init_device(int32_t DeviceId) {
1588 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1589 // Context is set when init the device.
1590
1591 return DeviceRTL.initDevice(DeviceId);
1592 }
1593
__tgt_rtl_deinit_device(int32_t DeviceId)1594 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
1595 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1596 // Context is set when deinit the device.
1597
1598 return DeviceRTL.deinitDevice(DeviceId);
1599 }
1600
__tgt_rtl_load_binary(int32_t DeviceId,__tgt_device_image * Image)1601 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
1602 __tgt_device_image *Image) {
1603 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1604
1605 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1606 return nullptr;
1607
1608 return DeviceRTL.loadBinary(DeviceId, Image);
1609 }
1610
__tgt_rtl_data_alloc(int32_t DeviceId,int64_t Size,void *,int32_t Kind)1611 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
1612 int32_t Kind) {
1613 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1614
1615 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1616 return nullptr;
1617
1618 return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
1619 }
1620
__tgt_rtl_data_submit(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size)1621 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1622 int64_t Size) {
1623 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1624 // Context is set in __tgt_rtl_data_submit_async.
1625
1626 __tgt_async_info AsyncInfo;
1627 const int32_t Rc =
1628 __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
1629 if (Rc != OFFLOAD_SUCCESS)
1630 return OFFLOAD_FAIL;
1631
1632 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1633 }
1634
__tgt_rtl_data_submit_async(int32_t DeviceId,void * TgtPtr,void * HstPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)1635 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
1636 void *HstPtr, int64_t Size,
1637 __tgt_async_info *AsyncInfoPtr) {
1638 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1639 assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1640
1641 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1642 return OFFLOAD_FAIL;
1643
1644 return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
1645 }
1646
__tgt_rtl_data_retrieve(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size)1647 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1648 int64_t Size) {
1649 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1650 // Context is set in __tgt_rtl_data_retrieve_async.
1651
1652 __tgt_async_info AsyncInfo;
1653 const int32_t Rc =
1654 __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
1655 if (Rc != OFFLOAD_SUCCESS)
1656 return OFFLOAD_FAIL;
1657
1658 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1659 }
1660
__tgt_rtl_data_retrieve_async(int32_t DeviceId,void * HstPtr,void * TgtPtr,int64_t Size,__tgt_async_info * AsyncInfoPtr)1661 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
1662 void *TgtPtr, int64_t Size,
1663 __tgt_async_info *AsyncInfoPtr) {
1664 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1665 assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1666
1667 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1668 return OFFLOAD_FAIL;
1669
1670 return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
1671 }
1672
__tgt_rtl_data_exchange_async(int32_t SrcDevId,void * SrcPtr,int DstDevId,void * DstPtr,int64_t Size,__tgt_async_info * AsyncInfo)1673 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
1674 int DstDevId, void *DstPtr, int64_t Size,
1675 __tgt_async_info *AsyncInfo) {
1676 assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1677 assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1678 assert(AsyncInfo && "AsyncInfo is nullptr");
1679
1680 if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
1681 return OFFLOAD_FAIL;
1682
1683 return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
1684 AsyncInfo);
1685 }
1686
__tgt_rtl_data_exchange(int32_t SrcDevId,void * SrcPtr,int32_t DstDevId,void * DstPtr,int64_t Size)1687 int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
1688 int32_t DstDevId, void *DstPtr, int64_t Size) {
1689 assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
1690 assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
1691 // Context is set in __tgt_rtl_data_exchange_async.
1692
1693 __tgt_async_info AsyncInfo;
1694 const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
1695 DstPtr, Size, &AsyncInfo);
1696 if (Rc != OFFLOAD_SUCCESS)
1697 return OFFLOAD_FAIL;
1698
1699 return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
1700 }
1701
__tgt_rtl_data_delete(int32_t DeviceId,void * TgtPtr)1702 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
1703 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1704
1705 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1706 return OFFLOAD_FAIL;
1707
1708 return DeviceRTL.dataDelete(DeviceId, TgtPtr);
1709 }
1710
__tgt_rtl_run_target_team_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t TeamNum,int32_t ThreadLimit,uint64_t LoopTripcount)1711 int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
1712 void **TgtArgs, ptrdiff_t *TgtOffsets,
1713 int32_t ArgNum, int32_t TeamNum,
1714 int32_t ThreadLimit,
1715 uint64_t LoopTripcount) {
1716 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1717 // Context is set in __tgt_rtl_run_target_team_region_async.
1718
1719 __tgt_async_info AsyncInfo;
1720 const int32_t Rc = __tgt_rtl_run_target_team_region_async(
1721 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1722 LoopTripcount, &AsyncInfo);
1723 if (Rc != OFFLOAD_SUCCESS)
1724 return OFFLOAD_FAIL;
1725
1726 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1727 }
1728
__tgt_rtl_run_target_team_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,int32_t TeamNum,int32_t ThreadLimit,uint64_t LoopTripcount,__tgt_async_info * AsyncInfoPtr)1729 int32_t __tgt_rtl_run_target_team_region_async(
1730 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
1731 int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
1732 uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
1733 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1734
1735 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1736 return OFFLOAD_FAIL;
1737
1738 return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
1739 TgtOffsets, ArgNum, TeamNum, ThreadLimit,
1740 LoopTripcount, AsyncInfoPtr);
1741 }
1742
__tgt_rtl_run_target_region(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum)1743 int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
1744 void **TgtArgs, ptrdiff_t *TgtOffsets,
1745 int32_t ArgNum) {
1746 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1747 // Context is set in __tgt_rtl_run_target_region_async.
1748
1749 __tgt_async_info AsyncInfo;
1750 const int32_t Rc = __tgt_rtl_run_target_region_async(
1751 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
1752 if (Rc != OFFLOAD_SUCCESS)
1753 return OFFLOAD_FAIL;
1754
1755 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
1756 }
1757
__tgt_rtl_run_target_region_async(int32_t DeviceId,void * TgtEntryPtr,void ** TgtArgs,ptrdiff_t * TgtOffsets,int32_t ArgNum,__tgt_async_info * AsyncInfoPtr)1758 int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
1759 void **TgtArgs, ptrdiff_t *TgtOffsets,
1760 int32_t ArgNum,
1761 __tgt_async_info *AsyncInfoPtr) {
1762 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1763 // Context is set in __tgt_rtl_run_target_team_region_async.
1764 return __tgt_rtl_run_target_team_region_async(
1765 DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
1766 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
1767 AsyncInfoPtr);
1768 }
1769
__tgt_rtl_synchronize(int32_t DeviceId,__tgt_async_info * AsyncInfoPtr)1770 int32_t __tgt_rtl_synchronize(int32_t DeviceId,
1771 __tgt_async_info *AsyncInfoPtr) {
1772 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1773 assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1774 assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1775 // NOTE: We don't need to set context for stream sync.
1776 return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
1777 }
1778
__tgt_rtl_set_info_flag(uint32_t NewInfoLevel)1779 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1780 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1781 InfoLevel.store(NewInfoLevel);
1782 }
1783
__tgt_rtl_print_device_info(int32_t DeviceId)1784 void __tgt_rtl_print_device_info(int32_t DeviceId) {
1785 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1786 // NOTE: We don't need to set context for print device info.
1787 DeviceRTL.printDeviceInfo(DeviceId);
1788 }
1789
__tgt_rtl_create_event(int32_t DeviceId,void ** Event)1790 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
1791 assert(Event && "event is nullptr");
1792
1793 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1794 return OFFLOAD_FAIL;
1795
1796 return DeviceRTL.createEvent(DeviceId, Event);
1797 }
1798
__tgt_rtl_record_event(int32_t DeviceId,void * EventPtr,__tgt_async_info * AsyncInfoPtr)1799 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
1800 __tgt_async_info *AsyncInfoPtr) {
1801 assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1802 assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
1803 assert(EventPtr && "event_ptr is nullptr");
1804 // NOTE: We might not need to set context for event record.
1805 return recordEvent(EventPtr, AsyncInfoPtr);
1806 }
1807
__tgt_rtl_wait_event(int32_t DeviceId,void * EventPtr,__tgt_async_info * AsyncInfoPtr)1808 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
1809 __tgt_async_info *AsyncInfoPtr) {
1810 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1811 assert(AsyncInfoPtr && "async_info_ptr is nullptr");
1812 assert(EventPtr && "event is nullptr");
1813 // If we don't have a queue we need to set the context.
1814 if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1815 return OFFLOAD_FAIL;
1816 return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
1817 }
1818
__tgt_rtl_sync_event(int32_t DeviceId,void * EventPtr)1819 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
1820 assert(EventPtr && "event is nullptr");
1821 // NOTE: We might not need to set context for event sync.
1822 return syncEvent(EventPtr);
1823 }
1824
__tgt_rtl_destroy_event(int32_t DeviceId,void * EventPtr)1825 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
1826 assert(EventPtr && "event is nullptr");
1827
1828 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1829 return OFFLOAD_FAIL;
1830
1831 return DeviceRTL.destroyEvent(DeviceId, EventPtr);
1832 }
1833
__tgt_rtl_release_async_info(int32_t DeviceId,__tgt_async_info * AsyncInfo)1834 int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
1835 __tgt_async_info *AsyncInfo) {
1836 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1837 assert(AsyncInfo && "async_info is nullptr");
1838
1839 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1840 return OFFLOAD_FAIL;
1841
1842 return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
1843 }
1844
__tgt_rtl_init_async_info(int32_t DeviceId,__tgt_async_info ** AsyncInfo)1845 int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
1846 __tgt_async_info **AsyncInfo) {
1847 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1848 assert(AsyncInfo && "async_info is nullptr");
1849
1850 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1851 return OFFLOAD_FAIL;
1852
1853 return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
1854 }
1855
__tgt_rtl_init_device_info(int32_t DeviceId,__tgt_device_info * DeviceInfoPtr,const char ** ErrStr)1856 int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
1857 __tgt_device_info *DeviceInfoPtr,
1858 const char **ErrStr) {
1859 assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
1860 assert(DeviceInfoPtr && "device_info_ptr is nullptr");
1861
1862 if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
1863 return OFFLOAD_FAIL;
1864
1865 return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
1866 }
1867
1868 #ifdef __cplusplus
1869 }
1870 #endif
1871