1 //===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implements C wrappers around the CUDA library for easy linking in ORC jit.
10 // Also adds some debugging helpers that are helpful when writing MLIR code to
11 // run on GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "mlir/ExecutionEngine/CRunnerUtils.h"
16
17 #include <stdio.h>
18
19 #include "cuda.h"
20
21 #ifdef _WIN32
22 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
23 #else
24 #define MLIR_CUDA_WRAPPERS_EXPORT
25 #endif // _WIN32
26
27 #define CUDA_REPORT_IF_ERROR(expr) \
28 [](CUresult result) { \
29 if (!result) \
30 return; \
31 const char *name = nullptr; \
32 cuGetErrorName(result, &name); \
33 if (!name) \
34 name = "<unknown>"; \
35 fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
36 }(expr)
37
38 thread_local static int32_t defaultDevice = 0;
39
40 // Make the primary context of the current default device current for the
41 // duration
42 // of the instance and restore the previous context on destruction.
43 class ScopedContext {
44 public:
ScopedContext()45 ScopedContext() {
46 // Static reference to CUDA primary context for device ordinal
47 // defaultDevice.
48 static CUcontext context = [] {
49 CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
50 CUdevice device;
51 CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
52 CUcontext ctx;
53 // Note: this does not affect the current context.
54 CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
55 return ctx;
56 }();
57
58 CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context));
59 }
60
~ScopedContext()61 ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); }
62 };
63
mgpuModuleLoad(void * data)64 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
65 ScopedContext scopedContext;
66 CUmodule module = nullptr;
67 CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
68 return module;
69 }
70
mgpuModuleUnload(CUmodule module)71 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
72 CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
73 }
74
75 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction
mgpuModuleGetFunction(CUmodule module,const char * name)76 mgpuModuleGetFunction(CUmodule module, const char *name) {
77 CUfunction function = nullptr;
78 CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
79 return function;
80 }
81
82 // The wrapper uses intptr_t instead of CUDA's unsigned int to match
83 // the type of MLIR's index type. This avoids the need for casts in the
84 // generated MLIR code.
85 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuLaunchKernel(CUfunction function,intptr_t gridX,intptr_t gridY,intptr_t gridZ,intptr_t blockX,intptr_t blockY,intptr_t blockZ,int32_t smem,CUstream stream,void ** params,void ** extra)86 mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
87 intptr_t gridZ, intptr_t blockX, intptr_t blockY,
88 intptr_t blockZ, int32_t smem, CUstream stream, void **params,
89 void **extra) {
90 ScopedContext scopedContext;
91 CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
92 blockY, blockZ, smem, stream, params,
93 extra));
94 }
95
mgpuStreamCreate()96 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() {
97 ScopedContext scopedContext;
98 CUstream stream = nullptr;
99 CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
100 return stream;
101 }
102
mgpuStreamDestroy(CUstream stream)103 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) {
104 CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream));
105 }
106
107 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuStreamSynchronize(CUstream stream)108 mgpuStreamSynchronize(CUstream stream) {
109 CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
110 }
111
mgpuStreamWaitEvent(CUstream stream,CUevent event)112 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream,
113 CUevent event) {
114 CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0));
115 }
116
mgpuEventCreate()117 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() {
118 ScopedContext scopedContext;
119 CUevent event = nullptr;
120 CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
121 return event;
122 }
123
mgpuEventDestroy(CUevent event)124 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) {
125 CUDA_REPORT_IF_ERROR(cuEventDestroy(event));
126 }
127
128 extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(CUevent event) {
129 CUDA_REPORT_IF_ERROR(cuEventSynchronize(event));
130 }
131
132 extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
133 CUstream stream) {
134 CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
135 }
136
mgpuMemAlloc(uint64_t sizeBytes,CUstream)137 extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
138 ScopedContext scopedContext;
139 CUdeviceptr ptr;
140 CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
141 return reinterpret_cast<void *>(ptr);
142 }
143
mgpuMemFree(void * ptr,CUstream)144 extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) {
145 CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
146 }
147
mgpuMemcpy(void * dst,void * src,size_t sizeBytes,CUstream stream)148 extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
149 CUstream stream) {
150 CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst),
151 reinterpret_cast<CUdeviceptr>(src),
152 sizeBytes, stream));
153 }
154
mgpuMemset32(void * dst,unsigned int value,size_t count,CUstream stream)155 extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
156 CUstream stream) {
157 CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst),
158 value, count, stream));
159 }
160
161 /// Helper functions for writing mlir example code
162
163 // Allows to register byte array with the CUDA runtime. Helpful until we have
164 // transfer functions implemented.
165 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuMemHostRegister(void * ptr,uint64_t sizeBytes)166 mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
167 ScopedContext scopedContext;
168 CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
169 }
170
171 /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a
172 /// ranked memref descriptor struct of rank `rank`. Helpful until we have
173 /// transfer functions implemented.
174 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
mgpuMemHostRegisterMemRef(int64_t rank,StridedMemRefType<char,1> * descriptor,int64_t elementSizeBytes)175 mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
176 int64_t elementSizeBytes) {
177 // Only densely packed tensors are currently supported.
178 int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t));
179 int64_t *sizes = descriptor->sizes;
180 for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) {
181 denseStrides[i] = runningStride;
182 runningStride *= sizes[i];
183 }
184 uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes;
185 int64_t *strides = &sizes[rank];
186 (void)strides;
187 for (unsigned i = 0; i < rank; ++i)
188 assert(strides[i] == denseStrides[i] &&
189 "Mismatch in computed dense strides");
190
191 auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
192 mgpuMemHostRegister(ptr, sizeBytes);
193 }
194
mgpuSetDefaultDevice(int32_t device)195 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
196 defaultDevice = device;
197 }
198