1 //===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implements C wrappers around the CUDA library for easy linking in ORC jit. 10 // Also adds some debugging helpers that are helpful when writing MLIR code to 11 // run on GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/ExecutionEngine/CRunnerUtils.h" 16 17 #include <stdio.h> 18 19 #include "cuda.h" 20 21 // We need to know the CUDA version to determine how to map some of the runtime 22 // calls below. 23 #if !defined(CUDA_VERSION) 24 #error "cuda.h did not define CUDA_VERSION" 25 #endif 26 27 #ifdef _WIN32 28 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) 29 #else 30 #define MLIR_CUDA_WRAPPERS_EXPORT 31 #endif // _WIN32 32 33 #define CUDA_REPORT_IF_ERROR(expr) \ 34 [](CUresult result) { \ 35 if (!result) \ 36 return; \ 37 const char *name = nullptr; \ 38 cuGetErrorName(result, &name); \ 39 if (!name) \ 40 name = "<unknown>"; \ 41 fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ 42 }(expr) 43 44 thread_local static int32_t defaultDevice = 0; 45 46 // Make the primary context of the current default device current for the 47 // duration 48 // of the instance and restore the previous context on destruction. 49 class ScopedContext { 50 public: 51 ScopedContext() { 52 // Static reference to CUDA primary context for device ordinal 53 // defaultDevice. 54 static CUcontext context = [] { 55 CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); 56 CUdevice device; 57 CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); 58 CUcontext ctx; 59 // Note: this does not affect the current context. 60 CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device)); 61 return ctx; 62 }(); 63 64 CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); 65 } 66 67 ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } 68 }; 69 70 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { 71 ScopedContext scopedContext; 72 CUmodule module = nullptr; 73 CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); 74 return module; 75 } 76 77 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { 78 CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); 79 } 80 81 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction 82 mgpuModuleGetFunction(CUmodule module, const char *name) { 83 CUfunction function = nullptr; 84 CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); 85 return function; 86 } 87 88 // The wrapper uses intptr_t instead of CUDA's unsigned int to match 89 // the type of MLIR's index type. This avoids the need for casts in the 90 // generated MLIR code. 91 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 92 mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, 93 intptr_t gridZ, intptr_t blockX, intptr_t blockY, 94 intptr_t blockZ, int32_t smem, CUstream stream, void **params, 95 void **extra) { 96 ScopedContext scopedContext; 97 CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, 98 blockY, blockZ, smem, stream, params, 99 extra)); 100 } 101 102 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { 103 ScopedContext scopedContext; 104 CUstream stream = nullptr; 105 CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); 106 return stream; 107 } 108 109 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { 110 CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); 111 } 112 113 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 114 mgpuStreamSynchronize(CUstream stream) { 115 CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); 116 } 117 118 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, 119 CUevent event) { 120 CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); 121 } 122 123 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { 124 ScopedContext scopedContext; 125 CUevent event = nullptr; 126 CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); 127 return event; 128 } 129 130 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { 131 CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); 132 } 133 134 extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(CUevent event) { 135 CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); 136 } 137 138 extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, 139 CUstream stream) { 140 CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); 141 } 142 143 extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream stream) { 144 ScopedContext scopedContext; 145 CUdeviceptr ptr; 146 #if CUDA_VERSION >= 11020 147 // Use the async version that was available since CUDA 11.2. 148 CUDA_REPORT_IF_ERROR(cuMemAllocAsync(&ptr, sizeBytes, stream)); 149 #else 150 CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); 151 (void)stream; 152 #endif 153 return reinterpret_cast<void *>(ptr); 154 } 155 156 extern "C" void mgpuMemFree(void *ptr, CUstream stream) { 157 #if CUDA_VERSION >= 11020 158 // Use the async version that was available since CUDA 11.2. 159 CUDA_REPORT_IF_ERROR( 160 cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr), stream)); 161 #else 162 CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr))); 163 (void)stream; 164 #endif 165 } 166 167 extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes, 168 CUstream stream) { 169 CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst), 170 reinterpret_cast<CUdeviceptr>(src), 171 sizeBytes, stream)); 172 } 173 174 extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count, 175 CUstream stream) { 176 CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst), 177 value, count, stream)); 178 } 179 180 /// Helper functions for writing mlir example code 181 182 // Allows to register byte array with the CUDA runtime. Helpful until we have 183 // transfer functions implemented. 184 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 185 mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { 186 ScopedContext scopedContext; 187 CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0)); 188 } 189 190 /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a 191 /// ranked memref descriptor struct of rank `rank`. Helpful until we have 192 /// transfer functions implemented. 193 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 194 mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor, 195 int64_t elementSizeBytes) { 196 // Only densely packed tensors are currently supported. 197 int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t)); 198 int64_t *sizes = descriptor->sizes; 199 for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) { 200 denseStrides[i] = runningStride; 201 runningStride *= sizes[i]; 202 } 203 uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes; 204 int64_t *strides = &sizes[rank]; 205 (void)strides; 206 for (unsigned i = 0; i < rank; ++i) 207 assert(strides[i] == denseStrides[i] && 208 "Mismatch in computed dense strides"); 209 210 auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; 211 mgpuMemHostRegister(ptr, sizeBytes); 212 } 213 214 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { 215 defaultDevice = device; 216 } 217