1 //===- CudaRuntimeWrappers.cpp - MLIR CUDA API wrapper library ------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implements C wrappers around the CUDA library for easy linking in ORC jit. 10 // Also adds some debugging helpers that are helpful when writing MLIR code to 11 // run on GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "mlir/ExecutionEngine/CRunnerUtils.h" 16 17 #include <stdio.h> 18 19 #include "cuda.h" 20 21 #ifdef _WIN32 22 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) 23 #else 24 #define MLIR_CUDA_WRAPPERS_EXPORT 25 #endif // _WIN32 26 27 #define CUDA_REPORT_IF_ERROR(expr) \ 28 [](CUresult result) { \ 29 if (!result) \ 30 return; \ 31 const char *name = nullptr; \ 32 cuGetErrorName(result, &name); \ 33 if (!name) \ 34 name = "<unknown>"; \ 35 fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ 36 }(expr) 37 38 thread_local static int32_t defaultDevice = 0; 39 40 // Make the primary context of the current default device current for the 41 // duration 42 // of the instance and restore the previous context on destruction. 43 class ScopedContext { 44 public: 45 ScopedContext() { 46 // Static reference to CUDA primary context for device ordinal 47 // defaultDevice. 48 static CUcontext context = [] { 49 CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0)); 50 CUdevice device; 51 CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice)); 52 CUcontext ctx; 53 // Note: this does not affect the current context. 54 CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device)); 55 return ctx; 56 }(); 57 58 CUDA_REPORT_IF_ERROR(cuCtxPushCurrent(context)); 59 } 60 61 ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } 62 }; 63 64 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { 65 ScopedContext scopedContext; 66 CUmodule module = nullptr; 67 CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); 68 return module; 69 } 70 71 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { 72 CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); 73 } 74 75 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUfunction 76 mgpuModuleGetFunction(CUmodule module, const char *name) { 77 CUfunction function = nullptr; 78 CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name)); 79 return function; 80 } 81 82 // The wrapper uses intptr_t instead of CUDA's unsigned int to match 83 // the type of MLIR's index type. This avoids the need for casts in the 84 // generated MLIR code. 85 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 86 mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, 87 intptr_t gridZ, intptr_t blockX, intptr_t blockY, 88 intptr_t blockZ, int32_t smem, CUstream stream, void **params, 89 void **extra) { 90 ScopedContext scopedContext; 91 CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX, 92 blockY, blockZ, smem, stream, params, 93 extra)); 94 } 95 96 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUstream mgpuStreamCreate() { 97 ScopedContext scopedContext; 98 CUstream stream = nullptr; 99 CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); 100 return stream; 101 } 102 103 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamDestroy(CUstream stream) { 104 CUDA_REPORT_IF_ERROR(cuStreamDestroy(stream)); 105 } 106 107 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 108 mgpuStreamSynchronize(CUstream stream) { 109 CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream)); 110 } 111 112 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuStreamWaitEvent(CUstream stream, 113 CUevent event) { 114 CUDA_REPORT_IF_ERROR(cuStreamWaitEvent(stream, event, /*flags=*/0)); 115 } 116 117 extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUevent mgpuEventCreate() { 118 ScopedContext scopedContext; 119 CUevent event = nullptr; 120 CUDA_REPORT_IF_ERROR(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); 121 return event; 122 } 123 124 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuEventDestroy(CUevent event) { 125 CUDA_REPORT_IF_ERROR(cuEventDestroy(event)); 126 } 127 128 extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventSynchronize(CUevent event) { 129 CUDA_REPORT_IF_ERROR(cuEventSynchronize(event)); 130 } 131 132 extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, 133 CUstream stream) { 134 CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); 135 } 136 137 extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { 138 ScopedContext scopedContext; 139 CUdeviceptr ptr; 140 CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); 141 return reinterpret_cast<void *>(ptr); 142 } 143 144 extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) { 145 CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr))); 146 } 147 148 extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes, 149 CUstream stream) { 150 CUDA_REPORT_IF_ERROR(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dst), 151 reinterpret_cast<CUdeviceptr>(src), 152 sizeBytes, stream)); 153 } 154 155 extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count, 156 CUstream stream) { 157 CUDA_REPORT_IF_ERROR(cuMemsetD32Async(reinterpret_cast<CUdeviceptr>(dst), 158 value, count, stream)); 159 } 160 161 /// Helper functions for writing mlir example code 162 163 // Allows to register byte array with the CUDA runtime. Helpful until we have 164 // transfer functions implemented. 165 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 166 mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { 167 ScopedContext scopedContext; 168 CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0)); 169 } 170 171 /// Registers a memref with the CUDA runtime. `descriptor` is a pointer to a 172 /// ranked memref descriptor struct of rank `rank`. Helpful until we have 173 /// transfer functions implemented. 174 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void 175 mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor, 176 int64_t elementSizeBytes) { 177 // Only densely packed tensors are currently supported. 178 int64_t *denseStrides = (int64_t *)alloca(rank * sizeof(int64_t)); 179 int64_t *sizes = descriptor->sizes; 180 for (int64_t i = rank - 1, runningStride = 1; i >= 0; i--) { 181 denseStrides[i] = runningStride; 182 runningStride *= sizes[i]; 183 } 184 uint64_t sizeBytes = sizes[0] * denseStrides[0] * elementSizeBytes; 185 int64_t *strides = &sizes[rank]; 186 (void)strides; 187 for (unsigned i = 0; i < rank; ++i) 188 assert(strides[i] == denseStrides[i] && 189 "Mismatch in computed dense strides"); 190 191 auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes; 192 mgpuMemHostRegister(ptr, sizeBytes); 193 } 194 195 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { 196 defaultDevice = device; 197 } 198