1 //===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass that serializes a gpu module into CUBIN blob and
10 // adds that blob as a string attribute of the module.
11 //
12 //===----------------------------------------------------------------------===//
13 #include "mlir/Dialect/GPU/Passes.h"
14 
15 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
16 #include "mlir/Pass/Pass.h"
17 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
18 #include "mlir/Target/LLVMIR/Export.h"
19 #include "llvm/Support/TargetSelect.h"
20 
21 #include <cuda.h>
22 
23 using namespace mlir;
24 
25 static void emitCudaError(const llvm::Twine &expr, const char *buffer,
26                           CUresult result, Location loc) {
27   const char *error;
28   cuGetErrorString(result, &error);
29   emitError(loc, expr.concat(" failed with error code ")
30                      .concat(llvm::Twine{error})
31                      .concat("[")
32                      .concat(buffer)
33                      .concat("]"));
34 }
35 
36 #define RETURN_ON_CUDA_ERROR(expr)                                             \
37   do {                                                                         \
38     if (auto status = (expr)) {                                                \
39       emitCudaError(#expr, jitErrorBuffer, status, loc);                       \
40       return {};                                                               \
41     }                                                                          \
42   } while (false)
43 
44 namespace {
45 class SerializeToCubinPass
46     : public PassWrapper<SerializeToCubinPass, gpu::SerializeToBlobPass> {
47 public:
48   SerializeToCubinPass();
49 
50 private:
51   void getDependentDialects(DialectRegistry &registry) const override;
52 
53   // Serializes PTX to CUBIN.
54   std::unique_ptr<std::vector<char>>
55   serializeISA(const std::string &isa) override;
56 };
57 } // namespace
58 
59 // Sets the 'option' to 'value' unless it already has a value.
60 static void maybeSetOption(Pass::Option<std::string> &option,
61                            const char *value) {
62   if (!option.hasValue())
63     option = value;
64 }
65 
66 SerializeToCubinPass::SerializeToCubinPass() {
67   maybeSetOption(this->triple, "nvptx64-nvidia-cuda");
68   maybeSetOption(this->chip, "sm_35");
69   maybeSetOption(this->features, "+ptx60");
70 }
71 
72 void SerializeToCubinPass::getDependentDialects(
73     DialectRegistry &registry) const {
74   registerNVVMDialectTranslation(registry);
75   gpu::SerializeToBlobPass::getDependentDialects(registry);
76 }
77 
78 std::unique_ptr<std::vector<char>>
79 SerializeToCubinPass::serializeISA(const std::string &isa) {
80   Location loc = getOperation().getLoc();
81   char jitErrorBuffer[4096] = {0};
82 
83   RETURN_ON_CUDA_ERROR(cuInit(0));
84 
85   // Linking requires a device context.
86   CUdevice device;
87   RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
88   CUcontext context;
89   RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
90   CUlinkState linkState;
91 
92   CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
93                                CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
94   void *jitOptionsVals[] = {jitErrorBuffer,
95                             reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
96 
97   RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
98                                     jitOptions,     /* jit options */
99                                     jitOptionsVals, /* jit option values */
100                                     &linkState));
101 
102   auto kernelName = getOperation().getName().str();
103   RETURN_ON_CUDA_ERROR(cuLinkAddData(
104       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
105       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
106       kernelName.c_str(), 0, /* number of jit options */
107       nullptr,               /* jit options */
108       nullptr                /* jit option values */
109       ));
110 
111   void *cubinData;
112   size_t cubinSize;
113   RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
114 
115   char *cubinAsChar = static_cast<char *>(cubinData);
116   auto result =
117       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
118 
119   // This will also destroy the cubin data.
120   RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
121   RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
122 
123   return result;
124 }
125 
126 // Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
127 void mlir::registerGpuSerializeToCubinPass() {
128   PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
129       "gpu-to-cubin", "Lower GPU kernel function to CUBIN binary annotations",
130       [] {
131         // Initialize LLVM NVPTX backend.
132         LLVMInitializeNVPTXTarget();
133         LLVMInitializeNVPTXTargetInfo();
134         LLVMInitializeNVPTXTargetMC();
135         LLVMInitializeNVPTXAsmPrinter();
136 
137         return std::make_unique<SerializeToCubinPass>();
138       });
139 }
140 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE
141 void mlir::registerGpuSerializeToCubinPass() {}
142 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
143