1 //===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass that serializes a gpu module into CUBIN blob and
10 // adds that blob as a string attribute of the module.
11 //
12 //===----------------------------------------------------------------------===//
13 #include "mlir/Dialect/GPU/Passes.h"
14 
15 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
16 #include "mlir/Pass/Pass.h"
17 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
18 #include "mlir/Target/LLVMIR/Export.h"
19 #include "llvm/Support/TargetSelect.h"
20 
21 #include <cuda.h>
22 
23 using namespace mlir;
24 
25 static void emitCudaError(const llvm::Twine &expr, const char *buffer,
26                           CUresult result, Location loc) {
27   const char *error;
28   cuGetErrorString(result, &error);
29   emitError(loc, expr.concat(" failed with error code ")
30                      .concat(llvm::Twine{error})
31                      .concat("[")
32                      .concat(buffer)
33                      .concat("]"));
34 }
35 
36 #define RETURN_ON_CUDA_ERROR(expr)                                             \
37   do {                                                                         \
38     if (auto status = (expr)) {                                                \
39       emitCudaError(#expr, jitErrorBuffer, status, loc);                       \
40       return {};                                                               \
41     }                                                                          \
42   } while (false)
43 
44 namespace {
45 class SerializeToCubinPass
46     : public PassWrapper<SerializeToCubinPass, gpu::SerializeToBlobPass> {
47 public:
48   SerializeToCubinPass();
49 
50   StringRef getArgument() const override { return "gpu-to-cubin"; }
51   StringRef getDescription() const override {
52     return "Lower GPU kernel function to CUBIN binary annotations";
53   }
54 
55 private:
56   void getDependentDialects(DialectRegistry &registry) const override;
57 
58   // Serializes PTX to CUBIN.
59   std::unique_ptr<std::vector<char>>
60   serializeISA(const std::string &isa) override;
61 };
62 } // namespace
63 
64 // Sets the 'option' to 'value' unless it already has a value.
65 static void maybeSetOption(Pass::Option<std::string> &option,
66                            const char *value) {
67   if (!option.hasValue())
68     option = value;
69 }
70 
71 SerializeToCubinPass::SerializeToCubinPass() {
72   maybeSetOption(this->triple, "nvptx64-nvidia-cuda");
73   maybeSetOption(this->chip, "sm_35");
74   maybeSetOption(this->features, "+ptx60");
75 }
76 
77 void SerializeToCubinPass::getDependentDialects(
78     DialectRegistry &registry) const {
79   registerNVVMDialectTranslation(registry);
80   gpu::SerializeToBlobPass::getDependentDialects(registry);
81 }
82 
83 std::unique_ptr<std::vector<char>>
84 SerializeToCubinPass::serializeISA(const std::string &isa) {
85   Location loc = getOperation().getLoc();
86   char jitErrorBuffer[4096] = {0};
87 
88   RETURN_ON_CUDA_ERROR(cuInit(0));
89 
90   // Linking requires a device context.
91   CUdevice device;
92   RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
93   CUcontext context;
94   RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
95   CUlinkState linkState;
96 
97   CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
98                                CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
99   void *jitOptionsVals[] = {jitErrorBuffer,
100                             reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
101 
102   RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
103                                     jitOptions,     /* jit options */
104                                     jitOptionsVals, /* jit option values */
105                                     &linkState));
106 
107   auto kernelName = getOperation().getName().str();
108   RETURN_ON_CUDA_ERROR(cuLinkAddData(
109       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
110       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
111       kernelName.c_str(), 0, /* number of jit options */
112       nullptr,               /* jit options */
113       nullptr                /* jit option values */
114       ));
115 
116   void *cubinData;
117   size_t cubinSize;
118   RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
119 
120   char *cubinAsChar = static_cast<char *>(cubinData);
121   auto result =
122       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
123 
124   // This will also destroy the cubin data.
125   RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
126   RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
127 
128   return result;
129 }
130 
131 // Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
132 void mlir::registerGpuSerializeToCubinPass() {
133   PassRegistration<SerializeToCubinPass> registerSerializeToCubin(
134       [] {
135         // Initialize LLVM NVPTX backend.
136         LLVMInitializeNVPTXTarget();
137         LLVMInitializeNVPTXTargetInfo();
138         LLVMInitializeNVPTXTargetMC();
139         LLVMInitializeNVPTXAsmPrinter();
140 
141         return std::make_unique<SerializeToCubinPass>();
142       });
143 }
144 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE
145 void mlir::registerGpuSerializeToCubinPass() {}
146 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
147