1 //===- LowerGPUToCUBIN.cpp - Convert GPU kernel to CUBIN blob -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a pass that serializes a gpu module into CUBIN blob and
10 // adds that blob as a string attribute of the module.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "mlir/Dialect/GPU/Transforms/Passes.h"
15 
16 #if MLIR_GPU_TO_CUBIN_PASS_ENABLE
17 #include "mlir/Pass/Pass.h"
18 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
19 #include "mlir/Target/LLVMIR/Export.h"
20 #include "llvm/Support/TargetSelect.h"
21 
22 #include <cuda.h>
23 
24 using namespace mlir;
25 
emitCudaError(const llvm::Twine & expr,const char * buffer,CUresult result,Location loc)26 static void emitCudaError(const llvm::Twine &expr, const char *buffer,
27                           CUresult result, Location loc) {
28   const char *error;
29   cuGetErrorString(result, &error);
30   emitError(loc, expr.concat(" failed with error code ")
31                      .concat(llvm::Twine{error})
32                      .concat("[")
33                      .concat(buffer)
34                      .concat("]"));
35 }
36 
37 #define RETURN_ON_CUDA_ERROR(expr)                                             \
38   do {                                                                         \
39     if (auto status = (expr)) {                                                \
40       emitCudaError(#expr, jitErrorBuffer, status, loc);                       \
41       return {};                                                               \
42     }                                                                          \
43   } while (false)
44 
45 namespace {
46 class SerializeToCubinPass
47     : public PassWrapper<SerializeToCubinPass, gpu::SerializeToBlobPass> {
48 public:
49   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToCubinPass)
50 
51   SerializeToCubinPass();
52 
getArgument() const53   StringRef getArgument() const override { return "gpu-to-cubin"; }
getDescription() const54   StringRef getDescription() const override {
55     return "Lower GPU kernel function to CUBIN binary annotations";
56   }
57 
58 private:
59   void getDependentDialects(DialectRegistry &registry) const override;
60 
61   // Serializes PTX to CUBIN.
62   std::unique_ptr<std::vector<char>>
63   serializeISA(const std::string &isa) override;
64 };
65 } // namespace
66 
67 // Sets the 'option' to 'value' unless it already has a value.
maybeSetOption(Pass::Option<std::string> & option,const char * value)68 static void maybeSetOption(Pass::Option<std::string> &option,
69                            const char *value) {
70   if (!option.hasValue())
71     option = value;
72 }
73 
SerializeToCubinPass()74 SerializeToCubinPass::SerializeToCubinPass() {
75   maybeSetOption(this->triple, "nvptx64-nvidia-cuda");
76   maybeSetOption(this->chip, "sm_35");
77   maybeSetOption(this->features, "+ptx60");
78 }
79 
getDependentDialects(DialectRegistry & registry) const80 void SerializeToCubinPass::getDependentDialects(
81     DialectRegistry &registry) const {
82   registerNVVMDialectTranslation(registry);
83   gpu::SerializeToBlobPass::getDependentDialects(registry);
84 }
85 
86 std::unique_ptr<std::vector<char>>
serializeISA(const std::string & isa)87 SerializeToCubinPass::serializeISA(const std::string &isa) {
88   Location loc = getOperation().getLoc();
89   char jitErrorBuffer[4096] = {0};
90 
91   RETURN_ON_CUDA_ERROR(cuInit(0));
92 
93   // Linking requires a device context.
94   CUdevice device;
95   RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
96   CUcontext context;
97   RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
98   CUlinkState linkState;
99 
100   CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
101                                CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
102   void *jitOptionsVals[] = {jitErrorBuffer,
103                             reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
104 
105   RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
106                                     jitOptions,     /* jit options */
107                                     jitOptionsVals, /* jit option values */
108                                     &linkState));
109 
110   auto kernelName = getOperation().getName().str();
111   RETURN_ON_CUDA_ERROR(cuLinkAddData(
112       linkState, CUjitInputType::CU_JIT_INPUT_PTX,
113       const_cast<void *>(static_cast<const void *>(isa.c_str())), isa.length(),
114       kernelName.c_str(), 0, /* number of jit options */
115       nullptr,               /* jit options */
116       nullptr                /* jit option values */
117       ));
118 
119   void *cubinData;
120   size_t cubinSize;
121   RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
122 
123   char *cubinAsChar = static_cast<char *>(cubinData);
124   auto result =
125       std::make_unique<std::vector<char>>(cubinAsChar, cubinAsChar + cubinSize);
126 
127   // This will also destroy the cubin data.
128   RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
129   RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
130 
131   return result;
132 }
133 
134 // Register pass to serialize GPU kernel functions to a CUBIN binary annotation.
registerGpuSerializeToCubinPass()135 void mlir::registerGpuSerializeToCubinPass() {
136   PassRegistration<SerializeToCubinPass> registerSerializeToCubin([] {
137     // Initialize LLVM NVPTX backend.
138     LLVMInitializeNVPTXTarget();
139     LLVMInitializeNVPTXTargetInfo();
140     LLVMInitializeNVPTXTargetMC();
141     LLVMInitializeNVPTXAsmPrinter();
142 
143     return std::make_unique<SerializeToCubinPass>();
144   });
145 }
146 #else  // MLIR_GPU_TO_CUBIN_PASS_ENABLE
registerGpuSerializeToCubinPass()147 void mlir::registerGpuSerializeToCubinPass() {}
148 #endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
149