1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
11 // runtime library.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "CGCUDARuntime.h"
16 #include "CodeGenFunction.h"
17 #include "CodeGenModule.h"
18 #include "clang/AST/Decl.h"
19 #include "clang/CodeGen/ConstantInitBuilder.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/CallSite.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/Support/Format.h"
25 
26 using namespace clang;
27 using namespace CodeGen;
28 
29 namespace {
30 
31 class CGNVCUDARuntime : public CGCUDARuntime {
32 
33 private:
34   llvm::IntegerType *IntTy, *SizeTy;
35   llvm::Type *VoidTy;
36   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
37 
38   /// Convenience reference to LLVM Context
39   llvm::LLVMContext &Context;
40   /// Convenience reference to the current module
41   llvm::Module &TheModule;
42   /// Keeps track of kernel launch stubs emitted in this module
43   llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
44   llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
45   /// Keeps track of variable containing handle of GPU binary. Populated by
46   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
47   /// ModuleDtorFunction()
48   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
49   /// Whether we generate relocatable device code.
50   bool RelocatableDeviceCode;
51 
52   llvm::Constant *getSetupArgumentFn() const;
53   llvm::Constant *getLaunchFn() const;
54 
55   llvm::FunctionType *getRegisterGlobalsFnTy() const;
56   llvm::FunctionType *getCallbackFnTy() const;
57   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
58   std::string addPrefixToName(StringRef FuncName) const;
59   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
60 
61   /// Creates a function to register all kernel stubs generated in this module.
62   llvm::Function *makeRegisterGlobalsFn();
63 
64   /// Helper function that generates a constant string and returns a pointer to
65   /// the start of the string.  The result of this function can be used anywhere
66   /// where the C code specifies const char*.
67   llvm::Constant *makeConstantString(const std::string &Str,
68                                      const std::string &Name = "",
69                                      const std::string &SectionName = "",
70                                      unsigned Alignment = 0) {
71     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
72                                llvm::ConstantInt::get(SizeTy, 0)};
73     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
74     llvm::GlobalVariable *GV =
75         cast<llvm::GlobalVariable>(ConstStr.getPointer());
76     if (!SectionName.empty())
77       GV->setSection(SectionName);
78     if (Alignment)
79       GV->setAlignment(Alignment);
80 
81     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
82                                                 ConstStr.getPointer(), Zeros);
83   }
84 
85   /// Helper function that generates an empty dummy function returning void.
86   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
87     assert(FnTy->getReturnType()->isVoidTy() &&
88            "Can only generate dummy functions returning void!");
89     llvm::Function *DummyFunc = llvm::Function::Create(
90         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
91 
92     llvm::BasicBlock *DummyBlock =
93         llvm::BasicBlock::Create(Context, "", DummyFunc);
94     CGBuilderTy FuncBuilder(CGM, Context);
95     FuncBuilder.SetInsertPoint(DummyBlock);
96     FuncBuilder.CreateRetVoid();
97 
98     return DummyFunc;
99   }
100 
101   void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
102 
103 public:
104   CGNVCUDARuntime(CodeGenModule &CGM);
105 
106   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
107   void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
108     DeviceVars.push_back(std::make_pair(&Var, Flags));
109   }
110 
111   /// Creates module constructor function
112   llvm::Function *makeModuleCtorFunction() override;
113   /// Creates module destructor function
114   llvm::Function *makeModuleDtorFunction() override;
115 };
116 
117 }
118 
119 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
120   if (CGM.getLangOpts().HIP)
121     return ((Twine("hip") + Twine(FuncName)).str());
122   return ((Twine("cuda") + Twine(FuncName)).str());
123 }
124 std::string
125 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
126   if (CGM.getLangOpts().HIP)
127     return ((Twine("__hip") + Twine(FuncName)).str());
128   return ((Twine("__cuda") + Twine(FuncName)).str());
129 }
130 
131 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
132     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
133       TheModule(CGM.getModule()),
134       RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
135   CodeGen::CodeGenTypes &Types = CGM.getTypes();
136   ASTContext &Ctx = CGM.getContext();
137 
138   IntTy = CGM.IntTy;
139   SizeTy = CGM.SizeTy;
140   VoidTy = CGM.VoidTy;
141 
142   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
143   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
144   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
145 }
146 
147 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
148   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
149   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
150   return CGM.CreateRuntimeFunction(
151       llvm::FunctionType::get(IntTy, Params, false),
152       addPrefixToName("SetupArgument"));
153 }
154 
155 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
156   if (CGM.getLangOpts().HIP) {
157     // hipError_t hipLaunchByPtr(char *);
158     return CGM.CreateRuntimeFunction(
159         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
160   } else {
161     // cudaError_t cudaLaunch(char *);
162     return CGM.CreateRuntimeFunction(
163         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
164   }
165 }
166 
167 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
168   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
169 }
170 
171 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
172   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
173 }
174 
175 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
176   auto CallbackFnTy = getCallbackFnTy();
177   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
178   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
179                           VoidPtrTy, CallbackFnTy->getPointerTo()};
180   return llvm::FunctionType::get(VoidTy, Params, false);
181 }
182 
183 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
184                                      FunctionArgList &Args) {
185   EmittedKernels.push_back(CGF.CurFn);
186   emitDeviceStubBody(CGF, Args);
187 }
188 
189 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF,
190                                          FunctionArgList &Args) {
191   // Emit a call to cudaSetupArgument for each arg in Args.
192   llvm::Constant *cudaSetupArgFn = getSetupArgumentFn();
193   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
194   CharUnits Offset = CharUnits::Zero();
195   for (const VarDecl *A : Args) {
196     CharUnits TyWidth, TyAlign;
197     std::tie(TyWidth, TyAlign) =
198         CGM.getContext().getTypeInfoInChars(A->getType());
199     Offset = Offset.alignTo(TyAlign);
200     llvm::Value *Args[] = {
201         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
202                                       VoidPtrTy),
203         llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
204         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
205     };
206     llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
207     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
208     llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero);
209     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
210     CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock);
211     CGF.EmitBlock(NextBlock);
212     Offset += TyWidth;
213   }
214 
215   // Emit the call to cudaLaunch
216   llvm::Constant *cudaLaunchFn = getLaunchFn();
217   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
218   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
219   CGF.EmitBranch(EndBlock);
220 
221   CGF.EmitBlock(EndBlock);
222 }
223 
224 /// Creates a function that sets up state on the host side for CUDA objects that
225 /// have a presence on both the host and device sides. Specifically, registers
226 /// the host side of kernel functions and device global variables with the CUDA
227 /// runtime.
228 /// \code
229 /// void __cuda_register_globals(void** GpuBinaryHandle) {
230 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
231 ///    ...
232 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
233 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
234 ///    ...
235 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
236 /// }
237 /// \endcode
238 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
239   // No need to register anything
240   if (EmittedKernels.empty() && DeviceVars.empty())
241     return nullptr;
242 
243   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
244       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
245       addUnderscoredPrefixToName("_register_globals"), &TheModule);
246   llvm::BasicBlock *EntryBB =
247       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
248   CGBuilderTy Builder(CGM, Context);
249   Builder.SetInsertPoint(EntryBB);
250 
251   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
252   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
253   llvm::Type *RegisterFuncParams[] = {
254       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
255       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
256   llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
257       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
258       addUnderscoredPrefixToName("RegisterFunction"));
259 
260   // Extract GpuBinaryHandle passed as the first argument passed to
261   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
262   // each emitted kernel.
263   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
264   for (llvm::Function *Kernel : EmittedKernels) {
265     llvm::Constant *KernelName = makeConstantString(Kernel->getName());
266     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
267     llvm::Value *Args[] = {
268         &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
269         KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
270         NullPtr, NullPtr, NullPtr,
271         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
272     Builder.CreateCall(RegisterFunc, Args);
273   }
274 
275   // void __cudaRegisterVar(void **, char *, char *, const char *,
276   //                        int, int, int, int)
277   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
278                                      CharPtrTy,    IntTy,     IntTy,
279                                      IntTy,        IntTy};
280   llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
281       llvm::FunctionType::get(IntTy, RegisterVarParams, false),
282       addUnderscoredPrefixToName("RegisterVar"));
283   for (auto &Pair : DeviceVars) {
284     llvm::GlobalVariable *Var = Pair.first;
285     unsigned Flags = Pair.second;
286     llvm::Constant *VarName = makeConstantString(Var->getName());
287     uint64_t VarSize =
288         CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
289     llvm::Value *Args[] = {
290         &GpuBinaryHandlePtr,
291         Builder.CreateBitCast(Var, VoidPtrTy),
292         VarName,
293         VarName,
294         llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
295         llvm::ConstantInt::get(IntTy, VarSize),
296         llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
297         llvm::ConstantInt::get(IntTy, 0)};
298     Builder.CreateCall(RegisterVar, Args);
299   }
300 
301   Builder.CreateRetVoid();
302   return RegisterKernelsFunc;
303 }
304 
305 /// Creates a global constructor function for the module:
306 /// \code
307 /// void __cuda_module_ctor(void*) {
308 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
309 ///     __cuda_register_globals(Handle);
310 /// }
311 /// \endcode
312 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
313   // No need to generate ctors/dtors if there is no GPU binary.
314   std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
315   if (GpuBinaryFileName.empty())
316     return nullptr;
317 
318   // void __cuda_register_globals(void* handle);
319   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
320   // We always need a function to pass in as callback. Create a dummy
321   // implementation if we don't need to register anything.
322   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
323     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
324 
325   // void ** __cudaRegisterFatBinary(void *);
326   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
327       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
328       addUnderscoredPrefixToName("RegisterFatBinary"));
329   // struct { int magic, int version, void * gpu_binary, void * dont_care };
330   llvm::StructType *FatbinWrapperTy =
331       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
332 
333   // Register GPU binary with the CUDA runtime, store returned handle in a
334   // global variable and save a reference in GpuBinaryHandle to be cleaned up
335   // in destructor on exit. Then associate all known kernels with the GPU binary
336   // handle so CUDA runtime can figure out what to call on the GPU side.
337   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
338       llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
339   if (std::error_code EC = GpuBinaryOrErr.getError()) {
340     CGM.getDiags().Report(diag::err_cannot_open_file)
341         << GpuBinaryFileName << EC.message();
342     return nullptr;
343   }
344 
345   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
346       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
347       llvm::GlobalValue::InternalLinkage,
348       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
349   llvm::BasicBlock *CtorEntryBB =
350       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
351   CGBuilderTy CtorBuilder(CGM, Context);
352 
353   CtorBuilder.SetInsertPoint(CtorEntryBB);
354 
355   const char *FatbinConstantName;
356   if (RelocatableDeviceCode)
357     // TODO: Figure out how this is called on mac OS!
358     FatbinConstantName = "__nv_relfatbin";
359   else
360     FatbinConstantName =
361         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
362   // NVIDIA's cuobjdump looks for fatbins in this section.
363   const char *FatbinSectionName =
364       CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
365   // TODO: Figure out how this is called on mac OS!
366   const char *NVModuleIDSectionName = "__nv_module_id";
367 
368   // Create initialized wrapper structure that points to the loaded GPU binary
369   ConstantInitBuilder Builder(CGM);
370   auto Values = Builder.beginStruct(FatbinWrapperTy);
371   // Fatbin wrapper magic.
372   Values.addInt(IntTy, 0x466243b1);
373   // Fatbin version.
374   Values.addInt(IntTy, 1);
375   // Data.
376   Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
377                                 FatbinConstantName, 8));
378   // Unused in fatbin v1.
379   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
380   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
381       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
382       /*constant*/ true);
383   FatbinWrapper->setSection(FatbinSectionName);
384 
385   // Register binary with CUDA runtime. This is substantially different in
386   // default mode vs. separate compilation!
387   if (!RelocatableDeviceCode) {
388     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
389     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
390         RegisterFatbinFunc,
391         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
392     GpuBinaryHandle = new llvm::GlobalVariable(
393         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
394         llvm::ConstantPointerNull::get(VoidPtrPtrTy),
395         addUnderscoredPrefixToName("_gpubin_handle"));
396 
397     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
398                                    CGM.getPointerAlign());
399 
400     // Call __cuda_register_globals(GpuBinaryHandle);
401     if (RegisterGlobalsFunc)
402       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
403   } else {
404     // Generate a unique module ID.
405     SmallString<64> NVModuleID;
406     llvm::raw_svector_ostream OS(NVModuleID);
407     OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
408     llvm::Constant *NVModuleIDConstant =
409         makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
410 
411     // Create an alias for the FatbinWrapper that nvcc will look for.
412     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
413                               Twine("__fatbinwrap") + NVModuleID,
414                               FatbinWrapper);
415 
416     // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
417     // void *, void (*)(void **))
418     SmallString<128> RegisterLinkedBinaryName(
419         addUnderscoredPrefixToName("RegisterLinkedBinary"));
420     RegisterLinkedBinaryName += NVModuleID;
421     llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
422         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
423 
424     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
425     llvm::Value *Args[] = {RegisterGlobalsFunc,
426                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
427                            NVModuleIDConstant,
428                            makeDummyFunction(getCallbackFnTy())};
429     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
430   }
431 
432   CtorBuilder.CreateRetVoid();
433   return ModuleCtorFunc;
434 }
435 
436 /// Creates a global destructor function that unregisters the GPU code blob
437 /// registered by constructor.
438 /// \code
439 /// void __cuda_module_dtor(void*) {
440 ///     __cudaUnregisterFatBinary(Handle);
441 /// }
442 /// \endcode
443 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
444   // No need for destructor if we don't have a handle to unregister.
445   if (!GpuBinaryHandle)
446     return nullptr;
447 
448   // void __cudaUnregisterFatBinary(void ** handle);
449   llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
450       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
451       addUnderscoredPrefixToName("UnregisterFatBinary"));
452 
453   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
454       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
455       llvm::GlobalValue::InternalLinkage,
456       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
457 
458   llvm::BasicBlock *DtorEntryBB =
459       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
460   CGBuilderTy DtorBuilder(CGM, Context);
461   DtorBuilder.SetInsertPoint(DtorEntryBB);
462 
463   auto HandleValue =
464       DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
465   DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
466 
467   DtorBuilder.CreateRetVoid();
468   return ModuleDtorFunc;
469 }
470 
471 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
472   return new CGNVCUDARuntime(CGM);
473 }
474