1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel;
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   struct VarInfo {
52     llvm::GlobalVariable *Var;
53     const VarDecl *D;
54     DeviceVarFlags Flags;
55   };
56   llvm::SmallVector<VarInfo, 16> DeviceVars;
57   /// Keeps track of variable containing handle of GPU binary. Populated by
58   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59   /// ModuleDtorFunction()
60   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61   /// Whether we generate relocatable device code.
62   bool RelocatableDeviceCode;
63   /// Mangle context for device.
64   std::unique_ptr<MangleContext> DeviceMC;
65 
66   llvm::FunctionCallee getSetupArgumentFn() const;
67   llvm::FunctionCallee getLaunchFn() const;
68 
69   llvm::FunctionType *getRegisterGlobalsFnTy() const;
70   llvm::FunctionType *getCallbackFnTy() const;
71   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72   std::string addPrefixToName(StringRef FuncName) const;
73   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74 
75   /// Creates a function to register all kernel stubs generated in this module.
76   llvm::Function *makeRegisterGlobalsFn();
77 
78   /// Helper function that generates a constant string and returns a pointer to
79   /// the start of the string.  The result of this function can be used anywhere
80   /// where the C code specifies const char*.
81   llvm::Constant *makeConstantString(const std::string &Str,
82                                      const std::string &Name = "",
83                                      const std::string &SectionName = "",
84                                      unsigned Alignment = 0) {
85     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86                                llvm::ConstantInt::get(SizeTy, 0)};
87     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88     llvm::GlobalVariable *GV =
89         cast<llvm::GlobalVariable>(ConstStr.getPointer());
90     if (!SectionName.empty()) {
91       GV->setSection(SectionName);
92       // Mark the address as used which make sure that this section isn't
93       // merged and we will really have it in the object file.
94       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95     }
96     if (Alignment)
97       GV->setAlignment(llvm::Align(Alignment));
98 
99     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100                                                 ConstStr.getPointer(), Zeros);
101   }
102 
103   /// Helper function that generates an empty dummy function returning void.
104   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105     assert(FnTy->getReturnType()->isVoidTy() &&
106            "Can only generate dummy functions returning void!");
107     llvm::Function *DummyFunc = llvm::Function::Create(
108         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109 
110     llvm::BasicBlock *DummyBlock =
111         llvm::BasicBlock::Create(Context, "", DummyFunc);
112     CGBuilderTy FuncBuilder(CGM, Context);
113     FuncBuilder.SetInsertPoint(DummyBlock);
114     FuncBuilder.CreateRetVoid();
115 
116     return DummyFunc;
117   }
118 
119   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121   std::string getDeviceSideName(const NamedDecl *ND) override;
122 
123 public:
124   CGNVCUDARuntime(CodeGenModule &CGM);
125 
126   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
127   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
128                          bool Extern, bool Constant) override {
129     DeviceVars.push_back({&Var,
130                           VD,
131                           {DeviceVarFlags::Variable, Extern, Constant,
132                            VD->hasAttr<HIPManagedAttr>(),
133                            /*Normalized*/ false, 0}});
134   }
135   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
136                           bool Extern, int Type) override {
137     DeviceVars.push_back({&Var,
138                           VD,
139                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
140                            /*Managed*/ false,
141                            /*Normalized*/ false, Type}});
142   }
143   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
144                          bool Extern, int Type, bool Normalized) override {
145     DeviceVars.push_back({&Var,
146                           VD,
147                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
148                            /*Managed*/ false, Normalized, Type}});
149   }
150 
151   /// Creates module constructor function
152   llvm::Function *makeModuleCtorFunction() override;
153   /// Creates module destructor function
154   llvm::Function *makeModuleDtorFunction() override;
155 };
156 
157 }
158 
159 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
160   if (CGM.getLangOpts().HIP)
161     return ((Twine("hip") + Twine(FuncName)).str());
162   return ((Twine("cuda") + Twine(FuncName)).str());
163 }
164 std::string
165 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
166   if (CGM.getLangOpts().HIP)
167     return ((Twine("__hip") + Twine(FuncName)).str());
168   return ((Twine("__cuda") + Twine(FuncName)).str());
169 }
170 
171 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
172     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
173       TheModule(CGM.getModule()),
174       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
175       DeviceMC(CGM.getContext().createMangleContext(
176           CGM.getContext().getAuxTargetInfo())) {
177   CodeGen::CodeGenTypes &Types = CGM.getTypes();
178   ASTContext &Ctx = CGM.getContext();
179 
180   IntTy = CGM.IntTy;
181   SizeTy = CGM.SizeTy;
182   VoidTy = CGM.VoidTy;
183 
184   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
185   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
186   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
187 }
188 
189 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
190   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
191   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
192   return CGM.CreateRuntimeFunction(
193       llvm::FunctionType::get(IntTy, Params, false),
194       addPrefixToName("SetupArgument"));
195 }
196 
197 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
198   if (CGM.getLangOpts().HIP) {
199     // hipError_t hipLaunchByPtr(char *);
200     return CGM.CreateRuntimeFunction(
201         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
202   } else {
203     // cudaError_t cudaLaunch(char *);
204     return CGM.CreateRuntimeFunction(
205         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
206   }
207 }
208 
209 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
210   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
211 }
212 
213 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
214   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
215 }
216 
217 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
218   auto CallbackFnTy = getCallbackFnTy();
219   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
220   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
221                           VoidPtrTy, CallbackFnTy->getPointerTo()};
222   return llvm::FunctionType::get(VoidTy, Params, false);
223 }
224 
225 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
226   GlobalDecl GD;
227   // D could be either a kernel or a variable.
228   if (auto *FD = dyn_cast<FunctionDecl>(ND))
229     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
230   else
231     GD = GlobalDecl(ND);
232   std::string DeviceSideName;
233   if (DeviceMC->shouldMangleDeclName(ND)) {
234     SmallString<256> Buffer;
235     llvm::raw_svector_ostream Out(Buffer);
236     DeviceMC->mangleName(GD, Out);
237     DeviceSideName = std::string(Out.str());
238   } else
239     DeviceSideName = std::string(ND->getIdentifier()->getName());
240   return DeviceSideName;
241 }
242 
243 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
244                                      FunctionArgList &Args) {
245   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
246   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
247                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
248       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
249     emitDeviceStubBodyNew(CGF, Args);
250   else
251     emitDeviceStubBodyLegacy(CGF, Args);
252 }
253 
254 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
255 // array and kernels are launched using cudaLaunchKernel().
256 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
257                                             FunctionArgList &Args) {
258   // Build the shadow stack entry at the very start of the function.
259 
260   // Calculate amount of space we will need for all arguments.  If we have no
261   // args, allocate a single pointer so we still have a valid pointer to the
262   // argument array that we can pass to runtime, even if it will be unused.
263   Address KernelArgs = CGF.CreateTempAlloca(
264       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
265       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
266   // Store pointers to the arguments in a locally allocated launch_args.
267   for (unsigned i = 0; i < Args.size(); ++i) {
268     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
269     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
270     CGF.Builder.CreateDefaultAlignedStore(
271         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
272   }
273 
274   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
275 
276   // Lookup cudaLaunchKernel/hipLaunchKernel function.
277   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
278   //                              void **args, size_t sharedMem,
279   //                              cudaStream_t stream);
280   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
281   //                            void **args, size_t sharedMem,
282   //                            hipStream_t stream);
283   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
284   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
285   auto LaunchKernelName = addPrefixToName("LaunchKernel");
286   IdentifierInfo &cudaLaunchKernelII =
287       CGM.getContext().Idents.get(LaunchKernelName);
288   FunctionDecl *cudaLaunchKernelFD = nullptr;
289   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
290     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
291       cudaLaunchKernelFD = FD;
292   }
293 
294   if (cudaLaunchKernelFD == nullptr) {
295     CGM.Error(CGF.CurFuncDecl->getLocation(),
296               "Can't find declaration for " + LaunchKernelName);
297     return;
298   }
299   // Create temporary dim3 grid_dim, block_dim.
300   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
301   QualType Dim3Ty = GridDimParam->getType();
302   Address GridDim =
303       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
304   Address BlockDim =
305       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
306   Address ShmemSize =
307       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
308   Address Stream =
309       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
310   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
311       llvm::FunctionType::get(IntTy,
312                               {/*gridDim=*/GridDim.getType(),
313                                /*blockDim=*/BlockDim.getType(),
314                                /*ShmemSize=*/ShmemSize.getType(),
315                                /*Stream=*/Stream.getType()},
316                               /*isVarArg=*/false),
317       addUnderscoredPrefixToName("PopCallConfiguration"));
318 
319   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
320                               {GridDim.getPointer(), BlockDim.getPointer(),
321                                ShmemSize.getPointer(), Stream.getPointer()});
322 
323   // Emit the call to cudaLaunch
324   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
325   CallArgList LaunchKernelArgs;
326   LaunchKernelArgs.add(RValue::get(Kernel),
327                        cudaLaunchKernelFD->getParamDecl(0)->getType());
328   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
329   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
330   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
331                        cudaLaunchKernelFD->getParamDecl(3)->getType());
332   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
333                        cudaLaunchKernelFD->getParamDecl(4)->getType());
334   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
335                        cudaLaunchKernelFD->getParamDecl(5)->getType());
336 
337   QualType QT = cudaLaunchKernelFD->getType();
338   QualType CQT = QT.getCanonicalType();
339   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
340   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
341 
342   const CGFunctionInfo &FI =
343       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
344   llvm::FunctionCallee cudaLaunchKernelFn =
345       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
346   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
347                LaunchKernelArgs);
348   CGF.EmitBranch(EndBlock);
349 
350   CGF.EmitBlock(EndBlock);
351 }
352 
353 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
354                                                FunctionArgList &Args) {
355   // Emit a call to cudaSetupArgument for each arg in Args.
356   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
357   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
358   CharUnits Offset = CharUnits::Zero();
359   for (const VarDecl *A : Args) {
360     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
361     Offset = Offset.alignTo(TInfo.Align);
362     llvm::Value *Args[] = {
363         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
364                                       VoidPtrTy),
365         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
366         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
367     };
368     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
369     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
370     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
371     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
372     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
373     CGF.EmitBlock(NextBlock);
374     Offset += TInfo.Width;
375   }
376 
377   // Emit the call to cudaLaunch
378   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
379   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
380   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
381   CGF.EmitBranch(EndBlock);
382 
383   CGF.EmitBlock(EndBlock);
384 }
385 
386 // Replace the original variable Var with the address loaded from variable
387 // ManagedVar populated by HIP runtime.
388 static void replaceManagedVar(llvm::GlobalVariable *Var,
389                               llvm::GlobalVariable *ManagedVar) {
390   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
391   for (auto &&VarUse : Var->uses()) {
392     WorkList.push_back({VarUse.getUser()});
393   }
394   while (!WorkList.empty()) {
395     auto &&WorkItem = WorkList.pop_back_val();
396     auto *U = WorkItem.back();
397     if (isa<llvm::ConstantExpr>(U)) {
398       for (auto &&UU : U->uses()) {
399         WorkItem.push_back(UU.getUser());
400         WorkList.push_back(WorkItem);
401         WorkItem.pop_back();
402       }
403       continue;
404     }
405     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
406       llvm::Value *OldV = Var;
407       llvm::Instruction *NewV =
408           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
409                              llvm::Align(Var->getAlignment()), I);
410       WorkItem.pop_back();
411       // Replace constant expressions directly or indirectly using the managed
412       // variable with instructions.
413       for (auto &&Op : WorkItem) {
414         auto *CE = cast<llvm::ConstantExpr>(Op);
415         auto *NewInst = llvm::createReplacementInstr(CE, I);
416         NewInst->replaceUsesOfWith(OldV, NewV);
417         OldV = CE;
418         NewV = NewInst;
419       }
420       I->replaceUsesOfWith(OldV, NewV);
421     } else {
422       llvm_unreachable("Invalid use of managed variable");
423     }
424   }
425 }
426 
427 /// Creates a function that sets up state on the host side for CUDA objects that
428 /// have a presence on both the host and device sides. Specifically, registers
429 /// the host side of kernel functions and device global variables with the CUDA
430 /// runtime.
431 /// \code
432 /// void __cuda_register_globals(void** GpuBinaryHandle) {
433 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
434 ///    ...
435 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
436 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
437 ///    ...
438 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
439 /// }
440 /// \endcode
441 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
442   // No need to register anything
443   if (EmittedKernels.empty() && DeviceVars.empty())
444     return nullptr;
445 
446   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
447       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
448       addUnderscoredPrefixToName("_register_globals"), &TheModule);
449   llvm::BasicBlock *EntryBB =
450       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
451   CGBuilderTy Builder(CGM, Context);
452   Builder.SetInsertPoint(EntryBB);
453 
454   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
455   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
456   llvm::Type *RegisterFuncParams[] = {
457       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
458       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
459   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
460       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
461       addUnderscoredPrefixToName("RegisterFunction"));
462 
463   // Extract GpuBinaryHandle passed as the first argument passed to
464   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
465   // each emitted kernel.
466   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
467   for (auto &&I : EmittedKernels) {
468     llvm::Constant *KernelName =
469         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
470     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
471     llvm::Value *Args[] = {
472         &GpuBinaryHandlePtr,
473         Builder.CreateBitCast(I.Kernel, VoidPtrTy),
474         KernelName,
475         KernelName,
476         llvm::ConstantInt::get(IntTy, -1),
477         NullPtr,
478         NullPtr,
479         NullPtr,
480         NullPtr,
481         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
482     Builder.CreateCall(RegisterFunc, Args);
483   }
484 
485   llvm::Type *VarSizeTy = IntTy;
486   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
487   if (CGM.getLangOpts().HIP ||
488       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
489     VarSizeTy = SizeTy;
490 
491   // void __cudaRegisterVar(void **, char *, char *, const char *,
492   //                        int, int, int, int)
493   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
494                                      CharPtrTy,    IntTy,     VarSizeTy,
495                                      IntTy,        IntTy};
496   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
497       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
498       addUnderscoredPrefixToName("RegisterVar"));
499   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
500   //                              size_t, unsigned)
501   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
502                                             CharPtrTy,    VarSizeTy, IntTy};
503   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
504       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
505       addUnderscoredPrefixToName("RegisterManagedVar"));
506   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
507   //                            const void **, const char *, int, int);
508   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
509       llvm::FunctionType::get(
510           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
511           false),
512       addUnderscoredPrefixToName("RegisterSurface"));
513   // void __cudaRegisterTexture(void **, const struct textureReference *,
514   //                            const void **, const char *, int, int, int)
515   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
516       llvm::FunctionType::get(
517           VoidTy,
518           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
519           false),
520       addUnderscoredPrefixToName("RegisterTexture"));
521   for (auto &&Info : DeviceVars) {
522     llvm::GlobalVariable *Var = Info.Var;
523     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
524     switch (Info.Flags.getKind()) {
525     case DeviceVarFlags::Variable: {
526       uint64_t VarSize =
527           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
528       if (Info.Flags.isManaged()) {
529         auto ManagedVar = new llvm::GlobalVariable(
530             CGM.getModule(), Var->getType(),
531             /*isConstant=*/false, Var->getLinkage(),
532             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
533             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
534             llvm::GlobalVariable::NotThreadLocal);
535         replaceManagedVar(Var, ManagedVar);
536         llvm::Value *Args[] = {
537             &GpuBinaryHandlePtr,
538             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
539             Builder.CreateBitCast(Var, VoidPtrTy),
540             VarName,
541             llvm::ConstantInt::get(VarSizeTy, VarSize),
542             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
543         Builder.CreateCall(RegisterManagedVar, Args);
544       } else {
545         llvm::Value *Args[] = {
546             &GpuBinaryHandlePtr,
547             Builder.CreateBitCast(Var, VoidPtrTy),
548             VarName,
549             VarName,
550             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
551             llvm::ConstantInt::get(VarSizeTy, VarSize),
552             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
553             llvm::ConstantInt::get(IntTy, 0)};
554         Builder.CreateCall(RegisterVar, Args);
555       }
556       break;
557     }
558     case DeviceVarFlags::Surface:
559       Builder.CreateCall(
560           RegisterSurf,
561           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
562            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
563            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
564       break;
565     case DeviceVarFlags::Texture:
566       Builder.CreateCall(
567           RegisterTex,
568           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
569            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
570            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
571            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
572       break;
573     }
574   }
575 
576   Builder.CreateRetVoid();
577   return RegisterKernelsFunc;
578 }
579 
580 /// Creates a global constructor function for the module:
581 ///
582 /// For CUDA:
583 /// \code
584 /// void __cuda_module_ctor(void*) {
585 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
586 ///     __cuda_register_globals(Handle);
587 /// }
588 /// \endcode
589 ///
590 /// For HIP:
591 /// \code
592 /// void __hip_module_ctor(void*) {
593 ///     if (__hip_gpubin_handle == 0) {
594 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
595 ///         __hip_register_globals(__hip_gpubin_handle);
596 ///     }
597 /// }
598 /// \endcode
599 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
600   bool IsHIP = CGM.getLangOpts().HIP;
601   bool IsCUDA = CGM.getLangOpts().CUDA;
602   // No need to generate ctors/dtors if there is no GPU binary.
603   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
604   if (CudaGpuBinaryFileName.empty() && !IsHIP)
605     return nullptr;
606   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
607       DeviceVars.empty())
608     return nullptr;
609 
610   // void __{cuda|hip}_register_globals(void* handle);
611   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
612   // We always need a function to pass in as callback. Create a dummy
613   // implementation if we don't need to register anything.
614   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
615     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
616 
617   // void ** __{cuda|hip}RegisterFatBinary(void *);
618   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
619       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
620       addUnderscoredPrefixToName("RegisterFatBinary"));
621   // struct { int magic, int version, void * gpu_binary, void * dont_care };
622   llvm::StructType *FatbinWrapperTy =
623       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
624 
625   // Register GPU binary with the CUDA runtime, store returned handle in a
626   // global variable and save a reference in GpuBinaryHandle to be cleaned up
627   // in destructor on exit. Then associate all known kernels with the GPU binary
628   // handle so CUDA runtime can figure out what to call on the GPU side.
629   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
630   if (!CudaGpuBinaryFileName.empty()) {
631     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
632         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
633     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
634       CGM.getDiags().Report(diag::err_cannot_open_file)
635           << CudaGpuBinaryFileName << EC.message();
636       return nullptr;
637     }
638     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
639   }
640 
641   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
642       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
643       llvm::GlobalValue::InternalLinkage,
644       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
645   llvm::BasicBlock *CtorEntryBB =
646       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
647   CGBuilderTy CtorBuilder(CGM, Context);
648 
649   CtorBuilder.SetInsertPoint(CtorEntryBB);
650 
651   const char *FatbinConstantName;
652   const char *FatbinSectionName;
653   const char *ModuleIDSectionName;
654   StringRef ModuleIDPrefix;
655   llvm::Constant *FatBinStr;
656   unsigned FatMagic;
657   if (IsHIP) {
658     FatbinConstantName = ".hip_fatbin";
659     FatbinSectionName = ".hipFatBinSegment";
660 
661     ModuleIDSectionName = "__hip_module_id";
662     ModuleIDPrefix = "__hip_";
663 
664     if (CudaGpuBinary) {
665       // If fatbin is available from early finalization, create a string
666       // literal containing the fat binary loaded from the given file.
667       const unsigned HIPCodeObjectAlign = 4096;
668       FatBinStr =
669           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
670                              FatbinConstantName, HIPCodeObjectAlign);
671     } else {
672       // If fatbin is not available, create an external symbol
673       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
674       // to contain the fat binary but will be populated somewhere else,
675       // e.g. by lld through link script.
676       FatBinStr = new llvm::GlobalVariable(
677         CGM.getModule(), CGM.Int8Ty,
678         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
679         "__hip_fatbin", nullptr,
680         llvm::GlobalVariable::NotThreadLocal);
681       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
682     }
683 
684     FatMagic = HIPFatMagic;
685   } else {
686     if (RelocatableDeviceCode)
687       FatbinConstantName = CGM.getTriple().isMacOSX()
688                                ? "__NV_CUDA,__nv_relfatbin"
689                                : "__nv_relfatbin";
690     else
691       FatbinConstantName =
692           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
693     // NVIDIA's cuobjdump looks for fatbins in this section.
694     FatbinSectionName =
695         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
696 
697     ModuleIDSectionName = CGM.getTriple().isMacOSX()
698                               ? "__NV_CUDA,__nv_module_id"
699                               : "__nv_module_id";
700     ModuleIDPrefix = "__nv_";
701 
702     // For CUDA, create a string literal containing the fat binary loaded from
703     // the given file.
704     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
705                                    FatbinConstantName, 8);
706     FatMagic = CudaFatMagic;
707   }
708 
709   // Create initialized wrapper structure that points to the loaded GPU binary
710   ConstantInitBuilder Builder(CGM);
711   auto Values = Builder.beginStruct(FatbinWrapperTy);
712   // Fatbin wrapper magic.
713   Values.addInt(IntTy, FatMagic);
714   // Fatbin version.
715   Values.addInt(IntTy, 1);
716   // Data.
717   Values.add(FatBinStr);
718   // Unused in fatbin v1.
719   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
720   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
721       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
722       /*constant*/ true);
723   FatbinWrapper->setSection(FatbinSectionName);
724 
725   // There is only one HIP fat binary per linked module, however there are
726   // multiple constructor functions. Make sure the fat binary is registered
727   // only once. The constructor functions are executed by the dynamic loader
728   // before the program gains control. The dynamic loader cannot execute the
729   // constructor functions concurrently since doing that would not guarantee
730   // thread safety of the loaded program. Therefore we can assume sequential
731   // execution of constructor functions here.
732   if (IsHIP) {
733     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
734         llvm::GlobalValue::LinkOnceAnyLinkage;
735     llvm::BasicBlock *IfBlock =
736         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
737     llvm::BasicBlock *ExitBlock =
738         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
739     // The name, size, and initialization pattern of this variable is part
740     // of HIP ABI.
741     GpuBinaryHandle = new llvm::GlobalVariable(
742         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
743         Linkage,
744         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
745         "__hip_gpubin_handle");
746     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
747     // Prevent the weak symbol in different shared libraries being merged.
748     if (Linkage != llvm::GlobalValue::InternalLinkage)
749       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
750     Address GpuBinaryAddr(
751         GpuBinaryHandle,
752         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
753     {
754       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
755       llvm::Constant *Zero =
756           llvm::Constant::getNullValue(HandleValue->getType());
757       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
758       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
759     }
760     {
761       CtorBuilder.SetInsertPoint(IfBlock);
762       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
763       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
764           RegisterFatbinFunc,
765           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
766       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
767       CtorBuilder.CreateBr(ExitBlock);
768     }
769     {
770       CtorBuilder.SetInsertPoint(ExitBlock);
771       // Call __hip_register_globals(GpuBinaryHandle);
772       if (RegisterGlobalsFunc) {
773         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
774         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
775       }
776     }
777   } else if (!RelocatableDeviceCode) {
778     // Register binary with CUDA runtime. This is substantially different in
779     // default mode vs. separate compilation!
780     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
781     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
782         RegisterFatbinFunc,
783         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
784     GpuBinaryHandle = new llvm::GlobalVariable(
785         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
786         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
787     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
788     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
789                                    CGM.getPointerAlign());
790 
791     // Call __cuda_register_globals(GpuBinaryHandle);
792     if (RegisterGlobalsFunc)
793       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
794 
795     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
796     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
797                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
798       // void __cudaRegisterFatBinaryEnd(void **);
799       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
800           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
801           "__cudaRegisterFatBinaryEnd");
802       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
803     }
804   } else {
805     // Generate a unique module ID.
806     SmallString<64> ModuleID;
807     llvm::raw_svector_ostream OS(ModuleID);
808     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
809     llvm::Constant *ModuleIDConstant = makeConstantString(
810         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
811 
812     // Create an alias for the FatbinWrapper that nvcc will look for.
813     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
814                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
815 
816     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
817     // void *, void (*)(void **))
818     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
819     RegisterLinkedBinaryName += ModuleID;
820     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
821         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
822 
823     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
824     llvm::Value *Args[] = {RegisterGlobalsFunc,
825                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
826                            ModuleIDConstant,
827                            makeDummyFunction(getCallbackFnTy())};
828     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
829   }
830 
831   // Create destructor and register it with atexit() the way NVCC does it. Doing
832   // it during regular destructor phase worked in CUDA before 9.2 but results in
833   // double-free in 9.2.
834   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
835     // extern "C" int atexit(void (*f)(void));
836     llvm::FunctionType *AtExitTy =
837         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
838     llvm::FunctionCallee AtExitFunc =
839         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
840                                   /*Local=*/true);
841     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
842   }
843 
844   CtorBuilder.CreateRetVoid();
845   return ModuleCtorFunc;
846 }
847 
848 /// Creates a global destructor function that unregisters the GPU code blob
849 /// registered by constructor.
850 ///
851 /// For CUDA:
852 /// \code
853 /// void __cuda_module_dtor(void*) {
854 ///     __cudaUnregisterFatBinary(Handle);
855 /// }
856 /// \endcode
857 ///
858 /// For HIP:
859 /// \code
860 /// void __hip_module_dtor(void*) {
861 ///     if (__hip_gpubin_handle) {
862 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
863 ///         __hip_gpubin_handle = 0;
864 ///     }
865 /// }
866 /// \endcode
867 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
868   // No need for destructor if we don't have a handle to unregister.
869   if (!GpuBinaryHandle)
870     return nullptr;
871 
872   // void __cudaUnregisterFatBinary(void ** handle);
873   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
874       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
875       addUnderscoredPrefixToName("UnregisterFatBinary"));
876 
877   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
878       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
879       llvm::GlobalValue::InternalLinkage,
880       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
881 
882   llvm::BasicBlock *DtorEntryBB =
883       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
884   CGBuilderTy DtorBuilder(CGM, Context);
885   DtorBuilder.SetInsertPoint(DtorEntryBB);
886 
887   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
888                                              GpuBinaryHandle->getAlignment()));
889   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
890   // There is only one HIP fat binary per linked module, however there are
891   // multiple destructor functions. Make sure the fat binary is unregistered
892   // only once.
893   if (CGM.getLangOpts().HIP) {
894     llvm::BasicBlock *IfBlock =
895         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
896     llvm::BasicBlock *ExitBlock =
897         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
898     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
899     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
900     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
901 
902     DtorBuilder.SetInsertPoint(IfBlock);
903     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
904     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
905     DtorBuilder.CreateBr(ExitBlock);
906 
907     DtorBuilder.SetInsertPoint(ExitBlock);
908   } else {
909     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
910   }
911   DtorBuilder.CreateRetVoid();
912   return ModuleDtorFunc;
913 }
914 
915 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
916   return new CGNVCUDARuntime(CGM);
917 }
918