1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel;
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   struct VarInfo {
52     llvm::GlobalVariable *Var;
53     const VarDecl *D;
54     DeviceVarFlags Flags;
55   };
56   llvm::SmallVector<VarInfo, 16> DeviceVars;
57   /// Keeps track of variable containing handle of GPU binary. Populated by
58   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59   /// ModuleDtorFunction()
60   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61   /// Whether we generate relocatable device code.
62   bool RelocatableDeviceCode;
63   /// Mangle context for device.
64   std::unique_ptr<MangleContext> DeviceMC;
65 
66   llvm::FunctionCallee getSetupArgumentFn() const;
67   llvm::FunctionCallee getLaunchFn() const;
68 
69   llvm::FunctionType *getRegisterGlobalsFnTy() const;
70   llvm::FunctionType *getCallbackFnTy() const;
71   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72   std::string addPrefixToName(StringRef FuncName) const;
73   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74 
75   /// Creates a function to register all kernel stubs generated in this module.
76   llvm::Function *makeRegisterGlobalsFn();
77 
78   /// Helper function that generates a constant string and returns a pointer to
79   /// the start of the string.  The result of this function can be used anywhere
80   /// where the C code specifies const char*.
81   llvm::Constant *makeConstantString(const std::string &Str,
82                                      const std::string &Name = "",
83                                      const std::string &SectionName = "",
84                                      unsigned Alignment = 0) {
85     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86                                llvm::ConstantInt::get(SizeTy, 0)};
87     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88     llvm::GlobalVariable *GV =
89         cast<llvm::GlobalVariable>(ConstStr.getPointer());
90     if (!SectionName.empty()) {
91       GV->setSection(SectionName);
92       // Mark the address as used which make sure that this section isn't
93       // merged and we will really have it in the object file.
94       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95     }
96     if (Alignment)
97       GV->setAlignment(llvm::Align(Alignment));
98 
99     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100                                                 ConstStr.getPointer(), Zeros);
101   }
102 
103   /// Helper function that generates an empty dummy function returning void.
104   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105     assert(FnTy->getReturnType()->isVoidTy() &&
106            "Can only generate dummy functions returning void!");
107     llvm::Function *DummyFunc = llvm::Function::Create(
108         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109 
110     llvm::BasicBlock *DummyBlock =
111         llvm::BasicBlock::Create(Context, "", DummyFunc);
112     CGBuilderTy FuncBuilder(CGM, Context);
113     FuncBuilder.SetInsertPoint(DummyBlock);
114     FuncBuilder.CreateRetVoid();
115 
116     return DummyFunc;
117   }
118 
119   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121   std::string getDeviceSideName(const NamedDecl *ND) override;
122 
123   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
124                          bool Extern, bool Constant) {
125     DeviceVars.push_back({&Var,
126                           VD,
127                           {DeviceVarFlags::Variable, Extern, Constant,
128                            VD->hasAttr<HIPManagedAttr>(),
129                            /*Normalized*/ false, 0}});
130   }
131   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
132                           bool Extern, int Type) {
133     DeviceVars.push_back({&Var,
134                           VD,
135                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
136                            /*Managed*/ false,
137                            /*Normalized*/ false, Type}});
138   }
139   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
140                          bool Extern, int Type, bool Normalized) {
141     DeviceVars.push_back({&Var,
142                           VD,
143                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
144                            /*Managed*/ false, Normalized, Type}});
145   }
146 
147   /// Creates module constructor function
148   llvm::Function *makeModuleCtorFunction();
149   /// Creates module destructor function
150   llvm::Function *makeModuleDtorFunction();
151   /// Transform managed variables for device compilation.
152   void transformManagedVars();
153 
154 public:
155   CGNVCUDARuntime(CodeGenModule &CGM);
156 
157   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
158   void handleVarRegistration(const VarDecl *VD,
159                              llvm::GlobalVariable &Var) override;
160   void
161   internalizeDeviceSideVar(const VarDecl *D,
162                            llvm::GlobalValue::LinkageTypes &Linkage) override;
163 
164   llvm::Function *finalizeModule() override;
165 };
166 
167 }
168 
169 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
170   if (CGM.getLangOpts().HIP)
171     return ((Twine("hip") + Twine(FuncName)).str());
172   return ((Twine("cuda") + Twine(FuncName)).str());
173 }
174 std::string
175 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
176   if (CGM.getLangOpts().HIP)
177     return ((Twine("__hip") + Twine(FuncName)).str());
178   return ((Twine("__cuda") + Twine(FuncName)).str());
179 }
180 
181 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
182     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
183       TheModule(CGM.getModule()),
184       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
185       DeviceMC(CGM.getContext().createMangleContext(
186           CGM.getContext().getAuxTargetInfo())) {
187   CodeGen::CodeGenTypes &Types = CGM.getTypes();
188   ASTContext &Ctx = CGM.getContext();
189 
190   IntTy = CGM.IntTy;
191   SizeTy = CGM.SizeTy;
192   VoidTy = CGM.VoidTy;
193 
194   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
195   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
196   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
197   if (CGM.getContext().getAuxTargetInfo()) {
198     // If the host and device have different C++ ABIs, mark it as the device
199     // mangle context so that the mangling needs to retrieve the additonal
200     // device lambda mangling number instead of the regular host one.
201     DeviceMC->setDeviceMangleContext(
202         CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
203         CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
204   }
205 }
206 
207 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
208   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
209   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
210   return CGM.CreateRuntimeFunction(
211       llvm::FunctionType::get(IntTy, Params, false),
212       addPrefixToName("SetupArgument"));
213 }
214 
215 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
216   if (CGM.getLangOpts().HIP) {
217     // hipError_t hipLaunchByPtr(char *);
218     return CGM.CreateRuntimeFunction(
219         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
220   } else {
221     // cudaError_t cudaLaunch(char *);
222     return CGM.CreateRuntimeFunction(
223         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
224   }
225 }
226 
227 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
228   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
229 }
230 
231 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
232   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
233 }
234 
235 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
236   auto CallbackFnTy = getCallbackFnTy();
237   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
238   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
239                           VoidPtrTy, CallbackFnTy->getPointerTo()};
240   return llvm::FunctionType::get(VoidTy, Params, false);
241 }
242 
243 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
244   GlobalDecl GD;
245   // D could be either a kernel or a variable.
246   if (auto *FD = dyn_cast<FunctionDecl>(ND))
247     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
248   else
249     GD = GlobalDecl(ND);
250   std::string DeviceSideName;
251   if (DeviceMC->shouldMangleDeclName(ND)) {
252     SmallString<256> Buffer;
253     llvm::raw_svector_ostream Out(Buffer);
254     DeviceMC->mangleName(GD, Out);
255     DeviceSideName = std::string(Out.str());
256   } else
257     DeviceSideName = std::string(ND->getIdentifier()->getName());
258 
259   // Make unique name for device side static file-scope variable for HIP.
260   if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
261       CGM.getLangOpts().GPURelocatableDeviceCode &&
262       !CGM.getLangOpts().CUID.empty()) {
263     SmallString<256> Buffer;
264     llvm::raw_svector_ostream Out(Buffer);
265     Out << DeviceSideName;
266     CGM.printPostfixForExternalizedStaticVar(Out);
267     DeviceSideName = std::string(Out.str());
268   }
269   return DeviceSideName;
270 }
271 
272 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
273                                      FunctionArgList &Args) {
274   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
275   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
276                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
277       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
278     emitDeviceStubBodyNew(CGF, Args);
279   else
280     emitDeviceStubBodyLegacy(CGF, Args);
281 }
282 
283 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
284 // array and kernels are launched using cudaLaunchKernel().
285 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
286                                             FunctionArgList &Args) {
287   // Build the shadow stack entry at the very start of the function.
288 
289   // Calculate amount of space we will need for all arguments.  If we have no
290   // args, allocate a single pointer so we still have a valid pointer to the
291   // argument array that we can pass to runtime, even if it will be unused.
292   Address KernelArgs = CGF.CreateTempAlloca(
293       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
294       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
295   // Store pointers to the arguments in a locally allocated launch_args.
296   for (unsigned i = 0; i < Args.size(); ++i) {
297     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
298     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
299     CGF.Builder.CreateDefaultAlignedStore(
300         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
301   }
302 
303   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
304 
305   // Lookup cudaLaunchKernel/hipLaunchKernel function.
306   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
307   //                              void **args, size_t sharedMem,
308   //                              cudaStream_t stream);
309   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
310   //                            void **args, size_t sharedMem,
311   //                            hipStream_t stream);
312   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
313   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
314   auto LaunchKernelName = addPrefixToName("LaunchKernel");
315   IdentifierInfo &cudaLaunchKernelII =
316       CGM.getContext().Idents.get(LaunchKernelName);
317   FunctionDecl *cudaLaunchKernelFD = nullptr;
318   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
319     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
320       cudaLaunchKernelFD = FD;
321   }
322 
323   if (cudaLaunchKernelFD == nullptr) {
324     CGM.Error(CGF.CurFuncDecl->getLocation(),
325               "Can't find declaration for " + LaunchKernelName);
326     return;
327   }
328   // Create temporary dim3 grid_dim, block_dim.
329   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
330   QualType Dim3Ty = GridDimParam->getType();
331   Address GridDim =
332       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
333   Address BlockDim =
334       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
335   Address ShmemSize =
336       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
337   Address Stream =
338       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
339   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
340       llvm::FunctionType::get(IntTy,
341                               {/*gridDim=*/GridDim.getType(),
342                                /*blockDim=*/BlockDim.getType(),
343                                /*ShmemSize=*/ShmemSize.getType(),
344                                /*Stream=*/Stream.getType()},
345                               /*isVarArg=*/false),
346       addUnderscoredPrefixToName("PopCallConfiguration"));
347 
348   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
349                               {GridDim.getPointer(), BlockDim.getPointer(),
350                                ShmemSize.getPointer(), Stream.getPointer()});
351 
352   // Emit the call to cudaLaunch
353   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
354   CallArgList LaunchKernelArgs;
355   LaunchKernelArgs.add(RValue::get(Kernel),
356                        cudaLaunchKernelFD->getParamDecl(0)->getType());
357   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
358   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
359   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
360                        cudaLaunchKernelFD->getParamDecl(3)->getType());
361   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
362                        cudaLaunchKernelFD->getParamDecl(4)->getType());
363   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
364                        cudaLaunchKernelFD->getParamDecl(5)->getType());
365 
366   QualType QT = cudaLaunchKernelFD->getType();
367   QualType CQT = QT.getCanonicalType();
368   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
369   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
370 
371   const CGFunctionInfo &FI =
372       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
373   llvm::FunctionCallee cudaLaunchKernelFn =
374       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
375   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
376                LaunchKernelArgs);
377   CGF.EmitBranch(EndBlock);
378 
379   CGF.EmitBlock(EndBlock);
380 }
381 
382 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
383                                                FunctionArgList &Args) {
384   // Emit a call to cudaSetupArgument for each arg in Args.
385   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
386   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
387   CharUnits Offset = CharUnits::Zero();
388   for (const VarDecl *A : Args) {
389     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
390     Offset = Offset.alignTo(TInfo.Align);
391     llvm::Value *Args[] = {
392         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
393                                       VoidPtrTy),
394         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
395         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
396     };
397     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
398     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
399     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
400     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
401     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
402     CGF.EmitBlock(NextBlock);
403     Offset += TInfo.Width;
404   }
405 
406   // Emit the call to cudaLaunch
407   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
408   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
409   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
410   CGF.EmitBranch(EndBlock);
411 
412   CGF.EmitBlock(EndBlock);
413 }
414 
415 // Replace the original variable Var with the address loaded from variable
416 // ManagedVar populated by HIP runtime.
417 static void replaceManagedVar(llvm::GlobalVariable *Var,
418                               llvm::GlobalVariable *ManagedVar) {
419   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
420   for (auto &&VarUse : Var->uses()) {
421     WorkList.push_back({VarUse.getUser()});
422   }
423   while (!WorkList.empty()) {
424     auto &&WorkItem = WorkList.pop_back_val();
425     auto *U = WorkItem.back();
426     if (isa<llvm::ConstantExpr>(U)) {
427       for (auto &&UU : U->uses()) {
428         WorkItem.push_back(UU.getUser());
429         WorkList.push_back(WorkItem);
430         WorkItem.pop_back();
431       }
432       continue;
433     }
434     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
435       llvm::Value *OldV = Var;
436       llvm::Instruction *NewV =
437           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
438                              llvm::Align(Var->getAlignment()), I);
439       WorkItem.pop_back();
440       // Replace constant expressions directly or indirectly using the managed
441       // variable with instructions.
442       for (auto &&Op : WorkItem) {
443         auto *CE = cast<llvm::ConstantExpr>(Op);
444         auto *NewInst = llvm::createReplacementInstr(CE, I);
445         NewInst->replaceUsesOfWith(OldV, NewV);
446         OldV = CE;
447         NewV = NewInst;
448       }
449       I->replaceUsesOfWith(OldV, NewV);
450     } else {
451       llvm_unreachable("Invalid use of managed variable");
452     }
453   }
454 }
455 
456 /// Creates a function that sets up state on the host side for CUDA objects that
457 /// have a presence on both the host and device sides. Specifically, registers
458 /// the host side of kernel functions and device global variables with the CUDA
459 /// runtime.
460 /// \code
461 /// void __cuda_register_globals(void** GpuBinaryHandle) {
462 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
463 ///    ...
464 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
465 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
466 ///    ...
467 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
468 /// }
469 /// \endcode
470 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
471   // No need to register anything
472   if (EmittedKernels.empty() && DeviceVars.empty())
473     return nullptr;
474 
475   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
476       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
477       addUnderscoredPrefixToName("_register_globals"), &TheModule);
478   llvm::BasicBlock *EntryBB =
479       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
480   CGBuilderTy Builder(CGM, Context);
481   Builder.SetInsertPoint(EntryBB);
482 
483   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
484   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
485   llvm::Type *RegisterFuncParams[] = {
486       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
487       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
488   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
489       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
490       addUnderscoredPrefixToName("RegisterFunction"));
491 
492   // Extract GpuBinaryHandle passed as the first argument passed to
493   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
494   // each emitted kernel.
495   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
496   for (auto &&I : EmittedKernels) {
497     llvm::Constant *KernelName =
498         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
499     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
500     llvm::Value *Args[] = {
501         &GpuBinaryHandlePtr,
502         Builder.CreateBitCast(I.Kernel, VoidPtrTy),
503         KernelName,
504         KernelName,
505         llvm::ConstantInt::get(IntTy, -1),
506         NullPtr,
507         NullPtr,
508         NullPtr,
509         NullPtr,
510         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
511     Builder.CreateCall(RegisterFunc, Args);
512   }
513 
514   llvm::Type *VarSizeTy = IntTy;
515   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
516   if (CGM.getLangOpts().HIP ||
517       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
518     VarSizeTy = SizeTy;
519 
520   // void __cudaRegisterVar(void **, char *, char *, const char *,
521   //                        int, int, int, int)
522   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
523                                      CharPtrTy,    IntTy,     VarSizeTy,
524                                      IntTy,        IntTy};
525   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
526       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
527       addUnderscoredPrefixToName("RegisterVar"));
528   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
529   //                              size_t, unsigned)
530   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
531                                             CharPtrTy,    VarSizeTy, IntTy};
532   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
533       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
534       addUnderscoredPrefixToName("RegisterManagedVar"));
535   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
536   //                            const void **, const char *, int, int);
537   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
538       llvm::FunctionType::get(
539           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
540           false),
541       addUnderscoredPrefixToName("RegisterSurface"));
542   // void __cudaRegisterTexture(void **, const struct textureReference *,
543   //                            const void **, const char *, int, int, int)
544   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
545       llvm::FunctionType::get(
546           VoidTy,
547           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
548           false),
549       addUnderscoredPrefixToName("RegisterTexture"));
550   for (auto &&Info : DeviceVars) {
551     llvm::GlobalVariable *Var = Info.Var;
552     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
553            "External variables should not show up here, except HIP managed "
554            "variables");
555     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
556     switch (Info.Flags.getKind()) {
557     case DeviceVarFlags::Variable: {
558       uint64_t VarSize =
559           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
560       if (Info.Flags.isManaged()) {
561         auto ManagedVar = new llvm::GlobalVariable(
562             CGM.getModule(), Var->getType(),
563             /*isConstant=*/false, Var->getLinkage(),
564             /*Init=*/Var->isDeclaration()
565                 ? nullptr
566                 : llvm::ConstantPointerNull::get(Var->getType()),
567             /*Name=*/"", /*InsertBefore=*/nullptr,
568             llvm::GlobalVariable::NotThreadLocal);
569         ManagedVar->setDSOLocal(Var->isDSOLocal());
570         ManagedVar->setVisibility(Var->getVisibility());
571         ManagedVar->setExternallyInitialized(true);
572         ManagedVar->takeName(Var);
573         Var->setName(Twine(ManagedVar->getName() + ".managed"));
574         replaceManagedVar(Var, ManagedVar);
575         llvm::Value *Args[] = {
576             &GpuBinaryHandlePtr,
577             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
578             Builder.CreateBitCast(Var, VoidPtrTy),
579             VarName,
580             llvm::ConstantInt::get(VarSizeTy, VarSize),
581             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
582         if (!Var->isDeclaration())
583           Builder.CreateCall(RegisterManagedVar, Args);
584       } else {
585         llvm::Value *Args[] = {
586             &GpuBinaryHandlePtr,
587             Builder.CreateBitCast(Var, VoidPtrTy),
588             VarName,
589             VarName,
590             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
591             llvm::ConstantInt::get(VarSizeTy, VarSize),
592             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
593             llvm::ConstantInt::get(IntTy, 0)};
594         Builder.CreateCall(RegisterVar, Args);
595       }
596       break;
597     }
598     case DeviceVarFlags::Surface:
599       Builder.CreateCall(
600           RegisterSurf,
601           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
602            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
603            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
604       break;
605     case DeviceVarFlags::Texture:
606       Builder.CreateCall(
607           RegisterTex,
608           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
609            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
610            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
611            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
612       break;
613     }
614   }
615 
616   Builder.CreateRetVoid();
617   return RegisterKernelsFunc;
618 }
619 
620 /// Creates a global constructor function for the module:
621 ///
622 /// For CUDA:
623 /// \code
624 /// void __cuda_module_ctor(void*) {
625 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
626 ///     __cuda_register_globals(Handle);
627 /// }
628 /// \endcode
629 ///
630 /// For HIP:
631 /// \code
632 /// void __hip_module_ctor(void*) {
633 ///     if (__hip_gpubin_handle == 0) {
634 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
635 ///         __hip_register_globals(__hip_gpubin_handle);
636 ///     }
637 /// }
638 /// \endcode
639 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
640   bool IsHIP = CGM.getLangOpts().HIP;
641   bool IsCUDA = CGM.getLangOpts().CUDA;
642   // No need to generate ctors/dtors if there is no GPU binary.
643   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
644   if (CudaGpuBinaryFileName.empty() && !IsHIP)
645     return nullptr;
646   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
647       DeviceVars.empty())
648     return nullptr;
649 
650   // void __{cuda|hip}_register_globals(void* handle);
651   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
652   // We always need a function to pass in as callback. Create a dummy
653   // implementation if we don't need to register anything.
654   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
655     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
656 
657   // void ** __{cuda|hip}RegisterFatBinary(void *);
658   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
659       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
660       addUnderscoredPrefixToName("RegisterFatBinary"));
661   // struct { int magic, int version, void * gpu_binary, void * dont_care };
662   llvm::StructType *FatbinWrapperTy =
663       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
664 
665   // Register GPU binary with the CUDA runtime, store returned handle in a
666   // global variable and save a reference in GpuBinaryHandle to be cleaned up
667   // in destructor on exit. Then associate all known kernels with the GPU binary
668   // handle so CUDA runtime can figure out what to call on the GPU side.
669   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
670   if (!CudaGpuBinaryFileName.empty()) {
671     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
672         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
673     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
674       CGM.getDiags().Report(diag::err_cannot_open_file)
675           << CudaGpuBinaryFileName << EC.message();
676       return nullptr;
677     }
678     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
679   }
680 
681   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
682       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
683       llvm::GlobalValue::InternalLinkage,
684       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
685   llvm::BasicBlock *CtorEntryBB =
686       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
687   CGBuilderTy CtorBuilder(CGM, Context);
688 
689   CtorBuilder.SetInsertPoint(CtorEntryBB);
690 
691   const char *FatbinConstantName;
692   const char *FatbinSectionName;
693   const char *ModuleIDSectionName;
694   StringRef ModuleIDPrefix;
695   llvm::Constant *FatBinStr;
696   unsigned FatMagic;
697   if (IsHIP) {
698     FatbinConstantName = ".hip_fatbin";
699     FatbinSectionName = ".hipFatBinSegment";
700 
701     ModuleIDSectionName = "__hip_module_id";
702     ModuleIDPrefix = "__hip_";
703 
704     if (CudaGpuBinary) {
705       // If fatbin is available from early finalization, create a string
706       // literal containing the fat binary loaded from the given file.
707       const unsigned HIPCodeObjectAlign = 4096;
708       FatBinStr =
709           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
710                              FatbinConstantName, HIPCodeObjectAlign);
711     } else {
712       // If fatbin is not available, create an external symbol
713       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
714       // to contain the fat binary but will be populated somewhere else,
715       // e.g. by lld through link script.
716       FatBinStr = new llvm::GlobalVariable(
717         CGM.getModule(), CGM.Int8Ty,
718         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
719         "__hip_fatbin", nullptr,
720         llvm::GlobalVariable::NotThreadLocal);
721       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
722     }
723 
724     FatMagic = HIPFatMagic;
725   } else {
726     if (RelocatableDeviceCode)
727       FatbinConstantName = CGM.getTriple().isMacOSX()
728                                ? "__NV_CUDA,__nv_relfatbin"
729                                : "__nv_relfatbin";
730     else
731       FatbinConstantName =
732           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
733     // NVIDIA's cuobjdump looks for fatbins in this section.
734     FatbinSectionName =
735         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
736 
737     ModuleIDSectionName = CGM.getTriple().isMacOSX()
738                               ? "__NV_CUDA,__nv_module_id"
739                               : "__nv_module_id";
740     ModuleIDPrefix = "__nv_";
741 
742     // For CUDA, create a string literal containing the fat binary loaded from
743     // the given file.
744     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
745                                    FatbinConstantName, 8);
746     FatMagic = CudaFatMagic;
747   }
748 
749   // Create initialized wrapper structure that points to the loaded GPU binary
750   ConstantInitBuilder Builder(CGM);
751   auto Values = Builder.beginStruct(FatbinWrapperTy);
752   // Fatbin wrapper magic.
753   Values.addInt(IntTy, FatMagic);
754   // Fatbin version.
755   Values.addInt(IntTy, 1);
756   // Data.
757   Values.add(FatBinStr);
758   // Unused in fatbin v1.
759   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
760   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
761       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
762       /*constant*/ true);
763   FatbinWrapper->setSection(FatbinSectionName);
764 
765   // There is only one HIP fat binary per linked module, however there are
766   // multiple constructor functions. Make sure the fat binary is registered
767   // only once. The constructor functions are executed by the dynamic loader
768   // before the program gains control. The dynamic loader cannot execute the
769   // constructor functions concurrently since doing that would not guarantee
770   // thread safety of the loaded program. Therefore we can assume sequential
771   // execution of constructor functions here.
772   if (IsHIP) {
773     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
774         llvm::GlobalValue::LinkOnceAnyLinkage;
775     llvm::BasicBlock *IfBlock =
776         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
777     llvm::BasicBlock *ExitBlock =
778         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
779     // The name, size, and initialization pattern of this variable is part
780     // of HIP ABI.
781     GpuBinaryHandle = new llvm::GlobalVariable(
782         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
783         Linkage,
784         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
785         "__hip_gpubin_handle");
786     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
787     // Prevent the weak symbol in different shared libraries being merged.
788     if (Linkage != llvm::GlobalValue::InternalLinkage)
789       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
790     Address GpuBinaryAddr(
791         GpuBinaryHandle,
792         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
793     {
794       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
795       llvm::Constant *Zero =
796           llvm::Constant::getNullValue(HandleValue->getType());
797       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
798       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
799     }
800     {
801       CtorBuilder.SetInsertPoint(IfBlock);
802       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
803       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
804           RegisterFatbinFunc,
805           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
806       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
807       CtorBuilder.CreateBr(ExitBlock);
808     }
809     {
810       CtorBuilder.SetInsertPoint(ExitBlock);
811       // Call __hip_register_globals(GpuBinaryHandle);
812       if (RegisterGlobalsFunc) {
813         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
814         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
815       }
816     }
817   } else if (!RelocatableDeviceCode) {
818     // Register binary with CUDA runtime. This is substantially different in
819     // default mode vs. separate compilation!
820     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
821     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
822         RegisterFatbinFunc,
823         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
824     GpuBinaryHandle = new llvm::GlobalVariable(
825         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
826         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
827     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
828     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
829                                    CGM.getPointerAlign());
830 
831     // Call __cuda_register_globals(GpuBinaryHandle);
832     if (RegisterGlobalsFunc)
833       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
834 
835     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
836     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
837                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
838       // void __cudaRegisterFatBinaryEnd(void **);
839       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
840           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
841           "__cudaRegisterFatBinaryEnd");
842       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
843     }
844   } else {
845     // Generate a unique module ID.
846     SmallString<64> ModuleID;
847     llvm::raw_svector_ostream OS(ModuleID);
848     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
849     llvm::Constant *ModuleIDConstant = makeConstantString(
850         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
851 
852     // Create an alias for the FatbinWrapper that nvcc will look for.
853     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
854                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
855 
856     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
857     // void *, void (*)(void **))
858     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
859     RegisterLinkedBinaryName += ModuleID;
860     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
861         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
862 
863     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
864     llvm::Value *Args[] = {RegisterGlobalsFunc,
865                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
866                            ModuleIDConstant,
867                            makeDummyFunction(getCallbackFnTy())};
868     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
869   }
870 
871   // Create destructor and register it with atexit() the way NVCC does it. Doing
872   // it during regular destructor phase worked in CUDA before 9.2 but results in
873   // double-free in 9.2.
874   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
875     // extern "C" int atexit(void (*f)(void));
876     llvm::FunctionType *AtExitTy =
877         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
878     llvm::FunctionCallee AtExitFunc =
879         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
880                                   /*Local=*/true);
881     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
882   }
883 
884   CtorBuilder.CreateRetVoid();
885   return ModuleCtorFunc;
886 }
887 
888 /// Creates a global destructor function that unregisters the GPU code blob
889 /// registered by constructor.
890 ///
891 /// For CUDA:
892 /// \code
893 /// void __cuda_module_dtor(void*) {
894 ///     __cudaUnregisterFatBinary(Handle);
895 /// }
896 /// \endcode
897 ///
898 /// For HIP:
899 /// \code
900 /// void __hip_module_dtor(void*) {
901 ///     if (__hip_gpubin_handle) {
902 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
903 ///         __hip_gpubin_handle = 0;
904 ///     }
905 /// }
906 /// \endcode
907 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
908   // No need for destructor if we don't have a handle to unregister.
909   if (!GpuBinaryHandle)
910     return nullptr;
911 
912   // void __cudaUnregisterFatBinary(void ** handle);
913   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
914       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
915       addUnderscoredPrefixToName("UnregisterFatBinary"));
916 
917   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
918       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
919       llvm::GlobalValue::InternalLinkage,
920       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
921 
922   llvm::BasicBlock *DtorEntryBB =
923       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
924   CGBuilderTy DtorBuilder(CGM, Context);
925   DtorBuilder.SetInsertPoint(DtorEntryBB);
926 
927   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
928                                              GpuBinaryHandle->getAlignment()));
929   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
930   // There is only one HIP fat binary per linked module, however there are
931   // multiple destructor functions. Make sure the fat binary is unregistered
932   // only once.
933   if (CGM.getLangOpts().HIP) {
934     llvm::BasicBlock *IfBlock =
935         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
936     llvm::BasicBlock *ExitBlock =
937         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
938     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
939     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
940     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
941 
942     DtorBuilder.SetInsertPoint(IfBlock);
943     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
944     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
945     DtorBuilder.CreateBr(ExitBlock);
946 
947     DtorBuilder.SetInsertPoint(ExitBlock);
948   } else {
949     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
950   }
951   DtorBuilder.CreateRetVoid();
952   return ModuleDtorFunc;
953 }
954 
955 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
956   return new CGNVCUDARuntime(CGM);
957 }
958 
959 void CGNVCUDARuntime::internalizeDeviceSideVar(
960     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
961   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
962   // global variables become internal definitions. These have to be internal in
963   // order to prevent name conflicts with global host variables with the same
964   // name in a different TUs.
965   //
966   // For -fgpu-rdc, the shadow variables should not be internalized because
967   // they may be accessed by different TU.
968   if (CGM.getLangOpts().GPURelocatableDeviceCode)
969     return;
970 
971   // __shared__ variables are odd. Shadows do get created, but
972   // they are not registered with the CUDA runtime, so they
973   // can't really be used to access their device-side
974   // counterparts. It's not clear yet whether it's nvcc's bug or
975   // a feature, but we've got to do the same for compatibility.
976   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
977       D->hasAttr<CUDASharedAttr>() ||
978       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
979       D->getType()->isCUDADeviceBuiltinTextureType()) {
980     Linkage = llvm::GlobalValue::InternalLinkage;
981   }
982 }
983 
984 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
985                                             llvm::GlobalVariable &GV) {
986   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
987     // Shadow variables and their properties must be registered with CUDA
988     // runtime. Skip Extern global variables, which will be registered in
989     // the TU where they are defined.
990     //
991     // Don't register a C++17 inline variable. The local symbol can be
992     // discarded and referencing a discarded local symbol from outside the
993     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
994     // TODO: Reject __device__ constexpr and __device__ inline in Sema.
995     // HIP managed variables need to be always recorded in device and host
996     // compilations for transformation.
997     if ((!D->hasExternalStorage() && !D->isInline()) ||
998         D->hasAttr<HIPManagedAttr>()) {
999       registerDeviceVar(D, GV, !D->hasDefinition(),
1000                         D->hasAttr<CUDAConstantAttr>());
1001     }
1002   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1003              D->getType()->isCUDADeviceBuiltinTextureType()) {
1004     // Builtin surfaces and textures and their template arguments are
1005     // also registered with CUDA runtime.
1006     const ClassTemplateSpecializationDecl *TD =
1007         cast<ClassTemplateSpecializationDecl>(
1008             D->getType()->getAs<RecordType>()->getDecl());
1009     const TemplateArgumentList &Args = TD->getTemplateArgs();
1010     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1011       assert(Args.size() == 2 &&
1012              "Unexpected number of template arguments of CUDA device "
1013              "builtin surface type.");
1014       auto SurfType = Args[1].getAsIntegral();
1015       if (!D->hasExternalStorage())
1016         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1017     } else {
1018       assert(Args.size() == 3 &&
1019              "Unexpected number of template arguments of CUDA device "
1020              "builtin texture type.");
1021       auto TexType = Args[1].getAsIntegral();
1022       auto Normalized = Args[2].getAsIntegral();
1023       if (!D->hasExternalStorage())
1024         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1025                           Normalized.getZExtValue());
1026     }
1027   }
1028 }
1029 
1030 // Transform managed variables to pointers to managed variables in device code.
1031 // Each use of the original managed variable is replaced by a load from the
1032 // transformed managed variable. The transformed managed variable contains
1033 // the address of managed memory which will be allocated by the runtime.
1034 void CGNVCUDARuntime::transformManagedVars() {
1035   for (auto &&Info : DeviceVars) {
1036     llvm::GlobalVariable *Var = Info.Var;
1037     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1038         Info.Flags.isManaged()) {
1039       auto ManagedVar = new llvm::GlobalVariable(
1040           CGM.getModule(), Var->getType(),
1041           /*isConstant=*/false, Var->getLinkage(),
1042           /*Init=*/Var->isDeclaration()
1043               ? nullptr
1044               : llvm::ConstantPointerNull::get(Var->getType()),
1045           /*Name=*/"", /*InsertBefore=*/nullptr,
1046           llvm::GlobalVariable::NotThreadLocal,
1047           CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1048       ManagedVar->setDSOLocal(Var->isDSOLocal());
1049       ManagedVar->setVisibility(Var->getVisibility());
1050       ManagedVar->setExternallyInitialized(true);
1051       replaceManagedVar(Var, ManagedVar);
1052       ManagedVar->takeName(Var);
1053       Var->setName(Twine(ManagedVar->getName()) + ".managed");
1054       // Keep managed variables even if they are not used in device code since
1055       // they need to be allocated by the runtime.
1056       if (!Var->isDeclaration()) {
1057         assert(!ManagedVar->isDeclaration());
1058         CGM.addCompilerUsedGlobal(Var);
1059         CGM.addCompilerUsedGlobal(ManagedVar);
1060       }
1061     }
1062   }
1063 }
1064 
1065 // Returns module constructor to be added.
1066 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1067   if (CGM.getLangOpts().CUDAIsDevice) {
1068     transformManagedVars();
1069     return nullptr;
1070   }
1071   return makeModuleCtorFunction();
1072 }
1073