1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel;
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   struct VarInfo {
52     llvm::GlobalVariable *Var;
53     const VarDecl *D;
54     DeviceVarFlags Flags;
55   };
56   llvm::SmallVector<VarInfo, 16> DeviceVars;
57   /// Keeps track of variable containing handle of GPU binary. Populated by
58   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59   /// ModuleDtorFunction()
60   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61   /// Whether we generate relocatable device code.
62   bool RelocatableDeviceCode;
63   /// Mangle context for device.
64   std::unique_ptr<MangleContext> DeviceMC;
65 
66   llvm::FunctionCallee getSetupArgumentFn() const;
67   llvm::FunctionCallee getLaunchFn() const;
68 
69   llvm::FunctionType *getRegisterGlobalsFnTy() const;
70   llvm::FunctionType *getCallbackFnTy() const;
71   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72   std::string addPrefixToName(StringRef FuncName) const;
73   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74 
75   /// Creates a function to register all kernel stubs generated in this module.
76   llvm::Function *makeRegisterGlobalsFn();
77 
78   /// Helper function that generates a constant string and returns a pointer to
79   /// the start of the string.  The result of this function can be used anywhere
80   /// where the C code specifies const char*.
81   llvm::Constant *makeConstantString(const std::string &Str,
82                                      const std::string &Name = "",
83                                      const std::string &SectionName = "",
84                                      unsigned Alignment = 0) {
85     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86                                llvm::ConstantInt::get(SizeTy, 0)};
87     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88     llvm::GlobalVariable *GV =
89         cast<llvm::GlobalVariable>(ConstStr.getPointer());
90     if (!SectionName.empty()) {
91       GV->setSection(SectionName);
92       // Mark the address as used which make sure that this section isn't
93       // merged and we will really have it in the object file.
94       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95     }
96     if (Alignment)
97       GV->setAlignment(llvm::Align(Alignment));
98 
99     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100                                                 ConstStr.getPointer(), Zeros);
101   }
102 
103   /// Helper function that generates an empty dummy function returning void.
104   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105     assert(FnTy->getReturnType()->isVoidTy() &&
106            "Can only generate dummy functions returning void!");
107     llvm::Function *DummyFunc = llvm::Function::Create(
108         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109 
110     llvm::BasicBlock *DummyBlock =
111         llvm::BasicBlock::Create(Context, "", DummyFunc);
112     CGBuilderTy FuncBuilder(CGM, Context);
113     FuncBuilder.SetInsertPoint(DummyBlock);
114     FuncBuilder.CreateRetVoid();
115 
116     return DummyFunc;
117   }
118 
119   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121   std::string getDeviceSideName(const NamedDecl *ND) override;
122 
123   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
124                          bool Extern, bool Constant) {
125     DeviceVars.push_back({&Var,
126                           VD,
127                           {DeviceVarFlags::Variable, Extern, Constant,
128                            VD->hasAttr<HIPManagedAttr>(),
129                            /*Normalized*/ false, 0}});
130   }
131   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
132                           bool Extern, int Type) {
133     DeviceVars.push_back({&Var,
134                           VD,
135                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
136                            /*Managed*/ false,
137                            /*Normalized*/ false, Type}});
138   }
139   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
140                          bool Extern, int Type, bool Normalized) {
141     DeviceVars.push_back({&Var,
142                           VD,
143                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
144                            /*Managed*/ false, Normalized, Type}});
145   }
146 
147   /// Creates module constructor function
148   llvm::Function *makeModuleCtorFunction();
149   /// Creates module destructor function
150   llvm::Function *makeModuleDtorFunction();
151   /// Transform managed variables for device compilation.
152   void transformManagedVars();
153 
154 public:
155   CGNVCUDARuntime(CodeGenModule &CGM);
156 
157   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
158   void handleVarRegistration(const VarDecl *VD,
159                              llvm::GlobalVariable &Var) override;
160   void
161   internalizeDeviceSideVar(const VarDecl *D,
162                            llvm::GlobalValue::LinkageTypes &Linkage) override;
163 
164   llvm::Function *finalizeModule() override;
165 };
166 
167 }
168 
169 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
170   if (CGM.getLangOpts().HIP)
171     return ((Twine("hip") + Twine(FuncName)).str());
172   return ((Twine("cuda") + Twine(FuncName)).str());
173 }
174 std::string
175 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
176   if (CGM.getLangOpts().HIP)
177     return ((Twine("__hip") + Twine(FuncName)).str());
178   return ((Twine("__cuda") + Twine(FuncName)).str());
179 }
180 
181 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
182     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
183       TheModule(CGM.getModule()),
184       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
185       DeviceMC(CGM.getContext().createMangleContext(
186           CGM.getContext().getAuxTargetInfo())) {
187   CodeGen::CodeGenTypes &Types = CGM.getTypes();
188   ASTContext &Ctx = CGM.getContext();
189 
190   IntTy = CGM.IntTy;
191   SizeTy = CGM.SizeTy;
192   VoidTy = CGM.VoidTy;
193 
194   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
195   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
196   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
197   if (CGM.getContext().getAuxTargetInfo()) {
198     // If the host and device have different C++ ABIs, mark it as the device
199     // mangle context so that the mangling needs to retrieve the additonal
200     // device lambda mangling number instead of the regular host one.
201     DeviceMC->setDeviceMangleContext(
202         CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
203         CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
204   }
205 }
206 
207 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
208   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
209   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
210   return CGM.CreateRuntimeFunction(
211       llvm::FunctionType::get(IntTy, Params, false),
212       addPrefixToName("SetupArgument"));
213 }
214 
215 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
216   if (CGM.getLangOpts().HIP) {
217     // hipError_t hipLaunchByPtr(char *);
218     return CGM.CreateRuntimeFunction(
219         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
220   } else {
221     // cudaError_t cudaLaunch(char *);
222     return CGM.CreateRuntimeFunction(
223         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
224   }
225 }
226 
227 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
228   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
229 }
230 
231 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
232   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
233 }
234 
235 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
236   auto CallbackFnTy = getCallbackFnTy();
237   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
238   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
239                           VoidPtrTy, CallbackFnTy->getPointerTo()};
240   return llvm::FunctionType::get(VoidTy, Params, false);
241 }
242 
243 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
244   GlobalDecl GD;
245   // D could be either a kernel or a variable.
246   if (auto *FD = dyn_cast<FunctionDecl>(ND))
247     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
248   else
249     GD = GlobalDecl(ND);
250   std::string DeviceSideName;
251   if (DeviceMC->shouldMangleDeclName(ND)) {
252     SmallString<256> Buffer;
253     llvm::raw_svector_ostream Out(Buffer);
254     DeviceMC->mangleName(GD, Out);
255     DeviceSideName = std::string(Out.str());
256   } else
257     DeviceSideName = std::string(ND->getIdentifier()->getName());
258   return DeviceSideName;
259 }
260 
261 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
262                                      FunctionArgList &Args) {
263   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
264   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
265                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
266       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
267     emitDeviceStubBodyNew(CGF, Args);
268   else
269     emitDeviceStubBodyLegacy(CGF, Args);
270 }
271 
272 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
273 // array and kernels are launched using cudaLaunchKernel().
274 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
275                                             FunctionArgList &Args) {
276   // Build the shadow stack entry at the very start of the function.
277 
278   // Calculate amount of space we will need for all arguments.  If we have no
279   // args, allocate a single pointer so we still have a valid pointer to the
280   // argument array that we can pass to runtime, even if it will be unused.
281   Address KernelArgs = CGF.CreateTempAlloca(
282       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
283       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
284   // Store pointers to the arguments in a locally allocated launch_args.
285   for (unsigned i = 0; i < Args.size(); ++i) {
286     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
287     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
288     CGF.Builder.CreateDefaultAlignedStore(
289         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
290   }
291 
292   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
293 
294   // Lookup cudaLaunchKernel/hipLaunchKernel function.
295   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
296   //                              void **args, size_t sharedMem,
297   //                              cudaStream_t stream);
298   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
299   //                            void **args, size_t sharedMem,
300   //                            hipStream_t stream);
301   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
302   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
303   auto LaunchKernelName = addPrefixToName("LaunchKernel");
304   IdentifierInfo &cudaLaunchKernelII =
305       CGM.getContext().Idents.get(LaunchKernelName);
306   FunctionDecl *cudaLaunchKernelFD = nullptr;
307   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
308     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
309       cudaLaunchKernelFD = FD;
310   }
311 
312   if (cudaLaunchKernelFD == nullptr) {
313     CGM.Error(CGF.CurFuncDecl->getLocation(),
314               "Can't find declaration for " + LaunchKernelName);
315     return;
316   }
317   // Create temporary dim3 grid_dim, block_dim.
318   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
319   QualType Dim3Ty = GridDimParam->getType();
320   Address GridDim =
321       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
322   Address BlockDim =
323       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
324   Address ShmemSize =
325       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
326   Address Stream =
327       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
328   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
329       llvm::FunctionType::get(IntTy,
330                               {/*gridDim=*/GridDim.getType(),
331                                /*blockDim=*/BlockDim.getType(),
332                                /*ShmemSize=*/ShmemSize.getType(),
333                                /*Stream=*/Stream.getType()},
334                               /*isVarArg=*/false),
335       addUnderscoredPrefixToName("PopCallConfiguration"));
336 
337   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
338                               {GridDim.getPointer(), BlockDim.getPointer(),
339                                ShmemSize.getPointer(), Stream.getPointer()});
340 
341   // Emit the call to cudaLaunch
342   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
343   CallArgList LaunchKernelArgs;
344   LaunchKernelArgs.add(RValue::get(Kernel),
345                        cudaLaunchKernelFD->getParamDecl(0)->getType());
346   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
347   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
348   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
349                        cudaLaunchKernelFD->getParamDecl(3)->getType());
350   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
351                        cudaLaunchKernelFD->getParamDecl(4)->getType());
352   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
353                        cudaLaunchKernelFD->getParamDecl(5)->getType());
354 
355   QualType QT = cudaLaunchKernelFD->getType();
356   QualType CQT = QT.getCanonicalType();
357   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
358   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
359 
360   const CGFunctionInfo &FI =
361       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
362   llvm::FunctionCallee cudaLaunchKernelFn =
363       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
364   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
365                LaunchKernelArgs);
366   CGF.EmitBranch(EndBlock);
367 
368   CGF.EmitBlock(EndBlock);
369 }
370 
371 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
372                                                FunctionArgList &Args) {
373   // Emit a call to cudaSetupArgument for each arg in Args.
374   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
375   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
376   CharUnits Offset = CharUnits::Zero();
377   for (const VarDecl *A : Args) {
378     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
379     Offset = Offset.alignTo(TInfo.Align);
380     llvm::Value *Args[] = {
381         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
382                                       VoidPtrTy),
383         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
384         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
385     };
386     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
387     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
388     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
389     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
390     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
391     CGF.EmitBlock(NextBlock);
392     Offset += TInfo.Width;
393   }
394 
395   // Emit the call to cudaLaunch
396   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
397   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
398   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
399   CGF.EmitBranch(EndBlock);
400 
401   CGF.EmitBlock(EndBlock);
402 }
403 
404 // Replace the original variable Var with the address loaded from variable
405 // ManagedVar populated by HIP runtime.
406 static void replaceManagedVar(llvm::GlobalVariable *Var,
407                               llvm::GlobalVariable *ManagedVar) {
408   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
409   for (auto &&VarUse : Var->uses()) {
410     WorkList.push_back({VarUse.getUser()});
411   }
412   while (!WorkList.empty()) {
413     auto &&WorkItem = WorkList.pop_back_val();
414     auto *U = WorkItem.back();
415     if (isa<llvm::ConstantExpr>(U)) {
416       for (auto &&UU : U->uses()) {
417         WorkItem.push_back(UU.getUser());
418         WorkList.push_back(WorkItem);
419         WorkItem.pop_back();
420       }
421       continue;
422     }
423     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
424       llvm::Value *OldV = Var;
425       llvm::Instruction *NewV =
426           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
427                              llvm::Align(Var->getAlignment()), I);
428       WorkItem.pop_back();
429       // Replace constant expressions directly or indirectly using the managed
430       // variable with instructions.
431       for (auto &&Op : WorkItem) {
432         auto *CE = cast<llvm::ConstantExpr>(Op);
433         auto *NewInst = llvm::createReplacementInstr(CE, I);
434         NewInst->replaceUsesOfWith(OldV, NewV);
435         OldV = CE;
436         NewV = NewInst;
437       }
438       I->replaceUsesOfWith(OldV, NewV);
439     } else {
440       llvm_unreachable("Invalid use of managed variable");
441     }
442   }
443 }
444 
445 /// Creates a function that sets up state on the host side for CUDA objects that
446 /// have a presence on both the host and device sides. Specifically, registers
447 /// the host side of kernel functions and device global variables with the CUDA
448 /// runtime.
449 /// \code
450 /// void __cuda_register_globals(void** GpuBinaryHandle) {
451 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
452 ///    ...
453 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
454 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
455 ///    ...
456 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
457 /// }
458 /// \endcode
459 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
460   // No need to register anything
461   if (EmittedKernels.empty() && DeviceVars.empty())
462     return nullptr;
463 
464   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
465       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
466       addUnderscoredPrefixToName("_register_globals"), &TheModule);
467   llvm::BasicBlock *EntryBB =
468       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
469   CGBuilderTy Builder(CGM, Context);
470   Builder.SetInsertPoint(EntryBB);
471 
472   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
473   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
474   llvm::Type *RegisterFuncParams[] = {
475       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
476       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
477   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
478       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
479       addUnderscoredPrefixToName("RegisterFunction"));
480 
481   // Extract GpuBinaryHandle passed as the first argument passed to
482   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
483   // each emitted kernel.
484   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
485   for (auto &&I : EmittedKernels) {
486     llvm::Constant *KernelName =
487         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
488     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
489     llvm::Value *Args[] = {
490         &GpuBinaryHandlePtr,
491         Builder.CreateBitCast(I.Kernel, VoidPtrTy),
492         KernelName,
493         KernelName,
494         llvm::ConstantInt::get(IntTy, -1),
495         NullPtr,
496         NullPtr,
497         NullPtr,
498         NullPtr,
499         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
500     Builder.CreateCall(RegisterFunc, Args);
501   }
502 
503   llvm::Type *VarSizeTy = IntTy;
504   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
505   if (CGM.getLangOpts().HIP ||
506       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
507     VarSizeTy = SizeTy;
508 
509   // void __cudaRegisterVar(void **, char *, char *, const char *,
510   //                        int, int, int, int)
511   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
512                                      CharPtrTy,    IntTy,     VarSizeTy,
513                                      IntTy,        IntTy};
514   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
515       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
516       addUnderscoredPrefixToName("RegisterVar"));
517   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
518   //                              size_t, unsigned)
519   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
520                                             CharPtrTy,    VarSizeTy, IntTy};
521   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
522       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
523       addUnderscoredPrefixToName("RegisterManagedVar"));
524   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
525   //                            const void **, const char *, int, int);
526   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
527       llvm::FunctionType::get(
528           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
529           false),
530       addUnderscoredPrefixToName("RegisterSurface"));
531   // void __cudaRegisterTexture(void **, const struct textureReference *,
532   //                            const void **, const char *, int, int, int)
533   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
534       llvm::FunctionType::get(
535           VoidTy,
536           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
537           false),
538       addUnderscoredPrefixToName("RegisterTexture"));
539   for (auto &&Info : DeviceVars) {
540     llvm::GlobalVariable *Var = Info.Var;
541     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
542            "External variables should not show up here, except HIP managed "
543            "variables");
544     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
545     switch (Info.Flags.getKind()) {
546     case DeviceVarFlags::Variable: {
547       uint64_t VarSize =
548           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
549       if (Info.Flags.isManaged()) {
550         auto ManagedVar = new llvm::GlobalVariable(
551             CGM.getModule(), Var->getType(),
552             /*isConstant=*/false, Var->getLinkage(),
553             /*Init=*/Var->isDeclaration()
554                 ? nullptr
555                 : llvm::ConstantPointerNull::get(Var->getType()),
556             /*Name=*/"", /*InsertBefore=*/nullptr,
557             llvm::GlobalVariable::NotThreadLocal);
558         ManagedVar->setDSOLocal(Var->isDSOLocal());
559         ManagedVar->setVisibility(Var->getVisibility());
560         ManagedVar->setExternallyInitialized(true);
561         ManagedVar->takeName(Var);
562         Var->setName(Twine(ManagedVar->getName() + ".managed"));
563         replaceManagedVar(Var, ManagedVar);
564         llvm::Value *Args[] = {
565             &GpuBinaryHandlePtr,
566             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
567             Builder.CreateBitCast(Var, VoidPtrTy),
568             VarName,
569             llvm::ConstantInt::get(VarSizeTy, VarSize),
570             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
571         if (!Var->isDeclaration())
572           Builder.CreateCall(RegisterManagedVar, Args);
573       } else {
574         llvm::Value *Args[] = {
575             &GpuBinaryHandlePtr,
576             Builder.CreateBitCast(Var, VoidPtrTy),
577             VarName,
578             VarName,
579             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
580             llvm::ConstantInt::get(VarSizeTy, VarSize),
581             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
582             llvm::ConstantInt::get(IntTy, 0)};
583         Builder.CreateCall(RegisterVar, Args);
584       }
585       break;
586     }
587     case DeviceVarFlags::Surface:
588       Builder.CreateCall(
589           RegisterSurf,
590           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
591            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
592            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
593       break;
594     case DeviceVarFlags::Texture:
595       Builder.CreateCall(
596           RegisterTex,
597           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
598            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
599            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
600            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
601       break;
602     }
603   }
604 
605   Builder.CreateRetVoid();
606   return RegisterKernelsFunc;
607 }
608 
609 /// Creates a global constructor function for the module:
610 ///
611 /// For CUDA:
612 /// \code
613 /// void __cuda_module_ctor(void*) {
614 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
615 ///     __cuda_register_globals(Handle);
616 /// }
617 /// \endcode
618 ///
619 /// For HIP:
620 /// \code
621 /// void __hip_module_ctor(void*) {
622 ///     if (__hip_gpubin_handle == 0) {
623 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
624 ///         __hip_register_globals(__hip_gpubin_handle);
625 ///     }
626 /// }
627 /// \endcode
628 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
629   bool IsHIP = CGM.getLangOpts().HIP;
630   bool IsCUDA = CGM.getLangOpts().CUDA;
631   // No need to generate ctors/dtors if there is no GPU binary.
632   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
633   if (CudaGpuBinaryFileName.empty() && !IsHIP)
634     return nullptr;
635   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
636       DeviceVars.empty())
637     return nullptr;
638 
639   // void __{cuda|hip}_register_globals(void* handle);
640   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
641   // We always need a function to pass in as callback. Create a dummy
642   // implementation if we don't need to register anything.
643   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
644     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
645 
646   // void ** __{cuda|hip}RegisterFatBinary(void *);
647   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
648       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
649       addUnderscoredPrefixToName("RegisterFatBinary"));
650   // struct { int magic, int version, void * gpu_binary, void * dont_care };
651   llvm::StructType *FatbinWrapperTy =
652       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
653 
654   // Register GPU binary with the CUDA runtime, store returned handle in a
655   // global variable and save a reference in GpuBinaryHandle to be cleaned up
656   // in destructor on exit. Then associate all known kernels with the GPU binary
657   // handle so CUDA runtime can figure out what to call on the GPU side.
658   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
659   if (!CudaGpuBinaryFileName.empty()) {
660     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
661         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
662     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
663       CGM.getDiags().Report(diag::err_cannot_open_file)
664           << CudaGpuBinaryFileName << EC.message();
665       return nullptr;
666     }
667     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
668   }
669 
670   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
671       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
672       llvm::GlobalValue::InternalLinkage,
673       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
674   llvm::BasicBlock *CtorEntryBB =
675       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
676   CGBuilderTy CtorBuilder(CGM, Context);
677 
678   CtorBuilder.SetInsertPoint(CtorEntryBB);
679 
680   const char *FatbinConstantName;
681   const char *FatbinSectionName;
682   const char *ModuleIDSectionName;
683   StringRef ModuleIDPrefix;
684   llvm::Constant *FatBinStr;
685   unsigned FatMagic;
686   if (IsHIP) {
687     FatbinConstantName = ".hip_fatbin";
688     FatbinSectionName = ".hipFatBinSegment";
689 
690     ModuleIDSectionName = "__hip_module_id";
691     ModuleIDPrefix = "__hip_";
692 
693     if (CudaGpuBinary) {
694       // If fatbin is available from early finalization, create a string
695       // literal containing the fat binary loaded from the given file.
696       const unsigned HIPCodeObjectAlign = 4096;
697       FatBinStr =
698           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
699                              FatbinConstantName, HIPCodeObjectAlign);
700     } else {
701       // If fatbin is not available, create an external symbol
702       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
703       // to contain the fat binary but will be populated somewhere else,
704       // e.g. by lld through link script.
705       FatBinStr = new llvm::GlobalVariable(
706         CGM.getModule(), CGM.Int8Ty,
707         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
708         "__hip_fatbin", nullptr,
709         llvm::GlobalVariable::NotThreadLocal);
710       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
711     }
712 
713     FatMagic = HIPFatMagic;
714   } else {
715     if (RelocatableDeviceCode)
716       FatbinConstantName = CGM.getTriple().isMacOSX()
717                                ? "__NV_CUDA,__nv_relfatbin"
718                                : "__nv_relfatbin";
719     else
720       FatbinConstantName =
721           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
722     // NVIDIA's cuobjdump looks for fatbins in this section.
723     FatbinSectionName =
724         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
725 
726     ModuleIDSectionName = CGM.getTriple().isMacOSX()
727                               ? "__NV_CUDA,__nv_module_id"
728                               : "__nv_module_id";
729     ModuleIDPrefix = "__nv_";
730 
731     // For CUDA, create a string literal containing the fat binary loaded from
732     // the given file.
733     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
734                                    FatbinConstantName, 8);
735     FatMagic = CudaFatMagic;
736   }
737 
738   // Create initialized wrapper structure that points to the loaded GPU binary
739   ConstantInitBuilder Builder(CGM);
740   auto Values = Builder.beginStruct(FatbinWrapperTy);
741   // Fatbin wrapper magic.
742   Values.addInt(IntTy, FatMagic);
743   // Fatbin version.
744   Values.addInt(IntTy, 1);
745   // Data.
746   Values.add(FatBinStr);
747   // Unused in fatbin v1.
748   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
749   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
750       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
751       /*constant*/ true);
752   FatbinWrapper->setSection(FatbinSectionName);
753 
754   // There is only one HIP fat binary per linked module, however there are
755   // multiple constructor functions. Make sure the fat binary is registered
756   // only once. The constructor functions are executed by the dynamic loader
757   // before the program gains control. The dynamic loader cannot execute the
758   // constructor functions concurrently since doing that would not guarantee
759   // thread safety of the loaded program. Therefore we can assume sequential
760   // execution of constructor functions here.
761   if (IsHIP) {
762     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
763         llvm::GlobalValue::LinkOnceAnyLinkage;
764     llvm::BasicBlock *IfBlock =
765         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
766     llvm::BasicBlock *ExitBlock =
767         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
768     // The name, size, and initialization pattern of this variable is part
769     // of HIP ABI.
770     GpuBinaryHandle = new llvm::GlobalVariable(
771         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
772         Linkage,
773         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
774         "__hip_gpubin_handle");
775     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
776     // Prevent the weak symbol in different shared libraries being merged.
777     if (Linkage != llvm::GlobalValue::InternalLinkage)
778       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
779     Address GpuBinaryAddr(
780         GpuBinaryHandle,
781         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
782     {
783       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
784       llvm::Constant *Zero =
785           llvm::Constant::getNullValue(HandleValue->getType());
786       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
787       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
788     }
789     {
790       CtorBuilder.SetInsertPoint(IfBlock);
791       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
792       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
793           RegisterFatbinFunc,
794           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
795       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
796       CtorBuilder.CreateBr(ExitBlock);
797     }
798     {
799       CtorBuilder.SetInsertPoint(ExitBlock);
800       // Call __hip_register_globals(GpuBinaryHandle);
801       if (RegisterGlobalsFunc) {
802         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
803         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
804       }
805     }
806   } else if (!RelocatableDeviceCode) {
807     // Register binary with CUDA runtime. This is substantially different in
808     // default mode vs. separate compilation!
809     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
810     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
811         RegisterFatbinFunc,
812         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
813     GpuBinaryHandle = new llvm::GlobalVariable(
814         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
815         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
816     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
817     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
818                                    CGM.getPointerAlign());
819 
820     // Call __cuda_register_globals(GpuBinaryHandle);
821     if (RegisterGlobalsFunc)
822       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
823 
824     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
825     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
826                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
827       // void __cudaRegisterFatBinaryEnd(void **);
828       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
829           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
830           "__cudaRegisterFatBinaryEnd");
831       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
832     }
833   } else {
834     // Generate a unique module ID.
835     SmallString<64> ModuleID;
836     llvm::raw_svector_ostream OS(ModuleID);
837     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
838     llvm::Constant *ModuleIDConstant = makeConstantString(
839         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
840 
841     // Create an alias for the FatbinWrapper that nvcc will look for.
842     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
843                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
844 
845     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
846     // void *, void (*)(void **))
847     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
848     RegisterLinkedBinaryName += ModuleID;
849     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
850         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
851 
852     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
853     llvm::Value *Args[] = {RegisterGlobalsFunc,
854                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
855                            ModuleIDConstant,
856                            makeDummyFunction(getCallbackFnTy())};
857     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
858   }
859 
860   // Create destructor and register it with atexit() the way NVCC does it. Doing
861   // it during regular destructor phase worked in CUDA before 9.2 but results in
862   // double-free in 9.2.
863   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
864     // extern "C" int atexit(void (*f)(void));
865     llvm::FunctionType *AtExitTy =
866         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
867     llvm::FunctionCallee AtExitFunc =
868         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
869                                   /*Local=*/true);
870     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
871   }
872 
873   CtorBuilder.CreateRetVoid();
874   return ModuleCtorFunc;
875 }
876 
877 /// Creates a global destructor function that unregisters the GPU code blob
878 /// registered by constructor.
879 ///
880 /// For CUDA:
881 /// \code
882 /// void __cuda_module_dtor(void*) {
883 ///     __cudaUnregisterFatBinary(Handle);
884 /// }
885 /// \endcode
886 ///
887 /// For HIP:
888 /// \code
889 /// void __hip_module_dtor(void*) {
890 ///     if (__hip_gpubin_handle) {
891 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
892 ///         __hip_gpubin_handle = 0;
893 ///     }
894 /// }
895 /// \endcode
896 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
897   // No need for destructor if we don't have a handle to unregister.
898   if (!GpuBinaryHandle)
899     return nullptr;
900 
901   // void __cudaUnregisterFatBinary(void ** handle);
902   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
903       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
904       addUnderscoredPrefixToName("UnregisterFatBinary"));
905 
906   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
907       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
908       llvm::GlobalValue::InternalLinkage,
909       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
910 
911   llvm::BasicBlock *DtorEntryBB =
912       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
913   CGBuilderTy DtorBuilder(CGM, Context);
914   DtorBuilder.SetInsertPoint(DtorEntryBB);
915 
916   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
917                                              GpuBinaryHandle->getAlignment()));
918   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
919   // There is only one HIP fat binary per linked module, however there are
920   // multiple destructor functions. Make sure the fat binary is unregistered
921   // only once.
922   if (CGM.getLangOpts().HIP) {
923     llvm::BasicBlock *IfBlock =
924         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
925     llvm::BasicBlock *ExitBlock =
926         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
927     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
928     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
929     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
930 
931     DtorBuilder.SetInsertPoint(IfBlock);
932     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
933     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
934     DtorBuilder.CreateBr(ExitBlock);
935 
936     DtorBuilder.SetInsertPoint(ExitBlock);
937   } else {
938     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
939   }
940   DtorBuilder.CreateRetVoid();
941   return ModuleDtorFunc;
942 }
943 
944 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
945   return new CGNVCUDARuntime(CGM);
946 }
947 
948 void CGNVCUDARuntime::internalizeDeviceSideVar(
949     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
950   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
951   // global variables become internal definitions. These have to be internal in
952   // order to prevent name conflicts with global host variables with the same
953   // name in a different TUs.
954   //
955   // For -fgpu-rdc, the shadow variables should not be internalized because
956   // they may be accessed by different TU.
957   if (CGM.getLangOpts().GPURelocatableDeviceCode)
958     return;
959 
960   // __shared__ variables are odd. Shadows do get created, but
961   // they are not registered with the CUDA runtime, so they
962   // can't really be used to access their device-side
963   // counterparts. It's not clear yet whether it's nvcc's bug or
964   // a feature, but we've got to do the same for compatibility.
965   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
966       D->hasAttr<CUDASharedAttr>() ||
967       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
968       D->getType()->isCUDADeviceBuiltinTextureType()) {
969     Linkage = llvm::GlobalValue::InternalLinkage;
970   }
971 }
972 
973 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
974                                             llvm::GlobalVariable &GV) {
975   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
976     // Shadow variables and their properties must be registered with CUDA
977     // runtime. Skip Extern global variables, which will be registered in
978     // the TU where they are defined.
979     //
980     // Don't register a C++17 inline variable. The local symbol can be
981     // discarded and referencing a discarded local symbol from outside the
982     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
983     // TODO: Reject __device__ constexpr and __device__ inline in Sema.
984     // HIP managed variables need to be always recorded in device and host
985     // compilations for transformation.
986     if ((!D->hasExternalStorage() && !D->isInline()) ||
987         D->hasAttr<HIPManagedAttr>()) {
988       registerDeviceVar(D, GV, !D->hasDefinition(),
989                         D->hasAttr<CUDAConstantAttr>());
990     }
991   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
992              D->getType()->isCUDADeviceBuiltinTextureType()) {
993     // Builtin surfaces and textures and their template arguments are
994     // also registered with CUDA runtime.
995     const ClassTemplateSpecializationDecl *TD =
996         cast<ClassTemplateSpecializationDecl>(
997             D->getType()->getAs<RecordType>()->getDecl());
998     const TemplateArgumentList &Args = TD->getTemplateArgs();
999     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1000       assert(Args.size() == 2 &&
1001              "Unexpected number of template arguments of CUDA device "
1002              "builtin surface type.");
1003       auto SurfType = Args[1].getAsIntegral();
1004       if (!D->hasExternalStorage())
1005         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1006     } else {
1007       assert(Args.size() == 3 &&
1008              "Unexpected number of template arguments of CUDA device "
1009              "builtin texture type.");
1010       auto TexType = Args[1].getAsIntegral();
1011       auto Normalized = Args[2].getAsIntegral();
1012       if (!D->hasExternalStorage())
1013         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1014                           Normalized.getZExtValue());
1015     }
1016   }
1017 }
1018 
1019 // Transform managed variables to pointers to managed variables in device code.
1020 // Each use of the original managed variable is replaced by a load from the
1021 // transformed managed variable. The transformed managed variable contains
1022 // the address of managed memory which will be allocated by the runtime.
1023 void CGNVCUDARuntime::transformManagedVars() {
1024   for (auto &&Info : DeviceVars) {
1025     llvm::GlobalVariable *Var = Info.Var;
1026     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1027         Info.Flags.isManaged()) {
1028       auto ManagedVar = new llvm::GlobalVariable(
1029           CGM.getModule(), Var->getType(),
1030           /*isConstant=*/false, Var->getLinkage(),
1031           /*Init=*/Var->isDeclaration()
1032               ? nullptr
1033               : llvm::ConstantPointerNull::get(Var->getType()),
1034           /*Name=*/"", /*InsertBefore=*/nullptr,
1035           llvm::GlobalVariable::NotThreadLocal,
1036           CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1037       ManagedVar->setDSOLocal(Var->isDSOLocal());
1038       ManagedVar->setVisibility(Var->getVisibility());
1039       ManagedVar->setExternallyInitialized(true);
1040       replaceManagedVar(Var, ManagedVar);
1041       ManagedVar->takeName(Var);
1042       Var->setName(Twine(ManagedVar->getName()) + ".managed");
1043       // Keep managed variables even if they are not used in device code since
1044       // they need to be allocated by the runtime.
1045       if (!Var->isDeclaration()) {
1046         assert(!ManagedVar->isDeclaration());
1047         CGM.addCompilerUsedGlobal(Var);
1048         CGM.addCompilerUsedGlobal(ManagedVar);
1049       }
1050     }
1051   }
1052 }
1053 
1054 // Returns module constructor to be added.
1055 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1056   if (CGM.getLangOpts().CUDAIsDevice) {
1057     transformManagedVars();
1058     return nullptr;
1059   }
1060   return makeModuleCtorFunction();
1061 }
1062