1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel;
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   struct VarInfo {
52     llvm::GlobalVariable *Var;
53     const VarDecl *D;
54     DeviceVarFlags Flags;
55   };
56   llvm::SmallVector<VarInfo, 16> DeviceVars;
57   /// Keeps track of variable containing handle of GPU binary. Populated by
58   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59   /// ModuleDtorFunction()
60   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61   /// Whether we generate relocatable device code.
62   bool RelocatableDeviceCode;
63   /// Mangle context for device.
64   std::unique_ptr<MangleContext> DeviceMC;
65 
66   llvm::FunctionCallee getSetupArgumentFn() const;
67   llvm::FunctionCallee getLaunchFn() const;
68 
69   llvm::FunctionType *getRegisterGlobalsFnTy() const;
70   llvm::FunctionType *getCallbackFnTy() const;
71   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72   std::string addPrefixToName(StringRef FuncName) const;
73   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74 
75   /// Creates a function to register all kernel stubs generated in this module.
76   llvm::Function *makeRegisterGlobalsFn();
77 
78   /// Helper function that generates a constant string and returns a pointer to
79   /// the start of the string.  The result of this function can be used anywhere
80   /// where the C code specifies const char*.
81   llvm::Constant *makeConstantString(const std::string &Str,
82                                      const std::string &Name = "",
83                                      const std::string &SectionName = "",
84                                      unsigned Alignment = 0) {
85     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86                                llvm::ConstantInt::get(SizeTy, 0)};
87     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88     llvm::GlobalVariable *GV =
89         cast<llvm::GlobalVariable>(ConstStr.getPointer());
90     if (!SectionName.empty()) {
91       GV->setSection(SectionName);
92       // Mark the address as used which make sure that this section isn't
93       // merged and we will really have it in the object file.
94       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95     }
96     if (Alignment)
97       GV->setAlignment(llvm::Align(Alignment));
98 
99     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100                                                 ConstStr.getPointer(), Zeros);
101   }
102 
103   /// Helper function that generates an empty dummy function returning void.
104   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105     assert(FnTy->getReturnType()->isVoidTy() &&
106            "Can only generate dummy functions returning void!");
107     llvm::Function *DummyFunc = llvm::Function::Create(
108         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109 
110     llvm::BasicBlock *DummyBlock =
111         llvm::BasicBlock::Create(Context, "", DummyFunc);
112     CGBuilderTy FuncBuilder(CGM, Context);
113     FuncBuilder.SetInsertPoint(DummyBlock);
114     FuncBuilder.CreateRetVoid();
115 
116     return DummyFunc;
117   }
118 
119   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121   std::string getDeviceSideName(const NamedDecl *ND) override;
122 
123   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
124                          bool Extern, bool Constant) {
125     DeviceVars.push_back({&Var,
126                           VD,
127                           {DeviceVarFlags::Variable, Extern, Constant,
128                            VD->hasAttr<HIPManagedAttr>(),
129                            /*Normalized*/ false, 0}});
130   }
131   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
132                           bool Extern, int Type) {
133     DeviceVars.push_back({&Var,
134                           VD,
135                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
136                            /*Managed*/ false,
137                            /*Normalized*/ false, Type}});
138   }
139   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
140                          bool Extern, int Type, bool Normalized) {
141     DeviceVars.push_back({&Var,
142                           VD,
143                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
144                            /*Managed*/ false, Normalized, Type}});
145   }
146 
147 public:
148   CGNVCUDARuntime(CodeGenModule &CGM);
149 
150   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
151   void handleVarRegistration(const VarDecl *VD,
152                              llvm::GlobalVariable &Var) override;
153 
154   /// Creates module constructor function
155   llvm::Function *makeModuleCtorFunction() override;
156   /// Creates module destructor function
157   llvm::Function *makeModuleDtorFunction() override;
158   void
159   internalizeDeviceSideVar(const VarDecl *D,
160                            llvm::GlobalValue::LinkageTypes &Linkage) override;
161 };
162 
163 }
164 
165 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
166   if (CGM.getLangOpts().HIP)
167     return ((Twine("hip") + Twine(FuncName)).str());
168   return ((Twine("cuda") + Twine(FuncName)).str());
169 }
170 std::string
171 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
172   if (CGM.getLangOpts().HIP)
173     return ((Twine("__hip") + Twine(FuncName)).str());
174   return ((Twine("__cuda") + Twine(FuncName)).str());
175 }
176 
177 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
178     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
179       TheModule(CGM.getModule()),
180       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
181       DeviceMC(CGM.getContext().createMangleContext(
182           CGM.getContext().getAuxTargetInfo())) {
183   CodeGen::CodeGenTypes &Types = CGM.getTypes();
184   ASTContext &Ctx = CGM.getContext();
185 
186   IntTy = CGM.IntTy;
187   SizeTy = CGM.SizeTy;
188   VoidTy = CGM.VoidTy;
189 
190   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
191   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
192   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
193   if (CGM.getContext().getAuxTargetInfo()) {
194     // If the host and device have different C++ ABIs, mark it as the device
195     // mangle context so that the mangling needs to retrieve the additonal
196     // device lambda mangling number instead of the regular host one.
197     DeviceMC->setDeviceMangleContext(
198         CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
199         CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
200   }
201 }
202 
203 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
204   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
205   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
206   return CGM.CreateRuntimeFunction(
207       llvm::FunctionType::get(IntTy, Params, false),
208       addPrefixToName("SetupArgument"));
209 }
210 
211 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
212   if (CGM.getLangOpts().HIP) {
213     // hipError_t hipLaunchByPtr(char *);
214     return CGM.CreateRuntimeFunction(
215         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
216   } else {
217     // cudaError_t cudaLaunch(char *);
218     return CGM.CreateRuntimeFunction(
219         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
220   }
221 }
222 
223 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
224   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
225 }
226 
227 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
228   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
229 }
230 
231 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
232   auto CallbackFnTy = getCallbackFnTy();
233   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
234   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
235                           VoidPtrTy, CallbackFnTy->getPointerTo()};
236   return llvm::FunctionType::get(VoidTy, Params, false);
237 }
238 
239 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
240   GlobalDecl GD;
241   // D could be either a kernel or a variable.
242   if (auto *FD = dyn_cast<FunctionDecl>(ND))
243     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
244   else
245     GD = GlobalDecl(ND);
246   std::string DeviceSideName;
247   if (DeviceMC->shouldMangleDeclName(ND)) {
248     SmallString<256> Buffer;
249     llvm::raw_svector_ostream Out(Buffer);
250     DeviceMC->mangleName(GD, Out);
251     DeviceSideName = std::string(Out.str());
252   } else
253     DeviceSideName = std::string(ND->getIdentifier()->getName());
254   return DeviceSideName;
255 }
256 
257 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
258                                      FunctionArgList &Args) {
259   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
260   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
261                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
262       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
263     emitDeviceStubBodyNew(CGF, Args);
264   else
265     emitDeviceStubBodyLegacy(CGF, Args);
266 }
267 
268 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
269 // array and kernels are launched using cudaLaunchKernel().
270 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
271                                             FunctionArgList &Args) {
272   // Build the shadow stack entry at the very start of the function.
273 
274   // Calculate amount of space we will need for all arguments.  If we have no
275   // args, allocate a single pointer so we still have a valid pointer to the
276   // argument array that we can pass to runtime, even if it will be unused.
277   Address KernelArgs = CGF.CreateTempAlloca(
278       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
279       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
280   // Store pointers to the arguments in a locally allocated launch_args.
281   for (unsigned i = 0; i < Args.size(); ++i) {
282     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
283     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
284     CGF.Builder.CreateDefaultAlignedStore(
285         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
286   }
287 
288   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
289 
290   // Lookup cudaLaunchKernel/hipLaunchKernel function.
291   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
292   //                              void **args, size_t sharedMem,
293   //                              cudaStream_t stream);
294   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
295   //                            void **args, size_t sharedMem,
296   //                            hipStream_t stream);
297   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
298   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
299   auto LaunchKernelName = addPrefixToName("LaunchKernel");
300   IdentifierInfo &cudaLaunchKernelII =
301       CGM.getContext().Idents.get(LaunchKernelName);
302   FunctionDecl *cudaLaunchKernelFD = nullptr;
303   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
304     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
305       cudaLaunchKernelFD = FD;
306   }
307 
308   if (cudaLaunchKernelFD == nullptr) {
309     CGM.Error(CGF.CurFuncDecl->getLocation(),
310               "Can't find declaration for " + LaunchKernelName);
311     return;
312   }
313   // Create temporary dim3 grid_dim, block_dim.
314   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
315   QualType Dim3Ty = GridDimParam->getType();
316   Address GridDim =
317       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
318   Address BlockDim =
319       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
320   Address ShmemSize =
321       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
322   Address Stream =
323       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
324   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
325       llvm::FunctionType::get(IntTy,
326                               {/*gridDim=*/GridDim.getType(),
327                                /*blockDim=*/BlockDim.getType(),
328                                /*ShmemSize=*/ShmemSize.getType(),
329                                /*Stream=*/Stream.getType()},
330                               /*isVarArg=*/false),
331       addUnderscoredPrefixToName("PopCallConfiguration"));
332 
333   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
334                               {GridDim.getPointer(), BlockDim.getPointer(),
335                                ShmemSize.getPointer(), Stream.getPointer()});
336 
337   // Emit the call to cudaLaunch
338   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
339   CallArgList LaunchKernelArgs;
340   LaunchKernelArgs.add(RValue::get(Kernel),
341                        cudaLaunchKernelFD->getParamDecl(0)->getType());
342   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
343   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
344   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
345                        cudaLaunchKernelFD->getParamDecl(3)->getType());
346   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
347                        cudaLaunchKernelFD->getParamDecl(4)->getType());
348   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
349                        cudaLaunchKernelFD->getParamDecl(5)->getType());
350 
351   QualType QT = cudaLaunchKernelFD->getType();
352   QualType CQT = QT.getCanonicalType();
353   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
354   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
355 
356   const CGFunctionInfo &FI =
357       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
358   llvm::FunctionCallee cudaLaunchKernelFn =
359       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
360   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
361                LaunchKernelArgs);
362   CGF.EmitBranch(EndBlock);
363 
364   CGF.EmitBlock(EndBlock);
365 }
366 
367 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
368                                                FunctionArgList &Args) {
369   // Emit a call to cudaSetupArgument for each arg in Args.
370   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
371   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
372   CharUnits Offset = CharUnits::Zero();
373   for (const VarDecl *A : Args) {
374     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
375     Offset = Offset.alignTo(TInfo.Align);
376     llvm::Value *Args[] = {
377         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
378                                       VoidPtrTy),
379         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
380         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
381     };
382     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
383     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
384     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
385     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
386     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
387     CGF.EmitBlock(NextBlock);
388     Offset += TInfo.Width;
389   }
390 
391   // Emit the call to cudaLaunch
392   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
393   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
394   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
395   CGF.EmitBranch(EndBlock);
396 
397   CGF.EmitBlock(EndBlock);
398 }
399 
400 // Replace the original variable Var with the address loaded from variable
401 // ManagedVar populated by HIP runtime.
402 static void replaceManagedVar(llvm::GlobalVariable *Var,
403                               llvm::GlobalVariable *ManagedVar) {
404   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
405   for (auto &&VarUse : Var->uses()) {
406     WorkList.push_back({VarUse.getUser()});
407   }
408   while (!WorkList.empty()) {
409     auto &&WorkItem = WorkList.pop_back_val();
410     auto *U = WorkItem.back();
411     if (isa<llvm::ConstantExpr>(U)) {
412       for (auto &&UU : U->uses()) {
413         WorkItem.push_back(UU.getUser());
414         WorkList.push_back(WorkItem);
415         WorkItem.pop_back();
416       }
417       continue;
418     }
419     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
420       llvm::Value *OldV = Var;
421       llvm::Instruction *NewV =
422           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
423                              llvm::Align(Var->getAlignment()), I);
424       WorkItem.pop_back();
425       // Replace constant expressions directly or indirectly using the managed
426       // variable with instructions.
427       for (auto &&Op : WorkItem) {
428         auto *CE = cast<llvm::ConstantExpr>(Op);
429         auto *NewInst = llvm::createReplacementInstr(CE, I);
430         NewInst->replaceUsesOfWith(OldV, NewV);
431         OldV = CE;
432         NewV = NewInst;
433       }
434       I->replaceUsesOfWith(OldV, NewV);
435     } else {
436       llvm_unreachable("Invalid use of managed variable");
437     }
438   }
439 }
440 
441 /// Creates a function that sets up state on the host side for CUDA objects that
442 /// have a presence on both the host and device sides. Specifically, registers
443 /// the host side of kernel functions and device global variables with the CUDA
444 /// runtime.
445 /// \code
446 /// void __cuda_register_globals(void** GpuBinaryHandle) {
447 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
448 ///    ...
449 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
450 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
451 ///    ...
452 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
453 /// }
454 /// \endcode
455 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
456   // No need to register anything
457   if (EmittedKernels.empty() && DeviceVars.empty())
458     return nullptr;
459 
460   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
461       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
462       addUnderscoredPrefixToName("_register_globals"), &TheModule);
463   llvm::BasicBlock *EntryBB =
464       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
465   CGBuilderTy Builder(CGM, Context);
466   Builder.SetInsertPoint(EntryBB);
467 
468   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
469   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
470   llvm::Type *RegisterFuncParams[] = {
471       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
472       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
473   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
474       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
475       addUnderscoredPrefixToName("RegisterFunction"));
476 
477   // Extract GpuBinaryHandle passed as the first argument passed to
478   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
479   // each emitted kernel.
480   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
481   for (auto &&I : EmittedKernels) {
482     llvm::Constant *KernelName =
483         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
484     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
485     llvm::Value *Args[] = {
486         &GpuBinaryHandlePtr,
487         Builder.CreateBitCast(I.Kernel, VoidPtrTy),
488         KernelName,
489         KernelName,
490         llvm::ConstantInt::get(IntTy, -1),
491         NullPtr,
492         NullPtr,
493         NullPtr,
494         NullPtr,
495         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
496     Builder.CreateCall(RegisterFunc, Args);
497   }
498 
499   llvm::Type *VarSizeTy = IntTy;
500   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
501   if (CGM.getLangOpts().HIP ||
502       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
503     VarSizeTy = SizeTy;
504 
505   // void __cudaRegisterVar(void **, char *, char *, const char *,
506   //                        int, int, int, int)
507   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
508                                      CharPtrTy,    IntTy,     VarSizeTy,
509                                      IntTy,        IntTy};
510   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
511       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
512       addUnderscoredPrefixToName("RegisterVar"));
513   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
514   //                              size_t, unsigned)
515   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
516                                             CharPtrTy,    VarSizeTy, IntTy};
517   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
518       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
519       addUnderscoredPrefixToName("RegisterManagedVar"));
520   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
521   //                            const void **, const char *, int, int);
522   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
523       llvm::FunctionType::get(
524           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
525           false),
526       addUnderscoredPrefixToName("RegisterSurface"));
527   // void __cudaRegisterTexture(void **, const struct textureReference *,
528   //                            const void **, const char *, int, int, int)
529   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
530       llvm::FunctionType::get(
531           VoidTy,
532           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
533           false),
534       addUnderscoredPrefixToName("RegisterTexture"));
535   for (auto &&Info : DeviceVars) {
536     llvm::GlobalVariable *Var = Info.Var;
537     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
538     switch (Info.Flags.getKind()) {
539     case DeviceVarFlags::Variable: {
540       uint64_t VarSize =
541           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
542       if (Info.Flags.isManaged()) {
543         auto ManagedVar = new llvm::GlobalVariable(
544             CGM.getModule(), Var->getType(),
545             /*isConstant=*/false, Var->getLinkage(),
546             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
547             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
548             llvm::GlobalVariable::NotThreadLocal);
549         ManagedVar->setDSOLocal(Var->isDSOLocal());
550         ManagedVar->setVisibility(Var->getVisibility());
551         replaceManagedVar(Var, ManagedVar);
552         llvm::Value *Args[] = {
553             &GpuBinaryHandlePtr,
554             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
555             Builder.CreateBitCast(Var, VoidPtrTy),
556             VarName,
557             llvm::ConstantInt::get(VarSizeTy, VarSize),
558             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
559         Builder.CreateCall(RegisterManagedVar, Args);
560       } else {
561         llvm::Value *Args[] = {
562             &GpuBinaryHandlePtr,
563             Builder.CreateBitCast(Var, VoidPtrTy),
564             VarName,
565             VarName,
566             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
567             llvm::ConstantInt::get(VarSizeTy, VarSize),
568             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
569             llvm::ConstantInt::get(IntTy, 0)};
570         Builder.CreateCall(RegisterVar, Args);
571       }
572       break;
573     }
574     case DeviceVarFlags::Surface:
575       Builder.CreateCall(
576           RegisterSurf,
577           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
578            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
579            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
580       break;
581     case DeviceVarFlags::Texture:
582       Builder.CreateCall(
583           RegisterTex,
584           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
585            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
586            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
587            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
588       break;
589     }
590   }
591 
592   Builder.CreateRetVoid();
593   return RegisterKernelsFunc;
594 }
595 
596 /// Creates a global constructor function for the module:
597 ///
598 /// For CUDA:
599 /// \code
600 /// void __cuda_module_ctor(void*) {
601 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
602 ///     __cuda_register_globals(Handle);
603 /// }
604 /// \endcode
605 ///
606 /// For HIP:
607 /// \code
608 /// void __hip_module_ctor(void*) {
609 ///     if (__hip_gpubin_handle == 0) {
610 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
611 ///         __hip_register_globals(__hip_gpubin_handle);
612 ///     }
613 /// }
614 /// \endcode
615 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
616   bool IsHIP = CGM.getLangOpts().HIP;
617   bool IsCUDA = CGM.getLangOpts().CUDA;
618   // No need to generate ctors/dtors if there is no GPU binary.
619   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
620   if (CudaGpuBinaryFileName.empty() && !IsHIP)
621     return nullptr;
622   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
623       DeviceVars.empty())
624     return nullptr;
625 
626   // void __{cuda|hip}_register_globals(void* handle);
627   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
628   // We always need a function to pass in as callback. Create a dummy
629   // implementation if we don't need to register anything.
630   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
631     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
632 
633   // void ** __{cuda|hip}RegisterFatBinary(void *);
634   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
635       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
636       addUnderscoredPrefixToName("RegisterFatBinary"));
637   // struct { int magic, int version, void * gpu_binary, void * dont_care };
638   llvm::StructType *FatbinWrapperTy =
639       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
640 
641   // Register GPU binary with the CUDA runtime, store returned handle in a
642   // global variable and save a reference in GpuBinaryHandle to be cleaned up
643   // in destructor on exit. Then associate all known kernels with the GPU binary
644   // handle so CUDA runtime can figure out what to call on the GPU side.
645   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
646   if (!CudaGpuBinaryFileName.empty()) {
647     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
648         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
649     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
650       CGM.getDiags().Report(diag::err_cannot_open_file)
651           << CudaGpuBinaryFileName << EC.message();
652       return nullptr;
653     }
654     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
655   }
656 
657   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
658       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
659       llvm::GlobalValue::InternalLinkage,
660       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
661   llvm::BasicBlock *CtorEntryBB =
662       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
663   CGBuilderTy CtorBuilder(CGM, Context);
664 
665   CtorBuilder.SetInsertPoint(CtorEntryBB);
666 
667   const char *FatbinConstantName;
668   const char *FatbinSectionName;
669   const char *ModuleIDSectionName;
670   StringRef ModuleIDPrefix;
671   llvm::Constant *FatBinStr;
672   unsigned FatMagic;
673   if (IsHIP) {
674     FatbinConstantName = ".hip_fatbin";
675     FatbinSectionName = ".hipFatBinSegment";
676 
677     ModuleIDSectionName = "__hip_module_id";
678     ModuleIDPrefix = "__hip_";
679 
680     if (CudaGpuBinary) {
681       // If fatbin is available from early finalization, create a string
682       // literal containing the fat binary loaded from the given file.
683       const unsigned HIPCodeObjectAlign = 4096;
684       FatBinStr =
685           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
686                              FatbinConstantName, HIPCodeObjectAlign);
687     } else {
688       // If fatbin is not available, create an external symbol
689       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
690       // to contain the fat binary but will be populated somewhere else,
691       // e.g. by lld through link script.
692       FatBinStr = new llvm::GlobalVariable(
693         CGM.getModule(), CGM.Int8Ty,
694         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
695         "__hip_fatbin", nullptr,
696         llvm::GlobalVariable::NotThreadLocal);
697       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
698     }
699 
700     FatMagic = HIPFatMagic;
701   } else {
702     if (RelocatableDeviceCode)
703       FatbinConstantName = CGM.getTriple().isMacOSX()
704                                ? "__NV_CUDA,__nv_relfatbin"
705                                : "__nv_relfatbin";
706     else
707       FatbinConstantName =
708           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
709     // NVIDIA's cuobjdump looks for fatbins in this section.
710     FatbinSectionName =
711         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
712 
713     ModuleIDSectionName = CGM.getTriple().isMacOSX()
714                               ? "__NV_CUDA,__nv_module_id"
715                               : "__nv_module_id";
716     ModuleIDPrefix = "__nv_";
717 
718     // For CUDA, create a string literal containing the fat binary loaded from
719     // the given file.
720     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
721                                    FatbinConstantName, 8);
722     FatMagic = CudaFatMagic;
723   }
724 
725   // Create initialized wrapper structure that points to the loaded GPU binary
726   ConstantInitBuilder Builder(CGM);
727   auto Values = Builder.beginStruct(FatbinWrapperTy);
728   // Fatbin wrapper magic.
729   Values.addInt(IntTy, FatMagic);
730   // Fatbin version.
731   Values.addInt(IntTy, 1);
732   // Data.
733   Values.add(FatBinStr);
734   // Unused in fatbin v1.
735   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
736   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
737       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
738       /*constant*/ true);
739   FatbinWrapper->setSection(FatbinSectionName);
740 
741   // There is only one HIP fat binary per linked module, however there are
742   // multiple constructor functions. Make sure the fat binary is registered
743   // only once. The constructor functions are executed by the dynamic loader
744   // before the program gains control. The dynamic loader cannot execute the
745   // constructor functions concurrently since doing that would not guarantee
746   // thread safety of the loaded program. Therefore we can assume sequential
747   // execution of constructor functions here.
748   if (IsHIP) {
749     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
750         llvm::GlobalValue::LinkOnceAnyLinkage;
751     llvm::BasicBlock *IfBlock =
752         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
753     llvm::BasicBlock *ExitBlock =
754         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
755     // The name, size, and initialization pattern of this variable is part
756     // of HIP ABI.
757     GpuBinaryHandle = new llvm::GlobalVariable(
758         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
759         Linkage,
760         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
761         "__hip_gpubin_handle");
762     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
763     // Prevent the weak symbol in different shared libraries being merged.
764     if (Linkage != llvm::GlobalValue::InternalLinkage)
765       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
766     Address GpuBinaryAddr(
767         GpuBinaryHandle,
768         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
769     {
770       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
771       llvm::Constant *Zero =
772           llvm::Constant::getNullValue(HandleValue->getType());
773       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
774       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
775     }
776     {
777       CtorBuilder.SetInsertPoint(IfBlock);
778       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
779       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
780           RegisterFatbinFunc,
781           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
782       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
783       CtorBuilder.CreateBr(ExitBlock);
784     }
785     {
786       CtorBuilder.SetInsertPoint(ExitBlock);
787       // Call __hip_register_globals(GpuBinaryHandle);
788       if (RegisterGlobalsFunc) {
789         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
790         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
791       }
792     }
793   } else if (!RelocatableDeviceCode) {
794     // Register binary with CUDA runtime. This is substantially different in
795     // default mode vs. separate compilation!
796     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
797     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
798         RegisterFatbinFunc,
799         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
800     GpuBinaryHandle = new llvm::GlobalVariable(
801         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
802         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
803     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
804     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
805                                    CGM.getPointerAlign());
806 
807     // Call __cuda_register_globals(GpuBinaryHandle);
808     if (RegisterGlobalsFunc)
809       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
810 
811     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
812     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
813                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
814       // void __cudaRegisterFatBinaryEnd(void **);
815       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
816           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
817           "__cudaRegisterFatBinaryEnd");
818       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
819     }
820   } else {
821     // Generate a unique module ID.
822     SmallString<64> ModuleID;
823     llvm::raw_svector_ostream OS(ModuleID);
824     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
825     llvm::Constant *ModuleIDConstant = makeConstantString(
826         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
827 
828     // Create an alias for the FatbinWrapper that nvcc will look for.
829     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
830                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
831 
832     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
833     // void *, void (*)(void **))
834     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
835     RegisterLinkedBinaryName += ModuleID;
836     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
837         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
838 
839     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
840     llvm::Value *Args[] = {RegisterGlobalsFunc,
841                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
842                            ModuleIDConstant,
843                            makeDummyFunction(getCallbackFnTy())};
844     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
845   }
846 
847   // Create destructor and register it with atexit() the way NVCC does it. Doing
848   // it during regular destructor phase worked in CUDA before 9.2 but results in
849   // double-free in 9.2.
850   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
851     // extern "C" int atexit(void (*f)(void));
852     llvm::FunctionType *AtExitTy =
853         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
854     llvm::FunctionCallee AtExitFunc =
855         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
856                                   /*Local=*/true);
857     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
858   }
859 
860   CtorBuilder.CreateRetVoid();
861   return ModuleCtorFunc;
862 }
863 
864 /// Creates a global destructor function that unregisters the GPU code blob
865 /// registered by constructor.
866 ///
867 /// For CUDA:
868 /// \code
869 /// void __cuda_module_dtor(void*) {
870 ///     __cudaUnregisterFatBinary(Handle);
871 /// }
872 /// \endcode
873 ///
874 /// For HIP:
875 /// \code
876 /// void __hip_module_dtor(void*) {
877 ///     if (__hip_gpubin_handle) {
878 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
879 ///         __hip_gpubin_handle = 0;
880 ///     }
881 /// }
882 /// \endcode
883 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
884   // No need for destructor if we don't have a handle to unregister.
885   if (!GpuBinaryHandle)
886     return nullptr;
887 
888   // void __cudaUnregisterFatBinary(void ** handle);
889   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
890       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
891       addUnderscoredPrefixToName("UnregisterFatBinary"));
892 
893   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
894       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
895       llvm::GlobalValue::InternalLinkage,
896       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
897 
898   llvm::BasicBlock *DtorEntryBB =
899       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
900   CGBuilderTy DtorBuilder(CGM, Context);
901   DtorBuilder.SetInsertPoint(DtorEntryBB);
902 
903   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
904                                              GpuBinaryHandle->getAlignment()));
905   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
906   // There is only one HIP fat binary per linked module, however there are
907   // multiple destructor functions. Make sure the fat binary is unregistered
908   // only once.
909   if (CGM.getLangOpts().HIP) {
910     llvm::BasicBlock *IfBlock =
911         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
912     llvm::BasicBlock *ExitBlock =
913         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
914     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
915     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
916     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
917 
918     DtorBuilder.SetInsertPoint(IfBlock);
919     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
920     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
921     DtorBuilder.CreateBr(ExitBlock);
922 
923     DtorBuilder.SetInsertPoint(ExitBlock);
924   } else {
925     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
926   }
927   DtorBuilder.CreateRetVoid();
928   return ModuleDtorFunc;
929 }
930 
931 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
932   return new CGNVCUDARuntime(CGM);
933 }
934 
935 void CGNVCUDARuntime::internalizeDeviceSideVar(
936     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
937   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
938   // global variables become internal definitions. These have to be internal in
939   // order to prevent name conflicts with global host variables with the same
940   // name in a different TUs.
941   //
942   // For -fgpu-rdc, the shadow variables should not be internalized because
943   // they may be accessed by different TU.
944   if (CGM.getLangOpts().GPURelocatableDeviceCode)
945     return;
946 
947   // __shared__ variables are odd. Shadows do get created, but
948   // they are not registered with the CUDA runtime, so they
949   // can't really be used to access their device-side
950   // counterparts. It's not clear yet whether it's nvcc's bug or
951   // a feature, but we've got to do the same for compatibility.
952   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
953       D->hasAttr<CUDASharedAttr>() ||
954       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
955       D->getType()->isCUDADeviceBuiltinTextureType()) {
956     Linkage = llvm::GlobalValue::InternalLinkage;
957   }
958 }
959 
960 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
961                                             llvm::GlobalVariable &GV) {
962   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
963     // Shadow variables and their properties must be registered with CUDA
964     // runtime. Skip Extern global variables, which will be registered in
965     // the TU where they are defined.
966     //
967     // Don't register a C++17 inline variable. The local symbol can be
968     // discarded and referencing a discarded local symbol from outside the
969     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
970     // TODO: Reject __device__ constexpr and __device__ inline in Sema.
971     if (!D->hasExternalStorage() && !D->isInline())
972       registerDeviceVar(D, GV, !D->hasDefinition(),
973                         D->hasAttr<CUDAConstantAttr>());
974   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
975              D->getType()->isCUDADeviceBuiltinTextureType()) {
976     // Builtin surfaces and textures and their template arguments are
977     // also registered with CUDA runtime.
978     const ClassTemplateSpecializationDecl *TD =
979         cast<ClassTemplateSpecializationDecl>(
980             D->getType()->getAs<RecordType>()->getDecl());
981     const TemplateArgumentList &Args = TD->getTemplateArgs();
982     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
983       assert(Args.size() == 2 &&
984              "Unexpected number of template arguments of CUDA device "
985              "builtin surface type.");
986       auto SurfType = Args[1].getAsIntegral();
987       if (!D->hasExternalStorage())
988         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
989     } else {
990       assert(Args.size() == 3 &&
991              "Unexpected number of template arguments of CUDA device "
992              "builtin texture type.");
993       auto TexType = Args[1].getAsIntegral();
994       auto Normalized = Args[2].getAsIntegral();
995       if (!D->hasExternalStorage())
996         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
997                           Normalized.getZExtValue());
998     }
999   }
1000 }
1001