1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel;
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   struct VarInfo {
52     llvm::GlobalVariable *Var;
53     const VarDecl *D;
54     DeviceVarFlags Flags;
55   };
56   llvm::SmallVector<VarInfo, 16> DeviceVars;
57   /// Keeps track of variable containing handle of GPU binary. Populated by
58   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59   /// ModuleDtorFunction()
60   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61   /// Whether we generate relocatable device code.
62   bool RelocatableDeviceCode;
63   /// Mangle context for device.
64   std::unique_ptr<MangleContext> DeviceMC;
65 
66   llvm::FunctionCallee getSetupArgumentFn() const;
67   llvm::FunctionCallee getLaunchFn() const;
68 
69   llvm::FunctionType *getRegisterGlobalsFnTy() const;
70   llvm::FunctionType *getCallbackFnTy() const;
71   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72   std::string addPrefixToName(StringRef FuncName) const;
73   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74 
75   /// Creates a function to register all kernel stubs generated in this module.
76   llvm::Function *makeRegisterGlobalsFn();
77 
78   /// Helper function that generates a constant string and returns a pointer to
79   /// the start of the string.  The result of this function can be used anywhere
80   /// where the C code specifies const char*.
81   llvm::Constant *makeConstantString(const std::string &Str,
82                                      const std::string &Name = "",
83                                      const std::string &SectionName = "",
84                                      unsigned Alignment = 0) {
85     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86                                llvm::ConstantInt::get(SizeTy, 0)};
87     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88     llvm::GlobalVariable *GV =
89         cast<llvm::GlobalVariable>(ConstStr.getPointer());
90     if (!SectionName.empty()) {
91       GV->setSection(SectionName);
92       // Mark the address as used which make sure that this section isn't
93       // merged and we will really have it in the object file.
94       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95     }
96     if (Alignment)
97       GV->setAlignment(llvm::Align(Alignment));
98 
99     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100                                                 ConstStr.getPointer(), Zeros);
101   }
102 
103   /// Helper function that generates an empty dummy function returning void.
104   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105     assert(FnTy->getReturnType()->isVoidTy() &&
106            "Can only generate dummy functions returning void!");
107     llvm::Function *DummyFunc = llvm::Function::Create(
108         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109 
110     llvm::BasicBlock *DummyBlock =
111         llvm::BasicBlock::Create(Context, "", DummyFunc);
112     CGBuilderTy FuncBuilder(CGM, Context);
113     FuncBuilder.SetInsertPoint(DummyBlock);
114     FuncBuilder.CreateRetVoid();
115 
116     return DummyFunc;
117   }
118 
119   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121   std::string getDeviceSideName(const NamedDecl *ND) override;
122 
123   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
124                          bool Extern, bool Constant) {
125     DeviceVars.push_back({&Var,
126                           VD,
127                           {DeviceVarFlags::Variable, Extern, Constant,
128                            VD->hasAttr<HIPManagedAttr>(),
129                            /*Normalized*/ false, 0}});
130   }
131   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
132                           bool Extern, int Type) {
133     DeviceVars.push_back({&Var,
134                           VD,
135                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
136                            /*Managed*/ false,
137                            /*Normalized*/ false, Type}});
138   }
139   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
140                          bool Extern, int Type, bool Normalized) {
141     DeviceVars.push_back({&Var,
142                           VD,
143                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
144                            /*Managed*/ false, Normalized, Type}});
145   }
146 
147 public:
148   CGNVCUDARuntime(CodeGenModule &CGM);
149 
150   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
151   void handleVarRegistration(const VarDecl *VD,
152                              llvm::GlobalVariable &Var) override;
153 
154   /// Creates module constructor function
155   llvm::Function *makeModuleCtorFunction() override;
156   /// Creates module destructor function
157   llvm::Function *makeModuleDtorFunction() override;
158   void
159   internalizeDeviceSideVar(const VarDecl *D,
160                            llvm::GlobalValue::LinkageTypes &Linkage) override;
161 };
162 
163 }
164 
165 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
166   if (CGM.getLangOpts().HIP)
167     return ((Twine("hip") + Twine(FuncName)).str());
168   return ((Twine("cuda") + Twine(FuncName)).str());
169 }
170 std::string
171 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
172   if (CGM.getLangOpts().HIP)
173     return ((Twine("__hip") + Twine(FuncName)).str());
174   return ((Twine("__cuda") + Twine(FuncName)).str());
175 }
176 
177 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
178     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
179       TheModule(CGM.getModule()),
180       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
181       DeviceMC(CGM.getContext().createMangleContext(
182           CGM.getContext().getAuxTargetInfo())) {
183   CodeGen::CodeGenTypes &Types = CGM.getTypes();
184   ASTContext &Ctx = CGM.getContext();
185 
186   IntTy = CGM.IntTy;
187   SizeTy = CGM.SizeTy;
188   VoidTy = CGM.VoidTy;
189 
190   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
191   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
192   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
193 }
194 
195 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
196   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
197   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
198   return CGM.CreateRuntimeFunction(
199       llvm::FunctionType::get(IntTy, Params, false),
200       addPrefixToName("SetupArgument"));
201 }
202 
203 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
204   if (CGM.getLangOpts().HIP) {
205     // hipError_t hipLaunchByPtr(char *);
206     return CGM.CreateRuntimeFunction(
207         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
208   } else {
209     // cudaError_t cudaLaunch(char *);
210     return CGM.CreateRuntimeFunction(
211         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
212   }
213 }
214 
215 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
216   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
217 }
218 
219 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
220   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
221 }
222 
223 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
224   auto CallbackFnTy = getCallbackFnTy();
225   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
226   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
227                           VoidPtrTy, CallbackFnTy->getPointerTo()};
228   return llvm::FunctionType::get(VoidTy, Params, false);
229 }
230 
231 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
232   GlobalDecl GD;
233   // D could be either a kernel or a variable.
234   if (auto *FD = dyn_cast<FunctionDecl>(ND))
235     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
236   else
237     GD = GlobalDecl(ND);
238   std::string DeviceSideName;
239   if (DeviceMC->shouldMangleDeclName(ND)) {
240     SmallString<256> Buffer;
241     llvm::raw_svector_ostream Out(Buffer);
242     DeviceMC->mangleName(GD, Out);
243     DeviceSideName = std::string(Out.str());
244   } else
245     DeviceSideName = std::string(ND->getIdentifier()->getName());
246   return DeviceSideName;
247 }
248 
249 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
250                                      FunctionArgList &Args) {
251   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
252   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
253                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
254       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
255     emitDeviceStubBodyNew(CGF, Args);
256   else
257     emitDeviceStubBodyLegacy(CGF, Args);
258 }
259 
260 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
261 // array and kernels are launched using cudaLaunchKernel().
262 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
263                                             FunctionArgList &Args) {
264   // Build the shadow stack entry at the very start of the function.
265 
266   // Calculate amount of space we will need for all arguments.  If we have no
267   // args, allocate a single pointer so we still have a valid pointer to the
268   // argument array that we can pass to runtime, even if it will be unused.
269   Address KernelArgs = CGF.CreateTempAlloca(
270       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
271       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
272   // Store pointers to the arguments in a locally allocated launch_args.
273   for (unsigned i = 0; i < Args.size(); ++i) {
274     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
275     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
276     CGF.Builder.CreateDefaultAlignedStore(
277         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
278   }
279 
280   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
281 
282   // Lookup cudaLaunchKernel/hipLaunchKernel function.
283   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
284   //                              void **args, size_t sharedMem,
285   //                              cudaStream_t stream);
286   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
287   //                            void **args, size_t sharedMem,
288   //                            hipStream_t stream);
289   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
290   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
291   auto LaunchKernelName = addPrefixToName("LaunchKernel");
292   IdentifierInfo &cudaLaunchKernelII =
293       CGM.getContext().Idents.get(LaunchKernelName);
294   FunctionDecl *cudaLaunchKernelFD = nullptr;
295   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
296     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
297       cudaLaunchKernelFD = FD;
298   }
299 
300   if (cudaLaunchKernelFD == nullptr) {
301     CGM.Error(CGF.CurFuncDecl->getLocation(),
302               "Can't find declaration for " + LaunchKernelName);
303     return;
304   }
305   // Create temporary dim3 grid_dim, block_dim.
306   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
307   QualType Dim3Ty = GridDimParam->getType();
308   Address GridDim =
309       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
310   Address BlockDim =
311       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
312   Address ShmemSize =
313       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
314   Address Stream =
315       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
316   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
317       llvm::FunctionType::get(IntTy,
318                               {/*gridDim=*/GridDim.getType(),
319                                /*blockDim=*/BlockDim.getType(),
320                                /*ShmemSize=*/ShmemSize.getType(),
321                                /*Stream=*/Stream.getType()},
322                               /*isVarArg=*/false),
323       addUnderscoredPrefixToName("PopCallConfiguration"));
324 
325   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
326                               {GridDim.getPointer(), BlockDim.getPointer(),
327                                ShmemSize.getPointer(), Stream.getPointer()});
328 
329   // Emit the call to cudaLaunch
330   llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
331   CallArgList LaunchKernelArgs;
332   LaunchKernelArgs.add(RValue::get(Kernel),
333                        cudaLaunchKernelFD->getParamDecl(0)->getType());
334   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
335   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
336   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
337                        cudaLaunchKernelFD->getParamDecl(3)->getType());
338   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
339                        cudaLaunchKernelFD->getParamDecl(4)->getType());
340   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
341                        cudaLaunchKernelFD->getParamDecl(5)->getType());
342 
343   QualType QT = cudaLaunchKernelFD->getType();
344   QualType CQT = QT.getCanonicalType();
345   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
346   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
347 
348   const CGFunctionInfo &FI =
349       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
350   llvm::FunctionCallee cudaLaunchKernelFn =
351       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
352   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
353                LaunchKernelArgs);
354   CGF.EmitBranch(EndBlock);
355 
356   CGF.EmitBlock(EndBlock);
357 }
358 
359 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
360                                                FunctionArgList &Args) {
361   // Emit a call to cudaSetupArgument for each arg in Args.
362   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
363   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
364   CharUnits Offset = CharUnits::Zero();
365   for (const VarDecl *A : Args) {
366     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
367     Offset = Offset.alignTo(TInfo.Align);
368     llvm::Value *Args[] = {
369         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
370                                       VoidPtrTy),
371         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
372         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
373     };
374     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
375     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
376     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
377     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
378     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
379     CGF.EmitBlock(NextBlock);
380     Offset += TInfo.Width;
381   }
382 
383   // Emit the call to cudaLaunch
384   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
385   llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
386   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
387   CGF.EmitBranch(EndBlock);
388 
389   CGF.EmitBlock(EndBlock);
390 }
391 
392 // Replace the original variable Var with the address loaded from variable
393 // ManagedVar populated by HIP runtime.
394 static void replaceManagedVar(llvm::GlobalVariable *Var,
395                               llvm::GlobalVariable *ManagedVar) {
396   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
397   for (auto &&VarUse : Var->uses()) {
398     WorkList.push_back({VarUse.getUser()});
399   }
400   while (!WorkList.empty()) {
401     auto &&WorkItem = WorkList.pop_back_val();
402     auto *U = WorkItem.back();
403     if (isa<llvm::ConstantExpr>(U)) {
404       for (auto &&UU : U->uses()) {
405         WorkItem.push_back(UU.getUser());
406         WorkList.push_back(WorkItem);
407         WorkItem.pop_back();
408       }
409       continue;
410     }
411     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
412       llvm::Value *OldV = Var;
413       llvm::Instruction *NewV =
414           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
415                              llvm::Align(Var->getAlignment()), I);
416       WorkItem.pop_back();
417       // Replace constant expressions directly or indirectly using the managed
418       // variable with instructions.
419       for (auto &&Op : WorkItem) {
420         auto *CE = cast<llvm::ConstantExpr>(Op);
421         auto *NewInst = llvm::createReplacementInstr(CE, I);
422         NewInst->replaceUsesOfWith(OldV, NewV);
423         OldV = CE;
424         NewV = NewInst;
425       }
426       I->replaceUsesOfWith(OldV, NewV);
427     } else {
428       llvm_unreachable("Invalid use of managed variable");
429     }
430   }
431 }
432 
433 /// Creates a function that sets up state on the host side for CUDA objects that
434 /// have a presence on both the host and device sides. Specifically, registers
435 /// the host side of kernel functions and device global variables with the CUDA
436 /// runtime.
437 /// \code
438 /// void __cuda_register_globals(void** GpuBinaryHandle) {
439 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
440 ///    ...
441 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
442 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
443 ///    ...
444 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
445 /// }
446 /// \endcode
447 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
448   // No need to register anything
449   if (EmittedKernels.empty() && DeviceVars.empty())
450     return nullptr;
451 
452   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
453       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
454       addUnderscoredPrefixToName("_register_globals"), &TheModule);
455   llvm::BasicBlock *EntryBB =
456       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
457   CGBuilderTy Builder(CGM, Context);
458   Builder.SetInsertPoint(EntryBB);
459 
460   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
461   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
462   llvm::Type *RegisterFuncParams[] = {
463       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
464       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
465   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
466       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
467       addUnderscoredPrefixToName("RegisterFunction"));
468 
469   // Extract GpuBinaryHandle passed as the first argument passed to
470   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
471   // each emitted kernel.
472   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
473   for (auto &&I : EmittedKernels) {
474     llvm::Constant *KernelName =
475         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
476     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
477     llvm::Value *Args[] = {
478         &GpuBinaryHandlePtr,
479         Builder.CreateBitCast(I.Kernel, VoidPtrTy),
480         KernelName,
481         KernelName,
482         llvm::ConstantInt::get(IntTy, -1),
483         NullPtr,
484         NullPtr,
485         NullPtr,
486         NullPtr,
487         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
488     Builder.CreateCall(RegisterFunc, Args);
489   }
490 
491   llvm::Type *VarSizeTy = IntTy;
492   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
493   if (CGM.getLangOpts().HIP ||
494       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
495     VarSizeTy = SizeTy;
496 
497   // void __cudaRegisterVar(void **, char *, char *, const char *,
498   //                        int, int, int, int)
499   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
500                                      CharPtrTy,    IntTy,     VarSizeTy,
501                                      IntTy,        IntTy};
502   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
503       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
504       addUnderscoredPrefixToName("RegisterVar"));
505   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
506   //                              size_t, unsigned)
507   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
508                                             CharPtrTy,    VarSizeTy, IntTy};
509   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
510       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
511       addUnderscoredPrefixToName("RegisterManagedVar"));
512   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
513   //                            const void **, const char *, int, int);
514   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
515       llvm::FunctionType::get(
516           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
517           false),
518       addUnderscoredPrefixToName("RegisterSurface"));
519   // void __cudaRegisterTexture(void **, const struct textureReference *,
520   //                            const void **, const char *, int, int, int)
521   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
522       llvm::FunctionType::get(
523           VoidTy,
524           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
525           false),
526       addUnderscoredPrefixToName("RegisterTexture"));
527   for (auto &&Info : DeviceVars) {
528     llvm::GlobalVariable *Var = Info.Var;
529     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
530     switch (Info.Flags.getKind()) {
531     case DeviceVarFlags::Variable: {
532       uint64_t VarSize =
533           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
534       if (Info.Flags.isManaged()) {
535         auto ManagedVar = new llvm::GlobalVariable(
536             CGM.getModule(), Var->getType(),
537             /*isConstant=*/false, Var->getLinkage(),
538             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
539             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
540             llvm::GlobalVariable::NotThreadLocal);
541         replaceManagedVar(Var, ManagedVar);
542         llvm::Value *Args[] = {
543             &GpuBinaryHandlePtr,
544             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
545             Builder.CreateBitCast(Var, VoidPtrTy),
546             VarName,
547             llvm::ConstantInt::get(VarSizeTy, VarSize),
548             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
549         Builder.CreateCall(RegisterManagedVar, Args);
550       } else {
551         llvm::Value *Args[] = {
552             &GpuBinaryHandlePtr,
553             Builder.CreateBitCast(Var, VoidPtrTy),
554             VarName,
555             VarName,
556             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
557             llvm::ConstantInt::get(VarSizeTy, VarSize),
558             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
559             llvm::ConstantInt::get(IntTy, 0)};
560         Builder.CreateCall(RegisterVar, Args);
561       }
562       break;
563     }
564     case DeviceVarFlags::Surface:
565       Builder.CreateCall(
566           RegisterSurf,
567           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
568            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
569            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
570       break;
571     case DeviceVarFlags::Texture:
572       Builder.CreateCall(
573           RegisterTex,
574           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
575            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
576            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
577            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
578       break;
579     }
580   }
581 
582   Builder.CreateRetVoid();
583   return RegisterKernelsFunc;
584 }
585 
586 /// Creates a global constructor function for the module:
587 ///
588 /// For CUDA:
589 /// \code
590 /// void __cuda_module_ctor(void*) {
591 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
592 ///     __cuda_register_globals(Handle);
593 /// }
594 /// \endcode
595 ///
596 /// For HIP:
597 /// \code
598 /// void __hip_module_ctor(void*) {
599 ///     if (__hip_gpubin_handle == 0) {
600 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
601 ///         __hip_register_globals(__hip_gpubin_handle);
602 ///     }
603 /// }
604 /// \endcode
605 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
606   bool IsHIP = CGM.getLangOpts().HIP;
607   bool IsCUDA = CGM.getLangOpts().CUDA;
608   // No need to generate ctors/dtors if there is no GPU binary.
609   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
610   if (CudaGpuBinaryFileName.empty() && !IsHIP)
611     return nullptr;
612   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
613       DeviceVars.empty())
614     return nullptr;
615 
616   // void __{cuda|hip}_register_globals(void* handle);
617   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
618   // We always need a function to pass in as callback. Create a dummy
619   // implementation if we don't need to register anything.
620   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
621     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
622 
623   // void ** __{cuda|hip}RegisterFatBinary(void *);
624   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
625       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
626       addUnderscoredPrefixToName("RegisterFatBinary"));
627   // struct { int magic, int version, void * gpu_binary, void * dont_care };
628   llvm::StructType *FatbinWrapperTy =
629       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
630 
631   // Register GPU binary with the CUDA runtime, store returned handle in a
632   // global variable and save a reference in GpuBinaryHandle to be cleaned up
633   // in destructor on exit. Then associate all known kernels with the GPU binary
634   // handle so CUDA runtime can figure out what to call on the GPU side.
635   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
636   if (!CudaGpuBinaryFileName.empty()) {
637     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
638         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
639     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
640       CGM.getDiags().Report(diag::err_cannot_open_file)
641           << CudaGpuBinaryFileName << EC.message();
642       return nullptr;
643     }
644     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
645   }
646 
647   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
648       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
649       llvm::GlobalValue::InternalLinkage,
650       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
651   llvm::BasicBlock *CtorEntryBB =
652       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
653   CGBuilderTy CtorBuilder(CGM, Context);
654 
655   CtorBuilder.SetInsertPoint(CtorEntryBB);
656 
657   const char *FatbinConstantName;
658   const char *FatbinSectionName;
659   const char *ModuleIDSectionName;
660   StringRef ModuleIDPrefix;
661   llvm::Constant *FatBinStr;
662   unsigned FatMagic;
663   if (IsHIP) {
664     FatbinConstantName = ".hip_fatbin";
665     FatbinSectionName = ".hipFatBinSegment";
666 
667     ModuleIDSectionName = "__hip_module_id";
668     ModuleIDPrefix = "__hip_";
669 
670     if (CudaGpuBinary) {
671       // If fatbin is available from early finalization, create a string
672       // literal containing the fat binary loaded from the given file.
673       const unsigned HIPCodeObjectAlign = 4096;
674       FatBinStr =
675           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
676                              FatbinConstantName, HIPCodeObjectAlign);
677     } else {
678       // If fatbin is not available, create an external symbol
679       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
680       // to contain the fat binary but will be populated somewhere else,
681       // e.g. by lld through link script.
682       FatBinStr = new llvm::GlobalVariable(
683         CGM.getModule(), CGM.Int8Ty,
684         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
685         "__hip_fatbin", nullptr,
686         llvm::GlobalVariable::NotThreadLocal);
687       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
688     }
689 
690     FatMagic = HIPFatMagic;
691   } else {
692     if (RelocatableDeviceCode)
693       FatbinConstantName = CGM.getTriple().isMacOSX()
694                                ? "__NV_CUDA,__nv_relfatbin"
695                                : "__nv_relfatbin";
696     else
697       FatbinConstantName =
698           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
699     // NVIDIA's cuobjdump looks for fatbins in this section.
700     FatbinSectionName =
701         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
702 
703     ModuleIDSectionName = CGM.getTriple().isMacOSX()
704                               ? "__NV_CUDA,__nv_module_id"
705                               : "__nv_module_id";
706     ModuleIDPrefix = "__nv_";
707 
708     // For CUDA, create a string literal containing the fat binary loaded from
709     // the given file.
710     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
711                                    FatbinConstantName, 8);
712     FatMagic = CudaFatMagic;
713   }
714 
715   // Create initialized wrapper structure that points to the loaded GPU binary
716   ConstantInitBuilder Builder(CGM);
717   auto Values = Builder.beginStruct(FatbinWrapperTy);
718   // Fatbin wrapper magic.
719   Values.addInt(IntTy, FatMagic);
720   // Fatbin version.
721   Values.addInt(IntTy, 1);
722   // Data.
723   Values.add(FatBinStr);
724   // Unused in fatbin v1.
725   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
726   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
727       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
728       /*constant*/ true);
729   FatbinWrapper->setSection(FatbinSectionName);
730 
731   // There is only one HIP fat binary per linked module, however there are
732   // multiple constructor functions. Make sure the fat binary is registered
733   // only once. The constructor functions are executed by the dynamic loader
734   // before the program gains control. The dynamic loader cannot execute the
735   // constructor functions concurrently since doing that would not guarantee
736   // thread safety of the loaded program. Therefore we can assume sequential
737   // execution of constructor functions here.
738   if (IsHIP) {
739     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
740         llvm::GlobalValue::LinkOnceAnyLinkage;
741     llvm::BasicBlock *IfBlock =
742         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
743     llvm::BasicBlock *ExitBlock =
744         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
745     // The name, size, and initialization pattern of this variable is part
746     // of HIP ABI.
747     GpuBinaryHandle = new llvm::GlobalVariable(
748         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
749         Linkage,
750         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
751         "__hip_gpubin_handle");
752     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
753     // Prevent the weak symbol in different shared libraries being merged.
754     if (Linkage != llvm::GlobalValue::InternalLinkage)
755       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
756     Address GpuBinaryAddr(
757         GpuBinaryHandle,
758         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
759     {
760       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
761       llvm::Constant *Zero =
762           llvm::Constant::getNullValue(HandleValue->getType());
763       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
764       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
765     }
766     {
767       CtorBuilder.SetInsertPoint(IfBlock);
768       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
769       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
770           RegisterFatbinFunc,
771           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
772       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
773       CtorBuilder.CreateBr(ExitBlock);
774     }
775     {
776       CtorBuilder.SetInsertPoint(ExitBlock);
777       // Call __hip_register_globals(GpuBinaryHandle);
778       if (RegisterGlobalsFunc) {
779         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
780         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
781       }
782     }
783   } else if (!RelocatableDeviceCode) {
784     // Register binary with CUDA runtime. This is substantially different in
785     // default mode vs. separate compilation!
786     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
787     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
788         RegisterFatbinFunc,
789         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
790     GpuBinaryHandle = new llvm::GlobalVariable(
791         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
792         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
793     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
794     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
795                                    CGM.getPointerAlign());
796 
797     // Call __cuda_register_globals(GpuBinaryHandle);
798     if (RegisterGlobalsFunc)
799       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
800 
801     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
802     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
803                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
804       // void __cudaRegisterFatBinaryEnd(void **);
805       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
806           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
807           "__cudaRegisterFatBinaryEnd");
808       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
809     }
810   } else {
811     // Generate a unique module ID.
812     SmallString<64> ModuleID;
813     llvm::raw_svector_ostream OS(ModuleID);
814     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
815     llvm::Constant *ModuleIDConstant = makeConstantString(
816         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
817 
818     // Create an alias for the FatbinWrapper that nvcc will look for.
819     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
820                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
821 
822     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
823     // void *, void (*)(void **))
824     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
825     RegisterLinkedBinaryName += ModuleID;
826     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
827         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
828 
829     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
830     llvm::Value *Args[] = {RegisterGlobalsFunc,
831                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
832                            ModuleIDConstant,
833                            makeDummyFunction(getCallbackFnTy())};
834     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
835   }
836 
837   // Create destructor and register it with atexit() the way NVCC does it. Doing
838   // it during regular destructor phase worked in CUDA before 9.2 but results in
839   // double-free in 9.2.
840   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
841     // extern "C" int atexit(void (*f)(void));
842     llvm::FunctionType *AtExitTy =
843         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
844     llvm::FunctionCallee AtExitFunc =
845         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
846                                   /*Local=*/true);
847     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
848   }
849 
850   CtorBuilder.CreateRetVoid();
851   return ModuleCtorFunc;
852 }
853 
854 /// Creates a global destructor function that unregisters the GPU code blob
855 /// registered by constructor.
856 ///
857 /// For CUDA:
858 /// \code
859 /// void __cuda_module_dtor(void*) {
860 ///     __cudaUnregisterFatBinary(Handle);
861 /// }
862 /// \endcode
863 ///
864 /// For HIP:
865 /// \code
866 /// void __hip_module_dtor(void*) {
867 ///     if (__hip_gpubin_handle) {
868 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
869 ///         __hip_gpubin_handle = 0;
870 ///     }
871 /// }
872 /// \endcode
873 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
874   // No need for destructor if we don't have a handle to unregister.
875   if (!GpuBinaryHandle)
876     return nullptr;
877 
878   // void __cudaUnregisterFatBinary(void ** handle);
879   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
880       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
881       addUnderscoredPrefixToName("UnregisterFatBinary"));
882 
883   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
884       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
885       llvm::GlobalValue::InternalLinkage,
886       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
887 
888   llvm::BasicBlock *DtorEntryBB =
889       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
890   CGBuilderTy DtorBuilder(CGM, Context);
891   DtorBuilder.SetInsertPoint(DtorEntryBB);
892 
893   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
894                                              GpuBinaryHandle->getAlignment()));
895   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
896   // There is only one HIP fat binary per linked module, however there are
897   // multiple destructor functions. Make sure the fat binary is unregistered
898   // only once.
899   if (CGM.getLangOpts().HIP) {
900     llvm::BasicBlock *IfBlock =
901         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
902     llvm::BasicBlock *ExitBlock =
903         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
904     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
905     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
906     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
907 
908     DtorBuilder.SetInsertPoint(IfBlock);
909     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
910     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
911     DtorBuilder.CreateBr(ExitBlock);
912 
913     DtorBuilder.SetInsertPoint(ExitBlock);
914   } else {
915     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
916   }
917   DtorBuilder.CreateRetVoid();
918   return ModuleDtorFunc;
919 }
920 
921 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
922   return new CGNVCUDARuntime(CGM);
923 }
924 
925 void CGNVCUDARuntime::internalizeDeviceSideVar(
926     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
927   // Host-side shadows of external declarations of device-side
928   // global variables become internal definitions. These have to
929   // be internal in order to prevent name conflicts with global
930   // host variables with the same name in a different TUs.
931   //
932   // __shared__ variables are odd. Shadows do get created, but
933   // they are not registered with the CUDA runtime, so they
934   // can't really be used to access their device-side
935   // counterparts. It's not clear yet whether it's nvcc's bug or
936   // a feature, but we've got to do the same for compatibility.
937   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
938       D->hasAttr<CUDASharedAttr>() ||
939       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
940       D->getType()->isCUDADeviceBuiltinTextureType()) {
941     Linkage = llvm::GlobalValue::InternalLinkage;
942   }
943 }
944 
945 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
946                                             llvm::GlobalVariable &GV) {
947   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
948     // Shadow variables and their properties must be registered with CUDA
949     // runtime. Skip Extern global variables, which will be registered in
950     // the TU where they are defined.
951     //
952     // Don't register a C++17 inline variable. The local symbol can be
953     // discarded and referencing a discarded local symbol from outside the
954     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
955     // TODO: Reject __device__ constexpr and __device__ inline in Sema.
956     if (!D->hasExternalStorage() && !D->isInline())
957       registerDeviceVar(D, GV, !D->hasDefinition(),
958                         D->hasAttr<CUDAConstantAttr>());
959   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
960              D->getType()->isCUDADeviceBuiltinTextureType()) {
961     // Builtin surfaces and textures and their template arguments are
962     // also registered with CUDA runtime.
963     const ClassTemplateSpecializationDecl *TD =
964         cast<ClassTemplateSpecializationDecl>(
965             D->getType()->getAs<RecordType>()->getDecl());
966     const TemplateArgumentList &Args = TD->getTemplateArgs();
967     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
968       assert(Args.size() == 2 &&
969              "Unexpected number of template arguments of CUDA device "
970              "builtin surface type.");
971       auto SurfType = Args[1].getAsIntegral();
972       if (!D->hasExternalStorage())
973         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
974     } else {
975       assert(Args.size() == 3 &&
976              "Unexpected number of template arguments of CUDA device "
977              "builtin texture type.");
978       auto TexType = Args[1].getAsIntegral();
979       auto Normalized = Args[2].getAsIntegral();
980       if (!D->hasExternalStorage())
981         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
982                           Normalized.getZExtValue());
983     }
984   }
985 }
986