1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/ReplaceConstant.h"
25 #include "llvm/Support/Format.h"
26 
27 using namespace clang;
28 using namespace CodeGen;
29 
30 namespace {
31 constexpr unsigned CudaFatMagic = 0x466243b1;
32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33 
34 class CGNVCUDARuntime : public CGCUDARuntime {
35 
36 private:
37   llvm::IntegerType *IntTy, *SizeTy;
38   llvm::Type *VoidTy;
39   llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40 
41   /// Convenience reference to LLVM Context
42   llvm::LLVMContext &Context;
43   /// Convenience reference to the current module
44   llvm::Module &TheModule;
45   /// Keeps track of kernel launch stubs and handles emitted in this module
46   struct KernelInfo {
47     llvm::Function *Kernel; // stub function to help launch kernel
48     const Decl *D;
49   };
50   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51   // Map a device stub function to a symbol for identifying kernel in host code.
52   // For CUDA, the symbol for identifying the kernel is the same as the device
53   // stub function. For HIP, they are different.
54   llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
55   // Map a kernel handle to the kernel stub.
56   llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
57   struct VarInfo {
58     llvm::GlobalVariable *Var;
59     const VarDecl *D;
60     DeviceVarFlags Flags;
61   };
62   llvm::SmallVector<VarInfo, 16> DeviceVars;
63   /// Keeps track of variable containing handle of GPU binary. Populated by
64   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
65   /// ModuleDtorFunction()
66   llvm::GlobalVariable *GpuBinaryHandle = nullptr;
67   /// Whether we generate relocatable device code.
68   bool RelocatableDeviceCode;
69   /// Mangle context for device.
70   std::unique_ptr<MangleContext> DeviceMC;
71 
72   llvm::FunctionCallee getSetupArgumentFn() const;
73   llvm::FunctionCallee getLaunchFn() const;
74 
75   llvm::FunctionType *getRegisterGlobalsFnTy() const;
76   llvm::FunctionType *getCallbackFnTy() const;
77   llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
78   std::string addPrefixToName(StringRef FuncName) const;
79   std::string addUnderscoredPrefixToName(StringRef FuncName) const;
80 
81   /// Creates a function to register all kernel stubs generated in this module.
82   llvm::Function *makeRegisterGlobalsFn();
83 
84   /// Helper function that generates a constant string and returns a pointer to
85   /// the start of the string.  The result of this function can be used anywhere
86   /// where the C code specifies const char*.
87   llvm::Constant *makeConstantString(const std::string &Str,
88                                      const std::string &Name = "",
89                                      const std::string &SectionName = "",
90                                      unsigned Alignment = 0) {
91     llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
92                                llvm::ConstantInt::get(SizeTy, 0)};
93     auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
94     llvm::GlobalVariable *GV =
95         cast<llvm::GlobalVariable>(ConstStr.getPointer());
96     if (!SectionName.empty()) {
97       GV->setSection(SectionName);
98       // Mark the address as used which make sure that this section isn't
99       // merged and we will really have it in the object file.
100       GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
101     }
102     if (Alignment)
103       GV->setAlignment(llvm::Align(Alignment));
104 
105     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
106                                                 ConstStr.getPointer(), Zeros);
107   }
108 
109   /// Helper function that generates an empty dummy function returning void.
110   llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
111     assert(FnTy->getReturnType()->isVoidTy() &&
112            "Can only generate dummy functions returning void!");
113     llvm::Function *DummyFunc = llvm::Function::Create(
114         FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
115 
116     llvm::BasicBlock *DummyBlock =
117         llvm::BasicBlock::Create(Context, "", DummyFunc);
118     CGBuilderTy FuncBuilder(CGM, Context);
119     FuncBuilder.SetInsertPoint(DummyBlock);
120     FuncBuilder.CreateRetVoid();
121 
122     return DummyFunc;
123   }
124 
125   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
126   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
127   std::string getDeviceSideName(const NamedDecl *ND) override;
128 
129   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
130                          bool Extern, bool Constant) {
131     DeviceVars.push_back({&Var,
132                           VD,
133                           {DeviceVarFlags::Variable, Extern, Constant,
134                            VD->hasAttr<HIPManagedAttr>(),
135                            /*Normalized*/ false, 0}});
136   }
137   void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
138                           bool Extern, int Type) {
139     DeviceVars.push_back({&Var,
140                           VD,
141                           {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
142                            /*Managed*/ false,
143                            /*Normalized*/ false, Type}});
144   }
145   void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
146                          bool Extern, int Type, bool Normalized) {
147     DeviceVars.push_back({&Var,
148                           VD,
149                           {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
150                            /*Managed*/ false, Normalized, Type}});
151   }
152 
153   /// Creates module constructor function
154   llvm::Function *makeModuleCtorFunction();
155   /// Creates module destructor function
156   llvm::Function *makeModuleDtorFunction();
157   /// Transform managed variables for device compilation.
158   void transformManagedVars();
159 
160 public:
161   CGNVCUDARuntime(CodeGenModule &CGM);
162 
163   llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
164   llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
165     auto Loc = KernelStubs.find(Handle);
166     assert(Loc != KernelStubs.end());
167     return Loc->second;
168   }
169   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
170   void handleVarRegistration(const VarDecl *VD,
171                              llvm::GlobalVariable &Var) override;
172   void
173   internalizeDeviceSideVar(const VarDecl *D,
174                            llvm::GlobalValue::LinkageTypes &Linkage) override;
175 
176   llvm::Function *finalizeModule() override;
177 };
178 
179 }
180 
181 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
182   if (CGM.getLangOpts().HIP)
183     return ((Twine("hip") + Twine(FuncName)).str());
184   return ((Twine("cuda") + Twine(FuncName)).str());
185 }
186 std::string
187 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
188   if (CGM.getLangOpts().HIP)
189     return ((Twine("__hip") + Twine(FuncName)).str());
190   return ((Twine("__cuda") + Twine(FuncName)).str());
191 }
192 
193 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
194     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
195       TheModule(CGM.getModule()),
196       RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
197       DeviceMC(CGM.getContext().createMangleContext(
198           CGM.getContext().getAuxTargetInfo())) {
199   CodeGen::CodeGenTypes &Types = CGM.getTypes();
200   ASTContext &Ctx = CGM.getContext();
201 
202   IntTy = CGM.IntTy;
203   SizeTy = CGM.SizeTy;
204   VoidTy = CGM.VoidTy;
205 
206   CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
207   VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
208   VoidPtrPtrTy = VoidPtrTy->getPointerTo();
209   if (CGM.getContext().getAuxTargetInfo()) {
210     // If the host and device have different C++ ABIs, mark it as the device
211     // mangle context so that the mangling needs to retrieve the additonal
212     // device lambda mangling number instead of the regular host one.
213     DeviceMC->setDeviceMangleContext(
214         CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
215         CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily());
216   }
217 }
218 
219 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
220   // cudaError_t cudaSetupArgument(void *, size_t, size_t)
221   llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
222   return CGM.CreateRuntimeFunction(
223       llvm::FunctionType::get(IntTy, Params, false),
224       addPrefixToName("SetupArgument"));
225 }
226 
227 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
228   if (CGM.getLangOpts().HIP) {
229     // hipError_t hipLaunchByPtr(char *);
230     return CGM.CreateRuntimeFunction(
231         llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
232   } else {
233     // cudaError_t cudaLaunch(char *);
234     return CGM.CreateRuntimeFunction(
235         llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
236   }
237 }
238 
239 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
240   return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
241 }
242 
243 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
244   return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
245 }
246 
247 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
248   auto CallbackFnTy = getCallbackFnTy();
249   auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
250   llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
251                           VoidPtrTy, CallbackFnTy->getPointerTo()};
252   return llvm::FunctionType::get(VoidTy, Params, false);
253 }
254 
255 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
256   GlobalDecl GD;
257   // D could be either a kernel or a variable.
258   if (auto *FD = dyn_cast<FunctionDecl>(ND))
259     GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
260   else
261     GD = GlobalDecl(ND);
262   std::string DeviceSideName;
263   if (DeviceMC->shouldMangleDeclName(ND)) {
264     SmallString<256> Buffer;
265     llvm::raw_svector_ostream Out(Buffer);
266     DeviceMC->mangleName(GD, Out);
267     DeviceSideName = std::string(Out.str());
268   } else
269     DeviceSideName = std::string(ND->getIdentifier()->getName());
270 
271   // Make unique name for device side static file-scope variable for HIP.
272   if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
273       CGM.getLangOpts().GPURelocatableDeviceCode &&
274       !CGM.getLangOpts().CUID.empty()) {
275     SmallString<256> Buffer;
276     llvm::raw_svector_ostream Out(Buffer);
277     Out << DeviceSideName;
278     CGM.printPostfixForExternalizedStaticVar(Out);
279     DeviceSideName = std::string(Out.str());
280   }
281   return DeviceSideName;
282 }
283 
284 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
285                                      FunctionArgList &Args) {
286   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
287   if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
288     GV->setLinkage(CGF.CurFn->getLinkage());
289     GV->setInitializer(CGF.CurFn);
290   }
291   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
292                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
293       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
294     emitDeviceStubBodyNew(CGF, Args);
295   else
296     emitDeviceStubBodyLegacy(CGF, Args);
297 }
298 
299 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
300 // array and kernels are launched using cudaLaunchKernel().
301 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
302                                             FunctionArgList &Args) {
303   // Build the shadow stack entry at the very start of the function.
304 
305   // Calculate amount of space we will need for all arguments.  If we have no
306   // args, allocate a single pointer so we still have a valid pointer to the
307   // argument array that we can pass to runtime, even if it will be unused.
308   Address KernelArgs = CGF.CreateTempAlloca(
309       VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
310       llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
311   // Store pointers to the arguments in a locally allocated launch_args.
312   for (unsigned i = 0; i < Args.size(); ++i) {
313     llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
314     llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
315     CGF.Builder.CreateDefaultAlignedStore(
316         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
317   }
318 
319   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
320 
321   // Lookup cudaLaunchKernel/hipLaunchKernel function.
322   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
323   //                              void **args, size_t sharedMem,
324   //                              cudaStream_t stream);
325   // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
326   //                            void **args, size_t sharedMem,
327   //                            hipStream_t stream);
328   TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
329   DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
330   auto LaunchKernelName = addPrefixToName("LaunchKernel");
331   IdentifierInfo &cudaLaunchKernelII =
332       CGM.getContext().Idents.get(LaunchKernelName);
333   FunctionDecl *cudaLaunchKernelFD = nullptr;
334   for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
335     if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
336       cudaLaunchKernelFD = FD;
337   }
338 
339   if (cudaLaunchKernelFD == nullptr) {
340     CGM.Error(CGF.CurFuncDecl->getLocation(),
341               "Can't find declaration for " + LaunchKernelName);
342     return;
343   }
344   // Create temporary dim3 grid_dim, block_dim.
345   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
346   QualType Dim3Ty = GridDimParam->getType();
347   Address GridDim =
348       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
349   Address BlockDim =
350       CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
351   Address ShmemSize =
352       CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
353   Address Stream =
354       CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
355   llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
356       llvm::FunctionType::get(IntTy,
357                               {/*gridDim=*/GridDim.getType(),
358                                /*blockDim=*/BlockDim.getType(),
359                                /*ShmemSize=*/ShmemSize.getType(),
360                                /*Stream=*/Stream.getType()},
361                               /*isVarArg=*/false),
362       addUnderscoredPrefixToName("PopCallConfiguration"));
363 
364   CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
365                               {GridDim.getPointer(), BlockDim.getPointer(),
366                                ShmemSize.getPointer(), Stream.getPointer()});
367 
368   // Emit the call to cudaLaunch
369   llvm::Value *Kernel =
370       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
371   CallArgList LaunchKernelArgs;
372   LaunchKernelArgs.add(RValue::get(Kernel),
373                        cudaLaunchKernelFD->getParamDecl(0)->getType());
374   LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
375   LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
376   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
377                        cudaLaunchKernelFD->getParamDecl(3)->getType());
378   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
379                        cudaLaunchKernelFD->getParamDecl(4)->getType());
380   LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
381                        cudaLaunchKernelFD->getParamDecl(5)->getType());
382 
383   QualType QT = cudaLaunchKernelFD->getType();
384   QualType CQT = QT.getCanonicalType();
385   llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
386   llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
387 
388   const CGFunctionInfo &FI =
389       CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
390   llvm::FunctionCallee cudaLaunchKernelFn =
391       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
392   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
393                LaunchKernelArgs);
394   CGF.EmitBranch(EndBlock);
395 
396   CGF.EmitBlock(EndBlock);
397 }
398 
399 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
400                                                FunctionArgList &Args) {
401   // Emit a call to cudaSetupArgument for each arg in Args.
402   llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
403   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
404   CharUnits Offset = CharUnits::Zero();
405   for (const VarDecl *A : Args) {
406     auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
407     Offset = Offset.alignTo(TInfo.Align);
408     llvm::Value *Args[] = {
409         CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
410                                       VoidPtrTy),
411         llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
412         llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
413     };
414     llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
415     llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
416     llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
417     llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
418     CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
419     CGF.EmitBlock(NextBlock);
420     Offset += TInfo.Width;
421   }
422 
423   // Emit the call to cudaLaunch
424   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
425   llvm::Value *Arg =
426       CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
427   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
428   CGF.EmitBranch(EndBlock);
429 
430   CGF.EmitBlock(EndBlock);
431 }
432 
433 // Replace the original variable Var with the address loaded from variable
434 // ManagedVar populated by HIP runtime.
435 static void replaceManagedVar(llvm::GlobalVariable *Var,
436                               llvm::GlobalVariable *ManagedVar) {
437   SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
438   for (auto &&VarUse : Var->uses()) {
439     WorkList.push_back({VarUse.getUser()});
440   }
441   while (!WorkList.empty()) {
442     auto &&WorkItem = WorkList.pop_back_val();
443     auto *U = WorkItem.back();
444     if (isa<llvm::ConstantExpr>(U)) {
445       for (auto &&UU : U->uses()) {
446         WorkItem.push_back(UU.getUser());
447         WorkList.push_back(WorkItem);
448         WorkItem.pop_back();
449       }
450       continue;
451     }
452     if (auto *I = dyn_cast<llvm::Instruction>(U)) {
453       llvm::Value *OldV = Var;
454       llvm::Instruction *NewV =
455           new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
456                              llvm::Align(Var->getAlignment()), I);
457       WorkItem.pop_back();
458       // Replace constant expressions directly or indirectly using the managed
459       // variable with instructions.
460       for (auto &&Op : WorkItem) {
461         auto *CE = cast<llvm::ConstantExpr>(Op);
462         auto *NewInst = llvm::createReplacementInstr(CE, I);
463         NewInst->replaceUsesOfWith(OldV, NewV);
464         OldV = CE;
465         NewV = NewInst;
466       }
467       I->replaceUsesOfWith(OldV, NewV);
468     } else {
469       llvm_unreachable("Invalid use of managed variable");
470     }
471   }
472 }
473 
474 /// Creates a function that sets up state on the host side for CUDA objects that
475 /// have a presence on both the host and device sides. Specifically, registers
476 /// the host side of kernel functions and device global variables with the CUDA
477 /// runtime.
478 /// \code
479 /// void __cuda_register_globals(void** GpuBinaryHandle) {
480 ///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
481 ///    ...
482 ///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
483 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
484 ///    ...
485 ///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
486 /// }
487 /// \endcode
488 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
489   // No need to register anything
490   if (EmittedKernels.empty() && DeviceVars.empty())
491     return nullptr;
492 
493   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
494       getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
495       addUnderscoredPrefixToName("_register_globals"), &TheModule);
496   llvm::BasicBlock *EntryBB =
497       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
498   CGBuilderTy Builder(CGM, Context);
499   Builder.SetInsertPoint(EntryBB);
500 
501   // void __cudaRegisterFunction(void **, const char *, char *, const char *,
502   //                             int, uint3*, uint3*, dim3*, dim3*, int*)
503   llvm::Type *RegisterFuncParams[] = {
504       VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
505       VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
506   llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
507       llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
508       addUnderscoredPrefixToName("RegisterFunction"));
509 
510   // Extract GpuBinaryHandle passed as the first argument passed to
511   // __cuda_register_globals() and generate __cudaRegisterFunction() call for
512   // each emitted kernel.
513   llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
514   for (auto &&I : EmittedKernels) {
515     llvm::Constant *KernelName =
516         makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
517     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
518     llvm::Value *Args[] = {
519         &GpuBinaryHandlePtr,
520         Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
521         KernelName,
522         KernelName,
523         llvm::ConstantInt::get(IntTy, -1),
524         NullPtr,
525         NullPtr,
526         NullPtr,
527         NullPtr,
528         llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
529     Builder.CreateCall(RegisterFunc, Args);
530   }
531 
532   llvm::Type *VarSizeTy = IntTy;
533   // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
534   if (CGM.getLangOpts().HIP ||
535       ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
536     VarSizeTy = SizeTy;
537 
538   // void __cudaRegisterVar(void **, char *, char *, const char *,
539   //                        int, int, int, int)
540   llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
541                                      CharPtrTy,    IntTy,     VarSizeTy,
542                                      IntTy,        IntTy};
543   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
544       llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
545       addUnderscoredPrefixToName("RegisterVar"));
546   // void __hipRegisterManagedVar(void **, char *, char *, const char *,
547   //                              size_t, unsigned)
548   llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
549                                             CharPtrTy,    VarSizeTy, IntTy};
550   llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
551       llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
552       addUnderscoredPrefixToName("RegisterManagedVar"));
553   // void __cudaRegisterSurface(void **, const struct surfaceReference *,
554   //                            const void **, const char *, int, int);
555   llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
556       llvm::FunctionType::get(
557           VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
558           false),
559       addUnderscoredPrefixToName("RegisterSurface"));
560   // void __cudaRegisterTexture(void **, const struct textureReference *,
561   //                            const void **, const char *, int, int, int)
562   llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
563       llvm::FunctionType::get(
564           VoidTy,
565           {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
566           false),
567       addUnderscoredPrefixToName("RegisterTexture"));
568   for (auto &&Info : DeviceVars) {
569     llvm::GlobalVariable *Var = Info.Var;
570     assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
571            "External variables should not show up here, except HIP managed "
572            "variables");
573     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
574     switch (Info.Flags.getKind()) {
575     case DeviceVarFlags::Variable: {
576       uint64_t VarSize =
577           CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
578       if (Info.Flags.isManaged()) {
579         auto ManagedVar = new llvm::GlobalVariable(
580             CGM.getModule(), Var->getType(),
581             /*isConstant=*/false, Var->getLinkage(),
582             /*Init=*/Var->isDeclaration()
583                 ? nullptr
584                 : llvm::ConstantPointerNull::get(Var->getType()),
585             /*Name=*/"", /*InsertBefore=*/nullptr,
586             llvm::GlobalVariable::NotThreadLocal);
587         ManagedVar->setDSOLocal(Var->isDSOLocal());
588         ManagedVar->setVisibility(Var->getVisibility());
589         ManagedVar->setExternallyInitialized(true);
590         ManagedVar->takeName(Var);
591         Var->setName(Twine(ManagedVar->getName() + ".managed"));
592         replaceManagedVar(Var, ManagedVar);
593         llvm::Value *Args[] = {
594             &GpuBinaryHandlePtr,
595             Builder.CreateBitCast(ManagedVar, VoidPtrTy),
596             Builder.CreateBitCast(Var, VoidPtrTy),
597             VarName,
598             llvm::ConstantInt::get(VarSizeTy, VarSize),
599             llvm::ConstantInt::get(IntTy, Var->getAlignment())};
600         if (!Var->isDeclaration())
601           Builder.CreateCall(RegisterManagedVar, Args);
602       } else {
603         llvm::Value *Args[] = {
604             &GpuBinaryHandlePtr,
605             Builder.CreateBitCast(Var, VoidPtrTy),
606             VarName,
607             VarName,
608             llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
609             llvm::ConstantInt::get(VarSizeTy, VarSize),
610             llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
611             llvm::ConstantInt::get(IntTy, 0)};
612         Builder.CreateCall(RegisterVar, Args);
613       }
614       break;
615     }
616     case DeviceVarFlags::Surface:
617       Builder.CreateCall(
618           RegisterSurf,
619           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
620            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
621            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
622       break;
623     case DeviceVarFlags::Texture:
624       Builder.CreateCall(
625           RegisterTex,
626           {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
627            VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
628            llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
629            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
630       break;
631     }
632   }
633 
634   Builder.CreateRetVoid();
635   return RegisterKernelsFunc;
636 }
637 
638 /// Creates a global constructor function for the module:
639 ///
640 /// For CUDA:
641 /// \code
642 /// void __cuda_module_ctor(void*) {
643 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
644 ///     __cuda_register_globals(Handle);
645 /// }
646 /// \endcode
647 ///
648 /// For HIP:
649 /// \code
650 /// void __hip_module_ctor(void*) {
651 ///     if (__hip_gpubin_handle == 0) {
652 ///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
653 ///         __hip_register_globals(__hip_gpubin_handle);
654 ///     }
655 /// }
656 /// \endcode
657 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
658   bool IsHIP = CGM.getLangOpts().HIP;
659   bool IsCUDA = CGM.getLangOpts().CUDA;
660   // No need to generate ctors/dtors if there is no GPU binary.
661   StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
662   if (CudaGpuBinaryFileName.empty() && !IsHIP)
663     return nullptr;
664   if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
665       DeviceVars.empty())
666     return nullptr;
667 
668   // void __{cuda|hip}_register_globals(void* handle);
669   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
670   // We always need a function to pass in as callback. Create a dummy
671   // implementation if we don't need to register anything.
672   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
673     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
674 
675   // void ** __{cuda|hip}RegisterFatBinary(void *);
676   llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
677       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
678       addUnderscoredPrefixToName("RegisterFatBinary"));
679   // struct { int magic, int version, void * gpu_binary, void * dont_care };
680   llvm::StructType *FatbinWrapperTy =
681       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
682 
683   // Register GPU binary with the CUDA runtime, store returned handle in a
684   // global variable and save a reference in GpuBinaryHandle to be cleaned up
685   // in destructor on exit. Then associate all known kernels with the GPU binary
686   // handle so CUDA runtime can figure out what to call on the GPU side.
687   std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
688   if (!CudaGpuBinaryFileName.empty()) {
689     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
690         llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
691     if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
692       CGM.getDiags().Report(diag::err_cannot_open_file)
693           << CudaGpuBinaryFileName << EC.message();
694       return nullptr;
695     }
696     CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
697   }
698 
699   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
700       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
701       llvm::GlobalValue::InternalLinkage,
702       addUnderscoredPrefixToName("_module_ctor"), &TheModule);
703   llvm::BasicBlock *CtorEntryBB =
704       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
705   CGBuilderTy CtorBuilder(CGM, Context);
706 
707   CtorBuilder.SetInsertPoint(CtorEntryBB);
708 
709   const char *FatbinConstantName;
710   const char *FatbinSectionName;
711   const char *ModuleIDSectionName;
712   StringRef ModuleIDPrefix;
713   llvm::Constant *FatBinStr;
714   unsigned FatMagic;
715   if (IsHIP) {
716     FatbinConstantName = ".hip_fatbin";
717     FatbinSectionName = ".hipFatBinSegment";
718 
719     ModuleIDSectionName = "__hip_module_id";
720     ModuleIDPrefix = "__hip_";
721 
722     if (CudaGpuBinary) {
723       // If fatbin is available from early finalization, create a string
724       // literal containing the fat binary loaded from the given file.
725       const unsigned HIPCodeObjectAlign = 4096;
726       FatBinStr =
727           makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
728                              FatbinConstantName, HIPCodeObjectAlign);
729     } else {
730       // If fatbin is not available, create an external symbol
731       // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
732       // to contain the fat binary but will be populated somewhere else,
733       // e.g. by lld through link script.
734       FatBinStr = new llvm::GlobalVariable(
735         CGM.getModule(), CGM.Int8Ty,
736         /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
737         "__hip_fatbin", nullptr,
738         llvm::GlobalVariable::NotThreadLocal);
739       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
740     }
741 
742     FatMagic = HIPFatMagic;
743   } else {
744     if (RelocatableDeviceCode)
745       FatbinConstantName = CGM.getTriple().isMacOSX()
746                                ? "__NV_CUDA,__nv_relfatbin"
747                                : "__nv_relfatbin";
748     else
749       FatbinConstantName =
750           CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
751     // NVIDIA's cuobjdump looks for fatbins in this section.
752     FatbinSectionName =
753         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
754 
755     ModuleIDSectionName = CGM.getTriple().isMacOSX()
756                               ? "__NV_CUDA,__nv_module_id"
757                               : "__nv_module_id";
758     ModuleIDPrefix = "__nv_";
759 
760     // For CUDA, create a string literal containing the fat binary loaded from
761     // the given file.
762     FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
763                                    FatbinConstantName, 8);
764     FatMagic = CudaFatMagic;
765   }
766 
767   // Create initialized wrapper structure that points to the loaded GPU binary
768   ConstantInitBuilder Builder(CGM);
769   auto Values = Builder.beginStruct(FatbinWrapperTy);
770   // Fatbin wrapper magic.
771   Values.addInt(IntTy, FatMagic);
772   // Fatbin version.
773   Values.addInt(IntTy, 1);
774   // Data.
775   Values.add(FatBinStr);
776   // Unused in fatbin v1.
777   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
778   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
779       addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
780       /*constant*/ true);
781   FatbinWrapper->setSection(FatbinSectionName);
782 
783   // There is only one HIP fat binary per linked module, however there are
784   // multiple constructor functions. Make sure the fat binary is registered
785   // only once. The constructor functions are executed by the dynamic loader
786   // before the program gains control. The dynamic loader cannot execute the
787   // constructor functions concurrently since doing that would not guarantee
788   // thread safety of the loaded program. Therefore we can assume sequential
789   // execution of constructor functions here.
790   if (IsHIP) {
791     auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
792         llvm::GlobalValue::LinkOnceAnyLinkage;
793     llvm::BasicBlock *IfBlock =
794         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
795     llvm::BasicBlock *ExitBlock =
796         llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
797     // The name, size, and initialization pattern of this variable is part
798     // of HIP ABI.
799     GpuBinaryHandle = new llvm::GlobalVariable(
800         TheModule, VoidPtrPtrTy, /*isConstant=*/false,
801         Linkage,
802         /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
803         "__hip_gpubin_handle");
804     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
805     // Prevent the weak symbol in different shared libraries being merged.
806     if (Linkage != llvm::GlobalValue::InternalLinkage)
807       GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
808     Address GpuBinaryAddr(
809         GpuBinaryHandle,
810         CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
811     {
812       auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
813       llvm::Constant *Zero =
814           llvm::Constant::getNullValue(HandleValue->getType());
815       llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
816       CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
817     }
818     {
819       CtorBuilder.SetInsertPoint(IfBlock);
820       // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
821       llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
822           RegisterFatbinFunc,
823           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
824       CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
825       CtorBuilder.CreateBr(ExitBlock);
826     }
827     {
828       CtorBuilder.SetInsertPoint(ExitBlock);
829       // Call __hip_register_globals(GpuBinaryHandle);
830       if (RegisterGlobalsFunc) {
831         auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
832         CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
833       }
834     }
835   } else if (!RelocatableDeviceCode) {
836     // Register binary with CUDA runtime. This is substantially different in
837     // default mode vs. separate compilation!
838     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
839     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
840         RegisterFatbinFunc,
841         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
842     GpuBinaryHandle = new llvm::GlobalVariable(
843         TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
844         llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
845     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
846     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
847                                    CGM.getPointerAlign());
848 
849     // Call __cuda_register_globals(GpuBinaryHandle);
850     if (RegisterGlobalsFunc)
851       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
852 
853     // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
854     if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
855                            CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
856       // void __cudaRegisterFatBinaryEnd(void **);
857       llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
858           llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
859           "__cudaRegisterFatBinaryEnd");
860       CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
861     }
862   } else {
863     // Generate a unique module ID.
864     SmallString<64> ModuleID;
865     llvm::raw_svector_ostream OS(ModuleID);
866     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
867     llvm::Constant *ModuleIDConstant = makeConstantString(
868         std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
869 
870     // Create an alias for the FatbinWrapper that nvcc will look for.
871     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
872                               Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
873 
874     // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
875     // void *, void (*)(void **))
876     SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
877     RegisterLinkedBinaryName += ModuleID;
878     llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
879         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
880 
881     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
882     llvm::Value *Args[] = {RegisterGlobalsFunc,
883                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
884                            ModuleIDConstant,
885                            makeDummyFunction(getCallbackFnTy())};
886     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
887   }
888 
889   // Create destructor and register it with atexit() the way NVCC does it. Doing
890   // it during regular destructor phase worked in CUDA before 9.2 but results in
891   // double-free in 9.2.
892   if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
893     // extern "C" int atexit(void (*f)(void));
894     llvm::FunctionType *AtExitTy =
895         llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
896     llvm::FunctionCallee AtExitFunc =
897         CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
898                                   /*Local=*/true);
899     CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
900   }
901 
902   CtorBuilder.CreateRetVoid();
903   return ModuleCtorFunc;
904 }
905 
906 /// Creates a global destructor function that unregisters the GPU code blob
907 /// registered by constructor.
908 ///
909 /// For CUDA:
910 /// \code
911 /// void __cuda_module_dtor(void*) {
912 ///     __cudaUnregisterFatBinary(Handle);
913 /// }
914 /// \endcode
915 ///
916 /// For HIP:
917 /// \code
918 /// void __hip_module_dtor(void*) {
919 ///     if (__hip_gpubin_handle) {
920 ///         __hipUnregisterFatBinary(__hip_gpubin_handle);
921 ///         __hip_gpubin_handle = 0;
922 ///     }
923 /// }
924 /// \endcode
925 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
926   // No need for destructor if we don't have a handle to unregister.
927   if (!GpuBinaryHandle)
928     return nullptr;
929 
930   // void __cudaUnregisterFatBinary(void ** handle);
931   llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
932       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
933       addUnderscoredPrefixToName("UnregisterFatBinary"));
934 
935   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
936       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
937       llvm::GlobalValue::InternalLinkage,
938       addUnderscoredPrefixToName("_module_dtor"), &TheModule);
939 
940   llvm::BasicBlock *DtorEntryBB =
941       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
942   CGBuilderTy DtorBuilder(CGM, Context);
943   DtorBuilder.SetInsertPoint(DtorEntryBB);
944 
945   Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
946                                              GpuBinaryHandle->getAlignment()));
947   auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
948   // There is only one HIP fat binary per linked module, however there are
949   // multiple destructor functions. Make sure the fat binary is unregistered
950   // only once.
951   if (CGM.getLangOpts().HIP) {
952     llvm::BasicBlock *IfBlock =
953         llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
954     llvm::BasicBlock *ExitBlock =
955         llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
956     llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
957     llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
958     DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
959 
960     DtorBuilder.SetInsertPoint(IfBlock);
961     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
962     DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
963     DtorBuilder.CreateBr(ExitBlock);
964 
965     DtorBuilder.SetInsertPoint(ExitBlock);
966   } else {
967     DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
968   }
969   DtorBuilder.CreateRetVoid();
970   return ModuleDtorFunc;
971 }
972 
973 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
974   return new CGNVCUDARuntime(CGM);
975 }
976 
977 void CGNVCUDARuntime::internalizeDeviceSideVar(
978     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
979   // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
980   // global variables become internal definitions. These have to be internal in
981   // order to prevent name conflicts with global host variables with the same
982   // name in a different TUs.
983   //
984   // For -fgpu-rdc, the shadow variables should not be internalized because
985   // they may be accessed by different TU.
986   if (CGM.getLangOpts().GPURelocatableDeviceCode)
987     return;
988 
989   // __shared__ variables are odd. Shadows do get created, but
990   // they are not registered with the CUDA runtime, so they
991   // can't really be used to access their device-side
992   // counterparts. It's not clear yet whether it's nvcc's bug or
993   // a feature, but we've got to do the same for compatibility.
994   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
995       D->hasAttr<CUDASharedAttr>() ||
996       D->getType()->isCUDADeviceBuiltinSurfaceType() ||
997       D->getType()->isCUDADeviceBuiltinTextureType()) {
998     Linkage = llvm::GlobalValue::InternalLinkage;
999   }
1000 }
1001 
1002 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1003                                             llvm::GlobalVariable &GV) {
1004   if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1005     // Shadow variables and their properties must be registered with CUDA
1006     // runtime. Skip Extern global variables, which will be registered in
1007     // the TU where they are defined.
1008     //
1009     // Don't register a C++17 inline variable. The local symbol can be
1010     // discarded and referencing a discarded local symbol from outside the
1011     // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1012     // TODO: Reject __device__ constexpr and __device__ inline in Sema.
1013     // HIP managed variables need to be always recorded in device and host
1014     // compilations for transformation.
1015     if ((!D->hasExternalStorage() && !D->isInline()) ||
1016         D->hasAttr<HIPManagedAttr>()) {
1017       registerDeviceVar(D, GV, !D->hasDefinition(),
1018                         D->hasAttr<CUDAConstantAttr>());
1019     }
1020   } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1021              D->getType()->isCUDADeviceBuiltinTextureType()) {
1022     // Builtin surfaces and textures and their template arguments are
1023     // also registered with CUDA runtime.
1024     const ClassTemplateSpecializationDecl *TD =
1025         cast<ClassTemplateSpecializationDecl>(
1026             D->getType()->getAs<RecordType>()->getDecl());
1027     const TemplateArgumentList &Args = TD->getTemplateArgs();
1028     if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1029       assert(Args.size() == 2 &&
1030              "Unexpected number of template arguments of CUDA device "
1031              "builtin surface type.");
1032       auto SurfType = Args[1].getAsIntegral();
1033       if (!D->hasExternalStorage())
1034         registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1035     } else {
1036       assert(Args.size() == 3 &&
1037              "Unexpected number of template arguments of CUDA device "
1038              "builtin texture type.");
1039       auto TexType = Args[1].getAsIntegral();
1040       auto Normalized = Args[2].getAsIntegral();
1041       if (!D->hasExternalStorage())
1042         registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1043                           Normalized.getZExtValue());
1044     }
1045   }
1046 }
1047 
1048 // Transform managed variables to pointers to managed variables in device code.
1049 // Each use of the original managed variable is replaced by a load from the
1050 // transformed managed variable. The transformed managed variable contains
1051 // the address of managed memory which will be allocated by the runtime.
1052 void CGNVCUDARuntime::transformManagedVars() {
1053   for (auto &&Info : DeviceVars) {
1054     llvm::GlobalVariable *Var = Info.Var;
1055     if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1056         Info.Flags.isManaged()) {
1057       auto ManagedVar = new llvm::GlobalVariable(
1058           CGM.getModule(), Var->getType(),
1059           /*isConstant=*/false, Var->getLinkage(),
1060           /*Init=*/Var->isDeclaration()
1061               ? nullptr
1062               : llvm::ConstantPointerNull::get(Var->getType()),
1063           /*Name=*/"", /*InsertBefore=*/nullptr,
1064           llvm::GlobalVariable::NotThreadLocal,
1065           CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1066       ManagedVar->setDSOLocal(Var->isDSOLocal());
1067       ManagedVar->setVisibility(Var->getVisibility());
1068       ManagedVar->setExternallyInitialized(true);
1069       replaceManagedVar(Var, ManagedVar);
1070       ManagedVar->takeName(Var);
1071       Var->setName(Twine(ManagedVar->getName()) + ".managed");
1072       // Keep managed variables even if they are not used in device code since
1073       // they need to be allocated by the runtime.
1074       if (!Var->isDeclaration()) {
1075         assert(!ManagedVar->isDeclaration());
1076         CGM.addCompilerUsedGlobal(Var);
1077         CGM.addCompilerUsedGlobal(ManagedVar);
1078       }
1079     }
1080   }
1081 }
1082 
1083 // Returns module constructor to be added.
1084 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1085   if (CGM.getLangOpts().CUDAIsDevice) {
1086     transformManagedVars();
1087     return nullptr;
1088   }
1089   return makeModuleCtorFunction();
1090 }
1091 
1092 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1093                                                     GlobalDecl GD) {
1094   auto Loc = KernelHandles.find(F);
1095   if (Loc != KernelHandles.end())
1096     return Loc->second;
1097 
1098   if (!CGM.getLangOpts().HIP) {
1099     KernelHandles[F] = F;
1100     KernelStubs[F] = F;
1101     return F;
1102   }
1103 
1104   auto *Var = new llvm::GlobalVariable(
1105       TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1106       /*Initializer=*/nullptr,
1107       CGM.getMangledName(
1108           GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1109   Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1110   Var->setDSOLocal(F->isDSOLocal());
1111   Var->setVisibility(F->getVisibility());
1112   KernelHandles[F] = Var;
1113   KernelStubs[Var] = F;
1114   return Var;
1115 }
1116