1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/IR/ReplaceConstant.h" 25 #include "llvm/Support/Format.h" 26 27 using namespace clang; 28 using namespace CodeGen; 29 30 namespace { 31 constexpr unsigned CudaFatMagic = 0x466243b1; 32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 33 34 class CGNVCUDARuntime : public CGCUDARuntime { 35 36 private: 37 llvm::IntegerType *IntTy, *SizeTy; 38 llvm::Type *VoidTy; 39 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 40 41 /// Convenience reference to LLVM Context 42 llvm::LLVMContext &Context; 43 /// Convenience reference to the current module 44 llvm::Module &TheModule; 45 /// Keeps track of kernel launch stubs emitted in this module 46 struct KernelInfo { 47 llvm::Function *Kernel; 48 const Decl *D; 49 }; 50 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 51 struct VarInfo { 52 llvm::GlobalVariable *Var; 53 const VarDecl *D; 54 DeviceVarFlags Flags; 55 }; 56 llvm::SmallVector<VarInfo, 16> DeviceVars; 57 /// Keeps track of variable containing handle of GPU binary. Populated by 58 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 59 /// ModuleDtorFunction() 60 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 61 /// Whether we generate relocatable device code. 62 bool RelocatableDeviceCode; 63 /// Mangle context for device. 64 std::unique_ptr<MangleContext> DeviceMC; 65 66 llvm::FunctionCallee getSetupArgumentFn() const; 67 llvm::FunctionCallee getLaunchFn() const; 68 69 llvm::FunctionType *getRegisterGlobalsFnTy() const; 70 llvm::FunctionType *getCallbackFnTy() const; 71 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 72 std::string addPrefixToName(StringRef FuncName) const; 73 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 74 75 /// Creates a function to register all kernel stubs generated in this module. 76 llvm::Function *makeRegisterGlobalsFn(); 77 78 /// Helper function that generates a constant string and returns a pointer to 79 /// the start of the string. The result of this function can be used anywhere 80 /// where the C code specifies const char*. 81 llvm::Constant *makeConstantString(const std::string &Str, 82 const std::string &Name = "", 83 const std::string &SectionName = "", 84 unsigned Alignment = 0) { 85 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 86 llvm::ConstantInt::get(SizeTy, 0)}; 87 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 88 llvm::GlobalVariable *GV = 89 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 90 if (!SectionName.empty()) { 91 GV->setSection(SectionName); 92 // Mark the address as used which make sure that this section isn't 93 // merged and we will really have it in the object file. 94 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 95 } 96 if (Alignment) 97 GV->setAlignment(llvm::Align(Alignment)); 98 99 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 100 ConstStr.getPointer(), Zeros); 101 } 102 103 /// Helper function that generates an empty dummy function returning void. 104 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 105 assert(FnTy->getReturnType()->isVoidTy() && 106 "Can only generate dummy functions returning void!"); 107 llvm::Function *DummyFunc = llvm::Function::Create( 108 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 109 110 llvm::BasicBlock *DummyBlock = 111 llvm::BasicBlock::Create(Context, "", DummyFunc); 112 CGBuilderTy FuncBuilder(CGM, Context); 113 FuncBuilder.SetInsertPoint(DummyBlock); 114 FuncBuilder.CreateRetVoid(); 115 116 return DummyFunc; 117 } 118 119 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 120 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 121 std::string getDeviceSideName(const NamedDecl *ND) override; 122 123 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 124 bool Extern, bool Constant) { 125 DeviceVars.push_back({&Var, 126 VD, 127 {DeviceVarFlags::Variable, Extern, Constant, 128 VD->hasAttr<HIPManagedAttr>(), 129 /*Normalized*/ false, 0}}); 130 } 131 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, 132 bool Extern, int Type) { 133 DeviceVars.push_back({&Var, 134 VD, 135 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, 136 /*Managed*/ false, 137 /*Normalized*/ false, Type}}); 138 } 139 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, 140 bool Extern, int Type, bool Normalized) { 141 DeviceVars.push_back({&Var, 142 VD, 143 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, 144 /*Managed*/ false, Normalized, Type}}); 145 } 146 147 /// Creates module constructor function 148 llvm::Function *makeModuleCtorFunction(); 149 /// Creates module destructor function 150 llvm::Function *makeModuleDtorFunction(); 151 /// Transform managed variables for device compilation. 152 void transformManagedVars(); 153 154 public: 155 CGNVCUDARuntime(CodeGenModule &CGM); 156 157 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 158 void handleVarRegistration(const VarDecl *VD, 159 llvm::GlobalVariable &Var) override; 160 void 161 internalizeDeviceSideVar(const VarDecl *D, 162 llvm::GlobalValue::LinkageTypes &Linkage) override; 163 164 llvm::Function *finalizeModule() override; 165 }; 166 167 } 168 169 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 170 if (CGM.getLangOpts().HIP) 171 return ((Twine("hip") + Twine(FuncName)).str()); 172 return ((Twine("cuda") + Twine(FuncName)).str()); 173 } 174 std::string 175 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 176 if (CGM.getLangOpts().HIP) 177 return ((Twine("__hip") + Twine(FuncName)).str()); 178 return ((Twine("__cuda") + Twine(FuncName)).str()); 179 } 180 181 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 182 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 183 TheModule(CGM.getModule()), 184 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 185 DeviceMC(CGM.getContext().createMangleContext( 186 CGM.getContext().getAuxTargetInfo())) { 187 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 188 ASTContext &Ctx = CGM.getContext(); 189 190 IntTy = CGM.IntTy; 191 SizeTy = CGM.SizeTy; 192 VoidTy = CGM.VoidTy; 193 194 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 195 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 196 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 197 if (CGM.getContext().getAuxTargetInfo()) { 198 // If the host and device have different C++ ABIs, mark it as the device 199 // mangle context so that the mangling needs to retrieve the additonal 200 // device lambda mangling number instead of the regular host one. 201 DeviceMC->setDeviceMangleContext( 202 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && 203 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()); 204 } 205 } 206 207 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 208 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 209 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 210 return CGM.CreateRuntimeFunction( 211 llvm::FunctionType::get(IntTy, Params, false), 212 addPrefixToName("SetupArgument")); 213 } 214 215 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 216 if (CGM.getLangOpts().HIP) { 217 // hipError_t hipLaunchByPtr(char *); 218 return CGM.CreateRuntimeFunction( 219 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 220 } else { 221 // cudaError_t cudaLaunch(char *); 222 return CGM.CreateRuntimeFunction( 223 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 224 } 225 } 226 227 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 228 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 229 } 230 231 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 232 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 233 } 234 235 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 236 auto CallbackFnTy = getCallbackFnTy(); 237 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 238 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 239 VoidPtrTy, CallbackFnTy->getPointerTo()}; 240 return llvm::FunctionType::get(VoidTy, Params, false); 241 } 242 243 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { 244 GlobalDecl GD; 245 // D could be either a kernel or a variable. 246 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 247 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 248 else 249 GD = GlobalDecl(ND); 250 std::string DeviceSideName; 251 if (DeviceMC->shouldMangleDeclName(ND)) { 252 SmallString<256> Buffer; 253 llvm::raw_svector_ostream Out(Buffer); 254 DeviceMC->mangleName(GD, Out); 255 DeviceSideName = std::string(Out.str()); 256 } else 257 DeviceSideName = std::string(ND->getIdentifier()->getName()); 258 return DeviceSideName; 259 } 260 261 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 262 FunctionArgList &Args) { 263 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 264 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 265 CudaFeature::CUDA_USES_NEW_LAUNCH) || 266 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) 267 emitDeviceStubBodyNew(CGF, Args); 268 else 269 emitDeviceStubBodyLegacy(CGF, Args); 270 } 271 272 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 273 // array and kernels are launched using cudaLaunchKernel(). 274 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 275 FunctionArgList &Args) { 276 // Build the shadow stack entry at the very start of the function. 277 278 // Calculate amount of space we will need for all arguments. If we have no 279 // args, allocate a single pointer so we still have a valid pointer to the 280 // argument array that we can pass to runtime, even if it will be unused. 281 Address KernelArgs = CGF.CreateTempAlloca( 282 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 283 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 284 // Store pointers to the arguments in a locally allocated launch_args. 285 for (unsigned i = 0; i < Args.size(); ++i) { 286 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 287 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 288 CGF.Builder.CreateDefaultAlignedStore( 289 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 290 } 291 292 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 293 294 // Lookup cudaLaunchKernel/hipLaunchKernel function. 295 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 296 // void **args, size_t sharedMem, 297 // cudaStream_t stream); 298 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 299 // void **args, size_t sharedMem, 300 // hipStream_t stream); 301 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 302 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 303 auto LaunchKernelName = addPrefixToName("LaunchKernel"); 304 IdentifierInfo &cudaLaunchKernelII = 305 CGM.getContext().Idents.get(LaunchKernelName); 306 FunctionDecl *cudaLaunchKernelFD = nullptr; 307 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 308 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 309 cudaLaunchKernelFD = FD; 310 } 311 312 if (cudaLaunchKernelFD == nullptr) { 313 CGM.Error(CGF.CurFuncDecl->getLocation(), 314 "Can't find declaration for " + LaunchKernelName); 315 return; 316 } 317 // Create temporary dim3 grid_dim, block_dim. 318 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 319 QualType Dim3Ty = GridDimParam->getType(); 320 Address GridDim = 321 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 322 Address BlockDim = 323 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 324 Address ShmemSize = 325 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 326 Address Stream = 327 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 328 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 329 llvm::FunctionType::get(IntTy, 330 {/*gridDim=*/GridDim.getType(), 331 /*blockDim=*/BlockDim.getType(), 332 /*ShmemSize=*/ShmemSize.getType(), 333 /*Stream=*/Stream.getType()}, 334 /*isVarArg=*/false), 335 addUnderscoredPrefixToName("PopCallConfiguration")); 336 337 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 338 {GridDim.getPointer(), BlockDim.getPointer(), 339 ShmemSize.getPointer(), Stream.getPointer()}); 340 341 // Emit the call to cudaLaunch 342 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 343 CallArgList LaunchKernelArgs; 344 LaunchKernelArgs.add(RValue::get(Kernel), 345 cudaLaunchKernelFD->getParamDecl(0)->getType()); 346 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 347 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 348 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 349 cudaLaunchKernelFD->getParamDecl(3)->getType()); 350 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 351 cudaLaunchKernelFD->getParamDecl(4)->getType()); 352 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 353 cudaLaunchKernelFD->getParamDecl(5)->getType()); 354 355 QualType QT = cudaLaunchKernelFD->getType(); 356 QualType CQT = QT.getCanonicalType(); 357 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 358 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 359 360 const CGFunctionInfo &FI = 361 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 362 llvm::FunctionCallee cudaLaunchKernelFn = 363 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 364 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 365 LaunchKernelArgs); 366 CGF.EmitBranch(EndBlock); 367 368 CGF.EmitBlock(EndBlock); 369 } 370 371 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 372 FunctionArgList &Args) { 373 // Emit a call to cudaSetupArgument for each arg in Args. 374 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 375 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 376 CharUnits Offset = CharUnits::Zero(); 377 for (const VarDecl *A : Args) { 378 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); 379 Offset = Offset.alignTo(TInfo.Align); 380 llvm::Value *Args[] = { 381 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 382 VoidPtrTy), 383 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), 384 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 385 }; 386 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 387 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 388 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 389 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 390 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 391 CGF.EmitBlock(NextBlock); 392 Offset += TInfo.Width; 393 } 394 395 // Emit the call to cudaLaunch 396 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 397 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 398 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 399 CGF.EmitBranch(EndBlock); 400 401 CGF.EmitBlock(EndBlock); 402 } 403 404 // Replace the original variable Var with the address loaded from variable 405 // ManagedVar populated by HIP runtime. 406 static void replaceManagedVar(llvm::GlobalVariable *Var, 407 llvm::GlobalVariable *ManagedVar) { 408 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; 409 for (auto &&VarUse : Var->uses()) { 410 WorkList.push_back({VarUse.getUser()}); 411 } 412 while (!WorkList.empty()) { 413 auto &&WorkItem = WorkList.pop_back_val(); 414 auto *U = WorkItem.back(); 415 if (isa<llvm::ConstantExpr>(U)) { 416 for (auto &&UU : U->uses()) { 417 WorkItem.push_back(UU.getUser()); 418 WorkList.push_back(WorkItem); 419 WorkItem.pop_back(); 420 } 421 continue; 422 } 423 if (auto *I = dyn_cast<llvm::Instruction>(U)) { 424 llvm::Value *OldV = Var; 425 llvm::Instruction *NewV = 426 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, 427 llvm::Align(Var->getAlignment()), I); 428 WorkItem.pop_back(); 429 // Replace constant expressions directly or indirectly using the managed 430 // variable with instructions. 431 for (auto &&Op : WorkItem) { 432 auto *CE = cast<llvm::ConstantExpr>(Op); 433 auto *NewInst = llvm::createReplacementInstr(CE, I); 434 NewInst->replaceUsesOfWith(OldV, NewV); 435 OldV = CE; 436 NewV = NewInst; 437 } 438 I->replaceUsesOfWith(OldV, NewV); 439 } else { 440 llvm_unreachable("Invalid use of managed variable"); 441 } 442 } 443 } 444 445 /// Creates a function that sets up state on the host side for CUDA objects that 446 /// have a presence on both the host and device sides. Specifically, registers 447 /// the host side of kernel functions and device global variables with the CUDA 448 /// runtime. 449 /// \code 450 /// void __cuda_register_globals(void** GpuBinaryHandle) { 451 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 452 /// ... 453 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 454 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 455 /// ... 456 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 457 /// } 458 /// \endcode 459 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 460 // No need to register anything 461 if (EmittedKernels.empty() && DeviceVars.empty()) 462 return nullptr; 463 464 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 465 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 466 addUnderscoredPrefixToName("_register_globals"), &TheModule); 467 llvm::BasicBlock *EntryBB = 468 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 469 CGBuilderTy Builder(CGM, Context); 470 Builder.SetInsertPoint(EntryBB); 471 472 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 473 // int, uint3*, uint3*, dim3*, dim3*, int*) 474 llvm::Type *RegisterFuncParams[] = { 475 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 476 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 477 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 478 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 479 addUnderscoredPrefixToName("RegisterFunction")); 480 481 // Extract GpuBinaryHandle passed as the first argument passed to 482 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 483 // each emitted kernel. 484 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 485 for (auto &&I : EmittedKernels) { 486 llvm::Constant *KernelName = 487 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 488 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 489 llvm::Value *Args[] = { 490 &GpuBinaryHandlePtr, 491 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 492 KernelName, 493 KernelName, 494 llvm::ConstantInt::get(IntTy, -1), 495 NullPtr, 496 NullPtr, 497 NullPtr, 498 NullPtr, 499 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 500 Builder.CreateCall(RegisterFunc, Args); 501 } 502 503 llvm::Type *VarSizeTy = IntTy; 504 // For HIP or CUDA 9.0+, device variable size is type of `size_t`. 505 if (CGM.getLangOpts().HIP || 506 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) 507 VarSizeTy = SizeTy; 508 509 // void __cudaRegisterVar(void **, char *, char *, const char *, 510 // int, int, int, int) 511 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 512 CharPtrTy, IntTy, VarSizeTy, 513 IntTy, IntTy}; 514 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 515 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), 516 addUnderscoredPrefixToName("RegisterVar")); 517 // void __hipRegisterManagedVar(void **, char *, char *, const char *, 518 // size_t, unsigned) 519 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 520 CharPtrTy, VarSizeTy, IntTy}; 521 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( 522 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), 523 addUnderscoredPrefixToName("RegisterManagedVar")); 524 // void __cudaRegisterSurface(void **, const struct surfaceReference *, 525 // const void **, const char *, int, int); 526 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( 527 llvm::FunctionType::get( 528 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, 529 false), 530 addUnderscoredPrefixToName("RegisterSurface")); 531 // void __cudaRegisterTexture(void **, const struct textureReference *, 532 // const void **, const char *, int, int, int) 533 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( 534 llvm::FunctionType::get( 535 VoidTy, 536 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, 537 false), 538 addUnderscoredPrefixToName("RegisterTexture")); 539 for (auto &&Info : DeviceVars) { 540 llvm::GlobalVariable *Var = Info.Var; 541 assert((!Var->isDeclaration() || Info.Flags.isManaged()) && 542 "External variables should not show up here, except HIP managed " 543 "variables"); 544 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 545 switch (Info.Flags.getKind()) { 546 case DeviceVarFlags::Variable: { 547 uint64_t VarSize = 548 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 549 if (Info.Flags.isManaged()) { 550 auto ManagedVar = new llvm::GlobalVariable( 551 CGM.getModule(), Var->getType(), 552 /*isConstant=*/false, Var->getLinkage(), 553 /*Init=*/Var->isDeclaration() 554 ? nullptr 555 : llvm::ConstantPointerNull::get(Var->getType()), 556 /*Name=*/"", /*InsertBefore=*/nullptr, 557 llvm::GlobalVariable::NotThreadLocal); 558 ManagedVar->setDSOLocal(Var->isDSOLocal()); 559 ManagedVar->setVisibility(Var->getVisibility()); 560 ManagedVar->setExternallyInitialized(true); 561 ManagedVar->takeName(Var); 562 Var->setName(Twine(ManagedVar->getName() + ".managed")); 563 replaceManagedVar(Var, ManagedVar); 564 llvm::Value *Args[] = { 565 &GpuBinaryHandlePtr, 566 Builder.CreateBitCast(ManagedVar, VoidPtrTy), 567 Builder.CreateBitCast(Var, VoidPtrTy), 568 VarName, 569 llvm::ConstantInt::get(VarSizeTy, VarSize), 570 llvm::ConstantInt::get(IntTy, Var->getAlignment())}; 571 if (!Var->isDeclaration()) 572 Builder.CreateCall(RegisterManagedVar, Args); 573 } else { 574 llvm::Value *Args[] = { 575 &GpuBinaryHandlePtr, 576 Builder.CreateBitCast(Var, VoidPtrTy), 577 VarName, 578 VarName, 579 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), 580 llvm::ConstantInt::get(VarSizeTy, VarSize), 581 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), 582 llvm::ConstantInt::get(IntTy, 0)}; 583 Builder.CreateCall(RegisterVar, Args); 584 } 585 break; 586 } 587 case DeviceVarFlags::Surface: 588 Builder.CreateCall( 589 RegisterSurf, 590 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 591 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 592 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 593 break; 594 case DeviceVarFlags::Texture: 595 Builder.CreateCall( 596 RegisterTex, 597 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 598 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 599 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), 600 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 601 break; 602 } 603 } 604 605 Builder.CreateRetVoid(); 606 return RegisterKernelsFunc; 607 } 608 609 /// Creates a global constructor function for the module: 610 /// 611 /// For CUDA: 612 /// \code 613 /// void __cuda_module_ctor(void*) { 614 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 615 /// __cuda_register_globals(Handle); 616 /// } 617 /// \endcode 618 /// 619 /// For HIP: 620 /// \code 621 /// void __hip_module_ctor(void*) { 622 /// if (__hip_gpubin_handle == 0) { 623 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 624 /// __hip_register_globals(__hip_gpubin_handle); 625 /// } 626 /// } 627 /// \endcode 628 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 629 bool IsHIP = CGM.getLangOpts().HIP; 630 bool IsCUDA = CGM.getLangOpts().CUDA; 631 // No need to generate ctors/dtors if there is no GPU binary. 632 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 633 if (CudaGpuBinaryFileName.empty() && !IsHIP) 634 return nullptr; 635 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 636 DeviceVars.empty()) 637 return nullptr; 638 639 // void __{cuda|hip}_register_globals(void* handle); 640 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 641 // We always need a function to pass in as callback. Create a dummy 642 // implementation if we don't need to register anything. 643 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 644 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 645 646 // void ** __{cuda|hip}RegisterFatBinary(void *); 647 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 648 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 649 addUnderscoredPrefixToName("RegisterFatBinary")); 650 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 651 llvm::StructType *FatbinWrapperTy = 652 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 653 654 // Register GPU binary with the CUDA runtime, store returned handle in a 655 // global variable and save a reference in GpuBinaryHandle to be cleaned up 656 // in destructor on exit. Then associate all known kernels with the GPU binary 657 // handle so CUDA runtime can figure out what to call on the GPU side. 658 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 659 if (!CudaGpuBinaryFileName.empty()) { 660 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 661 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 662 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 663 CGM.getDiags().Report(diag::err_cannot_open_file) 664 << CudaGpuBinaryFileName << EC.message(); 665 return nullptr; 666 } 667 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 668 } 669 670 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 671 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 672 llvm::GlobalValue::InternalLinkage, 673 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 674 llvm::BasicBlock *CtorEntryBB = 675 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 676 CGBuilderTy CtorBuilder(CGM, Context); 677 678 CtorBuilder.SetInsertPoint(CtorEntryBB); 679 680 const char *FatbinConstantName; 681 const char *FatbinSectionName; 682 const char *ModuleIDSectionName; 683 StringRef ModuleIDPrefix; 684 llvm::Constant *FatBinStr; 685 unsigned FatMagic; 686 if (IsHIP) { 687 FatbinConstantName = ".hip_fatbin"; 688 FatbinSectionName = ".hipFatBinSegment"; 689 690 ModuleIDSectionName = "__hip_module_id"; 691 ModuleIDPrefix = "__hip_"; 692 693 if (CudaGpuBinary) { 694 // If fatbin is available from early finalization, create a string 695 // literal containing the fat binary loaded from the given file. 696 const unsigned HIPCodeObjectAlign = 4096; 697 FatBinStr = 698 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 699 FatbinConstantName, HIPCodeObjectAlign); 700 } else { 701 // If fatbin is not available, create an external symbol 702 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 703 // to contain the fat binary but will be populated somewhere else, 704 // e.g. by lld through link script. 705 FatBinStr = new llvm::GlobalVariable( 706 CGM.getModule(), CGM.Int8Ty, 707 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 708 "__hip_fatbin", nullptr, 709 llvm::GlobalVariable::NotThreadLocal); 710 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 711 } 712 713 FatMagic = HIPFatMagic; 714 } else { 715 if (RelocatableDeviceCode) 716 FatbinConstantName = CGM.getTriple().isMacOSX() 717 ? "__NV_CUDA,__nv_relfatbin" 718 : "__nv_relfatbin"; 719 else 720 FatbinConstantName = 721 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 722 // NVIDIA's cuobjdump looks for fatbins in this section. 723 FatbinSectionName = 724 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 725 726 ModuleIDSectionName = CGM.getTriple().isMacOSX() 727 ? "__NV_CUDA,__nv_module_id" 728 : "__nv_module_id"; 729 ModuleIDPrefix = "__nv_"; 730 731 // For CUDA, create a string literal containing the fat binary loaded from 732 // the given file. 733 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 734 FatbinConstantName, 8); 735 FatMagic = CudaFatMagic; 736 } 737 738 // Create initialized wrapper structure that points to the loaded GPU binary 739 ConstantInitBuilder Builder(CGM); 740 auto Values = Builder.beginStruct(FatbinWrapperTy); 741 // Fatbin wrapper magic. 742 Values.addInt(IntTy, FatMagic); 743 // Fatbin version. 744 Values.addInt(IntTy, 1); 745 // Data. 746 Values.add(FatBinStr); 747 // Unused in fatbin v1. 748 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 749 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 750 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 751 /*constant*/ true); 752 FatbinWrapper->setSection(FatbinSectionName); 753 754 // There is only one HIP fat binary per linked module, however there are 755 // multiple constructor functions. Make sure the fat binary is registered 756 // only once. The constructor functions are executed by the dynamic loader 757 // before the program gains control. The dynamic loader cannot execute the 758 // constructor functions concurrently since doing that would not guarantee 759 // thread safety of the loaded program. Therefore we can assume sequential 760 // execution of constructor functions here. 761 if (IsHIP) { 762 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 763 llvm::GlobalValue::LinkOnceAnyLinkage; 764 llvm::BasicBlock *IfBlock = 765 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 766 llvm::BasicBlock *ExitBlock = 767 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 768 // The name, size, and initialization pattern of this variable is part 769 // of HIP ABI. 770 GpuBinaryHandle = new llvm::GlobalVariable( 771 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 772 Linkage, 773 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 774 "__hip_gpubin_handle"); 775 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 776 // Prevent the weak symbol in different shared libraries being merged. 777 if (Linkage != llvm::GlobalValue::InternalLinkage) 778 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 779 Address GpuBinaryAddr( 780 GpuBinaryHandle, 781 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 782 { 783 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 784 llvm::Constant *Zero = 785 llvm::Constant::getNullValue(HandleValue->getType()); 786 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 787 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 788 } 789 { 790 CtorBuilder.SetInsertPoint(IfBlock); 791 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 792 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 793 RegisterFatbinFunc, 794 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 795 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 796 CtorBuilder.CreateBr(ExitBlock); 797 } 798 { 799 CtorBuilder.SetInsertPoint(ExitBlock); 800 // Call __hip_register_globals(GpuBinaryHandle); 801 if (RegisterGlobalsFunc) { 802 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 803 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 804 } 805 } 806 } else if (!RelocatableDeviceCode) { 807 // Register binary with CUDA runtime. This is substantially different in 808 // default mode vs. separate compilation! 809 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 810 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 811 RegisterFatbinFunc, 812 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 813 GpuBinaryHandle = new llvm::GlobalVariable( 814 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 815 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 816 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 817 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 818 CGM.getPointerAlign()); 819 820 // Call __cuda_register_globals(GpuBinaryHandle); 821 if (RegisterGlobalsFunc) 822 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 823 824 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 825 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 826 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 827 // void __cudaRegisterFatBinaryEnd(void **); 828 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 829 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 830 "__cudaRegisterFatBinaryEnd"); 831 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 832 } 833 } else { 834 // Generate a unique module ID. 835 SmallString<64> ModuleID; 836 llvm::raw_svector_ostream OS(ModuleID); 837 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 838 llvm::Constant *ModuleIDConstant = makeConstantString( 839 std::string(ModuleID.str()), "", ModuleIDSectionName, 32); 840 841 // Create an alias for the FatbinWrapper that nvcc will look for. 842 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 843 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 844 845 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 846 // void *, void (*)(void **)) 847 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 848 RegisterLinkedBinaryName += ModuleID; 849 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 850 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 851 852 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 853 llvm::Value *Args[] = {RegisterGlobalsFunc, 854 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 855 ModuleIDConstant, 856 makeDummyFunction(getCallbackFnTy())}; 857 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 858 } 859 860 // Create destructor and register it with atexit() the way NVCC does it. Doing 861 // it during regular destructor phase worked in CUDA before 9.2 but results in 862 // double-free in 9.2. 863 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 864 // extern "C" int atexit(void (*f)(void)); 865 llvm::FunctionType *AtExitTy = 866 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 867 llvm::FunctionCallee AtExitFunc = 868 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 869 /*Local=*/true); 870 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 871 } 872 873 CtorBuilder.CreateRetVoid(); 874 return ModuleCtorFunc; 875 } 876 877 /// Creates a global destructor function that unregisters the GPU code blob 878 /// registered by constructor. 879 /// 880 /// For CUDA: 881 /// \code 882 /// void __cuda_module_dtor(void*) { 883 /// __cudaUnregisterFatBinary(Handle); 884 /// } 885 /// \endcode 886 /// 887 /// For HIP: 888 /// \code 889 /// void __hip_module_dtor(void*) { 890 /// if (__hip_gpubin_handle) { 891 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 892 /// __hip_gpubin_handle = 0; 893 /// } 894 /// } 895 /// \endcode 896 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 897 // No need for destructor if we don't have a handle to unregister. 898 if (!GpuBinaryHandle) 899 return nullptr; 900 901 // void __cudaUnregisterFatBinary(void ** handle); 902 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 903 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 904 addUnderscoredPrefixToName("UnregisterFatBinary")); 905 906 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 907 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 908 llvm::GlobalValue::InternalLinkage, 909 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 910 911 llvm::BasicBlock *DtorEntryBB = 912 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 913 CGBuilderTy DtorBuilder(CGM, Context); 914 DtorBuilder.SetInsertPoint(DtorEntryBB); 915 916 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 917 GpuBinaryHandle->getAlignment())); 918 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 919 // There is only one HIP fat binary per linked module, however there are 920 // multiple destructor functions. Make sure the fat binary is unregistered 921 // only once. 922 if (CGM.getLangOpts().HIP) { 923 llvm::BasicBlock *IfBlock = 924 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 925 llvm::BasicBlock *ExitBlock = 926 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 927 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 928 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 929 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 930 931 DtorBuilder.SetInsertPoint(IfBlock); 932 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 933 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 934 DtorBuilder.CreateBr(ExitBlock); 935 936 DtorBuilder.SetInsertPoint(ExitBlock); 937 } else { 938 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 939 } 940 DtorBuilder.CreateRetVoid(); 941 return ModuleDtorFunc; 942 } 943 944 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 945 return new CGNVCUDARuntime(CGM); 946 } 947 948 void CGNVCUDARuntime::internalizeDeviceSideVar( 949 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { 950 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side 951 // global variables become internal definitions. These have to be internal in 952 // order to prevent name conflicts with global host variables with the same 953 // name in a different TUs. 954 // 955 // For -fgpu-rdc, the shadow variables should not be internalized because 956 // they may be accessed by different TU. 957 if (CGM.getLangOpts().GPURelocatableDeviceCode) 958 return; 959 960 // __shared__ variables are odd. Shadows do get created, but 961 // they are not registered with the CUDA runtime, so they 962 // can't really be used to access their device-side 963 // counterparts. It's not clear yet whether it's nvcc's bug or 964 // a feature, but we've got to do the same for compatibility. 965 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 966 D->hasAttr<CUDASharedAttr>() || 967 D->getType()->isCUDADeviceBuiltinSurfaceType() || 968 D->getType()->isCUDADeviceBuiltinTextureType()) { 969 Linkage = llvm::GlobalValue::InternalLinkage; 970 } 971 } 972 973 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, 974 llvm::GlobalVariable &GV) { 975 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { 976 // Shadow variables and their properties must be registered with CUDA 977 // runtime. Skip Extern global variables, which will be registered in 978 // the TU where they are defined. 979 // 980 // Don't register a C++17 inline variable. The local symbol can be 981 // discarded and referencing a discarded local symbol from outside the 982 // comdat (__cuda_register_globals) is disallowed by the ELF spec. 983 // TODO: Reject __device__ constexpr and __device__ inline in Sema. 984 // HIP managed variables need to be always recorded in device and host 985 // compilations for transformation. 986 if ((!D->hasExternalStorage() && !D->isInline()) || 987 D->hasAttr<HIPManagedAttr>()) { 988 registerDeviceVar(D, GV, !D->hasDefinition(), 989 D->hasAttr<CUDAConstantAttr>()); 990 } 991 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || 992 D->getType()->isCUDADeviceBuiltinTextureType()) { 993 // Builtin surfaces and textures and their template arguments are 994 // also registered with CUDA runtime. 995 const ClassTemplateSpecializationDecl *TD = 996 cast<ClassTemplateSpecializationDecl>( 997 D->getType()->getAs<RecordType>()->getDecl()); 998 const TemplateArgumentList &Args = TD->getTemplateArgs(); 999 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { 1000 assert(Args.size() == 2 && 1001 "Unexpected number of template arguments of CUDA device " 1002 "builtin surface type."); 1003 auto SurfType = Args[1].getAsIntegral(); 1004 if (!D->hasExternalStorage()) 1005 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); 1006 } else { 1007 assert(Args.size() == 3 && 1008 "Unexpected number of template arguments of CUDA device " 1009 "builtin texture type."); 1010 auto TexType = Args[1].getAsIntegral(); 1011 auto Normalized = Args[2].getAsIntegral(); 1012 if (!D->hasExternalStorage()) 1013 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), 1014 Normalized.getZExtValue()); 1015 } 1016 } 1017 } 1018 1019 // Transform managed variables to pointers to managed variables in device code. 1020 // Each use of the original managed variable is replaced by a load from the 1021 // transformed managed variable. The transformed managed variable contains 1022 // the address of managed memory which will be allocated by the runtime. 1023 void CGNVCUDARuntime::transformManagedVars() { 1024 for (auto &&Info : DeviceVars) { 1025 llvm::GlobalVariable *Var = Info.Var; 1026 if (Info.Flags.getKind() == DeviceVarFlags::Variable && 1027 Info.Flags.isManaged()) { 1028 auto ManagedVar = new llvm::GlobalVariable( 1029 CGM.getModule(), Var->getType(), 1030 /*isConstant=*/false, Var->getLinkage(), 1031 /*Init=*/Var->isDeclaration() 1032 ? nullptr 1033 : llvm::ConstantPointerNull::get(Var->getType()), 1034 /*Name=*/"", /*InsertBefore=*/nullptr, 1035 llvm::GlobalVariable::NotThreadLocal, 1036 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); 1037 ManagedVar->setDSOLocal(Var->isDSOLocal()); 1038 ManagedVar->setVisibility(Var->getVisibility()); 1039 ManagedVar->setExternallyInitialized(true); 1040 replaceManagedVar(Var, ManagedVar); 1041 ManagedVar->takeName(Var); 1042 Var->setName(Twine(ManagedVar->getName()) + ".managed"); 1043 // Keep managed variables even if they are not used in device code since 1044 // they need to be allocated by the runtime. 1045 if (!Var->isDeclaration()) { 1046 assert(!ManagedVar->isDeclaration()); 1047 CGM.addCompilerUsedGlobal(Var); 1048 CGM.addCompilerUsedGlobal(ManagedVar); 1049 } 1050 } 1051 } 1052 } 1053 1054 // Returns module constructor to be added. 1055 llvm::Function *CGNVCUDARuntime::finalizeModule() { 1056 if (CGM.getLangOpts().CUDAIsDevice) { 1057 transformManagedVars(); 1058 return nullptr; 1059 } 1060 return makeModuleCtorFunction(); 1061 } 1062