1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/IR/ReplaceConstant.h" 25 #include "llvm/Support/Format.h" 26 27 using namespace clang; 28 using namespace CodeGen; 29 30 namespace { 31 constexpr unsigned CudaFatMagic = 0x466243b1; 32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 33 34 class CGNVCUDARuntime : public CGCUDARuntime { 35 36 private: 37 llvm::IntegerType *IntTy, *SizeTy; 38 llvm::Type *VoidTy; 39 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 40 41 /// Convenience reference to LLVM Context 42 llvm::LLVMContext &Context; 43 /// Convenience reference to the current module 44 llvm::Module &TheModule; 45 /// Keeps track of kernel launch stubs emitted in this module 46 struct KernelInfo { 47 llvm::Function *Kernel; 48 const Decl *D; 49 }; 50 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 51 struct VarInfo { 52 llvm::GlobalVariable *Var; 53 const VarDecl *D; 54 DeviceVarFlags Flags; 55 }; 56 llvm::SmallVector<VarInfo, 16> DeviceVars; 57 /// Keeps track of variable containing handle of GPU binary. Populated by 58 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 59 /// ModuleDtorFunction() 60 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 61 /// Whether we generate relocatable device code. 62 bool RelocatableDeviceCode; 63 /// Mangle context for device. 64 std::unique_ptr<MangleContext> DeviceMC; 65 66 llvm::FunctionCallee getSetupArgumentFn() const; 67 llvm::FunctionCallee getLaunchFn() const; 68 69 llvm::FunctionType *getRegisterGlobalsFnTy() const; 70 llvm::FunctionType *getCallbackFnTy() const; 71 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 72 std::string addPrefixToName(StringRef FuncName) const; 73 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 74 75 /// Creates a function to register all kernel stubs generated in this module. 76 llvm::Function *makeRegisterGlobalsFn(); 77 78 /// Helper function that generates a constant string and returns a pointer to 79 /// the start of the string. The result of this function can be used anywhere 80 /// where the C code specifies const char*. 81 llvm::Constant *makeConstantString(const std::string &Str, 82 const std::string &Name = "", 83 const std::string &SectionName = "", 84 unsigned Alignment = 0) { 85 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 86 llvm::ConstantInt::get(SizeTy, 0)}; 87 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 88 llvm::GlobalVariable *GV = 89 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 90 if (!SectionName.empty()) { 91 GV->setSection(SectionName); 92 // Mark the address as used which make sure that this section isn't 93 // merged and we will really have it in the object file. 94 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 95 } 96 if (Alignment) 97 GV->setAlignment(llvm::Align(Alignment)); 98 99 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 100 ConstStr.getPointer(), Zeros); 101 } 102 103 /// Helper function that generates an empty dummy function returning void. 104 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 105 assert(FnTy->getReturnType()->isVoidTy() && 106 "Can only generate dummy functions returning void!"); 107 llvm::Function *DummyFunc = llvm::Function::Create( 108 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 109 110 llvm::BasicBlock *DummyBlock = 111 llvm::BasicBlock::Create(Context, "", DummyFunc); 112 CGBuilderTy FuncBuilder(CGM, Context); 113 FuncBuilder.SetInsertPoint(DummyBlock); 114 FuncBuilder.CreateRetVoid(); 115 116 return DummyFunc; 117 } 118 119 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 120 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 121 std::string getDeviceSideName(const NamedDecl *ND) override; 122 123 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 124 bool Extern, bool Constant) { 125 DeviceVars.push_back({&Var, 126 VD, 127 {DeviceVarFlags::Variable, Extern, Constant, 128 VD->hasAttr<HIPManagedAttr>(), 129 /*Normalized*/ false, 0}}); 130 } 131 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, 132 bool Extern, int Type) { 133 DeviceVars.push_back({&Var, 134 VD, 135 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, 136 /*Managed*/ false, 137 /*Normalized*/ false, Type}}); 138 } 139 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, 140 bool Extern, int Type, bool Normalized) { 141 DeviceVars.push_back({&Var, 142 VD, 143 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, 144 /*Managed*/ false, Normalized, Type}}); 145 } 146 147 public: 148 CGNVCUDARuntime(CodeGenModule &CGM); 149 150 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 151 void handleVarRegistration(const VarDecl *VD, 152 llvm::GlobalVariable &Var) override; 153 154 /// Creates module constructor function 155 llvm::Function *makeModuleCtorFunction() override; 156 /// Creates module destructor function 157 llvm::Function *makeModuleDtorFunction() override; 158 void 159 internalizeDeviceSideVar(const VarDecl *D, 160 llvm::GlobalValue::LinkageTypes &Linkage) override; 161 }; 162 163 } 164 165 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 166 if (CGM.getLangOpts().HIP) 167 return ((Twine("hip") + Twine(FuncName)).str()); 168 return ((Twine("cuda") + Twine(FuncName)).str()); 169 } 170 std::string 171 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 172 if (CGM.getLangOpts().HIP) 173 return ((Twine("__hip") + Twine(FuncName)).str()); 174 return ((Twine("__cuda") + Twine(FuncName)).str()); 175 } 176 177 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 178 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 179 TheModule(CGM.getModule()), 180 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 181 DeviceMC(CGM.getContext().createMangleContext( 182 CGM.getContext().getAuxTargetInfo())) { 183 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 184 ASTContext &Ctx = CGM.getContext(); 185 186 IntTy = CGM.IntTy; 187 SizeTy = CGM.SizeTy; 188 VoidTy = CGM.VoidTy; 189 190 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 191 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 192 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 193 if (CGM.getContext().getAuxTargetInfo()) { 194 // If the host and device have different C++ ABIs, mark it as the device 195 // mangle context so that the mangling needs to retrieve the additonal 196 // device lambda mangling number instead of the regular host one. 197 DeviceMC->setDeviceMangleContext( 198 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && 199 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()); 200 } 201 } 202 203 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 204 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 205 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 206 return CGM.CreateRuntimeFunction( 207 llvm::FunctionType::get(IntTy, Params, false), 208 addPrefixToName("SetupArgument")); 209 } 210 211 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 212 if (CGM.getLangOpts().HIP) { 213 // hipError_t hipLaunchByPtr(char *); 214 return CGM.CreateRuntimeFunction( 215 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 216 } else { 217 // cudaError_t cudaLaunch(char *); 218 return CGM.CreateRuntimeFunction( 219 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 220 } 221 } 222 223 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 224 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 225 } 226 227 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 228 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 229 } 230 231 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 232 auto CallbackFnTy = getCallbackFnTy(); 233 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 234 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 235 VoidPtrTy, CallbackFnTy->getPointerTo()}; 236 return llvm::FunctionType::get(VoidTy, Params, false); 237 } 238 239 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { 240 GlobalDecl GD; 241 // D could be either a kernel or a variable. 242 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 243 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 244 else 245 GD = GlobalDecl(ND); 246 std::string DeviceSideName; 247 if (DeviceMC->shouldMangleDeclName(ND)) { 248 SmallString<256> Buffer; 249 llvm::raw_svector_ostream Out(Buffer); 250 DeviceMC->mangleName(GD, Out); 251 DeviceSideName = std::string(Out.str()); 252 } else 253 DeviceSideName = std::string(ND->getIdentifier()->getName()); 254 return DeviceSideName; 255 } 256 257 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 258 FunctionArgList &Args) { 259 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 260 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 261 CudaFeature::CUDA_USES_NEW_LAUNCH) || 262 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) 263 emitDeviceStubBodyNew(CGF, Args); 264 else 265 emitDeviceStubBodyLegacy(CGF, Args); 266 } 267 268 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 269 // array and kernels are launched using cudaLaunchKernel(). 270 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 271 FunctionArgList &Args) { 272 // Build the shadow stack entry at the very start of the function. 273 274 // Calculate amount of space we will need for all arguments. If we have no 275 // args, allocate a single pointer so we still have a valid pointer to the 276 // argument array that we can pass to runtime, even if it will be unused. 277 Address KernelArgs = CGF.CreateTempAlloca( 278 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 279 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 280 // Store pointers to the arguments in a locally allocated launch_args. 281 for (unsigned i = 0; i < Args.size(); ++i) { 282 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 283 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 284 CGF.Builder.CreateDefaultAlignedStore( 285 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 286 } 287 288 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 289 290 // Lookup cudaLaunchKernel/hipLaunchKernel function. 291 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 292 // void **args, size_t sharedMem, 293 // cudaStream_t stream); 294 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 295 // void **args, size_t sharedMem, 296 // hipStream_t stream); 297 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 298 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 299 auto LaunchKernelName = addPrefixToName("LaunchKernel"); 300 IdentifierInfo &cudaLaunchKernelII = 301 CGM.getContext().Idents.get(LaunchKernelName); 302 FunctionDecl *cudaLaunchKernelFD = nullptr; 303 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 304 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 305 cudaLaunchKernelFD = FD; 306 } 307 308 if (cudaLaunchKernelFD == nullptr) { 309 CGM.Error(CGF.CurFuncDecl->getLocation(), 310 "Can't find declaration for " + LaunchKernelName); 311 return; 312 } 313 // Create temporary dim3 grid_dim, block_dim. 314 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 315 QualType Dim3Ty = GridDimParam->getType(); 316 Address GridDim = 317 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 318 Address BlockDim = 319 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 320 Address ShmemSize = 321 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 322 Address Stream = 323 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 324 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 325 llvm::FunctionType::get(IntTy, 326 {/*gridDim=*/GridDim.getType(), 327 /*blockDim=*/BlockDim.getType(), 328 /*ShmemSize=*/ShmemSize.getType(), 329 /*Stream=*/Stream.getType()}, 330 /*isVarArg=*/false), 331 addUnderscoredPrefixToName("PopCallConfiguration")); 332 333 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 334 {GridDim.getPointer(), BlockDim.getPointer(), 335 ShmemSize.getPointer(), Stream.getPointer()}); 336 337 // Emit the call to cudaLaunch 338 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 339 CallArgList LaunchKernelArgs; 340 LaunchKernelArgs.add(RValue::get(Kernel), 341 cudaLaunchKernelFD->getParamDecl(0)->getType()); 342 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 343 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 344 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 345 cudaLaunchKernelFD->getParamDecl(3)->getType()); 346 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 347 cudaLaunchKernelFD->getParamDecl(4)->getType()); 348 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 349 cudaLaunchKernelFD->getParamDecl(5)->getType()); 350 351 QualType QT = cudaLaunchKernelFD->getType(); 352 QualType CQT = QT.getCanonicalType(); 353 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 354 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 355 356 const CGFunctionInfo &FI = 357 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 358 llvm::FunctionCallee cudaLaunchKernelFn = 359 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 360 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 361 LaunchKernelArgs); 362 CGF.EmitBranch(EndBlock); 363 364 CGF.EmitBlock(EndBlock); 365 } 366 367 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 368 FunctionArgList &Args) { 369 // Emit a call to cudaSetupArgument for each arg in Args. 370 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 371 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 372 CharUnits Offset = CharUnits::Zero(); 373 for (const VarDecl *A : Args) { 374 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); 375 Offset = Offset.alignTo(TInfo.Align); 376 llvm::Value *Args[] = { 377 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 378 VoidPtrTy), 379 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), 380 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 381 }; 382 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 383 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 384 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 385 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 386 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 387 CGF.EmitBlock(NextBlock); 388 Offset += TInfo.Width; 389 } 390 391 // Emit the call to cudaLaunch 392 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 393 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 394 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 395 CGF.EmitBranch(EndBlock); 396 397 CGF.EmitBlock(EndBlock); 398 } 399 400 // Replace the original variable Var with the address loaded from variable 401 // ManagedVar populated by HIP runtime. 402 static void replaceManagedVar(llvm::GlobalVariable *Var, 403 llvm::GlobalVariable *ManagedVar) { 404 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; 405 for (auto &&VarUse : Var->uses()) { 406 WorkList.push_back({VarUse.getUser()}); 407 } 408 while (!WorkList.empty()) { 409 auto &&WorkItem = WorkList.pop_back_val(); 410 auto *U = WorkItem.back(); 411 if (isa<llvm::ConstantExpr>(U)) { 412 for (auto &&UU : U->uses()) { 413 WorkItem.push_back(UU.getUser()); 414 WorkList.push_back(WorkItem); 415 WorkItem.pop_back(); 416 } 417 continue; 418 } 419 if (auto *I = dyn_cast<llvm::Instruction>(U)) { 420 llvm::Value *OldV = Var; 421 llvm::Instruction *NewV = 422 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, 423 llvm::Align(Var->getAlignment()), I); 424 WorkItem.pop_back(); 425 // Replace constant expressions directly or indirectly using the managed 426 // variable with instructions. 427 for (auto &&Op : WorkItem) { 428 auto *CE = cast<llvm::ConstantExpr>(Op); 429 auto *NewInst = llvm::createReplacementInstr(CE, I); 430 NewInst->replaceUsesOfWith(OldV, NewV); 431 OldV = CE; 432 NewV = NewInst; 433 } 434 I->replaceUsesOfWith(OldV, NewV); 435 } else { 436 llvm_unreachable("Invalid use of managed variable"); 437 } 438 } 439 } 440 441 /// Creates a function that sets up state on the host side for CUDA objects that 442 /// have a presence on both the host and device sides. Specifically, registers 443 /// the host side of kernel functions and device global variables with the CUDA 444 /// runtime. 445 /// \code 446 /// void __cuda_register_globals(void** GpuBinaryHandle) { 447 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 448 /// ... 449 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 450 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 451 /// ... 452 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 453 /// } 454 /// \endcode 455 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 456 // No need to register anything 457 if (EmittedKernels.empty() && DeviceVars.empty()) 458 return nullptr; 459 460 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 461 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 462 addUnderscoredPrefixToName("_register_globals"), &TheModule); 463 llvm::BasicBlock *EntryBB = 464 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 465 CGBuilderTy Builder(CGM, Context); 466 Builder.SetInsertPoint(EntryBB); 467 468 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 469 // int, uint3*, uint3*, dim3*, dim3*, int*) 470 llvm::Type *RegisterFuncParams[] = { 471 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 472 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 473 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 474 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 475 addUnderscoredPrefixToName("RegisterFunction")); 476 477 // Extract GpuBinaryHandle passed as the first argument passed to 478 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 479 // each emitted kernel. 480 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 481 for (auto &&I : EmittedKernels) { 482 llvm::Constant *KernelName = 483 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 484 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 485 llvm::Value *Args[] = { 486 &GpuBinaryHandlePtr, 487 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 488 KernelName, 489 KernelName, 490 llvm::ConstantInt::get(IntTy, -1), 491 NullPtr, 492 NullPtr, 493 NullPtr, 494 NullPtr, 495 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 496 Builder.CreateCall(RegisterFunc, Args); 497 } 498 499 llvm::Type *VarSizeTy = IntTy; 500 // For HIP or CUDA 9.0+, device variable size is type of `size_t`. 501 if (CGM.getLangOpts().HIP || 502 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) 503 VarSizeTy = SizeTy; 504 505 // void __cudaRegisterVar(void **, char *, char *, const char *, 506 // int, int, int, int) 507 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 508 CharPtrTy, IntTy, VarSizeTy, 509 IntTy, IntTy}; 510 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 511 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), 512 addUnderscoredPrefixToName("RegisterVar")); 513 // void __hipRegisterManagedVar(void **, char *, char *, const char *, 514 // size_t, unsigned) 515 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 516 CharPtrTy, VarSizeTy, IntTy}; 517 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( 518 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), 519 addUnderscoredPrefixToName("RegisterManagedVar")); 520 // void __cudaRegisterSurface(void **, const struct surfaceReference *, 521 // const void **, const char *, int, int); 522 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( 523 llvm::FunctionType::get( 524 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, 525 false), 526 addUnderscoredPrefixToName("RegisterSurface")); 527 // void __cudaRegisterTexture(void **, const struct textureReference *, 528 // const void **, const char *, int, int, int) 529 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( 530 llvm::FunctionType::get( 531 VoidTy, 532 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, 533 false), 534 addUnderscoredPrefixToName("RegisterTexture")); 535 for (auto &&Info : DeviceVars) { 536 llvm::GlobalVariable *Var = Info.Var; 537 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 538 switch (Info.Flags.getKind()) { 539 case DeviceVarFlags::Variable: { 540 uint64_t VarSize = 541 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 542 if (Info.Flags.isManaged()) { 543 auto ManagedVar = new llvm::GlobalVariable( 544 CGM.getModule(), Var->getType(), 545 /*isConstant=*/false, Var->getLinkage(), 546 /*Init=*/llvm::ConstantPointerNull::get(Var->getType()), 547 Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr, 548 llvm::GlobalVariable::NotThreadLocal); 549 ManagedVar->setDSOLocal(Var->isDSOLocal()); 550 ManagedVar->setVisibility(Var->getVisibility()); 551 replaceManagedVar(Var, ManagedVar); 552 llvm::Value *Args[] = { 553 &GpuBinaryHandlePtr, 554 Builder.CreateBitCast(ManagedVar, VoidPtrTy), 555 Builder.CreateBitCast(Var, VoidPtrTy), 556 VarName, 557 llvm::ConstantInt::get(VarSizeTy, VarSize), 558 llvm::ConstantInt::get(IntTy, Var->getAlignment())}; 559 Builder.CreateCall(RegisterManagedVar, Args); 560 } else { 561 llvm::Value *Args[] = { 562 &GpuBinaryHandlePtr, 563 Builder.CreateBitCast(Var, VoidPtrTy), 564 VarName, 565 VarName, 566 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), 567 llvm::ConstantInt::get(VarSizeTy, VarSize), 568 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), 569 llvm::ConstantInt::get(IntTy, 0)}; 570 Builder.CreateCall(RegisterVar, Args); 571 } 572 break; 573 } 574 case DeviceVarFlags::Surface: 575 Builder.CreateCall( 576 RegisterSurf, 577 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 578 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 579 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 580 break; 581 case DeviceVarFlags::Texture: 582 Builder.CreateCall( 583 RegisterTex, 584 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 585 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 586 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), 587 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 588 break; 589 } 590 } 591 592 Builder.CreateRetVoid(); 593 return RegisterKernelsFunc; 594 } 595 596 /// Creates a global constructor function for the module: 597 /// 598 /// For CUDA: 599 /// \code 600 /// void __cuda_module_ctor(void*) { 601 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 602 /// __cuda_register_globals(Handle); 603 /// } 604 /// \endcode 605 /// 606 /// For HIP: 607 /// \code 608 /// void __hip_module_ctor(void*) { 609 /// if (__hip_gpubin_handle == 0) { 610 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 611 /// __hip_register_globals(__hip_gpubin_handle); 612 /// } 613 /// } 614 /// \endcode 615 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 616 bool IsHIP = CGM.getLangOpts().HIP; 617 bool IsCUDA = CGM.getLangOpts().CUDA; 618 // No need to generate ctors/dtors if there is no GPU binary. 619 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 620 if (CudaGpuBinaryFileName.empty() && !IsHIP) 621 return nullptr; 622 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 623 DeviceVars.empty()) 624 return nullptr; 625 626 // void __{cuda|hip}_register_globals(void* handle); 627 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 628 // We always need a function to pass in as callback. Create a dummy 629 // implementation if we don't need to register anything. 630 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 631 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 632 633 // void ** __{cuda|hip}RegisterFatBinary(void *); 634 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 635 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 636 addUnderscoredPrefixToName("RegisterFatBinary")); 637 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 638 llvm::StructType *FatbinWrapperTy = 639 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 640 641 // Register GPU binary with the CUDA runtime, store returned handle in a 642 // global variable and save a reference in GpuBinaryHandle to be cleaned up 643 // in destructor on exit. Then associate all known kernels with the GPU binary 644 // handle so CUDA runtime can figure out what to call on the GPU side. 645 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 646 if (!CudaGpuBinaryFileName.empty()) { 647 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 648 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 649 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 650 CGM.getDiags().Report(diag::err_cannot_open_file) 651 << CudaGpuBinaryFileName << EC.message(); 652 return nullptr; 653 } 654 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 655 } 656 657 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 658 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 659 llvm::GlobalValue::InternalLinkage, 660 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 661 llvm::BasicBlock *CtorEntryBB = 662 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 663 CGBuilderTy CtorBuilder(CGM, Context); 664 665 CtorBuilder.SetInsertPoint(CtorEntryBB); 666 667 const char *FatbinConstantName; 668 const char *FatbinSectionName; 669 const char *ModuleIDSectionName; 670 StringRef ModuleIDPrefix; 671 llvm::Constant *FatBinStr; 672 unsigned FatMagic; 673 if (IsHIP) { 674 FatbinConstantName = ".hip_fatbin"; 675 FatbinSectionName = ".hipFatBinSegment"; 676 677 ModuleIDSectionName = "__hip_module_id"; 678 ModuleIDPrefix = "__hip_"; 679 680 if (CudaGpuBinary) { 681 // If fatbin is available from early finalization, create a string 682 // literal containing the fat binary loaded from the given file. 683 const unsigned HIPCodeObjectAlign = 4096; 684 FatBinStr = 685 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 686 FatbinConstantName, HIPCodeObjectAlign); 687 } else { 688 // If fatbin is not available, create an external symbol 689 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 690 // to contain the fat binary but will be populated somewhere else, 691 // e.g. by lld through link script. 692 FatBinStr = new llvm::GlobalVariable( 693 CGM.getModule(), CGM.Int8Ty, 694 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 695 "__hip_fatbin", nullptr, 696 llvm::GlobalVariable::NotThreadLocal); 697 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 698 } 699 700 FatMagic = HIPFatMagic; 701 } else { 702 if (RelocatableDeviceCode) 703 FatbinConstantName = CGM.getTriple().isMacOSX() 704 ? "__NV_CUDA,__nv_relfatbin" 705 : "__nv_relfatbin"; 706 else 707 FatbinConstantName = 708 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 709 // NVIDIA's cuobjdump looks for fatbins in this section. 710 FatbinSectionName = 711 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 712 713 ModuleIDSectionName = CGM.getTriple().isMacOSX() 714 ? "__NV_CUDA,__nv_module_id" 715 : "__nv_module_id"; 716 ModuleIDPrefix = "__nv_"; 717 718 // For CUDA, create a string literal containing the fat binary loaded from 719 // the given file. 720 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 721 FatbinConstantName, 8); 722 FatMagic = CudaFatMagic; 723 } 724 725 // Create initialized wrapper structure that points to the loaded GPU binary 726 ConstantInitBuilder Builder(CGM); 727 auto Values = Builder.beginStruct(FatbinWrapperTy); 728 // Fatbin wrapper magic. 729 Values.addInt(IntTy, FatMagic); 730 // Fatbin version. 731 Values.addInt(IntTy, 1); 732 // Data. 733 Values.add(FatBinStr); 734 // Unused in fatbin v1. 735 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 736 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 737 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 738 /*constant*/ true); 739 FatbinWrapper->setSection(FatbinSectionName); 740 741 // There is only one HIP fat binary per linked module, however there are 742 // multiple constructor functions. Make sure the fat binary is registered 743 // only once. The constructor functions are executed by the dynamic loader 744 // before the program gains control. The dynamic loader cannot execute the 745 // constructor functions concurrently since doing that would not guarantee 746 // thread safety of the loaded program. Therefore we can assume sequential 747 // execution of constructor functions here. 748 if (IsHIP) { 749 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 750 llvm::GlobalValue::LinkOnceAnyLinkage; 751 llvm::BasicBlock *IfBlock = 752 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 753 llvm::BasicBlock *ExitBlock = 754 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 755 // The name, size, and initialization pattern of this variable is part 756 // of HIP ABI. 757 GpuBinaryHandle = new llvm::GlobalVariable( 758 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 759 Linkage, 760 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 761 "__hip_gpubin_handle"); 762 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 763 // Prevent the weak symbol in different shared libraries being merged. 764 if (Linkage != llvm::GlobalValue::InternalLinkage) 765 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 766 Address GpuBinaryAddr( 767 GpuBinaryHandle, 768 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 769 { 770 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 771 llvm::Constant *Zero = 772 llvm::Constant::getNullValue(HandleValue->getType()); 773 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 774 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 775 } 776 { 777 CtorBuilder.SetInsertPoint(IfBlock); 778 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 779 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 780 RegisterFatbinFunc, 781 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 782 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 783 CtorBuilder.CreateBr(ExitBlock); 784 } 785 { 786 CtorBuilder.SetInsertPoint(ExitBlock); 787 // Call __hip_register_globals(GpuBinaryHandle); 788 if (RegisterGlobalsFunc) { 789 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 790 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 791 } 792 } 793 } else if (!RelocatableDeviceCode) { 794 // Register binary with CUDA runtime. This is substantially different in 795 // default mode vs. separate compilation! 796 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 797 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 798 RegisterFatbinFunc, 799 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 800 GpuBinaryHandle = new llvm::GlobalVariable( 801 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 802 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 803 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 804 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 805 CGM.getPointerAlign()); 806 807 // Call __cuda_register_globals(GpuBinaryHandle); 808 if (RegisterGlobalsFunc) 809 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 810 811 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 812 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 813 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 814 // void __cudaRegisterFatBinaryEnd(void **); 815 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 816 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 817 "__cudaRegisterFatBinaryEnd"); 818 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 819 } 820 } else { 821 // Generate a unique module ID. 822 SmallString<64> ModuleID; 823 llvm::raw_svector_ostream OS(ModuleID); 824 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 825 llvm::Constant *ModuleIDConstant = makeConstantString( 826 std::string(ModuleID.str()), "", ModuleIDSectionName, 32); 827 828 // Create an alias for the FatbinWrapper that nvcc will look for. 829 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 830 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 831 832 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 833 // void *, void (*)(void **)) 834 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 835 RegisterLinkedBinaryName += ModuleID; 836 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 837 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 838 839 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 840 llvm::Value *Args[] = {RegisterGlobalsFunc, 841 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 842 ModuleIDConstant, 843 makeDummyFunction(getCallbackFnTy())}; 844 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 845 } 846 847 // Create destructor and register it with atexit() the way NVCC does it. Doing 848 // it during regular destructor phase worked in CUDA before 9.2 but results in 849 // double-free in 9.2. 850 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 851 // extern "C" int atexit(void (*f)(void)); 852 llvm::FunctionType *AtExitTy = 853 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 854 llvm::FunctionCallee AtExitFunc = 855 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 856 /*Local=*/true); 857 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 858 } 859 860 CtorBuilder.CreateRetVoid(); 861 return ModuleCtorFunc; 862 } 863 864 /// Creates a global destructor function that unregisters the GPU code blob 865 /// registered by constructor. 866 /// 867 /// For CUDA: 868 /// \code 869 /// void __cuda_module_dtor(void*) { 870 /// __cudaUnregisterFatBinary(Handle); 871 /// } 872 /// \endcode 873 /// 874 /// For HIP: 875 /// \code 876 /// void __hip_module_dtor(void*) { 877 /// if (__hip_gpubin_handle) { 878 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 879 /// __hip_gpubin_handle = 0; 880 /// } 881 /// } 882 /// \endcode 883 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 884 // No need for destructor if we don't have a handle to unregister. 885 if (!GpuBinaryHandle) 886 return nullptr; 887 888 // void __cudaUnregisterFatBinary(void ** handle); 889 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 890 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 891 addUnderscoredPrefixToName("UnregisterFatBinary")); 892 893 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 894 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 895 llvm::GlobalValue::InternalLinkage, 896 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 897 898 llvm::BasicBlock *DtorEntryBB = 899 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 900 CGBuilderTy DtorBuilder(CGM, Context); 901 DtorBuilder.SetInsertPoint(DtorEntryBB); 902 903 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 904 GpuBinaryHandle->getAlignment())); 905 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 906 // There is only one HIP fat binary per linked module, however there are 907 // multiple destructor functions. Make sure the fat binary is unregistered 908 // only once. 909 if (CGM.getLangOpts().HIP) { 910 llvm::BasicBlock *IfBlock = 911 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 912 llvm::BasicBlock *ExitBlock = 913 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 914 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 915 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 916 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 917 918 DtorBuilder.SetInsertPoint(IfBlock); 919 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 920 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 921 DtorBuilder.CreateBr(ExitBlock); 922 923 DtorBuilder.SetInsertPoint(ExitBlock); 924 } else { 925 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 926 } 927 DtorBuilder.CreateRetVoid(); 928 return ModuleDtorFunc; 929 } 930 931 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 932 return new CGNVCUDARuntime(CGM); 933 } 934 935 void CGNVCUDARuntime::internalizeDeviceSideVar( 936 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { 937 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side 938 // global variables become internal definitions. These have to be internal in 939 // order to prevent name conflicts with global host variables with the same 940 // name in a different TUs. 941 // 942 // For -fgpu-rdc, the shadow variables should not be internalized because 943 // they may be accessed by different TU. 944 if (CGM.getLangOpts().GPURelocatableDeviceCode) 945 return; 946 947 // __shared__ variables are odd. Shadows do get created, but 948 // they are not registered with the CUDA runtime, so they 949 // can't really be used to access their device-side 950 // counterparts. It's not clear yet whether it's nvcc's bug or 951 // a feature, but we've got to do the same for compatibility. 952 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 953 D->hasAttr<CUDASharedAttr>() || 954 D->getType()->isCUDADeviceBuiltinSurfaceType() || 955 D->getType()->isCUDADeviceBuiltinTextureType()) { 956 Linkage = llvm::GlobalValue::InternalLinkage; 957 } 958 } 959 960 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, 961 llvm::GlobalVariable &GV) { 962 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { 963 // Shadow variables and their properties must be registered with CUDA 964 // runtime. Skip Extern global variables, which will be registered in 965 // the TU where they are defined. 966 // 967 // Don't register a C++17 inline variable. The local symbol can be 968 // discarded and referencing a discarded local symbol from outside the 969 // comdat (__cuda_register_globals) is disallowed by the ELF spec. 970 // TODO: Reject __device__ constexpr and __device__ inline in Sema. 971 if (!D->hasExternalStorage() && !D->isInline()) 972 registerDeviceVar(D, GV, !D->hasDefinition(), 973 D->hasAttr<CUDAConstantAttr>()); 974 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || 975 D->getType()->isCUDADeviceBuiltinTextureType()) { 976 // Builtin surfaces and textures and their template arguments are 977 // also registered with CUDA runtime. 978 const ClassTemplateSpecializationDecl *TD = 979 cast<ClassTemplateSpecializationDecl>( 980 D->getType()->getAs<RecordType>()->getDecl()); 981 const TemplateArgumentList &Args = TD->getTemplateArgs(); 982 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { 983 assert(Args.size() == 2 && 984 "Unexpected number of template arguments of CUDA device " 985 "builtin surface type."); 986 auto SurfType = Args[1].getAsIntegral(); 987 if (!D->hasExternalStorage()) 988 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); 989 } else { 990 assert(Args.size() == 3 && 991 "Unexpected number of template arguments of CUDA device " 992 "builtin texture type."); 993 auto TexType = Args[1].getAsIntegral(); 994 auto Normalized = Args[2].getAsIntegral(); 995 if (!D->hasExternalStorage()) 996 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), 997 Normalized.getZExtValue()); 998 } 999 } 1000 } 1001