1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/IR/ReplaceConstant.h" 25 #include "llvm/Support/Format.h" 26 27 using namespace clang; 28 using namespace CodeGen; 29 30 namespace { 31 constexpr unsigned CudaFatMagic = 0x466243b1; 32 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 33 34 class CGNVCUDARuntime : public CGCUDARuntime { 35 36 private: 37 llvm::IntegerType *IntTy, *SizeTy; 38 llvm::Type *VoidTy; 39 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 40 41 /// Convenience reference to LLVM Context 42 llvm::LLVMContext &Context; 43 /// Convenience reference to the current module 44 llvm::Module &TheModule; 45 /// Keeps track of kernel launch stubs emitted in this module 46 struct KernelInfo { 47 llvm::Function *Kernel; 48 const Decl *D; 49 }; 50 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 51 struct VarInfo { 52 llvm::GlobalVariable *Var; 53 const VarDecl *D; 54 DeviceVarFlags Flags; 55 }; 56 llvm::SmallVector<VarInfo, 16> DeviceVars; 57 /// Keeps track of variable containing handle of GPU binary. Populated by 58 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 59 /// ModuleDtorFunction() 60 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 61 /// Whether we generate relocatable device code. 62 bool RelocatableDeviceCode; 63 /// Mangle context for device. 64 std::unique_ptr<MangleContext> DeviceMC; 65 66 llvm::FunctionCallee getSetupArgumentFn() const; 67 llvm::FunctionCallee getLaunchFn() const; 68 69 llvm::FunctionType *getRegisterGlobalsFnTy() const; 70 llvm::FunctionType *getCallbackFnTy() const; 71 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 72 std::string addPrefixToName(StringRef FuncName) const; 73 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 74 75 /// Creates a function to register all kernel stubs generated in this module. 76 llvm::Function *makeRegisterGlobalsFn(); 77 78 /// Helper function that generates a constant string and returns a pointer to 79 /// the start of the string. The result of this function can be used anywhere 80 /// where the C code specifies const char*. 81 llvm::Constant *makeConstantString(const std::string &Str, 82 const std::string &Name = "", 83 const std::string &SectionName = "", 84 unsigned Alignment = 0) { 85 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 86 llvm::ConstantInt::get(SizeTy, 0)}; 87 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 88 llvm::GlobalVariable *GV = 89 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 90 if (!SectionName.empty()) { 91 GV->setSection(SectionName); 92 // Mark the address as used which make sure that this section isn't 93 // merged and we will really have it in the object file. 94 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 95 } 96 if (Alignment) 97 GV->setAlignment(llvm::Align(Alignment)); 98 99 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 100 ConstStr.getPointer(), Zeros); 101 } 102 103 /// Helper function that generates an empty dummy function returning void. 104 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 105 assert(FnTy->getReturnType()->isVoidTy() && 106 "Can only generate dummy functions returning void!"); 107 llvm::Function *DummyFunc = llvm::Function::Create( 108 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 109 110 llvm::BasicBlock *DummyBlock = 111 llvm::BasicBlock::Create(Context, "", DummyFunc); 112 CGBuilderTy FuncBuilder(CGM, Context); 113 FuncBuilder.SetInsertPoint(DummyBlock); 114 FuncBuilder.CreateRetVoid(); 115 116 return DummyFunc; 117 } 118 119 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 120 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 121 std::string getDeviceSideName(const NamedDecl *ND) override; 122 123 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 124 bool Extern, bool Constant) { 125 DeviceVars.push_back({&Var, 126 VD, 127 {DeviceVarFlags::Variable, Extern, Constant, 128 VD->hasAttr<HIPManagedAttr>(), 129 /*Normalized*/ false, 0}}); 130 } 131 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, 132 bool Extern, int Type) { 133 DeviceVars.push_back({&Var, 134 VD, 135 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, 136 /*Managed*/ false, 137 /*Normalized*/ false, Type}}); 138 } 139 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, 140 bool Extern, int Type, bool Normalized) { 141 DeviceVars.push_back({&Var, 142 VD, 143 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, 144 /*Managed*/ false, Normalized, Type}}); 145 } 146 147 /// Creates module constructor function 148 llvm::Function *makeModuleCtorFunction(); 149 /// Creates module destructor function 150 llvm::Function *makeModuleDtorFunction(); 151 /// Transform managed variables for device compilation. 152 void transformManagedVars(); 153 154 public: 155 CGNVCUDARuntime(CodeGenModule &CGM); 156 157 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 158 void handleVarRegistration(const VarDecl *VD, 159 llvm::GlobalVariable &Var) override; 160 void 161 internalizeDeviceSideVar(const VarDecl *D, 162 llvm::GlobalValue::LinkageTypes &Linkage) override; 163 164 llvm::Function *finalizeModule() override; 165 }; 166 167 } 168 169 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 170 if (CGM.getLangOpts().HIP) 171 return ((Twine("hip") + Twine(FuncName)).str()); 172 return ((Twine("cuda") + Twine(FuncName)).str()); 173 } 174 std::string 175 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 176 if (CGM.getLangOpts().HIP) 177 return ((Twine("__hip") + Twine(FuncName)).str()); 178 return ((Twine("__cuda") + Twine(FuncName)).str()); 179 } 180 181 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 182 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 183 TheModule(CGM.getModule()), 184 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 185 DeviceMC(CGM.getContext().createMangleContext( 186 CGM.getContext().getAuxTargetInfo())) { 187 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 188 ASTContext &Ctx = CGM.getContext(); 189 190 IntTy = CGM.IntTy; 191 SizeTy = CGM.SizeTy; 192 VoidTy = CGM.VoidTy; 193 194 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 195 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 196 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 197 if (CGM.getContext().getAuxTargetInfo()) { 198 // If the host and device have different C++ ABIs, mark it as the device 199 // mangle context so that the mangling needs to retrieve the additonal 200 // device lambda mangling number instead of the regular host one. 201 DeviceMC->setDeviceMangleContext( 202 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && 203 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()); 204 } 205 } 206 207 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 208 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 209 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 210 return CGM.CreateRuntimeFunction( 211 llvm::FunctionType::get(IntTy, Params, false), 212 addPrefixToName("SetupArgument")); 213 } 214 215 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 216 if (CGM.getLangOpts().HIP) { 217 // hipError_t hipLaunchByPtr(char *); 218 return CGM.CreateRuntimeFunction( 219 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 220 } else { 221 // cudaError_t cudaLaunch(char *); 222 return CGM.CreateRuntimeFunction( 223 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 224 } 225 } 226 227 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 228 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 229 } 230 231 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 232 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 233 } 234 235 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 236 auto CallbackFnTy = getCallbackFnTy(); 237 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 238 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 239 VoidPtrTy, CallbackFnTy->getPointerTo()}; 240 return llvm::FunctionType::get(VoidTy, Params, false); 241 } 242 243 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { 244 GlobalDecl GD; 245 // D could be either a kernel or a variable. 246 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 247 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 248 else 249 GD = GlobalDecl(ND); 250 std::string DeviceSideName; 251 if (DeviceMC->shouldMangleDeclName(ND)) { 252 SmallString<256> Buffer; 253 llvm::raw_svector_ostream Out(Buffer); 254 DeviceMC->mangleName(GD, Out); 255 DeviceSideName = std::string(Out.str()); 256 } else 257 DeviceSideName = std::string(ND->getIdentifier()->getName()); 258 259 // Make unique name for device side static file-scope variable for HIP. 260 if (CGM.getContext().shouldExternalizeStaticVar(ND) && 261 CGM.getLangOpts().GPURelocatableDeviceCode && 262 !CGM.getLangOpts().CUID.empty()) { 263 SmallString<256> Buffer; 264 llvm::raw_svector_ostream Out(Buffer); 265 Out << DeviceSideName; 266 CGM.printPostfixForExternalizedStaticVar(Out); 267 DeviceSideName = std::string(Out.str()); 268 } 269 return DeviceSideName; 270 } 271 272 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 273 FunctionArgList &Args) { 274 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 275 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 276 CudaFeature::CUDA_USES_NEW_LAUNCH) || 277 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) 278 emitDeviceStubBodyNew(CGF, Args); 279 else 280 emitDeviceStubBodyLegacy(CGF, Args); 281 } 282 283 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 284 // array and kernels are launched using cudaLaunchKernel(). 285 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 286 FunctionArgList &Args) { 287 // Build the shadow stack entry at the very start of the function. 288 289 // Calculate amount of space we will need for all arguments. If we have no 290 // args, allocate a single pointer so we still have a valid pointer to the 291 // argument array that we can pass to runtime, even if it will be unused. 292 Address KernelArgs = CGF.CreateTempAlloca( 293 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 294 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 295 // Store pointers to the arguments in a locally allocated launch_args. 296 for (unsigned i = 0; i < Args.size(); ++i) { 297 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 298 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 299 CGF.Builder.CreateDefaultAlignedStore( 300 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 301 } 302 303 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 304 305 // Lookup cudaLaunchKernel/hipLaunchKernel function. 306 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 307 // void **args, size_t sharedMem, 308 // cudaStream_t stream); 309 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 310 // void **args, size_t sharedMem, 311 // hipStream_t stream); 312 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 313 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 314 auto LaunchKernelName = addPrefixToName("LaunchKernel"); 315 IdentifierInfo &cudaLaunchKernelII = 316 CGM.getContext().Idents.get(LaunchKernelName); 317 FunctionDecl *cudaLaunchKernelFD = nullptr; 318 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 319 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 320 cudaLaunchKernelFD = FD; 321 } 322 323 if (cudaLaunchKernelFD == nullptr) { 324 CGM.Error(CGF.CurFuncDecl->getLocation(), 325 "Can't find declaration for " + LaunchKernelName); 326 return; 327 } 328 // Create temporary dim3 grid_dim, block_dim. 329 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 330 QualType Dim3Ty = GridDimParam->getType(); 331 Address GridDim = 332 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 333 Address BlockDim = 334 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 335 Address ShmemSize = 336 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 337 Address Stream = 338 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 339 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 340 llvm::FunctionType::get(IntTy, 341 {/*gridDim=*/GridDim.getType(), 342 /*blockDim=*/BlockDim.getType(), 343 /*ShmemSize=*/ShmemSize.getType(), 344 /*Stream=*/Stream.getType()}, 345 /*isVarArg=*/false), 346 addUnderscoredPrefixToName("PopCallConfiguration")); 347 348 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 349 {GridDim.getPointer(), BlockDim.getPointer(), 350 ShmemSize.getPointer(), Stream.getPointer()}); 351 352 // Emit the call to cudaLaunch 353 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 354 CallArgList LaunchKernelArgs; 355 LaunchKernelArgs.add(RValue::get(Kernel), 356 cudaLaunchKernelFD->getParamDecl(0)->getType()); 357 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 358 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 359 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 360 cudaLaunchKernelFD->getParamDecl(3)->getType()); 361 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 362 cudaLaunchKernelFD->getParamDecl(4)->getType()); 363 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 364 cudaLaunchKernelFD->getParamDecl(5)->getType()); 365 366 QualType QT = cudaLaunchKernelFD->getType(); 367 QualType CQT = QT.getCanonicalType(); 368 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 369 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 370 371 const CGFunctionInfo &FI = 372 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 373 llvm::FunctionCallee cudaLaunchKernelFn = 374 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 375 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 376 LaunchKernelArgs); 377 CGF.EmitBranch(EndBlock); 378 379 CGF.EmitBlock(EndBlock); 380 } 381 382 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 383 FunctionArgList &Args) { 384 // Emit a call to cudaSetupArgument for each arg in Args. 385 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 386 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 387 CharUnits Offset = CharUnits::Zero(); 388 for (const VarDecl *A : Args) { 389 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); 390 Offset = Offset.alignTo(TInfo.Align); 391 llvm::Value *Args[] = { 392 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 393 VoidPtrTy), 394 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), 395 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 396 }; 397 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 398 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 399 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 400 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 401 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 402 CGF.EmitBlock(NextBlock); 403 Offset += TInfo.Width; 404 } 405 406 // Emit the call to cudaLaunch 407 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 408 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 409 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 410 CGF.EmitBranch(EndBlock); 411 412 CGF.EmitBlock(EndBlock); 413 } 414 415 // Replace the original variable Var with the address loaded from variable 416 // ManagedVar populated by HIP runtime. 417 static void replaceManagedVar(llvm::GlobalVariable *Var, 418 llvm::GlobalVariable *ManagedVar) { 419 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; 420 for (auto &&VarUse : Var->uses()) { 421 WorkList.push_back({VarUse.getUser()}); 422 } 423 while (!WorkList.empty()) { 424 auto &&WorkItem = WorkList.pop_back_val(); 425 auto *U = WorkItem.back(); 426 if (isa<llvm::ConstantExpr>(U)) { 427 for (auto &&UU : U->uses()) { 428 WorkItem.push_back(UU.getUser()); 429 WorkList.push_back(WorkItem); 430 WorkItem.pop_back(); 431 } 432 continue; 433 } 434 if (auto *I = dyn_cast<llvm::Instruction>(U)) { 435 llvm::Value *OldV = Var; 436 llvm::Instruction *NewV = 437 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, 438 llvm::Align(Var->getAlignment()), I); 439 WorkItem.pop_back(); 440 // Replace constant expressions directly or indirectly using the managed 441 // variable with instructions. 442 for (auto &&Op : WorkItem) { 443 auto *CE = cast<llvm::ConstantExpr>(Op); 444 auto *NewInst = llvm::createReplacementInstr(CE, I); 445 NewInst->replaceUsesOfWith(OldV, NewV); 446 OldV = CE; 447 NewV = NewInst; 448 } 449 I->replaceUsesOfWith(OldV, NewV); 450 } else { 451 llvm_unreachable("Invalid use of managed variable"); 452 } 453 } 454 } 455 456 /// Creates a function that sets up state on the host side for CUDA objects that 457 /// have a presence on both the host and device sides. Specifically, registers 458 /// the host side of kernel functions and device global variables with the CUDA 459 /// runtime. 460 /// \code 461 /// void __cuda_register_globals(void** GpuBinaryHandle) { 462 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 463 /// ... 464 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 465 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 466 /// ... 467 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 468 /// } 469 /// \endcode 470 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 471 // No need to register anything 472 if (EmittedKernels.empty() && DeviceVars.empty()) 473 return nullptr; 474 475 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 476 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 477 addUnderscoredPrefixToName("_register_globals"), &TheModule); 478 llvm::BasicBlock *EntryBB = 479 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 480 CGBuilderTy Builder(CGM, Context); 481 Builder.SetInsertPoint(EntryBB); 482 483 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 484 // int, uint3*, uint3*, dim3*, dim3*, int*) 485 llvm::Type *RegisterFuncParams[] = { 486 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 487 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 488 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 489 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 490 addUnderscoredPrefixToName("RegisterFunction")); 491 492 // Extract GpuBinaryHandle passed as the first argument passed to 493 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 494 // each emitted kernel. 495 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 496 for (auto &&I : EmittedKernels) { 497 llvm::Constant *KernelName = 498 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 499 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 500 llvm::Value *Args[] = { 501 &GpuBinaryHandlePtr, 502 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 503 KernelName, 504 KernelName, 505 llvm::ConstantInt::get(IntTy, -1), 506 NullPtr, 507 NullPtr, 508 NullPtr, 509 NullPtr, 510 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 511 Builder.CreateCall(RegisterFunc, Args); 512 } 513 514 llvm::Type *VarSizeTy = IntTy; 515 // For HIP or CUDA 9.0+, device variable size is type of `size_t`. 516 if (CGM.getLangOpts().HIP || 517 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90) 518 VarSizeTy = SizeTy; 519 520 // void __cudaRegisterVar(void **, char *, char *, const char *, 521 // int, int, int, int) 522 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 523 CharPtrTy, IntTy, VarSizeTy, 524 IntTy, IntTy}; 525 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 526 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), 527 addUnderscoredPrefixToName("RegisterVar")); 528 // void __hipRegisterManagedVar(void **, char *, char *, const char *, 529 // size_t, unsigned) 530 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 531 CharPtrTy, VarSizeTy, IntTy}; 532 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( 533 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), 534 addUnderscoredPrefixToName("RegisterManagedVar")); 535 // void __cudaRegisterSurface(void **, const struct surfaceReference *, 536 // const void **, const char *, int, int); 537 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( 538 llvm::FunctionType::get( 539 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, 540 false), 541 addUnderscoredPrefixToName("RegisterSurface")); 542 // void __cudaRegisterTexture(void **, const struct textureReference *, 543 // const void **, const char *, int, int, int) 544 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( 545 llvm::FunctionType::get( 546 VoidTy, 547 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, 548 false), 549 addUnderscoredPrefixToName("RegisterTexture")); 550 for (auto &&Info : DeviceVars) { 551 llvm::GlobalVariable *Var = Info.Var; 552 assert((!Var->isDeclaration() || Info.Flags.isManaged()) && 553 "External variables should not show up here, except HIP managed " 554 "variables"); 555 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 556 switch (Info.Flags.getKind()) { 557 case DeviceVarFlags::Variable: { 558 uint64_t VarSize = 559 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 560 if (Info.Flags.isManaged()) { 561 auto ManagedVar = new llvm::GlobalVariable( 562 CGM.getModule(), Var->getType(), 563 /*isConstant=*/false, Var->getLinkage(), 564 /*Init=*/Var->isDeclaration() 565 ? nullptr 566 : llvm::ConstantPointerNull::get(Var->getType()), 567 /*Name=*/"", /*InsertBefore=*/nullptr, 568 llvm::GlobalVariable::NotThreadLocal); 569 ManagedVar->setDSOLocal(Var->isDSOLocal()); 570 ManagedVar->setVisibility(Var->getVisibility()); 571 ManagedVar->setExternallyInitialized(true); 572 ManagedVar->takeName(Var); 573 Var->setName(Twine(ManagedVar->getName() + ".managed")); 574 replaceManagedVar(Var, ManagedVar); 575 llvm::Value *Args[] = { 576 &GpuBinaryHandlePtr, 577 Builder.CreateBitCast(ManagedVar, VoidPtrTy), 578 Builder.CreateBitCast(Var, VoidPtrTy), 579 VarName, 580 llvm::ConstantInt::get(VarSizeTy, VarSize), 581 llvm::ConstantInt::get(IntTy, Var->getAlignment())}; 582 if (!Var->isDeclaration()) 583 Builder.CreateCall(RegisterManagedVar, Args); 584 } else { 585 llvm::Value *Args[] = { 586 &GpuBinaryHandlePtr, 587 Builder.CreateBitCast(Var, VoidPtrTy), 588 VarName, 589 VarName, 590 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), 591 llvm::ConstantInt::get(VarSizeTy, VarSize), 592 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), 593 llvm::ConstantInt::get(IntTy, 0)}; 594 Builder.CreateCall(RegisterVar, Args); 595 } 596 break; 597 } 598 case DeviceVarFlags::Surface: 599 Builder.CreateCall( 600 RegisterSurf, 601 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 602 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 603 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 604 break; 605 case DeviceVarFlags::Texture: 606 Builder.CreateCall( 607 RegisterTex, 608 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, 609 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), 610 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), 611 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); 612 break; 613 } 614 } 615 616 Builder.CreateRetVoid(); 617 return RegisterKernelsFunc; 618 } 619 620 /// Creates a global constructor function for the module: 621 /// 622 /// For CUDA: 623 /// \code 624 /// void __cuda_module_ctor(void*) { 625 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 626 /// __cuda_register_globals(Handle); 627 /// } 628 /// \endcode 629 /// 630 /// For HIP: 631 /// \code 632 /// void __hip_module_ctor(void*) { 633 /// if (__hip_gpubin_handle == 0) { 634 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 635 /// __hip_register_globals(__hip_gpubin_handle); 636 /// } 637 /// } 638 /// \endcode 639 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 640 bool IsHIP = CGM.getLangOpts().HIP; 641 bool IsCUDA = CGM.getLangOpts().CUDA; 642 // No need to generate ctors/dtors if there is no GPU binary. 643 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 644 if (CudaGpuBinaryFileName.empty() && !IsHIP) 645 return nullptr; 646 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 647 DeviceVars.empty()) 648 return nullptr; 649 650 // void __{cuda|hip}_register_globals(void* handle); 651 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 652 // We always need a function to pass in as callback. Create a dummy 653 // implementation if we don't need to register anything. 654 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 655 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 656 657 // void ** __{cuda|hip}RegisterFatBinary(void *); 658 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 659 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 660 addUnderscoredPrefixToName("RegisterFatBinary")); 661 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 662 llvm::StructType *FatbinWrapperTy = 663 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 664 665 // Register GPU binary with the CUDA runtime, store returned handle in a 666 // global variable and save a reference in GpuBinaryHandle to be cleaned up 667 // in destructor on exit. Then associate all known kernels with the GPU binary 668 // handle so CUDA runtime can figure out what to call on the GPU side. 669 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 670 if (!CudaGpuBinaryFileName.empty()) { 671 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 672 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 673 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 674 CGM.getDiags().Report(diag::err_cannot_open_file) 675 << CudaGpuBinaryFileName << EC.message(); 676 return nullptr; 677 } 678 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 679 } 680 681 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 682 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 683 llvm::GlobalValue::InternalLinkage, 684 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 685 llvm::BasicBlock *CtorEntryBB = 686 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 687 CGBuilderTy CtorBuilder(CGM, Context); 688 689 CtorBuilder.SetInsertPoint(CtorEntryBB); 690 691 const char *FatbinConstantName; 692 const char *FatbinSectionName; 693 const char *ModuleIDSectionName; 694 StringRef ModuleIDPrefix; 695 llvm::Constant *FatBinStr; 696 unsigned FatMagic; 697 if (IsHIP) { 698 FatbinConstantName = ".hip_fatbin"; 699 FatbinSectionName = ".hipFatBinSegment"; 700 701 ModuleIDSectionName = "__hip_module_id"; 702 ModuleIDPrefix = "__hip_"; 703 704 if (CudaGpuBinary) { 705 // If fatbin is available from early finalization, create a string 706 // literal containing the fat binary loaded from the given file. 707 const unsigned HIPCodeObjectAlign = 4096; 708 FatBinStr = 709 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 710 FatbinConstantName, HIPCodeObjectAlign); 711 } else { 712 // If fatbin is not available, create an external symbol 713 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 714 // to contain the fat binary but will be populated somewhere else, 715 // e.g. by lld through link script. 716 FatBinStr = new llvm::GlobalVariable( 717 CGM.getModule(), CGM.Int8Ty, 718 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 719 "__hip_fatbin", nullptr, 720 llvm::GlobalVariable::NotThreadLocal); 721 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 722 } 723 724 FatMagic = HIPFatMagic; 725 } else { 726 if (RelocatableDeviceCode) 727 FatbinConstantName = CGM.getTriple().isMacOSX() 728 ? "__NV_CUDA,__nv_relfatbin" 729 : "__nv_relfatbin"; 730 else 731 FatbinConstantName = 732 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 733 // NVIDIA's cuobjdump looks for fatbins in this section. 734 FatbinSectionName = 735 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 736 737 ModuleIDSectionName = CGM.getTriple().isMacOSX() 738 ? "__NV_CUDA,__nv_module_id" 739 : "__nv_module_id"; 740 ModuleIDPrefix = "__nv_"; 741 742 // For CUDA, create a string literal containing the fat binary loaded from 743 // the given file. 744 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 745 FatbinConstantName, 8); 746 FatMagic = CudaFatMagic; 747 } 748 749 // Create initialized wrapper structure that points to the loaded GPU binary 750 ConstantInitBuilder Builder(CGM); 751 auto Values = Builder.beginStruct(FatbinWrapperTy); 752 // Fatbin wrapper magic. 753 Values.addInt(IntTy, FatMagic); 754 // Fatbin version. 755 Values.addInt(IntTy, 1); 756 // Data. 757 Values.add(FatBinStr); 758 // Unused in fatbin v1. 759 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 760 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 761 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 762 /*constant*/ true); 763 FatbinWrapper->setSection(FatbinSectionName); 764 765 // There is only one HIP fat binary per linked module, however there are 766 // multiple constructor functions. Make sure the fat binary is registered 767 // only once. The constructor functions are executed by the dynamic loader 768 // before the program gains control. The dynamic loader cannot execute the 769 // constructor functions concurrently since doing that would not guarantee 770 // thread safety of the loaded program. Therefore we can assume sequential 771 // execution of constructor functions here. 772 if (IsHIP) { 773 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 774 llvm::GlobalValue::LinkOnceAnyLinkage; 775 llvm::BasicBlock *IfBlock = 776 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 777 llvm::BasicBlock *ExitBlock = 778 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 779 // The name, size, and initialization pattern of this variable is part 780 // of HIP ABI. 781 GpuBinaryHandle = new llvm::GlobalVariable( 782 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 783 Linkage, 784 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 785 "__hip_gpubin_handle"); 786 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 787 // Prevent the weak symbol in different shared libraries being merged. 788 if (Linkage != llvm::GlobalValue::InternalLinkage) 789 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 790 Address GpuBinaryAddr( 791 GpuBinaryHandle, 792 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 793 { 794 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 795 llvm::Constant *Zero = 796 llvm::Constant::getNullValue(HandleValue->getType()); 797 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 798 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 799 } 800 { 801 CtorBuilder.SetInsertPoint(IfBlock); 802 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 803 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 804 RegisterFatbinFunc, 805 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 806 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 807 CtorBuilder.CreateBr(ExitBlock); 808 } 809 { 810 CtorBuilder.SetInsertPoint(ExitBlock); 811 // Call __hip_register_globals(GpuBinaryHandle); 812 if (RegisterGlobalsFunc) { 813 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 814 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 815 } 816 } 817 } else if (!RelocatableDeviceCode) { 818 // Register binary with CUDA runtime. This is substantially different in 819 // default mode vs. separate compilation! 820 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 821 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 822 RegisterFatbinFunc, 823 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 824 GpuBinaryHandle = new llvm::GlobalVariable( 825 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 826 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 827 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 828 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 829 CGM.getPointerAlign()); 830 831 // Call __cuda_register_globals(GpuBinaryHandle); 832 if (RegisterGlobalsFunc) 833 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 834 835 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 836 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 837 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 838 // void __cudaRegisterFatBinaryEnd(void **); 839 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 840 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 841 "__cudaRegisterFatBinaryEnd"); 842 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 843 } 844 } else { 845 // Generate a unique module ID. 846 SmallString<64> ModuleID; 847 llvm::raw_svector_ostream OS(ModuleID); 848 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 849 llvm::Constant *ModuleIDConstant = makeConstantString( 850 std::string(ModuleID.str()), "", ModuleIDSectionName, 32); 851 852 // Create an alias for the FatbinWrapper that nvcc will look for. 853 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 854 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 855 856 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 857 // void *, void (*)(void **)) 858 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 859 RegisterLinkedBinaryName += ModuleID; 860 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 861 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 862 863 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 864 llvm::Value *Args[] = {RegisterGlobalsFunc, 865 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 866 ModuleIDConstant, 867 makeDummyFunction(getCallbackFnTy())}; 868 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 869 } 870 871 // Create destructor and register it with atexit() the way NVCC does it. Doing 872 // it during regular destructor phase worked in CUDA before 9.2 but results in 873 // double-free in 9.2. 874 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 875 // extern "C" int atexit(void (*f)(void)); 876 llvm::FunctionType *AtExitTy = 877 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 878 llvm::FunctionCallee AtExitFunc = 879 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 880 /*Local=*/true); 881 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 882 } 883 884 CtorBuilder.CreateRetVoid(); 885 return ModuleCtorFunc; 886 } 887 888 /// Creates a global destructor function that unregisters the GPU code blob 889 /// registered by constructor. 890 /// 891 /// For CUDA: 892 /// \code 893 /// void __cuda_module_dtor(void*) { 894 /// __cudaUnregisterFatBinary(Handle); 895 /// } 896 /// \endcode 897 /// 898 /// For HIP: 899 /// \code 900 /// void __hip_module_dtor(void*) { 901 /// if (__hip_gpubin_handle) { 902 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 903 /// __hip_gpubin_handle = 0; 904 /// } 905 /// } 906 /// \endcode 907 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 908 // No need for destructor if we don't have a handle to unregister. 909 if (!GpuBinaryHandle) 910 return nullptr; 911 912 // void __cudaUnregisterFatBinary(void ** handle); 913 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 914 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 915 addUnderscoredPrefixToName("UnregisterFatBinary")); 916 917 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 918 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 919 llvm::GlobalValue::InternalLinkage, 920 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 921 922 llvm::BasicBlock *DtorEntryBB = 923 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 924 CGBuilderTy DtorBuilder(CGM, Context); 925 DtorBuilder.SetInsertPoint(DtorEntryBB); 926 927 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 928 GpuBinaryHandle->getAlignment())); 929 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 930 // There is only one HIP fat binary per linked module, however there are 931 // multiple destructor functions. Make sure the fat binary is unregistered 932 // only once. 933 if (CGM.getLangOpts().HIP) { 934 llvm::BasicBlock *IfBlock = 935 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 936 llvm::BasicBlock *ExitBlock = 937 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 938 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 939 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 940 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 941 942 DtorBuilder.SetInsertPoint(IfBlock); 943 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 944 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 945 DtorBuilder.CreateBr(ExitBlock); 946 947 DtorBuilder.SetInsertPoint(ExitBlock); 948 } else { 949 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 950 } 951 DtorBuilder.CreateRetVoid(); 952 return ModuleDtorFunc; 953 } 954 955 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 956 return new CGNVCUDARuntime(CGM); 957 } 958 959 void CGNVCUDARuntime::internalizeDeviceSideVar( 960 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { 961 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side 962 // global variables become internal definitions. These have to be internal in 963 // order to prevent name conflicts with global host variables with the same 964 // name in a different TUs. 965 // 966 // For -fgpu-rdc, the shadow variables should not be internalized because 967 // they may be accessed by different TU. 968 if (CGM.getLangOpts().GPURelocatableDeviceCode) 969 return; 970 971 // __shared__ variables are odd. Shadows do get created, but 972 // they are not registered with the CUDA runtime, so they 973 // can't really be used to access their device-side 974 // counterparts. It's not clear yet whether it's nvcc's bug or 975 // a feature, but we've got to do the same for compatibility. 976 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || 977 D->hasAttr<CUDASharedAttr>() || 978 D->getType()->isCUDADeviceBuiltinSurfaceType() || 979 D->getType()->isCUDADeviceBuiltinTextureType()) { 980 Linkage = llvm::GlobalValue::InternalLinkage; 981 } 982 } 983 984 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, 985 llvm::GlobalVariable &GV) { 986 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { 987 // Shadow variables and their properties must be registered with CUDA 988 // runtime. Skip Extern global variables, which will be registered in 989 // the TU where they are defined. 990 // 991 // Don't register a C++17 inline variable. The local symbol can be 992 // discarded and referencing a discarded local symbol from outside the 993 // comdat (__cuda_register_globals) is disallowed by the ELF spec. 994 // TODO: Reject __device__ constexpr and __device__ inline in Sema. 995 // HIP managed variables need to be always recorded in device and host 996 // compilations for transformation. 997 if ((!D->hasExternalStorage() && !D->isInline()) || 998 D->hasAttr<HIPManagedAttr>()) { 999 registerDeviceVar(D, GV, !D->hasDefinition(), 1000 D->hasAttr<CUDAConstantAttr>()); 1001 } 1002 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || 1003 D->getType()->isCUDADeviceBuiltinTextureType()) { 1004 // Builtin surfaces and textures and their template arguments are 1005 // also registered with CUDA runtime. 1006 const ClassTemplateSpecializationDecl *TD = 1007 cast<ClassTemplateSpecializationDecl>( 1008 D->getType()->getAs<RecordType>()->getDecl()); 1009 const TemplateArgumentList &Args = TD->getTemplateArgs(); 1010 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { 1011 assert(Args.size() == 2 && 1012 "Unexpected number of template arguments of CUDA device " 1013 "builtin surface type."); 1014 auto SurfType = Args[1].getAsIntegral(); 1015 if (!D->hasExternalStorage()) 1016 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); 1017 } else { 1018 assert(Args.size() == 3 && 1019 "Unexpected number of template arguments of CUDA device " 1020 "builtin texture type."); 1021 auto TexType = Args[1].getAsIntegral(); 1022 auto Normalized = Args[2].getAsIntegral(); 1023 if (!D->hasExternalStorage()) 1024 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), 1025 Normalized.getZExtValue()); 1026 } 1027 } 1028 } 1029 1030 // Transform managed variables to pointers to managed variables in device code. 1031 // Each use of the original managed variable is replaced by a load from the 1032 // transformed managed variable. The transformed managed variable contains 1033 // the address of managed memory which will be allocated by the runtime. 1034 void CGNVCUDARuntime::transformManagedVars() { 1035 for (auto &&Info : DeviceVars) { 1036 llvm::GlobalVariable *Var = Info.Var; 1037 if (Info.Flags.getKind() == DeviceVarFlags::Variable && 1038 Info.Flags.isManaged()) { 1039 auto ManagedVar = new llvm::GlobalVariable( 1040 CGM.getModule(), Var->getType(), 1041 /*isConstant=*/false, Var->getLinkage(), 1042 /*Init=*/Var->isDeclaration() 1043 ? nullptr 1044 : llvm::ConstantPointerNull::get(Var->getType()), 1045 /*Name=*/"", /*InsertBefore=*/nullptr, 1046 llvm::GlobalVariable::NotThreadLocal, 1047 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); 1048 ManagedVar->setDSOLocal(Var->isDSOLocal()); 1049 ManagedVar->setVisibility(Var->getVisibility()); 1050 ManagedVar->setExternallyInitialized(true); 1051 replaceManagedVar(Var, ManagedVar); 1052 ManagedVar->takeName(Var); 1053 Var->setName(Twine(ManagedVar->getName()) + ".managed"); 1054 // Keep managed variables even if they are not used in device code since 1055 // they need to be allocated by the runtime. 1056 if (!Var->isDeclaration()) { 1057 assert(!ManagedVar->isDeclaration()); 1058 CGM.addCompilerUsedGlobal(Var); 1059 CGM.addCompilerUsedGlobal(ManagedVar); 1060 } 1061 } 1062 } 1063 } 1064 1065 // Returns module constructor to be added. 1066 llvm::Function *CGNVCUDARuntime::finalizeModule() { 1067 if (CGM.getLangOpts().CUDAIsDevice) { 1068 transformManagedVars(); 1069 return nullptr; 1070 } 1071 return makeModuleCtorFunction(); 1072 } 1073