1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "clang/CodeGen/ConstantInitBuilder.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/CallSite.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 46 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 47 /// Keeps track of variable containing handle of GPU binary. Populated by 48 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 49 /// ModuleDtorFunction() 50 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 51 /// Whether we generate relocatable device code. 52 bool RelocatableDeviceCode; 53 54 llvm::Constant *getSetupArgumentFn() const; 55 llvm::Constant *getLaunchFn() const; 56 57 llvm::FunctionType *getRegisterGlobalsFnTy() const; 58 llvm::FunctionType *getCallbackFnTy() const; 59 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 60 std::string addPrefixToName(StringRef FuncName) const; 61 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 62 63 /// Creates a function to register all kernel stubs generated in this module. 64 llvm::Function *makeRegisterGlobalsFn(); 65 66 /// Helper function that generates a constant string and returns a pointer to 67 /// the start of the string. The result of this function can be used anywhere 68 /// where the C code specifies const char*. 69 llvm::Constant *makeConstantString(const std::string &Str, 70 const std::string &Name = "", 71 const std::string &SectionName = "", 72 unsigned Alignment = 0) { 73 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 74 llvm::ConstantInt::get(SizeTy, 0)}; 75 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 76 llvm::GlobalVariable *GV = 77 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 78 if (!SectionName.empty()) { 79 GV->setSection(SectionName); 80 // Mark the address as used which make sure that this section isn't 81 // merged and we will really have it in the object file. 82 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 83 } 84 if (Alignment) 85 GV->setAlignment(Alignment); 86 87 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 88 ConstStr.getPointer(), Zeros); 89 } 90 91 /// Helper function that generates an empty dummy function returning void. 92 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 93 assert(FnTy->getReturnType()->isVoidTy() && 94 "Can only generate dummy functions returning void!"); 95 llvm::Function *DummyFunc = llvm::Function::Create( 96 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 97 98 llvm::BasicBlock *DummyBlock = 99 llvm::BasicBlock::Create(Context, "", DummyFunc); 100 CGBuilderTy FuncBuilder(CGM, Context); 101 FuncBuilder.SetInsertPoint(DummyBlock); 102 FuncBuilder.CreateRetVoid(); 103 104 return DummyFunc; 105 } 106 107 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 108 109 public: 110 CGNVCUDARuntime(CodeGenModule &CGM); 111 112 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 113 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 114 DeviceVars.push_back(std::make_pair(&Var, Flags)); 115 } 116 117 /// Creates module constructor function 118 llvm::Function *makeModuleCtorFunction() override; 119 /// Creates module destructor function 120 llvm::Function *makeModuleDtorFunction() override; 121 }; 122 123 } 124 125 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 126 if (CGM.getLangOpts().HIP) 127 return ((Twine("hip") + Twine(FuncName)).str()); 128 return ((Twine("cuda") + Twine(FuncName)).str()); 129 } 130 std::string 131 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 132 if (CGM.getLangOpts().HIP) 133 return ((Twine("__hip") + Twine(FuncName)).str()); 134 return ((Twine("__cuda") + Twine(FuncName)).str()); 135 } 136 137 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 138 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 139 TheModule(CGM.getModule()), 140 RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) { 141 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 142 ASTContext &Ctx = CGM.getContext(); 143 144 IntTy = CGM.IntTy; 145 SizeTy = CGM.SizeTy; 146 VoidTy = CGM.VoidTy; 147 148 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 149 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 150 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 151 } 152 153 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 154 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 155 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 156 return CGM.CreateRuntimeFunction( 157 llvm::FunctionType::get(IntTy, Params, false), 158 addPrefixToName("SetupArgument")); 159 } 160 161 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 162 if (CGM.getLangOpts().HIP) { 163 // hipError_t hipLaunchByPtr(char *); 164 return CGM.CreateRuntimeFunction( 165 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 166 } else { 167 // cudaError_t cudaLaunch(char *); 168 return CGM.CreateRuntimeFunction( 169 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 170 } 171 } 172 173 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 174 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 175 } 176 177 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 178 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 179 } 180 181 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 182 auto CallbackFnTy = getCallbackFnTy(); 183 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 184 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 185 VoidPtrTy, CallbackFnTy->getPointerTo()}; 186 return llvm::FunctionType::get(VoidTy, Params, false); 187 } 188 189 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 190 FunctionArgList &Args) { 191 EmittedKernels.push_back(CGF.CurFn); 192 emitDeviceStubBody(CGF, Args); 193 } 194 195 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 196 FunctionArgList &Args) { 197 // Emit a call to cudaSetupArgument for each arg in Args. 198 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 199 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 200 CharUnits Offset = CharUnits::Zero(); 201 for (const VarDecl *A : Args) { 202 CharUnits TyWidth, TyAlign; 203 std::tie(TyWidth, TyAlign) = 204 CGM.getContext().getTypeInfoInChars(A->getType()); 205 Offset = Offset.alignTo(TyAlign); 206 llvm::Value *Args[] = { 207 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 208 VoidPtrTy), 209 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 210 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 211 }; 212 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 213 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 214 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 215 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 216 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 217 CGF.EmitBlock(NextBlock); 218 Offset += TyWidth; 219 } 220 221 // Emit the call to cudaLaunch 222 llvm::Constant *cudaLaunchFn = getLaunchFn(); 223 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 224 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 225 CGF.EmitBranch(EndBlock); 226 227 CGF.EmitBlock(EndBlock); 228 } 229 230 /// Creates a function that sets up state on the host side for CUDA objects that 231 /// have a presence on both the host and device sides. Specifically, registers 232 /// the host side of kernel functions and device global variables with the CUDA 233 /// runtime. 234 /// \code 235 /// void __cuda_register_globals(void** GpuBinaryHandle) { 236 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 237 /// ... 238 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 239 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 240 /// ... 241 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 242 /// } 243 /// \endcode 244 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 245 // No need to register anything 246 if (EmittedKernels.empty() && DeviceVars.empty()) 247 return nullptr; 248 249 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 250 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 251 addUnderscoredPrefixToName("_register_globals"), &TheModule); 252 llvm::BasicBlock *EntryBB = 253 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 254 CGBuilderTy Builder(CGM, Context); 255 Builder.SetInsertPoint(EntryBB); 256 257 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 258 // int, uint3*, uint3*, dim3*, dim3*, int*) 259 llvm::Type *RegisterFuncParams[] = { 260 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 261 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 262 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 263 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 264 addUnderscoredPrefixToName("RegisterFunction")); 265 266 // Extract GpuBinaryHandle passed as the first argument passed to 267 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 268 // each emitted kernel. 269 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 270 for (llvm::Function *Kernel : EmittedKernels) { 271 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 272 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 273 llvm::Value *Args[] = { 274 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 275 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 276 NullPtr, NullPtr, NullPtr, 277 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 278 Builder.CreateCall(RegisterFunc, Args); 279 } 280 281 // void __cudaRegisterVar(void **, char *, char *, const char *, 282 // int, int, int, int) 283 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 284 CharPtrTy, IntTy, IntTy, 285 IntTy, IntTy}; 286 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 287 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 288 addUnderscoredPrefixToName("RegisterVar")); 289 for (auto &Pair : DeviceVars) { 290 llvm::GlobalVariable *Var = Pair.first; 291 unsigned Flags = Pair.second; 292 llvm::Constant *VarName = makeConstantString(Var->getName()); 293 uint64_t VarSize = 294 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 295 llvm::Value *Args[] = { 296 &GpuBinaryHandlePtr, 297 Builder.CreateBitCast(Var, VoidPtrTy), 298 VarName, 299 VarName, 300 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 301 llvm::ConstantInt::get(IntTy, VarSize), 302 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 303 llvm::ConstantInt::get(IntTy, 0)}; 304 Builder.CreateCall(RegisterVar, Args); 305 } 306 307 Builder.CreateRetVoid(); 308 return RegisterKernelsFunc; 309 } 310 311 /// Creates a global constructor function for the module: 312 /// 313 /// For CUDA: 314 /// \code 315 /// void __cuda_module_ctor(void*) { 316 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 317 /// __cuda_register_globals(Handle); 318 /// } 319 /// \endcode 320 /// 321 /// For HIP: 322 /// \code 323 /// void __hip_module_ctor(void*) { 324 /// if (__hip_gpubin_handle == 0) { 325 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 326 /// __hip_register_globals(__hip_gpubin_handle); 327 /// } 328 /// } 329 /// \endcode 330 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 331 bool IsHIP = CGM.getLangOpts().HIP; 332 // No need to generate ctors/dtors if there is no GPU binary. 333 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 334 if (CudaGpuBinaryFileName.empty() && !IsHIP) 335 return nullptr; 336 337 // void __{cuda|hip}_register_globals(void* handle); 338 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 339 // We always need a function to pass in as callback. Create a dummy 340 // implementation if we don't need to register anything. 341 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 342 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 343 344 // void ** __{cuda|hip}RegisterFatBinary(void *); 345 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 346 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 347 addUnderscoredPrefixToName("RegisterFatBinary")); 348 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 349 llvm::StructType *FatbinWrapperTy = 350 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 351 352 // Register GPU binary with the CUDA runtime, store returned handle in a 353 // global variable and save a reference in GpuBinaryHandle to be cleaned up 354 // in destructor on exit. Then associate all known kernels with the GPU binary 355 // handle so CUDA runtime can figure out what to call on the GPU side. 356 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary; 357 if (!IsHIP) { 358 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 359 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 360 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 361 CGM.getDiags().Report(diag::err_cannot_open_file) 362 << CudaGpuBinaryFileName << EC.message(); 363 return nullptr; 364 } 365 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 366 } 367 368 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 369 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 370 llvm::GlobalValue::InternalLinkage, 371 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 372 llvm::BasicBlock *CtorEntryBB = 373 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 374 CGBuilderTy CtorBuilder(CGM, Context); 375 376 CtorBuilder.SetInsertPoint(CtorEntryBB); 377 378 const char *FatbinConstantName; 379 const char *FatbinSectionName; 380 const char *ModuleIDSectionName; 381 StringRef ModuleIDPrefix; 382 llvm::Constant *FatBinStr; 383 unsigned FatMagic; 384 if (IsHIP) { 385 FatbinConstantName = ".hip_fatbin"; 386 FatbinSectionName = ".hipFatBinSegment"; 387 388 ModuleIDSectionName = "__hip_module_id"; 389 ModuleIDPrefix = "__hip_"; 390 391 // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin. 392 // The external symbol is supposed to contain the fat binary but will be 393 // populated somewhere else, e.g. by lld through link script. 394 FatBinStr = new llvm::GlobalVariable( 395 CGM.getModule(), CGM.Int8Ty, 396 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 397 "__hip_fatbin", nullptr, 398 llvm::GlobalVariable::NotThreadLocal); 399 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 400 401 FatMagic = HIPFatMagic; 402 } else { 403 if (RelocatableDeviceCode) 404 FatbinConstantName = CGM.getTriple().isMacOSX() 405 ? "__NV_CUDA,__nv_relfatbin" 406 : "__nv_relfatbin"; 407 else 408 FatbinConstantName = 409 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 410 // NVIDIA's cuobjdump looks for fatbins in this section. 411 FatbinSectionName = 412 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 413 414 ModuleIDSectionName = CGM.getTriple().isMacOSX() 415 ? "__NV_CUDA,__nv_module_id" 416 : "__nv_module_id"; 417 ModuleIDPrefix = "__nv_"; 418 419 // For CUDA, create a string literal containing the fat binary loaded from 420 // the given file. 421 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 422 FatbinConstantName, 8); 423 FatMagic = CudaFatMagic; 424 } 425 426 // Create initialized wrapper structure that points to the loaded GPU binary 427 ConstantInitBuilder Builder(CGM); 428 auto Values = Builder.beginStruct(FatbinWrapperTy); 429 // Fatbin wrapper magic. 430 Values.addInt(IntTy, FatMagic); 431 // Fatbin version. 432 Values.addInt(IntTy, 1); 433 // Data. 434 Values.add(FatBinStr); 435 // Unused in fatbin v1. 436 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 437 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 438 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 439 /*constant*/ true); 440 FatbinWrapper->setSection(FatbinSectionName); 441 442 // There is only one HIP fat binary per linked module, however there are 443 // multiple constructor functions. Make sure the fat binary is registered 444 // only once. The constructor functions are executed by the dynamic loader 445 // before the program gains control. The dynamic loader cannot execute the 446 // constructor functions concurrently since doing that would not guarantee 447 // thread safety of the loaded program. Therefore we can assume sequential 448 // execution of constructor functions here. 449 if (IsHIP) { 450 llvm::BasicBlock *IfBlock = 451 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 452 llvm::BasicBlock *ExitBlock = 453 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 454 // The name, size, and initialization pattern of this variable is part 455 // of HIP ABI. 456 GpuBinaryHandle = new llvm::GlobalVariable( 457 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 458 llvm::GlobalValue::LinkOnceAnyLinkage, 459 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 460 "__hip_gpubin_handle"); 461 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 462 // Prevent the weak symbol in different shared libraries being merged. 463 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 464 Address GpuBinaryAddr( 465 GpuBinaryHandle, 466 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 467 { 468 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 469 llvm::Constant *Zero = 470 llvm::Constant::getNullValue(HandleValue->getType()); 471 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 472 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 473 } 474 { 475 CtorBuilder.SetInsertPoint(IfBlock); 476 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 477 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 478 RegisterFatbinFunc, 479 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 480 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 481 CtorBuilder.CreateBr(ExitBlock); 482 } 483 { 484 CtorBuilder.SetInsertPoint(ExitBlock); 485 // Call __hip_register_globals(GpuBinaryHandle); 486 if (RegisterGlobalsFunc) { 487 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 488 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 489 } 490 } 491 } else if (!RelocatableDeviceCode) { 492 // Register binary with CUDA runtime. This is substantially different in 493 // default mode vs. separate compilation! 494 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 495 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 496 RegisterFatbinFunc, 497 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 498 GpuBinaryHandle = new llvm::GlobalVariable( 499 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 500 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 501 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 502 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 503 CGM.getPointerAlign()); 504 505 // Call __cuda_register_globals(GpuBinaryHandle); 506 if (RegisterGlobalsFunc) 507 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 508 } else { 509 // Generate a unique module ID. 510 SmallString<64> ModuleID; 511 llvm::raw_svector_ostream OS(ModuleID); 512 OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID()); 513 llvm::Constant *ModuleIDConstant = 514 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 515 516 // Create an alias for the FatbinWrapper that nvcc will look for. 517 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 518 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 519 520 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 521 // void *, void (*)(void **)) 522 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 523 RegisterLinkedBinaryName += ModuleID; 524 llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 525 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 526 527 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 528 llvm::Value *Args[] = {RegisterGlobalsFunc, 529 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 530 ModuleIDConstant, 531 makeDummyFunction(getCallbackFnTy())}; 532 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 533 } 534 535 // Create destructor and register it with atexit() the way NVCC does it. Doing 536 // it during regular destructor phase worked in CUDA before 9.2 but results in 537 // double-free in 9.2. 538 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 539 // extern "C" int atexit(void (*f)(void)); 540 llvm::FunctionType *AtExitTy = 541 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 542 llvm::Constant *AtExitFunc = 543 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 544 /*Local=*/true); 545 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 546 } 547 548 CtorBuilder.CreateRetVoid(); 549 return ModuleCtorFunc; 550 } 551 552 /// Creates a global destructor function that unregisters the GPU code blob 553 /// registered by constructor. 554 /// 555 /// For CUDA: 556 /// \code 557 /// void __cuda_module_dtor(void*) { 558 /// __cudaUnregisterFatBinary(Handle); 559 /// } 560 /// \endcode 561 /// 562 /// For HIP: 563 /// \code 564 /// void __hip_module_dtor(void*) { 565 /// if (__hip_gpubin_handle) { 566 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 567 /// __hip_gpubin_handle = 0; 568 /// } 569 /// } 570 /// \endcode 571 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 572 // No need for destructor if we don't have a handle to unregister. 573 if (!GpuBinaryHandle) 574 return nullptr; 575 576 // void __cudaUnregisterFatBinary(void ** handle); 577 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 578 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 579 addUnderscoredPrefixToName("UnregisterFatBinary")); 580 581 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 582 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 583 llvm::GlobalValue::InternalLinkage, 584 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 585 586 llvm::BasicBlock *DtorEntryBB = 587 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 588 CGBuilderTy DtorBuilder(CGM, Context); 589 DtorBuilder.SetInsertPoint(DtorEntryBB); 590 591 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 592 GpuBinaryHandle->getAlignment())); 593 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 594 // There is only one HIP fat binary per linked module, however there are 595 // multiple destructor functions. Make sure the fat binary is unregistered 596 // only once. 597 if (CGM.getLangOpts().HIP) { 598 llvm::BasicBlock *IfBlock = 599 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 600 llvm::BasicBlock *ExitBlock = 601 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 602 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 603 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 604 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 605 606 DtorBuilder.SetInsertPoint(IfBlock); 607 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 608 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 609 DtorBuilder.CreateBr(ExitBlock); 610 611 DtorBuilder.SetInsertPoint(ExitBlock); 612 } else { 613 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 614 } 615 DtorBuilder.CreateRetVoid(); 616 return ModuleDtorFunc; 617 } 618 619 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 620 return new CGNVCUDARuntime(CGM); 621 } 622