1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "clang/CodeGen/ConstantInitBuilder.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/CallSite.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 46 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 47 /// Keeps track of variable containing handle of GPU binary. Populated by 48 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 49 /// ModuleDtorFunction() 50 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 51 /// Whether we generate relocatable device code. 52 bool RelocatableDeviceCode; 53 54 llvm::Constant *getSetupArgumentFn() const; 55 llvm::Constant *getLaunchFn() const; 56 57 llvm::FunctionType *getRegisterGlobalsFnTy() const; 58 llvm::FunctionType *getCallbackFnTy() const; 59 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 60 std::string addPrefixToName(StringRef FuncName) const; 61 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 62 63 /// Creates a function to register all kernel stubs generated in this module. 64 llvm::Function *makeRegisterGlobalsFn(); 65 66 /// Helper function that generates a constant string and returns a pointer to 67 /// the start of the string. The result of this function can be used anywhere 68 /// where the C code specifies const char*. 69 llvm::Constant *makeConstantString(const std::string &Str, 70 const std::string &Name = "", 71 const std::string &SectionName = "", 72 unsigned Alignment = 0) { 73 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 74 llvm::ConstantInt::get(SizeTy, 0)}; 75 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 76 llvm::GlobalVariable *GV = 77 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 78 if (!SectionName.empty()) { 79 GV->setSection(SectionName); 80 // Mark the address as used which make sure that this section isn't 81 // merged and we will really have it in the object file. 82 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 83 } 84 if (Alignment) 85 GV->setAlignment(Alignment); 86 87 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 88 ConstStr.getPointer(), Zeros); 89 } 90 91 /// Helper function that generates an empty dummy function returning void. 92 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 93 assert(FnTy->getReturnType()->isVoidTy() && 94 "Can only generate dummy functions returning void!"); 95 llvm::Function *DummyFunc = llvm::Function::Create( 96 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 97 98 llvm::BasicBlock *DummyBlock = 99 llvm::BasicBlock::Create(Context, "", DummyFunc); 100 CGBuilderTy FuncBuilder(CGM, Context); 101 FuncBuilder.SetInsertPoint(DummyBlock); 102 FuncBuilder.CreateRetVoid(); 103 104 return DummyFunc; 105 } 106 107 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 108 109 public: 110 CGNVCUDARuntime(CodeGenModule &CGM); 111 112 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 113 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 114 DeviceVars.push_back(std::make_pair(&Var, Flags)); 115 } 116 117 /// Creates module constructor function 118 llvm::Function *makeModuleCtorFunction() override; 119 /// Creates module destructor function 120 llvm::Function *makeModuleDtorFunction() override; 121 }; 122 123 } 124 125 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 126 if (CGM.getLangOpts().HIP) 127 return ((Twine("hip") + Twine(FuncName)).str()); 128 return ((Twine("cuda") + Twine(FuncName)).str()); 129 } 130 std::string 131 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 132 if (CGM.getLangOpts().HIP) 133 return ((Twine("__hip") + Twine(FuncName)).str()); 134 return ((Twine("__cuda") + Twine(FuncName)).str()); 135 } 136 137 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 138 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 139 TheModule(CGM.getModule()), 140 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode) { 141 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 142 ASTContext &Ctx = CGM.getContext(); 143 144 IntTy = CGM.IntTy; 145 SizeTy = CGM.SizeTy; 146 VoidTy = CGM.VoidTy; 147 148 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 149 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 150 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 151 } 152 153 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 154 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 155 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 156 return CGM.CreateRuntimeFunction( 157 llvm::FunctionType::get(IntTy, Params, false), 158 addPrefixToName("SetupArgument")); 159 } 160 161 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 162 if (CGM.getLangOpts().HIP) { 163 // hipError_t hipLaunchByPtr(char *); 164 return CGM.CreateRuntimeFunction( 165 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 166 } else { 167 // cudaError_t cudaLaunch(char *); 168 return CGM.CreateRuntimeFunction( 169 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 170 } 171 } 172 173 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 174 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 175 } 176 177 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 178 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 179 } 180 181 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 182 auto CallbackFnTy = getCallbackFnTy(); 183 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 184 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 185 VoidPtrTy, CallbackFnTy->getPointerTo()}; 186 return llvm::FunctionType::get(VoidTy, Params, false); 187 } 188 189 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 190 FunctionArgList &Args) { 191 EmittedKernels.push_back(CGF.CurFn); 192 emitDeviceStubBody(CGF, Args); 193 } 194 195 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 196 FunctionArgList &Args) { 197 // Emit a call to cudaSetupArgument for each arg in Args. 198 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 199 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 200 CharUnits Offset = CharUnits::Zero(); 201 for (const VarDecl *A : Args) { 202 CharUnits TyWidth, TyAlign; 203 std::tie(TyWidth, TyAlign) = 204 CGM.getContext().getTypeInfoInChars(A->getType()); 205 Offset = Offset.alignTo(TyAlign); 206 llvm::Value *Args[] = { 207 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 208 VoidPtrTy), 209 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 210 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 211 }; 212 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 213 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 214 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 215 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 216 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 217 CGF.EmitBlock(NextBlock); 218 Offset += TyWidth; 219 } 220 221 // Emit the call to cudaLaunch 222 llvm::Constant *cudaLaunchFn = getLaunchFn(); 223 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 224 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 225 CGF.EmitBranch(EndBlock); 226 227 CGF.EmitBlock(EndBlock); 228 } 229 230 /// Creates a function that sets up state on the host side for CUDA objects that 231 /// have a presence on both the host and device sides. Specifically, registers 232 /// the host side of kernel functions and device global variables with the CUDA 233 /// runtime. 234 /// \code 235 /// void __cuda_register_globals(void** GpuBinaryHandle) { 236 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 237 /// ... 238 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 239 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 240 /// ... 241 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 242 /// } 243 /// \endcode 244 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 245 // No need to register anything 246 if (EmittedKernels.empty() && DeviceVars.empty()) 247 return nullptr; 248 249 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 250 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 251 addUnderscoredPrefixToName("_register_globals"), &TheModule); 252 llvm::BasicBlock *EntryBB = 253 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 254 CGBuilderTy Builder(CGM, Context); 255 Builder.SetInsertPoint(EntryBB); 256 257 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 258 // int, uint3*, uint3*, dim3*, dim3*, int*) 259 llvm::Type *RegisterFuncParams[] = { 260 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 261 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 262 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 263 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 264 addUnderscoredPrefixToName("RegisterFunction")); 265 266 // Extract GpuBinaryHandle passed as the first argument passed to 267 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 268 // each emitted kernel. 269 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 270 for (llvm::Function *Kernel : EmittedKernels) { 271 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 272 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 273 llvm::Value *Args[] = { 274 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 275 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 276 NullPtr, NullPtr, NullPtr, 277 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 278 Builder.CreateCall(RegisterFunc, Args); 279 } 280 281 // void __cudaRegisterVar(void **, char *, char *, const char *, 282 // int, int, int, int) 283 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 284 CharPtrTy, IntTy, IntTy, 285 IntTy, IntTy}; 286 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 287 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 288 addUnderscoredPrefixToName("RegisterVar")); 289 for (auto &Pair : DeviceVars) { 290 llvm::GlobalVariable *Var = Pair.first; 291 unsigned Flags = Pair.second; 292 llvm::Constant *VarName = makeConstantString(Var->getName()); 293 uint64_t VarSize = 294 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 295 llvm::Value *Args[] = { 296 &GpuBinaryHandlePtr, 297 Builder.CreateBitCast(Var, VoidPtrTy), 298 VarName, 299 VarName, 300 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 301 llvm::ConstantInt::get(IntTy, VarSize), 302 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 303 llvm::ConstantInt::get(IntTy, 0)}; 304 Builder.CreateCall(RegisterVar, Args); 305 } 306 307 Builder.CreateRetVoid(); 308 return RegisterKernelsFunc; 309 } 310 311 /// Creates a global constructor function for the module: 312 /// 313 /// For CUDA: 314 /// \code 315 /// void __cuda_module_ctor(void*) { 316 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 317 /// __cuda_register_globals(Handle); 318 /// } 319 /// \endcode 320 /// 321 /// For HIP: 322 /// \code 323 /// void __hip_module_ctor(void*) { 324 /// if (__hip_gpubin_handle == 0) { 325 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 326 /// __hip_register_globals(__hip_gpubin_handle); 327 /// } 328 /// } 329 /// \endcode 330 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 331 bool IsHIP = CGM.getLangOpts().HIP; 332 // No need to generate ctors/dtors if there is no GPU binary. 333 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 334 if (CudaGpuBinaryFileName.empty() && !IsHIP) 335 return nullptr; 336 337 // void __{cuda|hip}_register_globals(void* handle); 338 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 339 // We always need a function to pass in as callback. Create a dummy 340 // implementation if we don't need to register anything. 341 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 342 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 343 344 // void ** __{cuda|hip}RegisterFatBinary(void *); 345 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 346 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 347 addUnderscoredPrefixToName("RegisterFatBinary")); 348 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 349 llvm::StructType *FatbinWrapperTy = 350 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 351 352 // Register GPU binary with the CUDA runtime, store returned handle in a 353 // global variable and save a reference in GpuBinaryHandle to be cleaned up 354 // in destructor on exit. Then associate all known kernels with the GPU binary 355 // handle so CUDA runtime can figure out what to call on the GPU side. 356 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 357 if (!CudaGpuBinaryFileName.empty()) { 358 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 359 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 360 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 361 CGM.getDiags().Report(diag::err_cannot_open_file) 362 << CudaGpuBinaryFileName << EC.message(); 363 return nullptr; 364 } 365 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 366 } 367 368 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 369 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 370 llvm::GlobalValue::InternalLinkage, 371 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 372 llvm::BasicBlock *CtorEntryBB = 373 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 374 CGBuilderTy CtorBuilder(CGM, Context); 375 376 CtorBuilder.SetInsertPoint(CtorEntryBB); 377 378 const char *FatbinConstantName; 379 const char *FatbinSectionName; 380 const char *ModuleIDSectionName; 381 StringRef ModuleIDPrefix; 382 llvm::Constant *FatBinStr; 383 unsigned FatMagic; 384 if (IsHIP) { 385 FatbinConstantName = ".hip_fatbin"; 386 FatbinSectionName = ".hipFatBinSegment"; 387 388 ModuleIDSectionName = "__hip_module_id"; 389 ModuleIDPrefix = "__hip_"; 390 391 if (CudaGpuBinary) { 392 // If fatbin is available from early finalization, create a string 393 // literal containing the fat binary loaded from the given file. 394 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 395 FatbinConstantName, 8); 396 } else { 397 // If fatbin is not available, create an external symbol 398 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 399 // to contain the fat binary but will be populated somewhere else, 400 // e.g. by lld through link script. 401 FatBinStr = new llvm::GlobalVariable( 402 CGM.getModule(), CGM.Int8Ty, 403 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 404 "__hip_fatbin", nullptr, 405 llvm::GlobalVariable::NotThreadLocal); 406 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 407 } 408 409 FatMagic = HIPFatMagic; 410 } else { 411 if (RelocatableDeviceCode) 412 FatbinConstantName = CGM.getTriple().isMacOSX() 413 ? "__NV_CUDA,__nv_relfatbin" 414 : "__nv_relfatbin"; 415 else 416 FatbinConstantName = 417 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 418 // NVIDIA's cuobjdump looks for fatbins in this section. 419 FatbinSectionName = 420 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 421 422 ModuleIDSectionName = CGM.getTriple().isMacOSX() 423 ? "__NV_CUDA,__nv_module_id" 424 : "__nv_module_id"; 425 ModuleIDPrefix = "__nv_"; 426 427 // For CUDA, create a string literal containing the fat binary loaded from 428 // the given file. 429 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 430 FatbinConstantName, 8); 431 FatMagic = CudaFatMagic; 432 } 433 434 // Create initialized wrapper structure that points to the loaded GPU binary 435 ConstantInitBuilder Builder(CGM); 436 auto Values = Builder.beginStruct(FatbinWrapperTy); 437 // Fatbin wrapper magic. 438 Values.addInt(IntTy, FatMagic); 439 // Fatbin version. 440 Values.addInt(IntTy, 1); 441 // Data. 442 Values.add(FatBinStr); 443 // Unused in fatbin v1. 444 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 445 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 446 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 447 /*constant*/ true); 448 FatbinWrapper->setSection(FatbinSectionName); 449 450 // There is only one HIP fat binary per linked module, however there are 451 // multiple constructor functions. Make sure the fat binary is registered 452 // only once. The constructor functions are executed by the dynamic loader 453 // before the program gains control. The dynamic loader cannot execute the 454 // constructor functions concurrently since doing that would not guarantee 455 // thread safety of the loaded program. Therefore we can assume sequential 456 // execution of constructor functions here. 457 if (IsHIP) { 458 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 459 llvm::GlobalValue::LinkOnceAnyLinkage; 460 llvm::BasicBlock *IfBlock = 461 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 462 llvm::BasicBlock *ExitBlock = 463 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 464 // The name, size, and initialization pattern of this variable is part 465 // of HIP ABI. 466 GpuBinaryHandle = new llvm::GlobalVariable( 467 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 468 Linkage, 469 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 470 "__hip_gpubin_handle"); 471 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 472 // Prevent the weak symbol in different shared libraries being merged. 473 if (Linkage != llvm::GlobalValue::InternalLinkage) 474 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 475 Address GpuBinaryAddr( 476 GpuBinaryHandle, 477 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 478 { 479 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 480 llvm::Constant *Zero = 481 llvm::Constant::getNullValue(HandleValue->getType()); 482 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 483 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 484 } 485 { 486 CtorBuilder.SetInsertPoint(IfBlock); 487 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 488 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 489 RegisterFatbinFunc, 490 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 491 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 492 CtorBuilder.CreateBr(ExitBlock); 493 } 494 { 495 CtorBuilder.SetInsertPoint(ExitBlock); 496 // Call __hip_register_globals(GpuBinaryHandle); 497 if (RegisterGlobalsFunc) { 498 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 499 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 500 } 501 } 502 } else if (!RelocatableDeviceCode) { 503 // Register binary with CUDA runtime. This is substantially different in 504 // default mode vs. separate compilation! 505 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 506 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 507 RegisterFatbinFunc, 508 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 509 GpuBinaryHandle = new llvm::GlobalVariable( 510 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 511 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 512 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 513 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 514 CGM.getPointerAlign()); 515 516 // Call __cuda_register_globals(GpuBinaryHandle); 517 if (RegisterGlobalsFunc) 518 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 519 } else { 520 // Generate a unique module ID. 521 SmallString<64> ModuleID; 522 llvm::raw_svector_ostream OS(ModuleID); 523 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 524 llvm::Constant *ModuleIDConstant = 525 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 526 527 // Create an alias for the FatbinWrapper that nvcc will look for. 528 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 529 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 530 531 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 532 // void *, void (*)(void **)) 533 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 534 RegisterLinkedBinaryName += ModuleID; 535 llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 536 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 537 538 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 539 llvm::Value *Args[] = {RegisterGlobalsFunc, 540 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 541 ModuleIDConstant, 542 makeDummyFunction(getCallbackFnTy())}; 543 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 544 } 545 546 // Create destructor and register it with atexit() the way NVCC does it. Doing 547 // it during regular destructor phase worked in CUDA before 9.2 but results in 548 // double-free in 9.2. 549 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 550 // extern "C" int atexit(void (*f)(void)); 551 llvm::FunctionType *AtExitTy = 552 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 553 llvm::Constant *AtExitFunc = 554 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 555 /*Local=*/true); 556 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 557 } 558 559 CtorBuilder.CreateRetVoid(); 560 return ModuleCtorFunc; 561 } 562 563 /// Creates a global destructor function that unregisters the GPU code blob 564 /// registered by constructor. 565 /// 566 /// For CUDA: 567 /// \code 568 /// void __cuda_module_dtor(void*) { 569 /// __cudaUnregisterFatBinary(Handle); 570 /// } 571 /// \endcode 572 /// 573 /// For HIP: 574 /// \code 575 /// void __hip_module_dtor(void*) { 576 /// if (__hip_gpubin_handle) { 577 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 578 /// __hip_gpubin_handle = 0; 579 /// } 580 /// } 581 /// \endcode 582 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 583 // No need for destructor if we don't have a handle to unregister. 584 if (!GpuBinaryHandle) 585 return nullptr; 586 587 // void __cudaUnregisterFatBinary(void ** handle); 588 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 589 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 590 addUnderscoredPrefixToName("UnregisterFatBinary")); 591 592 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 593 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 594 llvm::GlobalValue::InternalLinkage, 595 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 596 597 llvm::BasicBlock *DtorEntryBB = 598 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 599 CGBuilderTy DtorBuilder(CGM, Context); 600 DtorBuilder.SetInsertPoint(DtorEntryBB); 601 602 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 603 GpuBinaryHandle->getAlignment())); 604 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 605 // There is only one HIP fat binary per linked module, however there are 606 // multiple destructor functions. Make sure the fat binary is unregistered 607 // only once. 608 if (CGM.getLangOpts().HIP) { 609 llvm::BasicBlock *IfBlock = 610 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 611 llvm::BasicBlock *ExitBlock = 612 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 613 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 614 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 615 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 616 617 DtorBuilder.SetInsertPoint(IfBlock); 618 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 619 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 620 DtorBuilder.CreateBr(ExitBlock); 621 622 DtorBuilder.SetInsertPoint(ExitBlock); 623 } else { 624 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 625 } 626 DtorBuilder.CreateRetVoid(); 627 return ModuleDtorFunc; 628 } 629 630 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 631 return new CGNVCUDARuntime(CGM); 632 } 633