1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "clang/CodeGen/ConstantInitBuilder.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/CallSite.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 46 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 47 /// Keeps track of variable containing handle of GPU binary. Populated by 48 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 49 /// ModuleDtorFunction() 50 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 51 /// Whether we generate relocatable device code. 52 bool RelocatableDeviceCode; 53 54 llvm::Constant *getSetupArgumentFn() const; 55 llvm::Constant *getLaunchFn() const; 56 57 llvm::FunctionType *getRegisterGlobalsFnTy() const; 58 llvm::FunctionType *getCallbackFnTy() const; 59 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 60 std::string addPrefixToName(StringRef FuncName) const; 61 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 62 63 /// Creates a function to register all kernel stubs generated in this module. 64 llvm::Function *makeRegisterGlobalsFn(); 65 66 /// Helper function that generates a constant string and returns a pointer to 67 /// the start of the string. The result of this function can be used anywhere 68 /// where the C code specifies const char*. 69 llvm::Constant *makeConstantString(const std::string &Str, 70 const std::string &Name = "", 71 const std::string &SectionName = "", 72 unsigned Alignment = 0) { 73 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 74 llvm::ConstantInt::get(SizeTy, 0)}; 75 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 76 llvm::GlobalVariable *GV = 77 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 78 if (!SectionName.empty()) { 79 GV->setSection(SectionName); 80 // Mark the address as used which make sure that this section isn't 81 // merged and we will really have it in the object file. 82 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 83 } 84 if (Alignment) 85 GV->setAlignment(Alignment); 86 87 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 88 ConstStr.getPointer(), Zeros); 89 } 90 91 /// Helper function that generates an empty dummy function returning void. 92 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 93 assert(FnTy->getReturnType()->isVoidTy() && 94 "Can only generate dummy functions returning void!"); 95 llvm::Function *DummyFunc = llvm::Function::Create( 96 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 97 98 llvm::BasicBlock *DummyBlock = 99 llvm::BasicBlock::Create(Context, "", DummyFunc); 100 CGBuilderTy FuncBuilder(CGM, Context); 101 FuncBuilder.SetInsertPoint(DummyBlock); 102 FuncBuilder.CreateRetVoid(); 103 104 return DummyFunc; 105 } 106 107 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 108 109 public: 110 CGNVCUDARuntime(CodeGenModule &CGM); 111 112 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 113 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 114 DeviceVars.push_back(std::make_pair(&Var, Flags)); 115 } 116 117 /// Creates module constructor function 118 llvm::Function *makeModuleCtorFunction() override; 119 /// Creates module destructor function 120 llvm::Function *makeModuleDtorFunction() override; 121 }; 122 123 } 124 125 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 126 if (CGM.getLangOpts().HIP) 127 return ((Twine("hip") + Twine(FuncName)).str()); 128 return ((Twine("cuda") + Twine(FuncName)).str()); 129 } 130 std::string 131 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 132 if (CGM.getLangOpts().HIP) 133 return ((Twine("__hip") + Twine(FuncName)).str()); 134 return ((Twine("__cuda") + Twine(FuncName)).str()); 135 } 136 137 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 138 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 139 TheModule(CGM.getModule()), 140 RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) { 141 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 142 ASTContext &Ctx = CGM.getContext(); 143 144 IntTy = CGM.IntTy; 145 SizeTy = CGM.SizeTy; 146 VoidTy = CGM.VoidTy; 147 148 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 149 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 150 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 151 } 152 153 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 154 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 155 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 156 return CGM.CreateRuntimeFunction( 157 llvm::FunctionType::get(IntTy, Params, false), 158 addPrefixToName("SetupArgument")); 159 } 160 161 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 162 if (CGM.getLangOpts().HIP) { 163 // hipError_t hipLaunchByPtr(char *); 164 return CGM.CreateRuntimeFunction( 165 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 166 } else { 167 // cudaError_t cudaLaunch(char *); 168 return CGM.CreateRuntimeFunction( 169 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 170 } 171 } 172 173 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 174 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 175 } 176 177 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 178 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 179 } 180 181 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 182 auto CallbackFnTy = getCallbackFnTy(); 183 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 184 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 185 VoidPtrTy, CallbackFnTy->getPointerTo()}; 186 return llvm::FunctionType::get(VoidTy, Params, false); 187 } 188 189 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 190 FunctionArgList &Args) { 191 EmittedKernels.push_back(CGF.CurFn); 192 emitDeviceStubBody(CGF, Args); 193 } 194 195 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 196 FunctionArgList &Args) { 197 // Emit a call to cudaSetupArgument for each arg in Args. 198 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 199 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 200 CharUnits Offset = CharUnits::Zero(); 201 for (const VarDecl *A : Args) { 202 CharUnits TyWidth, TyAlign; 203 std::tie(TyWidth, TyAlign) = 204 CGM.getContext().getTypeInfoInChars(A->getType()); 205 Offset = Offset.alignTo(TyAlign); 206 llvm::Value *Args[] = { 207 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 208 VoidPtrTy), 209 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 210 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 211 }; 212 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 213 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 214 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 215 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 216 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 217 CGF.EmitBlock(NextBlock); 218 Offset += TyWidth; 219 } 220 221 // Emit the call to cudaLaunch 222 llvm::Constant *cudaLaunchFn = getLaunchFn(); 223 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 224 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 225 CGF.EmitBranch(EndBlock); 226 227 CGF.EmitBlock(EndBlock); 228 } 229 230 /// Creates a function that sets up state on the host side for CUDA objects that 231 /// have a presence on both the host and device sides. Specifically, registers 232 /// the host side of kernel functions and device global variables with the CUDA 233 /// runtime. 234 /// \code 235 /// void __cuda_register_globals(void** GpuBinaryHandle) { 236 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 237 /// ... 238 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 239 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 240 /// ... 241 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 242 /// } 243 /// \endcode 244 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 245 // No need to register anything 246 if (EmittedKernels.empty() && DeviceVars.empty()) 247 return nullptr; 248 249 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 250 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 251 addUnderscoredPrefixToName("_register_globals"), &TheModule); 252 llvm::BasicBlock *EntryBB = 253 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 254 CGBuilderTy Builder(CGM, Context); 255 Builder.SetInsertPoint(EntryBB); 256 257 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 258 // int, uint3*, uint3*, dim3*, dim3*, int*) 259 llvm::Type *RegisterFuncParams[] = { 260 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 261 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 262 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 263 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 264 addUnderscoredPrefixToName("RegisterFunction")); 265 266 // Extract GpuBinaryHandle passed as the first argument passed to 267 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 268 // each emitted kernel. 269 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 270 for (llvm::Function *Kernel : EmittedKernels) { 271 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 272 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 273 llvm::Value *Args[] = { 274 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 275 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 276 NullPtr, NullPtr, NullPtr, 277 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 278 Builder.CreateCall(RegisterFunc, Args); 279 } 280 281 // void __cudaRegisterVar(void **, char *, char *, const char *, 282 // int, int, int, int) 283 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 284 CharPtrTy, IntTy, IntTy, 285 IntTy, IntTy}; 286 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 287 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 288 addUnderscoredPrefixToName("RegisterVar")); 289 for (auto &Pair : DeviceVars) { 290 llvm::GlobalVariable *Var = Pair.first; 291 unsigned Flags = Pair.second; 292 llvm::Constant *VarName = makeConstantString(Var->getName()); 293 uint64_t VarSize = 294 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 295 llvm::Value *Args[] = { 296 &GpuBinaryHandlePtr, 297 Builder.CreateBitCast(Var, VoidPtrTy), 298 VarName, 299 VarName, 300 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 301 llvm::ConstantInt::get(IntTy, VarSize), 302 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 303 llvm::ConstantInt::get(IntTy, 0)}; 304 Builder.CreateCall(RegisterVar, Args); 305 } 306 307 Builder.CreateRetVoid(); 308 return RegisterKernelsFunc; 309 } 310 311 /// Creates a global constructor function for the module: 312 /// \code 313 /// void __cuda_module_ctor(void*) { 314 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 315 /// __cuda_register_globals(Handle); 316 /// } 317 /// \endcode 318 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 319 bool IsHIP = CGM.getLangOpts().HIP; 320 // No need to generate ctors/dtors if there is no GPU binary. 321 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 322 if (CudaGpuBinaryFileName.empty() && !IsHIP) 323 return nullptr; 324 325 // void __{cuda|hip}_register_globals(void* handle); 326 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 327 // We always need a function to pass in as callback. Create a dummy 328 // implementation if we don't need to register anything. 329 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 330 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 331 332 // void ** __{cuda|hip}RegisterFatBinary(void *); 333 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 334 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 335 addUnderscoredPrefixToName("RegisterFatBinary")); 336 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 337 llvm::StructType *FatbinWrapperTy = 338 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 339 340 // Register GPU binary with the CUDA runtime, store returned handle in a 341 // global variable and save a reference in GpuBinaryHandle to be cleaned up 342 // in destructor on exit. Then associate all known kernels with the GPU binary 343 // handle so CUDA runtime can figure out what to call on the GPU side. 344 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary; 345 if (!IsHIP) { 346 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 347 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 348 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 349 CGM.getDiags().Report(diag::err_cannot_open_file) 350 << CudaGpuBinaryFileName << EC.message(); 351 return nullptr; 352 } 353 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 354 } 355 356 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 357 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 358 llvm::GlobalValue::InternalLinkage, 359 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 360 llvm::BasicBlock *CtorEntryBB = 361 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 362 CGBuilderTy CtorBuilder(CGM, Context); 363 364 CtorBuilder.SetInsertPoint(CtorEntryBB); 365 366 const char *FatbinConstantName; 367 const char *FatbinSectionName; 368 const char *ModuleIDSectionName; 369 StringRef ModuleIDPrefix; 370 llvm::Constant *FatBinStr; 371 unsigned FatMagic; 372 if (IsHIP) { 373 FatbinConstantName = ".hip_fatbin"; 374 FatbinSectionName = ".hipFatBinSegment"; 375 376 ModuleIDSectionName = "__hip_module_id"; 377 ModuleIDPrefix = "__hip_"; 378 379 // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin. 380 // The external symbol is supposed to contain the fat binary but will be 381 // populated somewhere else, e.g. by lld through link script. 382 FatBinStr = new llvm::GlobalVariable( 383 CGM.getModule(), CGM.Int8Ty, 384 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 385 "__hip_fatbin", nullptr, 386 llvm::GlobalVariable::NotThreadLocal); 387 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 388 389 FatMagic = HIPFatMagic; 390 } else { 391 if (RelocatableDeviceCode) 392 // TODO: Figure out how this is called on mac OS! 393 FatbinConstantName = "__nv_relfatbin"; 394 else 395 FatbinConstantName = 396 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 397 // NVIDIA's cuobjdump looks for fatbins in this section. 398 FatbinSectionName = 399 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 400 401 // TODO: Figure out how this is called on mac OS! 402 ModuleIDSectionName = "__nv_module_id"; 403 ModuleIDPrefix = "__nv_"; 404 405 // For CUDA, create a string literal containing the fat binary loaded from 406 // the given file. 407 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 408 FatbinConstantName, 8); 409 FatMagic = CudaFatMagic; 410 } 411 412 // Create initialized wrapper structure that points to the loaded GPU binary 413 ConstantInitBuilder Builder(CGM); 414 auto Values = Builder.beginStruct(FatbinWrapperTy); 415 // Fatbin wrapper magic. 416 Values.addInt(IntTy, FatMagic); 417 // Fatbin version. 418 Values.addInt(IntTy, 1); 419 // Data. 420 Values.add(FatBinStr); 421 // Unused in fatbin v1. 422 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 423 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 424 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 425 /*constant*/ true); 426 FatbinWrapper->setSection(FatbinSectionName); 427 428 // Register binary with CUDA/HIP runtime. This is substantially different in 429 // default mode vs. separate compilation! 430 if (!RelocatableDeviceCode) { 431 // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper); 432 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 433 RegisterFatbinFunc, 434 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 435 GpuBinaryHandle = new llvm::GlobalVariable( 436 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 437 llvm::ConstantPointerNull::get(VoidPtrPtrTy), 438 addUnderscoredPrefixToName("_gpubin_handle")); 439 440 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 441 CGM.getPointerAlign()); 442 443 // Call __{cuda|hip}_register_globals(GpuBinaryHandle); 444 if (RegisterGlobalsFunc) 445 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 446 } else { 447 // Generate a unique module ID. 448 SmallString<64> ModuleID; 449 llvm::raw_svector_ostream OS(ModuleID); 450 OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID()); 451 llvm::Constant *ModuleIDConstant = 452 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 453 454 // Create an alias for the FatbinWrapper that nvcc or hip backend will 455 // look for. 456 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 457 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 458 459 // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 460 // void *, void (*)(void **)) 461 SmallString<128> RegisterLinkedBinaryName( 462 addUnderscoredPrefixToName("RegisterLinkedBinary")); 463 RegisterLinkedBinaryName += ModuleID; 464 llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 465 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 466 467 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 468 llvm::Value *Args[] = {RegisterGlobalsFunc, 469 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 470 ModuleIDConstant, 471 makeDummyFunction(getCallbackFnTy())}; 472 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 473 } 474 475 // Create destructor and register it with atexit() the way NVCC does it. Doing 476 // it during regular destructor phase worked in CUDA before 9.2 but results in 477 // double-free in 9.2. 478 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 479 // extern "C" int atexit(void (*f)(void)); 480 llvm::FunctionType *AtExitTy = 481 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 482 llvm::Constant *AtExitFunc = 483 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 484 /*Local=*/true); 485 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 486 } 487 488 CtorBuilder.CreateRetVoid(); 489 return ModuleCtorFunc; 490 } 491 492 /// Creates a global destructor function that unregisters the GPU code blob 493 /// registered by constructor. 494 /// \code 495 /// void __cuda_module_dtor(void*) { 496 /// __cudaUnregisterFatBinary(Handle); 497 /// } 498 /// \endcode 499 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 500 // No need for destructor if we don't have a handle to unregister. 501 if (!GpuBinaryHandle) 502 return nullptr; 503 504 // void __cudaUnregisterFatBinary(void ** handle); 505 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 506 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 507 addUnderscoredPrefixToName("UnregisterFatBinary")); 508 509 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 510 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 511 llvm::GlobalValue::InternalLinkage, 512 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 513 514 llvm::BasicBlock *DtorEntryBB = 515 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 516 CGBuilderTy DtorBuilder(CGM, Context); 517 DtorBuilder.SetInsertPoint(DtorEntryBB); 518 519 auto HandleValue = 520 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 521 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 522 523 DtorBuilder.CreateRetVoid(); 524 return ModuleDtorFunc; 525 } 526 527 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 528 return new CGNVCUDARuntime(CGM); 529 } 530