1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 struct KernelInfo { 46 llvm::Function *Kernel; 47 const Decl *D; 48 }; 49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 50 struct VarInfo { 51 llvm::GlobalVariable *Var; 52 const VarDecl *D; 53 unsigned Flag; 54 }; 55 llvm::SmallVector<VarInfo, 16> DeviceVars; 56 /// Keeps track of variable containing handle of GPU binary. Populated by 57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 58 /// ModuleDtorFunction() 59 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 60 /// Whether we generate relocatable device code. 61 bool RelocatableDeviceCode; 62 /// Mangle context for device. 63 std::unique_ptr<MangleContext> DeviceMC; 64 65 llvm::FunctionCallee getSetupArgumentFn() const; 66 llvm::FunctionCallee getLaunchFn() const; 67 68 llvm::FunctionType *getRegisterGlobalsFnTy() const; 69 llvm::FunctionType *getCallbackFnTy() const; 70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 71 std::string addPrefixToName(StringRef FuncName) const; 72 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 73 74 /// Creates a function to register all kernel stubs generated in this module. 75 llvm::Function *makeRegisterGlobalsFn(); 76 77 /// Helper function that generates a constant string and returns a pointer to 78 /// the start of the string. The result of this function can be used anywhere 79 /// where the C code specifies const char*. 80 llvm::Constant *makeConstantString(const std::string &Str, 81 const std::string &Name = "", 82 const std::string &SectionName = "", 83 unsigned Alignment = 0) { 84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 85 llvm::ConstantInt::get(SizeTy, 0)}; 86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 87 llvm::GlobalVariable *GV = 88 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 89 if (!SectionName.empty()) { 90 GV->setSection(SectionName); 91 // Mark the address as used which make sure that this section isn't 92 // merged and we will really have it in the object file. 93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 94 } 95 if (Alignment) 96 GV->setAlignment(Alignment); 97 98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 99 ConstStr.getPointer(), Zeros); 100 } 101 102 /// Helper function that generates an empty dummy function returning void. 103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 104 assert(FnTy->getReturnType()->isVoidTy() && 105 "Can only generate dummy functions returning void!"); 106 llvm::Function *DummyFunc = llvm::Function::Create( 107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 108 109 llvm::BasicBlock *DummyBlock = 110 llvm::BasicBlock::Create(Context, "", DummyFunc); 111 CGBuilderTy FuncBuilder(CGM, Context); 112 FuncBuilder.SetInsertPoint(DummyBlock); 113 FuncBuilder.CreateRetVoid(); 114 115 return DummyFunc; 116 } 117 118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 120 std::string getDeviceSideName(const Decl *ND); 121 122 public: 123 CGNVCUDARuntime(CodeGenModule &CGM); 124 125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 127 unsigned Flags) override { 128 DeviceVars.push_back({&Var, VD, Flags}); 129 } 130 131 /// Creates module constructor function 132 llvm::Function *makeModuleCtorFunction() override; 133 /// Creates module destructor function 134 llvm::Function *makeModuleDtorFunction() override; 135 }; 136 137 } 138 139 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 140 if (CGM.getLangOpts().HIP) 141 return ((Twine("hip") + Twine(FuncName)).str()); 142 return ((Twine("cuda") + Twine(FuncName)).str()); 143 } 144 std::string 145 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 146 if (CGM.getLangOpts().HIP) 147 return ((Twine("__hip") + Twine(FuncName)).str()); 148 return ((Twine("__cuda") + Twine(FuncName)).str()); 149 } 150 151 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 152 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 153 TheModule(CGM.getModule()), 154 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 155 DeviceMC(CGM.getContext().createMangleContext( 156 CGM.getContext().getAuxTargetInfo())) { 157 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 158 ASTContext &Ctx = CGM.getContext(); 159 160 IntTy = CGM.IntTy; 161 SizeTy = CGM.SizeTy; 162 VoidTy = CGM.VoidTy; 163 164 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 165 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 166 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 167 } 168 169 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 170 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 171 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 172 return CGM.CreateRuntimeFunction( 173 llvm::FunctionType::get(IntTy, Params, false), 174 addPrefixToName("SetupArgument")); 175 } 176 177 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 178 if (CGM.getLangOpts().HIP) { 179 // hipError_t hipLaunchByPtr(char *); 180 return CGM.CreateRuntimeFunction( 181 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 182 } else { 183 // cudaError_t cudaLaunch(char *); 184 return CGM.CreateRuntimeFunction( 185 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 186 } 187 } 188 189 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 190 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 191 } 192 193 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 194 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 195 } 196 197 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 198 auto CallbackFnTy = getCallbackFnTy(); 199 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 200 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 201 VoidPtrTy, CallbackFnTy->getPointerTo()}; 202 return llvm::FunctionType::get(VoidTy, Params, false); 203 } 204 205 std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { 206 auto *ND = cast<const NamedDecl>(D); 207 std::string DeviceSideName; 208 if (DeviceMC->shouldMangleDeclName(ND)) { 209 SmallString<256> Buffer; 210 llvm::raw_svector_ostream Out(Buffer); 211 DeviceMC->mangleName(ND, Out); 212 DeviceSideName = Out.str(); 213 } else 214 DeviceSideName = ND->getIdentifier()->getName(); 215 return DeviceSideName; 216 } 217 218 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 219 FunctionArgList &Args) { 220 assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || 221 getDeviceSideName(CGF.CurFuncDecl) + ".stub" == CGF.CurFn->getName() || 222 CGF.CGM.getContext().getTargetInfo().getCXXABI() != 223 CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()); 224 225 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 226 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 227 CudaFeature::CUDA_USES_NEW_LAUNCH)) 228 emitDeviceStubBodyNew(CGF, Args); 229 else 230 emitDeviceStubBodyLegacy(CGF, Args); 231 } 232 233 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 234 // array and kernels are launched using cudaLaunchKernel(). 235 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 236 FunctionArgList &Args) { 237 // Build the shadow stack entry at the very start of the function. 238 239 // Calculate amount of space we will need for all arguments. If we have no 240 // args, allocate a single pointer so we still have a valid pointer to the 241 // argument array that we can pass to runtime, even if it will be unused. 242 Address KernelArgs = CGF.CreateTempAlloca( 243 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 244 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 245 // Store pointers to the arguments in a locally allocated launch_args. 246 for (unsigned i = 0; i < Args.size(); ++i) { 247 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 248 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 249 CGF.Builder.CreateDefaultAlignedStore( 250 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 251 } 252 253 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 254 255 // Lookup cudaLaunchKernel function. 256 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 257 // void **args, size_t sharedMem, 258 // cudaStream_t stream); 259 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 260 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 261 IdentifierInfo &cudaLaunchKernelII = 262 CGM.getContext().Idents.get("cudaLaunchKernel"); 263 FunctionDecl *cudaLaunchKernelFD = nullptr; 264 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 265 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 266 cudaLaunchKernelFD = FD; 267 } 268 269 if (cudaLaunchKernelFD == nullptr) { 270 CGM.Error(CGF.CurFuncDecl->getLocation(), 271 "Can't find declaration for cudaLaunchKernel()"); 272 return; 273 } 274 // Create temporary dim3 grid_dim, block_dim. 275 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 276 QualType Dim3Ty = GridDimParam->getType(); 277 Address GridDim = 278 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 279 Address BlockDim = 280 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 281 Address ShmemSize = 282 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 283 Address Stream = 284 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 285 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 286 llvm::FunctionType::get(IntTy, 287 {/*gridDim=*/GridDim.getType(), 288 /*blockDim=*/BlockDim.getType(), 289 /*ShmemSize=*/ShmemSize.getType(), 290 /*Stream=*/Stream.getType()}, 291 /*isVarArg=*/false), 292 "__cudaPopCallConfiguration"); 293 294 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 295 {GridDim.getPointer(), BlockDim.getPointer(), 296 ShmemSize.getPointer(), Stream.getPointer()}); 297 298 // Emit the call to cudaLaunch 299 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 300 CallArgList LaunchKernelArgs; 301 LaunchKernelArgs.add(RValue::get(Kernel), 302 cudaLaunchKernelFD->getParamDecl(0)->getType()); 303 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 304 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 305 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 306 cudaLaunchKernelFD->getParamDecl(3)->getType()); 307 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 308 cudaLaunchKernelFD->getParamDecl(4)->getType()); 309 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 310 cudaLaunchKernelFD->getParamDecl(5)->getType()); 311 312 QualType QT = cudaLaunchKernelFD->getType(); 313 QualType CQT = QT.getCanonicalType(); 314 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 315 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 316 317 const CGFunctionInfo &FI = 318 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 319 llvm::FunctionCallee cudaLaunchKernelFn = 320 CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); 321 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 322 LaunchKernelArgs); 323 CGF.EmitBranch(EndBlock); 324 325 CGF.EmitBlock(EndBlock); 326 } 327 328 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 329 FunctionArgList &Args) { 330 // Emit a call to cudaSetupArgument for each arg in Args. 331 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 332 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 333 CharUnits Offset = CharUnits::Zero(); 334 for (const VarDecl *A : Args) { 335 CharUnits TyWidth, TyAlign; 336 std::tie(TyWidth, TyAlign) = 337 CGM.getContext().getTypeInfoInChars(A->getType()); 338 Offset = Offset.alignTo(TyAlign); 339 llvm::Value *Args[] = { 340 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 341 VoidPtrTy), 342 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 343 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 344 }; 345 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 346 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 347 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 348 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 349 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 350 CGF.EmitBlock(NextBlock); 351 Offset += TyWidth; 352 } 353 354 // Emit the call to cudaLaunch 355 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 356 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 357 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 358 CGF.EmitBranch(EndBlock); 359 360 CGF.EmitBlock(EndBlock); 361 } 362 363 /// Creates a function that sets up state on the host side for CUDA objects that 364 /// have a presence on both the host and device sides. Specifically, registers 365 /// the host side of kernel functions and device global variables with the CUDA 366 /// runtime. 367 /// \code 368 /// void __cuda_register_globals(void** GpuBinaryHandle) { 369 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 370 /// ... 371 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 372 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 373 /// ... 374 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 375 /// } 376 /// \endcode 377 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 378 // No need to register anything 379 if (EmittedKernels.empty() && DeviceVars.empty()) 380 return nullptr; 381 382 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 383 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 384 addUnderscoredPrefixToName("_register_globals"), &TheModule); 385 llvm::BasicBlock *EntryBB = 386 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 387 CGBuilderTy Builder(CGM, Context); 388 Builder.SetInsertPoint(EntryBB); 389 390 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 391 // int, uint3*, uint3*, dim3*, dim3*, int*) 392 llvm::Type *RegisterFuncParams[] = { 393 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 394 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 395 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 396 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 397 addUnderscoredPrefixToName("RegisterFunction")); 398 399 // Extract GpuBinaryHandle passed as the first argument passed to 400 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 401 // each emitted kernel. 402 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 403 for (auto &&I : EmittedKernels) { 404 llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); 405 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 406 llvm::Value *Args[] = { 407 &GpuBinaryHandlePtr, 408 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 409 KernelName, 410 KernelName, 411 llvm::ConstantInt::get(IntTy, -1), 412 NullPtr, 413 NullPtr, 414 NullPtr, 415 NullPtr, 416 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 417 Builder.CreateCall(RegisterFunc, Args); 418 } 419 420 // void __cudaRegisterVar(void **, char *, char *, const char *, 421 // int, int, int, int) 422 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 423 CharPtrTy, IntTy, IntTy, 424 IntTy, IntTy}; 425 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 426 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 427 addUnderscoredPrefixToName("RegisterVar")); 428 for (auto &&Info : DeviceVars) { 429 llvm::GlobalVariable *Var = Info.Var; 430 unsigned Flags = Info.Flag; 431 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 432 uint64_t VarSize = 433 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 434 llvm::Value *Args[] = { 435 &GpuBinaryHandlePtr, 436 Builder.CreateBitCast(Var, VoidPtrTy), 437 VarName, 438 VarName, 439 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 440 llvm::ConstantInt::get(IntTy, VarSize), 441 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 442 llvm::ConstantInt::get(IntTy, 0)}; 443 Builder.CreateCall(RegisterVar, Args); 444 } 445 446 Builder.CreateRetVoid(); 447 return RegisterKernelsFunc; 448 } 449 450 /// Creates a global constructor function for the module: 451 /// 452 /// For CUDA: 453 /// \code 454 /// void __cuda_module_ctor(void*) { 455 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 456 /// __cuda_register_globals(Handle); 457 /// } 458 /// \endcode 459 /// 460 /// For HIP: 461 /// \code 462 /// void __hip_module_ctor(void*) { 463 /// if (__hip_gpubin_handle == 0) { 464 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 465 /// __hip_register_globals(__hip_gpubin_handle); 466 /// } 467 /// } 468 /// \endcode 469 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 470 bool IsHIP = CGM.getLangOpts().HIP; 471 bool IsCUDA = CGM.getLangOpts().CUDA; 472 // No need to generate ctors/dtors if there is no GPU binary. 473 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 474 if (CudaGpuBinaryFileName.empty() && !IsHIP) 475 return nullptr; 476 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 477 DeviceVars.empty()) 478 return nullptr; 479 480 // void __{cuda|hip}_register_globals(void* handle); 481 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 482 // We always need a function to pass in as callback. Create a dummy 483 // implementation if we don't need to register anything. 484 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 485 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 486 487 // void ** __{cuda|hip}RegisterFatBinary(void *); 488 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 489 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 490 addUnderscoredPrefixToName("RegisterFatBinary")); 491 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 492 llvm::StructType *FatbinWrapperTy = 493 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 494 495 // Register GPU binary with the CUDA runtime, store returned handle in a 496 // global variable and save a reference in GpuBinaryHandle to be cleaned up 497 // in destructor on exit. Then associate all known kernels with the GPU binary 498 // handle so CUDA runtime can figure out what to call on the GPU side. 499 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 500 if (!CudaGpuBinaryFileName.empty()) { 501 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 502 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 503 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 504 CGM.getDiags().Report(diag::err_cannot_open_file) 505 << CudaGpuBinaryFileName << EC.message(); 506 return nullptr; 507 } 508 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 509 } 510 511 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 512 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 513 llvm::GlobalValue::InternalLinkage, 514 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 515 llvm::BasicBlock *CtorEntryBB = 516 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 517 CGBuilderTy CtorBuilder(CGM, Context); 518 519 CtorBuilder.SetInsertPoint(CtorEntryBB); 520 521 const char *FatbinConstantName; 522 const char *FatbinSectionName; 523 const char *ModuleIDSectionName; 524 StringRef ModuleIDPrefix; 525 llvm::Constant *FatBinStr; 526 unsigned FatMagic; 527 if (IsHIP) { 528 FatbinConstantName = ".hip_fatbin"; 529 FatbinSectionName = ".hipFatBinSegment"; 530 531 ModuleIDSectionName = "__hip_module_id"; 532 ModuleIDPrefix = "__hip_"; 533 534 if (CudaGpuBinary) { 535 // If fatbin is available from early finalization, create a string 536 // literal containing the fat binary loaded from the given file. 537 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 538 FatbinConstantName, 8); 539 } else { 540 // If fatbin is not available, create an external symbol 541 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 542 // to contain the fat binary but will be populated somewhere else, 543 // e.g. by lld through link script. 544 FatBinStr = new llvm::GlobalVariable( 545 CGM.getModule(), CGM.Int8Ty, 546 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 547 "__hip_fatbin", nullptr, 548 llvm::GlobalVariable::NotThreadLocal); 549 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 550 } 551 552 FatMagic = HIPFatMagic; 553 } else { 554 if (RelocatableDeviceCode) 555 FatbinConstantName = CGM.getTriple().isMacOSX() 556 ? "__NV_CUDA,__nv_relfatbin" 557 : "__nv_relfatbin"; 558 else 559 FatbinConstantName = 560 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 561 // NVIDIA's cuobjdump looks for fatbins in this section. 562 FatbinSectionName = 563 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 564 565 ModuleIDSectionName = CGM.getTriple().isMacOSX() 566 ? "__NV_CUDA,__nv_module_id" 567 : "__nv_module_id"; 568 ModuleIDPrefix = "__nv_"; 569 570 // For CUDA, create a string literal containing the fat binary loaded from 571 // the given file. 572 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 573 FatbinConstantName, 8); 574 FatMagic = CudaFatMagic; 575 } 576 577 // Create initialized wrapper structure that points to the loaded GPU binary 578 ConstantInitBuilder Builder(CGM); 579 auto Values = Builder.beginStruct(FatbinWrapperTy); 580 // Fatbin wrapper magic. 581 Values.addInt(IntTy, FatMagic); 582 // Fatbin version. 583 Values.addInt(IntTy, 1); 584 // Data. 585 Values.add(FatBinStr); 586 // Unused in fatbin v1. 587 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 588 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 589 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 590 /*constant*/ true); 591 FatbinWrapper->setSection(FatbinSectionName); 592 593 // There is only one HIP fat binary per linked module, however there are 594 // multiple constructor functions. Make sure the fat binary is registered 595 // only once. The constructor functions are executed by the dynamic loader 596 // before the program gains control. The dynamic loader cannot execute the 597 // constructor functions concurrently since doing that would not guarantee 598 // thread safety of the loaded program. Therefore we can assume sequential 599 // execution of constructor functions here. 600 if (IsHIP) { 601 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 602 llvm::GlobalValue::LinkOnceAnyLinkage; 603 llvm::BasicBlock *IfBlock = 604 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 605 llvm::BasicBlock *ExitBlock = 606 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 607 // The name, size, and initialization pattern of this variable is part 608 // of HIP ABI. 609 GpuBinaryHandle = new llvm::GlobalVariable( 610 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 611 Linkage, 612 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 613 "__hip_gpubin_handle"); 614 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 615 // Prevent the weak symbol in different shared libraries being merged. 616 if (Linkage != llvm::GlobalValue::InternalLinkage) 617 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 618 Address GpuBinaryAddr( 619 GpuBinaryHandle, 620 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 621 { 622 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 623 llvm::Constant *Zero = 624 llvm::Constant::getNullValue(HandleValue->getType()); 625 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 626 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 627 } 628 { 629 CtorBuilder.SetInsertPoint(IfBlock); 630 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 631 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 632 RegisterFatbinFunc, 633 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 634 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 635 CtorBuilder.CreateBr(ExitBlock); 636 } 637 { 638 CtorBuilder.SetInsertPoint(ExitBlock); 639 // Call __hip_register_globals(GpuBinaryHandle); 640 if (RegisterGlobalsFunc) { 641 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 642 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 643 } 644 } 645 } else if (!RelocatableDeviceCode) { 646 // Register binary with CUDA runtime. This is substantially different in 647 // default mode vs. separate compilation! 648 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 649 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 650 RegisterFatbinFunc, 651 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 652 GpuBinaryHandle = new llvm::GlobalVariable( 653 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 654 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 655 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 656 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 657 CGM.getPointerAlign()); 658 659 // Call __cuda_register_globals(GpuBinaryHandle); 660 if (RegisterGlobalsFunc) 661 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 662 663 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 664 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 665 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 666 // void __cudaRegisterFatBinaryEnd(void **); 667 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 668 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 669 "__cudaRegisterFatBinaryEnd"); 670 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 671 } 672 } else { 673 // Generate a unique module ID. 674 SmallString<64> ModuleID; 675 llvm::raw_svector_ostream OS(ModuleID); 676 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 677 llvm::Constant *ModuleIDConstant = 678 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 679 680 // Create an alias for the FatbinWrapper that nvcc will look for. 681 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 682 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 683 684 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 685 // void *, void (*)(void **)) 686 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 687 RegisterLinkedBinaryName += ModuleID; 688 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 689 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 690 691 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 692 llvm::Value *Args[] = {RegisterGlobalsFunc, 693 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 694 ModuleIDConstant, 695 makeDummyFunction(getCallbackFnTy())}; 696 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 697 } 698 699 // Create destructor and register it with atexit() the way NVCC does it. Doing 700 // it during regular destructor phase worked in CUDA before 9.2 but results in 701 // double-free in 9.2. 702 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 703 // extern "C" int atexit(void (*f)(void)); 704 llvm::FunctionType *AtExitTy = 705 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 706 llvm::FunctionCallee AtExitFunc = 707 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 708 /*Local=*/true); 709 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 710 } 711 712 CtorBuilder.CreateRetVoid(); 713 return ModuleCtorFunc; 714 } 715 716 /// Creates a global destructor function that unregisters the GPU code blob 717 /// registered by constructor. 718 /// 719 /// For CUDA: 720 /// \code 721 /// void __cuda_module_dtor(void*) { 722 /// __cudaUnregisterFatBinary(Handle); 723 /// } 724 /// \endcode 725 /// 726 /// For HIP: 727 /// \code 728 /// void __hip_module_dtor(void*) { 729 /// if (__hip_gpubin_handle) { 730 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 731 /// __hip_gpubin_handle = 0; 732 /// } 733 /// } 734 /// \endcode 735 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 736 // No need for destructor if we don't have a handle to unregister. 737 if (!GpuBinaryHandle) 738 return nullptr; 739 740 // void __cudaUnregisterFatBinary(void ** handle); 741 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 742 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 743 addUnderscoredPrefixToName("UnregisterFatBinary")); 744 745 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 746 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 747 llvm::GlobalValue::InternalLinkage, 748 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 749 750 llvm::BasicBlock *DtorEntryBB = 751 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 752 CGBuilderTy DtorBuilder(CGM, Context); 753 DtorBuilder.SetInsertPoint(DtorEntryBB); 754 755 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 756 GpuBinaryHandle->getAlignment())); 757 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 758 // There is only one HIP fat binary per linked module, however there are 759 // multiple destructor functions. Make sure the fat binary is unregistered 760 // only once. 761 if (CGM.getLangOpts().HIP) { 762 llvm::BasicBlock *IfBlock = 763 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 764 llvm::BasicBlock *ExitBlock = 765 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 766 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 767 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 768 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 769 770 DtorBuilder.SetInsertPoint(IfBlock); 771 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 772 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 773 DtorBuilder.CreateBr(ExitBlock); 774 775 DtorBuilder.SetInsertPoint(ExitBlock); 776 } else { 777 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 778 } 779 DtorBuilder.CreateRetVoid(); 780 return ModuleDtorFunc; 781 } 782 783 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 784 return new CGNVCUDARuntime(CGM); 785 } 786