1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 struct KernelInfo { 46 llvm::Function *Kernel; 47 const Decl *D; 48 }; 49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 50 struct VarInfo { 51 llvm::GlobalVariable *Var; 52 const VarDecl *D; 53 unsigned Flag; 54 }; 55 llvm::SmallVector<VarInfo, 16> DeviceVars; 56 /// Keeps track of variable containing handle of GPU binary. Populated by 57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 58 /// ModuleDtorFunction() 59 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 60 /// Whether we generate relocatable device code. 61 bool RelocatableDeviceCode; 62 /// Mangle context for device. 63 std::unique_ptr<MangleContext> DeviceMC; 64 65 llvm::FunctionCallee getSetupArgumentFn() const; 66 llvm::FunctionCallee getLaunchFn() const; 67 68 llvm::FunctionType *getRegisterGlobalsFnTy() const; 69 llvm::FunctionType *getCallbackFnTy() const; 70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 71 std::string addPrefixToName(StringRef FuncName) const; 72 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 73 74 /// Creates a function to register all kernel stubs generated in this module. 75 llvm::Function *makeRegisterGlobalsFn(); 76 77 /// Helper function that generates a constant string and returns a pointer to 78 /// the start of the string. The result of this function can be used anywhere 79 /// where the C code specifies const char*. 80 llvm::Constant *makeConstantString(const std::string &Str, 81 const std::string &Name = "", 82 const std::string &SectionName = "", 83 unsigned Alignment = 0) { 84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 85 llvm::ConstantInt::get(SizeTy, 0)}; 86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 87 llvm::GlobalVariable *GV = 88 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 89 if (!SectionName.empty()) { 90 GV->setSection(SectionName); 91 // Mark the address as used which make sure that this section isn't 92 // merged and we will really have it in the object file. 93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 94 } 95 if (Alignment) 96 GV->setAlignment(llvm::Align(Alignment)); 97 98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 99 ConstStr.getPointer(), Zeros); 100 } 101 102 /// Helper function that generates an empty dummy function returning void. 103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 104 assert(FnTy->getReturnType()->isVoidTy() && 105 "Can only generate dummy functions returning void!"); 106 llvm::Function *DummyFunc = llvm::Function::Create( 107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 108 109 llvm::BasicBlock *DummyBlock = 110 llvm::BasicBlock::Create(Context, "", DummyFunc); 111 CGBuilderTy FuncBuilder(CGM, Context); 112 FuncBuilder.SetInsertPoint(DummyBlock); 113 FuncBuilder.CreateRetVoid(); 114 115 return DummyFunc; 116 } 117 118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 120 std::string getDeviceSideName(const NamedDecl *ND) override; 121 122 public: 123 CGNVCUDARuntime(CodeGenModule &CGM); 124 125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 127 unsigned Flags) override { 128 DeviceVars.push_back({&Var, VD, Flags}); 129 } 130 131 /// Creates module constructor function 132 llvm::Function *makeModuleCtorFunction() override; 133 /// Creates module destructor function 134 llvm::Function *makeModuleDtorFunction() override; 135 }; 136 137 } 138 139 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 140 if (CGM.getLangOpts().HIP) 141 return ((Twine("hip") + Twine(FuncName)).str()); 142 return ((Twine("cuda") + Twine(FuncName)).str()); 143 } 144 std::string 145 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 146 if (CGM.getLangOpts().HIP) 147 return ((Twine("__hip") + Twine(FuncName)).str()); 148 return ((Twine("__cuda") + Twine(FuncName)).str()); 149 } 150 151 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 152 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 153 TheModule(CGM.getModule()), 154 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 155 DeviceMC(CGM.getContext().createMangleContext( 156 CGM.getContext().getAuxTargetInfo())) { 157 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 158 ASTContext &Ctx = CGM.getContext(); 159 160 IntTy = CGM.IntTy; 161 SizeTy = CGM.SizeTy; 162 VoidTy = CGM.VoidTy; 163 164 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 165 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 166 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 167 } 168 169 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 170 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 171 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 172 return CGM.CreateRuntimeFunction( 173 llvm::FunctionType::get(IntTy, Params, false), 174 addPrefixToName("SetupArgument")); 175 } 176 177 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 178 if (CGM.getLangOpts().HIP) { 179 // hipError_t hipLaunchByPtr(char *); 180 return CGM.CreateRuntimeFunction( 181 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 182 } else { 183 // cudaError_t cudaLaunch(char *); 184 return CGM.CreateRuntimeFunction( 185 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 186 } 187 } 188 189 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 190 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 191 } 192 193 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 194 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 195 } 196 197 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 198 auto CallbackFnTy = getCallbackFnTy(); 199 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 200 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 201 VoidPtrTy, CallbackFnTy->getPointerTo()}; 202 return llvm::FunctionType::get(VoidTy, Params, false); 203 } 204 205 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { 206 GlobalDecl GD; 207 // D could be either a kernel or a variable. 208 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 209 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 210 else 211 GD = GlobalDecl(ND); 212 std::string DeviceSideName; 213 if (DeviceMC->shouldMangleDeclName(ND)) { 214 SmallString<256> Buffer; 215 llvm::raw_svector_ostream Out(Buffer); 216 DeviceMC->mangleName(GD, Out); 217 DeviceSideName = std::string(Out.str()); 218 } else 219 DeviceSideName = std::string(ND->getIdentifier()->getName()); 220 return DeviceSideName; 221 } 222 223 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 224 FunctionArgList &Args) { 225 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 226 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 227 CudaFeature::CUDA_USES_NEW_LAUNCH) || 228 CGF.getLangOpts().HIPUseNewLaunchAPI) 229 emitDeviceStubBodyNew(CGF, Args); 230 else 231 emitDeviceStubBodyLegacy(CGF, Args); 232 } 233 234 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 235 // array and kernels are launched using cudaLaunchKernel(). 236 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 237 FunctionArgList &Args) { 238 // Build the shadow stack entry at the very start of the function. 239 240 // Calculate amount of space we will need for all arguments. If we have no 241 // args, allocate a single pointer so we still have a valid pointer to the 242 // argument array that we can pass to runtime, even if it will be unused. 243 Address KernelArgs = CGF.CreateTempAlloca( 244 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 245 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 246 // Store pointers to the arguments in a locally allocated launch_args. 247 for (unsigned i = 0; i < Args.size(); ++i) { 248 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 249 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 250 CGF.Builder.CreateDefaultAlignedStore( 251 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 252 } 253 254 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 255 256 // Lookup cudaLaunchKernel/hipLaunchKernel function. 257 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 258 // void **args, size_t sharedMem, 259 // cudaStream_t stream); 260 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 261 // void **args, size_t sharedMem, 262 // hipStream_t stream); 263 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 264 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 265 auto LaunchKernelName = addPrefixToName("LaunchKernel"); 266 IdentifierInfo &cudaLaunchKernelII = 267 CGM.getContext().Idents.get(LaunchKernelName); 268 FunctionDecl *cudaLaunchKernelFD = nullptr; 269 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 270 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 271 cudaLaunchKernelFD = FD; 272 } 273 274 if (cudaLaunchKernelFD == nullptr) { 275 CGM.Error(CGF.CurFuncDecl->getLocation(), 276 "Can't find declaration for " + LaunchKernelName); 277 return; 278 } 279 // Create temporary dim3 grid_dim, block_dim. 280 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 281 QualType Dim3Ty = GridDimParam->getType(); 282 Address GridDim = 283 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 284 Address BlockDim = 285 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 286 Address ShmemSize = 287 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 288 Address Stream = 289 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 290 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 291 llvm::FunctionType::get(IntTy, 292 {/*gridDim=*/GridDim.getType(), 293 /*blockDim=*/BlockDim.getType(), 294 /*ShmemSize=*/ShmemSize.getType(), 295 /*Stream=*/Stream.getType()}, 296 /*isVarArg=*/false), 297 addUnderscoredPrefixToName("PopCallConfiguration")); 298 299 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 300 {GridDim.getPointer(), BlockDim.getPointer(), 301 ShmemSize.getPointer(), Stream.getPointer()}); 302 303 // Emit the call to cudaLaunch 304 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 305 CallArgList LaunchKernelArgs; 306 LaunchKernelArgs.add(RValue::get(Kernel), 307 cudaLaunchKernelFD->getParamDecl(0)->getType()); 308 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 309 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 310 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 311 cudaLaunchKernelFD->getParamDecl(3)->getType()); 312 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 313 cudaLaunchKernelFD->getParamDecl(4)->getType()); 314 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 315 cudaLaunchKernelFD->getParamDecl(5)->getType()); 316 317 QualType QT = cudaLaunchKernelFD->getType(); 318 QualType CQT = QT.getCanonicalType(); 319 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 320 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 321 322 const CGFunctionInfo &FI = 323 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 324 llvm::FunctionCallee cudaLaunchKernelFn = 325 CGM.CreateRuntimeFunction(FTy, LaunchKernelName); 326 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 327 LaunchKernelArgs); 328 CGF.EmitBranch(EndBlock); 329 330 CGF.EmitBlock(EndBlock); 331 } 332 333 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 334 FunctionArgList &Args) { 335 // Emit a call to cudaSetupArgument for each arg in Args. 336 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 337 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 338 CharUnits Offset = CharUnits::Zero(); 339 for (const VarDecl *A : Args) { 340 CharUnits TyWidth, TyAlign; 341 std::tie(TyWidth, TyAlign) = 342 CGM.getContext().getTypeInfoInChars(A->getType()); 343 Offset = Offset.alignTo(TyAlign); 344 llvm::Value *Args[] = { 345 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 346 VoidPtrTy), 347 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 348 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 349 }; 350 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 351 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 352 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 353 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 354 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 355 CGF.EmitBlock(NextBlock); 356 Offset += TyWidth; 357 } 358 359 // Emit the call to cudaLaunch 360 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 361 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 362 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 363 CGF.EmitBranch(EndBlock); 364 365 CGF.EmitBlock(EndBlock); 366 } 367 368 /// Creates a function that sets up state on the host side for CUDA objects that 369 /// have a presence on both the host and device sides. Specifically, registers 370 /// the host side of kernel functions and device global variables with the CUDA 371 /// runtime. 372 /// \code 373 /// void __cuda_register_globals(void** GpuBinaryHandle) { 374 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 375 /// ... 376 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 377 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 378 /// ... 379 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 380 /// } 381 /// \endcode 382 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 383 // No need to register anything 384 if (EmittedKernels.empty() && DeviceVars.empty()) 385 return nullptr; 386 387 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 388 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 389 addUnderscoredPrefixToName("_register_globals"), &TheModule); 390 llvm::BasicBlock *EntryBB = 391 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 392 CGBuilderTy Builder(CGM, Context); 393 Builder.SetInsertPoint(EntryBB); 394 395 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 396 // int, uint3*, uint3*, dim3*, dim3*, int*) 397 llvm::Type *RegisterFuncParams[] = { 398 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 399 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 400 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 401 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 402 addUnderscoredPrefixToName("RegisterFunction")); 403 404 // Extract GpuBinaryHandle passed as the first argument passed to 405 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 406 // each emitted kernel. 407 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 408 for (auto &&I : EmittedKernels) { 409 llvm::Constant *KernelName = 410 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 411 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 412 llvm::Value *Args[] = { 413 &GpuBinaryHandlePtr, 414 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 415 KernelName, 416 KernelName, 417 llvm::ConstantInt::get(IntTy, -1), 418 NullPtr, 419 NullPtr, 420 NullPtr, 421 NullPtr, 422 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 423 Builder.CreateCall(RegisterFunc, Args); 424 } 425 426 // void __cudaRegisterVar(void **, char *, char *, const char *, 427 // int, int, int, int) 428 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 429 CharPtrTy, IntTy, IntTy, 430 IntTy, IntTy}; 431 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 432 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 433 addUnderscoredPrefixToName("RegisterVar")); 434 for (auto &&Info : DeviceVars) { 435 llvm::GlobalVariable *Var = Info.Var; 436 unsigned Flags = Info.Flag; 437 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 438 uint64_t VarSize = 439 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 440 llvm::Value *Args[] = { 441 &GpuBinaryHandlePtr, 442 Builder.CreateBitCast(Var, VoidPtrTy), 443 VarName, 444 VarName, 445 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 446 llvm::ConstantInt::get(IntTy, VarSize), 447 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 448 llvm::ConstantInt::get(IntTy, 0)}; 449 Builder.CreateCall(RegisterVar, Args); 450 } 451 452 Builder.CreateRetVoid(); 453 return RegisterKernelsFunc; 454 } 455 456 /// Creates a global constructor function for the module: 457 /// 458 /// For CUDA: 459 /// \code 460 /// void __cuda_module_ctor(void*) { 461 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 462 /// __cuda_register_globals(Handle); 463 /// } 464 /// \endcode 465 /// 466 /// For HIP: 467 /// \code 468 /// void __hip_module_ctor(void*) { 469 /// if (__hip_gpubin_handle == 0) { 470 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 471 /// __hip_register_globals(__hip_gpubin_handle); 472 /// } 473 /// } 474 /// \endcode 475 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 476 bool IsHIP = CGM.getLangOpts().HIP; 477 bool IsCUDA = CGM.getLangOpts().CUDA; 478 // No need to generate ctors/dtors if there is no GPU binary. 479 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 480 if (CudaGpuBinaryFileName.empty() && !IsHIP) 481 return nullptr; 482 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() && 483 DeviceVars.empty()) 484 return nullptr; 485 486 // void __{cuda|hip}_register_globals(void* handle); 487 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 488 // We always need a function to pass in as callback. Create a dummy 489 // implementation if we don't need to register anything. 490 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 491 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 492 493 // void ** __{cuda|hip}RegisterFatBinary(void *); 494 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 495 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 496 addUnderscoredPrefixToName("RegisterFatBinary")); 497 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 498 llvm::StructType *FatbinWrapperTy = 499 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 500 501 // Register GPU binary with the CUDA runtime, store returned handle in a 502 // global variable and save a reference in GpuBinaryHandle to be cleaned up 503 // in destructor on exit. Then associate all known kernels with the GPU binary 504 // handle so CUDA runtime can figure out what to call on the GPU side. 505 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 506 if (!CudaGpuBinaryFileName.empty()) { 507 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 508 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 509 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 510 CGM.getDiags().Report(diag::err_cannot_open_file) 511 << CudaGpuBinaryFileName << EC.message(); 512 return nullptr; 513 } 514 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 515 } 516 517 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 518 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 519 llvm::GlobalValue::InternalLinkage, 520 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 521 llvm::BasicBlock *CtorEntryBB = 522 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 523 CGBuilderTy CtorBuilder(CGM, Context); 524 525 CtorBuilder.SetInsertPoint(CtorEntryBB); 526 527 const char *FatbinConstantName; 528 const char *FatbinSectionName; 529 const char *ModuleIDSectionName; 530 StringRef ModuleIDPrefix; 531 llvm::Constant *FatBinStr; 532 unsigned FatMagic; 533 if (IsHIP) { 534 FatbinConstantName = ".hip_fatbin"; 535 FatbinSectionName = ".hipFatBinSegment"; 536 537 ModuleIDSectionName = "__hip_module_id"; 538 ModuleIDPrefix = "__hip_"; 539 540 if (CudaGpuBinary) { 541 // If fatbin is available from early finalization, create a string 542 // literal containing the fat binary loaded from the given file. 543 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), 544 "", FatbinConstantName, 8); 545 } else { 546 // If fatbin is not available, create an external symbol 547 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 548 // to contain the fat binary but will be populated somewhere else, 549 // e.g. by lld through link script. 550 FatBinStr = new llvm::GlobalVariable( 551 CGM.getModule(), CGM.Int8Ty, 552 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 553 "__hip_fatbin", nullptr, 554 llvm::GlobalVariable::NotThreadLocal); 555 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 556 } 557 558 FatMagic = HIPFatMagic; 559 } else { 560 if (RelocatableDeviceCode) 561 FatbinConstantName = CGM.getTriple().isMacOSX() 562 ? "__NV_CUDA,__nv_relfatbin" 563 : "__nv_relfatbin"; 564 else 565 FatbinConstantName = 566 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 567 // NVIDIA's cuobjdump looks for fatbins in this section. 568 FatbinSectionName = 569 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 570 571 ModuleIDSectionName = CGM.getTriple().isMacOSX() 572 ? "__NV_CUDA,__nv_module_id" 573 : "__nv_module_id"; 574 ModuleIDPrefix = "__nv_"; 575 576 // For CUDA, create a string literal containing the fat binary loaded from 577 // the given file. 578 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", 579 FatbinConstantName, 8); 580 FatMagic = CudaFatMagic; 581 } 582 583 // Create initialized wrapper structure that points to the loaded GPU binary 584 ConstantInitBuilder Builder(CGM); 585 auto Values = Builder.beginStruct(FatbinWrapperTy); 586 // Fatbin wrapper magic. 587 Values.addInt(IntTy, FatMagic); 588 // Fatbin version. 589 Values.addInt(IntTy, 1); 590 // Data. 591 Values.add(FatBinStr); 592 // Unused in fatbin v1. 593 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 594 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 595 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 596 /*constant*/ true); 597 FatbinWrapper->setSection(FatbinSectionName); 598 599 // There is only one HIP fat binary per linked module, however there are 600 // multiple constructor functions. Make sure the fat binary is registered 601 // only once. The constructor functions are executed by the dynamic loader 602 // before the program gains control. The dynamic loader cannot execute the 603 // constructor functions concurrently since doing that would not guarantee 604 // thread safety of the loaded program. Therefore we can assume sequential 605 // execution of constructor functions here. 606 if (IsHIP) { 607 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 608 llvm::GlobalValue::LinkOnceAnyLinkage; 609 llvm::BasicBlock *IfBlock = 610 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 611 llvm::BasicBlock *ExitBlock = 612 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 613 // The name, size, and initialization pattern of this variable is part 614 // of HIP ABI. 615 GpuBinaryHandle = new llvm::GlobalVariable( 616 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 617 Linkage, 618 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 619 "__hip_gpubin_handle"); 620 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 621 // Prevent the weak symbol in different shared libraries being merged. 622 if (Linkage != llvm::GlobalValue::InternalLinkage) 623 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 624 Address GpuBinaryAddr( 625 GpuBinaryHandle, 626 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 627 { 628 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 629 llvm::Constant *Zero = 630 llvm::Constant::getNullValue(HandleValue->getType()); 631 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 632 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 633 } 634 { 635 CtorBuilder.SetInsertPoint(IfBlock); 636 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 637 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 638 RegisterFatbinFunc, 639 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 640 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 641 CtorBuilder.CreateBr(ExitBlock); 642 } 643 { 644 CtorBuilder.SetInsertPoint(ExitBlock); 645 // Call __hip_register_globals(GpuBinaryHandle); 646 if (RegisterGlobalsFunc) { 647 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 648 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 649 } 650 } 651 } else if (!RelocatableDeviceCode) { 652 // Register binary with CUDA runtime. This is substantially different in 653 // default mode vs. separate compilation! 654 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 655 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 656 RegisterFatbinFunc, 657 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 658 GpuBinaryHandle = new llvm::GlobalVariable( 659 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 660 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 661 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); 662 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 663 CGM.getPointerAlign()); 664 665 // Call __cuda_register_globals(GpuBinaryHandle); 666 if (RegisterGlobalsFunc) 667 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 668 669 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 670 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 671 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 672 // void __cudaRegisterFatBinaryEnd(void **); 673 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 674 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 675 "__cudaRegisterFatBinaryEnd"); 676 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 677 } 678 } else { 679 // Generate a unique module ID. 680 SmallString<64> ModuleID; 681 llvm::raw_svector_ostream OS(ModuleID); 682 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 683 llvm::Constant *ModuleIDConstant = makeConstantString( 684 std::string(ModuleID.str()), "", ModuleIDSectionName, 32); 685 686 // Create an alias for the FatbinWrapper that nvcc will look for. 687 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 688 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 689 690 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 691 // void *, void (*)(void **)) 692 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 693 RegisterLinkedBinaryName += ModuleID; 694 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 695 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 696 697 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 698 llvm::Value *Args[] = {RegisterGlobalsFunc, 699 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 700 ModuleIDConstant, 701 makeDummyFunction(getCallbackFnTy())}; 702 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 703 } 704 705 // Create destructor and register it with atexit() the way NVCC does it. Doing 706 // it during regular destructor phase worked in CUDA before 9.2 but results in 707 // double-free in 9.2. 708 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 709 // extern "C" int atexit(void (*f)(void)); 710 llvm::FunctionType *AtExitTy = 711 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 712 llvm::FunctionCallee AtExitFunc = 713 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 714 /*Local=*/true); 715 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 716 } 717 718 CtorBuilder.CreateRetVoid(); 719 return ModuleCtorFunc; 720 } 721 722 /// Creates a global destructor function that unregisters the GPU code blob 723 /// registered by constructor. 724 /// 725 /// For CUDA: 726 /// \code 727 /// void __cuda_module_dtor(void*) { 728 /// __cudaUnregisterFatBinary(Handle); 729 /// } 730 /// \endcode 731 /// 732 /// For HIP: 733 /// \code 734 /// void __hip_module_dtor(void*) { 735 /// if (__hip_gpubin_handle) { 736 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 737 /// __hip_gpubin_handle = 0; 738 /// } 739 /// } 740 /// \endcode 741 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 742 // No need for destructor if we don't have a handle to unregister. 743 if (!GpuBinaryHandle) 744 return nullptr; 745 746 // void __cudaUnregisterFatBinary(void ** handle); 747 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 748 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 749 addUnderscoredPrefixToName("UnregisterFatBinary")); 750 751 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 752 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 753 llvm::GlobalValue::InternalLinkage, 754 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 755 756 llvm::BasicBlock *DtorEntryBB = 757 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 758 CGBuilderTy DtorBuilder(CGM, Context); 759 DtorBuilder.SetInsertPoint(DtorEntryBB); 760 761 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 762 GpuBinaryHandle->getAlignment())); 763 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 764 // There is only one HIP fat binary per linked module, however there are 765 // multiple destructor functions. Make sure the fat binary is unregistered 766 // only once. 767 if (CGM.getLangOpts().HIP) { 768 llvm::BasicBlock *IfBlock = 769 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 770 llvm::BasicBlock *ExitBlock = 771 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 772 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 773 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 774 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 775 776 DtorBuilder.SetInsertPoint(IfBlock); 777 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 778 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 779 DtorBuilder.CreateBr(ExitBlock); 780 781 DtorBuilder.SetInsertPoint(ExitBlock); 782 } else { 783 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 784 } 785 DtorBuilder.CreateRetVoid(); 786 return ModuleDtorFunc; 787 } 788 789 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 790 return new CGNVCUDARuntime(CGM); 791 } 792