1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 struct KernelInfo { 46 llvm::Function *Kernel; 47 const Decl *D; 48 }; 49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 50 struct VarInfo { 51 llvm::GlobalVariable *Var; 52 const VarDecl *D; 53 unsigned Flag; 54 }; 55 llvm::SmallVector<VarInfo, 16> DeviceVars; 56 /// Keeps track of variable containing handle of GPU binary. Populated by 57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 58 /// ModuleDtorFunction() 59 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 60 /// Whether we generate relocatable device code. 61 bool RelocatableDeviceCode; 62 /// Mangle context for device. 63 std::unique_ptr<MangleContext> DeviceMC; 64 65 llvm::FunctionCallee getSetupArgumentFn() const; 66 llvm::FunctionCallee getLaunchFn() const; 67 68 llvm::FunctionType *getRegisterGlobalsFnTy() const; 69 llvm::FunctionType *getCallbackFnTy() const; 70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 71 std::string addPrefixToName(StringRef FuncName) const; 72 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 73 74 /// Creates a function to register all kernel stubs generated in this module. 75 llvm::Function *makeRegisterGlobalsFn(); 76 77 /// Helper function that generates a constant string and returns a pointer to 78 /// the start of the string. The result of this function can be used anywhere 79 /// where the C code specifies const char*. 80 llvm::Constant *makeConstantString(const std::string &Str, 81 const std::string &Name = "", 82 const std::string &SectionName = "", 83 unsigned Alignment = 0) { 84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 85 llvm::ConstantInt::get(SizeTy, 0)}; 86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 87 llvm::GlobalVariable *GV = 88 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 89 if (!SectionName.empty()) { 90 GV->setSection(SectionName); 91 // Mark the address as used which make sure that this section isn't 92 // merged and we will really have it in the object file. 93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 94 } 95 if (Alignment) 96 GV->setAlignment(Alignment); 97 98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 99 ConstStr.getPointer(), Zeros); 100 } 101 102 /// Helper function that generates an empty dummy function returning void. 103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 104 assert(FnTy->getReturnType()->isVoidTy() && 105 "Can only generate dummy functions returning void!"); 106 llvm::Function *DummyFunc = llvm::Function::Create( 107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 108 109 llvm::BasicBlock *DummyBlock = 110 llvm::BasicBlock::Create(Context, "", DummyFunc); 111 CGBuilderTy FuncBuilder(CGM, Context); 112 FuncBuilder.SetInsertPoint(DummyBlock); 113 FuncBuilder.CreateRetVoid(); 114 115 return DummyFunc; 116 } 117 118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 120 std::string getDeviceSideName(const Decl *ND); 121 122 public: 123 CGNVCUDARuntime(CodeGenModule &CGM); 124 125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 127 unsigned Flags) override { 128 DeviceVars.push_back({&Var, VD, Flags}); 129 } 130 131 /// Creates module constructor function 132 llvm::Function *makeModuleCtorFunction() override; 133 /// Creates module destructor function 134 llvm::Function *makeModuleDtorFunction() override; 135 }; 136 137 } 138 139 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 140 if (CGM.getLangOpts().HIP) 141 return ((Twine("hip") + Twine(FuncName)).str()); 142 return ((Twine("cuda") + Twine(FuncName)).str()); 143 } 144 std::string 145 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 146 if (CGM.getLangOpts().HIP) 147 return ((Twine("__hip") + Twine(FuncName)).str()); 148 return ((Twine("__cuda") + Twine(FuncName)).str()); 149 } 150 151 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 152 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 153 TheModule(CGM.getModule()), 154 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 155 DeviceMC(CGM.getContext().createMangleContext( 156 CGM.getContext().getAuxTargetInfo())) { 157 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 158 ASTContext &Ctx = CGM.getContext(); 159 160 IntTy = CGM.IntTy; 161 SizeTy = CGM.SizeTy; 162 VoidTy = CGM.VoidTy; 163 164 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 165 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 166 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 167 } 168 169 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 170 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 171 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 172 return CGM.CreateRuntimeFunction( 173 llvm::FunctionType::get(IntTy, Params, false), 174 addPrefixToName("SetupArgument")); 175 } 176 177 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 178 if (CGM.getLangOpts().HIP) { 179 // hipError_t hipLaunchByPtr(char *); 180 return CGM.CreateRuntimeFunction( 181 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 182 } else { 183 // cudaError_t cudaLaunch(char *); 184 return CGM.CreateRuntimeFunction( 185 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 186 } 187 } 188 189 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 190 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 191 } 192 193 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 194 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 195 } 196 197 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 198 auto CallbackFnTy = getCallbackFnTy(); 199 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 200 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 201 VoidPtrTy, CallbackFnTy->getPointerTo()}; 202 return llvm::FunctionType::get(VoidTy, Params, false); 203 } 204 205 std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { 206 auto *ND = cast<const NamedDecl>(D); 207 std::string DeviceSideName; 208 if (DeviceMC->shouldMangleDeclName(ND)) { 209 SmallString<256> Buffer; 210 llvm::raw_svector_ostream Out(Buffer); 211 DeviceMC->mangleName(ND, Out); 212 DeviceSideName = Out.str(); 213 } else 214 DeviceSideName = ND->getIdentifier()->getName(); 215 return DeviceSideName; 216 } 217 218 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 219 FunctionArgList &Args) { 220 assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || 221 getDeviceSideName(CGF.CurFuncDecl) + ".stub" == CGF.CurFn->getName() || 222 CGF.CGM.getContext().getTargetInfo().getCXXABI() != 223 CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()); 224 225 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 226 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 227 CudaFeature::CUDA_USES_NEW_LAUNCH)) 228 emitDeviceStubBodyNew(CGF, Args); 229 else 230 emitDeviceStubBodyLegacy(CGF, Args); 231 } 232 233 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 234 // array and kernels are launched using cudaLaunchKernel(). 235 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 236 FunctionArgList &Args) { 237 // Build the shadow stack entry at the very start of the function. 238 239 // Calculate amount of space we will need for all arguments. If we have no 240 // args, allocate a single pointer so we still have a valid pointer to the 241 // argument array that we can pass to runtime, even if it will be unused. 242 Address KernelArgs = CGF.CreateTempAlloca( 243 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 244 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 245 // Store pointers to the arguments in a locally allocated launch_args. 246 for (unsigned i = 0; i < Args.size(); ++i) { 247 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 248 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 249 CGF.Builder.CreateDefaultAlignedStore( 250 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 251 } 252 253 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 254 255 // Lookup cudaLaunchKernel function. 256 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 257 // void **args, size_t sharedMem, 258 // cudaStream_t stream); 259 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 260 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 261 IdentifierInfo &cudaLaunchKernelII = 262 CGM.getContext().Idents.get("cudaLaunchKernel"); 263 FunctionDecl *cudaLaunchKernelFD = nullptr; 264 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 265 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 266 cudaLaunchKernelFD = FD; 267 } 268 269 if (cudaLaunchKernelFD == nullptr) { 270 CGM.Error(CGF.CurFuncDecl->getLocation(), 271 "Can't find declaration for cudaLaunchKernel()"); 272 return; 273 } 274 // Create temporary dim3 grid_dim, block_dim. 275 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 276 QualType Dim3Ty = GridDimParam->getType(); 277 Address GridDim = 278 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 279 Address BlockDim = 280 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 281 Address ShmemSize = 282 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 283 Address Stream = 284 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 285 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 286 llvm::FunctionType::get(IntTy, 287 {/*gridDim=*/GridDim.getType(), 288 /*blockDim=*/BlockDim.getType(), 289 /*ShmemSize=*/ShmemSize.getType(), 290 /*Stream=*/Stream.getType()}, 291 /*isVarArg=*/false), 292 "__cudaPopCallConfiguration"); 293 294 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 295 {GridDim.getPointer(), BlockDim.getPointer(), 296 ShmemSize.getPointer(), Stream.getPointer()}); 297 298 // Emit the call to cudaLaunch 299 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 300 CallArgList LaunchKernelArgs; 301 LaunchKernelArgs.add(RValue::get(Kernel), 302 cudaLaunchKernelFD->getParamDecl(0)->getType()); 303 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 304 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 305 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 306 cudaLaunchKernelFD->getParamDecl(3)->getType()); 307 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 308 cudaLaunchKernelFD->getParamDecl(4)->getType()); 309 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 310 cudaLaunchKernelFD->getParamDecl(5)->getType()); 311 312 QualType QT = cudaLaunchKernelFD->getType(); 313 QualType CQT = QT.getCanonicalType(); 314 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 315 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 316 317 const CGFunctionInfo &FI = 318 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 319 llvm::FunctionCallee cudaLaunchKernelFn = 320 CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); 321 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 322 LaunchKernelArgs); 323 CGF.EmitBranch(EndBlock); 324 325 CGF.EmitBlock(EndBlock); 326 } 327 328 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 329 FunctionArgList &Args) { 330 // Emit a call to cudaSetupArgument for each arg in Args. 331 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 332 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 333 CharUnits Offset = CharUnits::Zero(); 334 for (const VarDecl *A : Args) { 335 CharUnits TyWidth, TyAlign; 336 std::tie(TyWidth, TyAlign) = 337 CGM.getContext().getTypeInfoInChars(A->getType()); 338 Offset = Offset.alignTo(TyAlign); 339 llvm::Value *Args[] = { 340 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 341 VoidPtrTy), 342 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 343 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 344 }; 345 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 346 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 347 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 348 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 349 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 350 CGF.EmitBlock(NextBlock); 351 Offset += TyWidth; 352 } 353 354 // Emit the call to cudaLaunch 355 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 356 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 357 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 358 CGF.EmitBranch(EndBlock); 359 360 CGF.EmitBlock(EndBlock); 361 } 362 363 /// Creates a function that sets up state on the host side for CUDA objects that 364 /// have a presence on both the host and device sides. Specifically, registers 365 /// the host side of kernel functions and device global variables with the CUDA 366 /// runtime. 367 /// \code 368 /// void __cuda_register_globals(void** GpuBinaryHandle) { 369 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 370 /// ... 371 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 372 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 373 /// ... 374 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 375 /// } 376 /// \endcode 377 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 378 // No need to register anything 379 if (EmittedKernels.empty() && DeviceVars.empty()) 380 return nullptr; 381 382 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 383 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 384 addUnderscoredPrefixToName("_register_globals"), &TheModule); 385 llvm::BasicBlock *EntryBB = 386 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 387 CGBuilderTy Builder(CGM, Context); 388 Builder.SetInsertPoint(EntryBB); 389 390 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 391 // int, uint3*, uint3*, dim3*, dim3*, int*) 392 llvm::Type *RegisterFuncParams[] = { 393 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 394 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 395 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 396 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 397 addUnderscoredPrefixToName("RegisterFunction")); 398 399 // Extract GpuBinaryHandle passed as the first argument passed to 400 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 401 // each emitted kernel. 402 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 403 for (auto &&I : EmittedKernels) { 404 llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); 405 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 406 llvm::Value *Args[] = { 407 &GpuBinaryHandlePtr, 408 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 409 KernelName, 410 KernelName, 411 llvm::ConstantInt::get(IntTy, -1), 412 NullPtr, 413 NullPtr, 414 NullPtr, 415 NullPtr, 416 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 417 Builder.CreateCall(RegisterFunc, Args); 418 } 419 420 // void __cudaRegisterVar(void **, char *, char *, const char *, 421 // int, int, int, int) 422 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 423 CharPtrTy, IntTy, IntTy, 424 IntTy, IntTy}; 425 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 426 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 427 addUnderscoredPrefixToName("RegisterVar")); 428 for (auto &&Info : DeviceVars) { 429 llvm::GlobalVariable *Var = Info.Var; 430 unsigned Flags = Info.Flag; 431 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 432 uint64_t VarSize = 433 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 434 llvm::Value *Args[] = { 435 &GpuBinaryHandlePtr, 436 Builder.CreateBitCast(Var, VoidPtrTy), 437 VarName, 438 VarName, 439 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 440 llvm::ConstantInt::get(IntTy, VarSize), 441 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 442 llvm::ConstantInt::get(IntTy, 0)}; 443 Builder.CreateCall(RegisterVar, Args); 444 } 445 446 Builder.CreateRetVoid(); 447 return RegisterKernelsFunc; 448 } 449 450 /// Creates a global constructor function for the module: 451 /// 452 /// For CUDA: 453 /// \code 454 /// void __cuda_module_ctor(void*) { 455 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 456 /// __cuda_register_globals(Handle); 457 /// } 458 /// \endcode 459 /// 460 /// For HIP: 461 /// \code 462 /// void __hip_module_ctor(void*) { 463 /// if (__hip_gpubin_handle == 0) { 464 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 465 /// __hip_register_globals(__hip_gpubin_handle); 466 /// } 467 /// } 468 /// \endcode 469 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 470 bool IsHIP = CGM.getLangOpts().HIP; 471 // No need to generate ctors/dtors if there is no GPU binary. 472 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 473 if (CudaGpuBinaryFileName.empty() && !IsHIP) 474 return nullptr; 475 476 // void __{cuda|hip}_register_globals(void* handle); 477 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 478 // We always need a function to pass in as callback. Create a dummy 479 // implementation if we don't need to register anything. 480 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 481 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 482 483 // void ** __{cuda|hip}RegisterFatBinary(void *); 484 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 485 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 486 addUnderscoredPrefixToName("RegisterFatBinary")); 487 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 488 llvm::StructType *FatbinWrapperTy = 489 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 490 491 // Register GPU binary with the CUDA runtime, store returned handle in a 492 // global variable and save a reference in GpuBinaryHandle to be cleaned up 493 // in destructor on exit. Then associate all known kernels with the GPU binary 494 // handle so CUDA runtime can figure out what to call on the GPU side. 495 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 496 if (!CudaGpuBinaryFileName.empty()) { 497 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 498 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 499 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 500 CGM.getDiags().Report(diag::err_cannot_open_file) 501 << CudaGpuBinaryFileName << EC.message(); 502 return nullptr; 503 } 504 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 505 } 506 507 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 508 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 509 llvm::GlobalValue::InternalLinkage, 510 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 511 llvm::BasicBlock *CtorEntryBB = 512 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 513 CGBuilderTy CtorBuilder(CGM, Context); 514 515 CtorBuilder.SetInsertPoint(CtorEntryBB); 516 517 const char *FatbinConstantName; 518 const char *FatbinSectionName; 519 const char *ModuleIDSectionName; 520 StringRef ModuleIDPrefix; 521 llvm::Constant *FatBinStr; 522 unsigned FatMagic; 523 if (IsHIP) { 524 FatbinConstantName = ".hip_fatbin"; 525 FatbinSectionName = ".hipFatBinSegment"; 526 527 ModuleIDSectionName = "__hip_module_id"; 528 ModuleIDPrefix = "__hip_"; 529 530 if (CudaGpuBinary) { 531 // If fatbin is available from early finalization, create a string 532 // literal containing the fat binary loaded from the given file. 533 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 534 FatbinConstantName, 8); 535 } else { 536 // If fatbin is not available, create an external symbol 537 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 538 // to contain the fat binary but will be populated somewhere else, 539 // e.g. by lld through link script. 540 FatBinStr = new llvm::GlobalVariable( 541 CGM.getModule(), CGM.Int8Ty, 542 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 543 "__hip_fatbin", nullptr, 544 llvm::GlobalVariable::NotThreadLocal); 545 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 546 } 547 548 FatMagic = HIPFatMagic; 549 } else { 550 if (RelocatableDeviceCode) 551 FatbinConstantName = CGM.getTriple().isMacOSX() 552 ? "__NV_CUDA,__nv_relfatbin" 553 : "__nv_relfatbin"; 554 else 555 FatbinConstantName = 556 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 557 // NVIDIA's cuobjdump looks for fatbins in this section. 558 FatbinSectionName = 559 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 560 561 ModuleIDSectionName = CGM.getTriple().isMacOSX() 562 ? "__NV_CUDA,__nv_module_id" 563 : "__nv_module_id"; 564 ModuleIDPrefix = "__nv_"; 565 566 // For CUDA, create a string literal containing the fat binary loaded from 567 // the given file. 568 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 569 FatbinConstantName, 8); 570 FatMagic = CudaFatMagic; 571 } 572 573 // Create initialized wrapper structure that points to the loaded GPU binary 574 ConstantInitBuilder Builder(CGM); 575 auto Values = Builder.beginStruct(FatbinWrapperTy); 576 // Fatbin wrapper magic. 577 Values.addInt(IntTy, FatMagic); 578 // Fatbin version. 579 Values.addInt(IntTy, 1); 580 // Data. 581 Values.add(FatBinStr); 582 // Unused in fatbin v1. 583 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 584 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 585 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 586 /*constant*/ true); 587 FatbinWrapper->setSection(FatbinSectionName); 588 589 // There is only one HIP fat binary per linked module, however there are 590 // multiple constructor functions. Make sure the fat binary is registered 591 // only once. The constructor functions are executed by the dynamic loader 592 // before the program gains control. The dynamic loader cannot execute the 593 // constructor functions concurrently since doing that would not guarantee 594 // thread safety of the loaded program. Therefore we can assume sequential 595 // execution of constructor functions here. 596 if (IsHIP) { 597 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 598 llvm::GlobalValue::LinkOnceAnyLinkage; 599 llvm::BasicBlock *IfBlock = 600 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 601 llvm::BasicBlock *ExitBlock = 602 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 603 // The name, size, and initialization pattern of this variable is part 604 // of HIP ABI. 605 GpuBinaryHandle = new llvm::GlobalVariable( 606 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 607 Linkage, 608 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 609 "__hip_gpubin_handle"); 610 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 611 // Prevent the weak symbol in different shared libraries being merged. 612 if (Linkage != llvm::GlobalValue::InternalLinkage) 613 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 614 Address GpuBinaryAddr( 615 GpuBinaryHandle, 616 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 617 { 618 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 619 llvm::Constant *Zero = 620 llvm::Constant::getNullValue(HandleValue->getType()); 621 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 622 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 623 } 624 { 625 CtorBuilder.SetInsertPoint(IfBlock); 626 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 627 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 628 RegisterFatbinFunc, 629 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 630 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 631 CtorBuilder.CreateBr(ExitBlock); 632 } 633 { 634 CtorBuilder.SetInsertPoint(ExitBlock); 635 // Call __hip_register_globals(GpuBinaryHandle); 636 if (RegisterGlobalsFunc) { 637 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 638 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 639 } 640 } 641 } else if (!RelocatableDeviceCode) { 642 // Register binary with CUDA runtime. This is substantially different in 643 // default mode vs. separate compilation! 644 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 645 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 646 RegisterFatbinFunc, 647 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 648 GpuBinaryHandle = new llvm::GlobalVariable( 649 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 650 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 651 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 652 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 653 CGM.getPointerAlign()); 654 655 // Call __cuda_register_globals(GpuBinaryHandle); 656 if (RegisterGlobalsFunc) 657 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 658 659 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 660 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 661 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 662 // void __cudaRegisterFatBinaryEnd(void **); 663 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 664 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 665 "__cudaRegisterFatBinaryEnd"); 666 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 667 } 668 } else { 669 // Generate a unique module ID. 670 SmallString<64> ModuleID; 671 llvm::raw_svector_ostream OS(ModuleID); 672 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 673 llvm::Constant *ModuleIDConstant = 674 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 675 676 // Create an alias for the FatbinWrapper that nvcc will look for. 677 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 678 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 679 680 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 681 // void *, void (*)(void **)) 682 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 683 RegisterLinkedBinaryName += ModuleID; 684 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 685 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 686 687 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 688 llvm::Value *Args[] = {RegisterGlobalsFunc, 689 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 690 ModuleIDConstant, 691 makeDummyFunction(getCallbackFnTy())}; 692 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 693 } 694 695 // Create destructor and register it with atexit() the way NVCC does it. Doing 696 // it during regular destructor phase worked in CUDA before 9.2 but results in 697 // double-free in 9.2. 698 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 699 // extern "C" int atexit(void (*f)(void)); 700 llvm::FunctionType *AtExitTy = 701 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 702 llvm::FunctionCallee AtExitFunc = 703 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 704 /*Local=*/true); 705 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 706 } 707 708 CtorBuilder.CreateRetVoid(); 709 return ModuleCtorFunc; 710 } 711 712 /// Creates a global destructor function that unregisters the GPU code blob 713 /// registered by constructor. 714 /// 715 /// For CUDA: 716 /// \code 717 /// void __cuda_module_dtor(void*) { 718 /// __cudaUnregisterFatBinary(Handle); 719 /// } 720 /// \endcode 721 /// 722 /// For HIP: 723 /// \code 724 /// void __hip_module_dtor(void*) { 725 /// if (__hip_gpubin_handle) { 726 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 727 /// __hip_gpubin_handle = 0; 728 /// } 729 /// } 730 /// \endcode 731 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 732 // No need for destructor if we don't have a handle to unregister. 733 if (!GpuBinaryHandle) 734 return nullptr; 735 736 // void __cudaUnregisterFatBinary(void ** handle); 737 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 738 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 739 addUnderscoredPrefixToName("UnregisterFatBinary")); 740 741 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 742 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 743 llvm::GlobalValue::InternalLinkage, 744 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 745 746 llvm::BasicBlock *DtorEntryBB = 747 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 748 CGBuilderTy DtorBuilder(CGM, Context); 749 DtorBuilder.SetInsertPoint(DtorEntryBB); 750 751 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 752 GpuBinaryHandle->getAlignment())); 753 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 754 // There is only one HIP fat binary per linked module, however there are 755 // multiple destructor functions. Make sure the fat binary is unregistered 756 // only once. 757 if (CGM.getLangOpts().HIP) { 758 llvm::BasicBlock *IfBlock = 759 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 760 llvm::BasicBlock *ExitBlock = 761 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 762 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 763 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 764 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 765 766 DtorBuilder.SetInsertPoint(IfBlock); 767 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 768 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 769 DtorBuilder.CreateBr(ExitBlock); 770 771 DtorBuilder.SetInsertPoint(ExitBlock); 772 } else { 773 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 774 } 775 DtorBuilder.CreateRetVoid(); 776 return ModuleDtorFunc; 777 } 778 779 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 780 return new CGNVCUDARuntime(CGM); 781 } 782