1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 10 // runtime library. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGCUDARuntime.h" 15 #include "CodeGenFunction.h" 16 #include "CodeGenModule.h" 17 #include "clang/AST/Decl.h" 18 #include "clang/Basic/Cuda.h" 19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 constexpr unsigned CudaFatMagic = 0x466243b1; 31 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" 32 33 class CGNVCUDARuntime : public CGCUDARuntime { 34 35 private: 36 llvm::IntegerType *IntTy, *SizeTy; 37 llvm::Type *VoidTy; 38 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 39 40 /// Convenience reference to LLVM Context 41 llvm::LLVMContext &Context; 42 /// Convenience reference to the current module 43 llvm::Module &TheModule; 44 /// Keeps track of kernel launch stubs emitted in this module 45 struct KernelInfo { 46 llvm::Function *Kernel; 47 const Decl *D; 48 }; 49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 50 struct VarInfo { 51 llvm::GlobalVariable *Var; 52 const VarDecl *D; 53 unsigned Flag; 54 }; 55 llvm::SmallVector<VarInfo, 16> DeviceVars; 56 /// Keeps track of variable containing handle of GPU binary. Populated by 57 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 58 /// ModuleDtorFunction() 59 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 60 /// Whether we generate relocatable device code. 61 bool RelocatableDeviceCode; 62 /// Mangle context for device. 63 std::unique_ptr<MangleContext> DeviceMC; 64 65 llvm::FunctionCallee getSetupArgumentFn() const; 66 llvm::FunctionCallee getLaunchFn() const; 67 68 llvm::FunctionType *getRegisterGlobalsFnTy() const; 69 llvm::FunctionType *getCallbackFnTy() const; 70 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 71 std::string addPrefixToName(StringRef FuncName) const; 72 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 73 74 /// Creates a function to register all kernel stubs generated in this module. 75 llvm::Function *makeRegisterGlobalsFn(); 76 77 /// Helper function that generates a constant string and returns a pointer to 78 /// the start of the string. The result of this function can be used anywhere 79 /// where the C code specifies const char*. 80 llvm::Constant *makeConstantString(const std::string &Str, 81 const std::string &Name = "", 82 const std::string &SectionName = "", 83 unsigned Alignment = 0) { 84 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 85 llvm::ConstantInt::get(SizeTy, 0)}; 86 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 87 llvm::GlobalVariable *GV = 88 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 89 if (!SectionName.empty()) { 90 GV->setSection(SectionName); 91 // Mark the address as used which make sure that this section isn't 92 // merged and we will really have it in the object file. 93 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); 94 } 95 if (Alignment) 96 GV->setAlignment(Alignment); 97 98 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 99 ConstStr.getPointer(), Zeros); 100 } 101 102 /// Helper function that generates an empty dummy function returning void. 103 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 104 assert(FnTy->getReturnType()->isVoidTy() && 105 "Can only generate dummy functions returning void!"); 106 llvm::Function *DummyFunc = llvm::Function::Create( 107 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 108 109 llvm::BasicBlock *DummyBlock = 110 llvm::BasicBlock::Create(Context, "", DummyFunc); 111 CGBuilderTy FuncBuilder(CGM, Context); 112 FuncBuilder.SetInsertPoint(DummyBlock); 113 FuncBuilder.CreateRetVoid(); 114 115 return DummyFunc; 116 } 117 118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 120 std::string getDeviceSideName(const Decl *ND); 121 122 public: 123 CGNVCUDARuntime(CodeGenModule &CGM); 124 125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 127 unsigned Flags) override { 128 DeviceVars.push_back({&Var, VD, Flags}); 129 } 130 131 /// Creates module constructor function 132 llvm::Function *makeModuleCtorFunction() override; 133 /// Creates module destructor function 134 llvm::Function *makeModuleDtorFunction() override; 135 }; 136 137 } 138 139 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 140 if (CGM.getLangOpts().HIP) 141 return ((Twine("hip") + Twine(FuncName)).str()); 142 return ((Twine("cuda") + Twine(FuncName)).str()); 143 } 144 std::string 145 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 146 if (CGM.getLangOpts().HIP) 147 return ((Twine("__hip") + Twine(FuncName)).str()); 148 return ((Twine("__cuda") + Twine(FuncName)).str()); 149 } 150 151 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 152 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 153 TheModule(CGM.getModule()), 154 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 155 DeviceMC(CGM.getContext().createMangleContext( 156 CGM.getContext().getAuxTargetInfo())) { 157 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 158 ASTContext &Ctx = CGM.getContext(); 159 160 IntTy = CGM.IntTy; 161 SizeTy = CGM.SizeTy; 162 VoidTy = CGM.VoidTy; 163 164 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 165 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 166 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 167 } 168 169 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { 170 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 171 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 172 return CGM.CreateRuntimeFunction( 173 llvm::FunctionType::get(IntTy, Params, false), 174 addPrefixToName("SetupArgument")); 175 } 176 177 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { 178 if (CGM.getLangOpts().HIP) { 179 // hipError_t hipLaunchByPtr(char *); 180 return CGM.CreateRuntimeFunction( 181 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 182 } else { 183 // cudaError_t cudaLaunch(char *); 184 return CGM.CreateRuntimeFunction( 185 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 186 } 187 } 188 189 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 190 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 191 } 192 193 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 194 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 195 } 196 197 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 198 auto CallbackFnTy = getCallbackFnTy(); 199 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 200 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 201 VoidPtrTy, CallbackFnTy->getPointerTo()}; 202 return llvm::FunctionType::get(VoidTy, Params, false); 203 } 204 205 std::string CGNVCUDARuntime::getDeviceSideName(const Decl *D) { 206 auto *ND = cast<const NamedDecl>(D); 207 std::string DeviceSideName; 208 if (DeviceMC->shouldMangleDeclName(ND)) { 209 SmallString<256> Buffer; 210 llvm::raw_svector_ostream Out(Buffer); 211 DeviceMC->mangleName(ND, Out); 212 DeviceSideName = Out.str(); 213 } else 214 DeviceSideName = ND->getIdentifier()->getName(); 215 return DeviceSideName; 216 } 217 218 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 219 FunctionArgList &Args) { 220 assert(getDeviceSideName(CGF.CurFuncDecl) == CGF.CurFn->getName() || 221 CGF.CGM.getContext().getTargetInfo().getCXXABI() != 222 CGF.CGM.getContext().getAuxTargetInfo()->getCXXABI()); 223 224 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 225 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 226 CudaFeature::CUDA_USES_NEW_LAUNCH)) 227 emitDeviceStubBodyNew(CGF, Args); 228 else 229 emitDeviceStubBodyLegacy(CGF, Args); 230 } 231 232 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local 233 // array and kernels are launched using cudaLaunchKernel(). 234 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, 235 FunctionArgList &Args) { 236 // Build the shadow stack entry at the very start of the function. 237 238 // Calculate amount of space we will need for all arguments. If we have no 239 // args, allocate a single pointer so we still have a valid pointer to the 240 // argument array that we can pass to runtime, even if it will be unused. 241 Address KernelArgs = CGF.CreateTempAlloca( 242 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", 243 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); 244 // Store pointers to the arguments in a locally allocated launch_args. 245 for (unsigned i = 0; i < Args.size(); ++i) { 246 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); 247 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); 248 CGF.Builder.CreateDefaultAlignedStore( 249 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); 250 } 251 252 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 253 254 // Lookup cudaLaunchKernel function. 255 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, 256 // void **args, size_t sharedMem, 257 // cudaStream_t stream); 258 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); 259 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 260 IdentifierInfo &cudaLaunchKernelII = 261 CGM.getContext().Idents.get("cudaLaunchKernel"); 262 FunctionDecl *cudaLaunchKernelFD = nullptr; 263 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 264 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 265 cudaLaunchKernelFD = FD; 266 } 267 268 if (cudaLaunchKernelFD == nullptr) { 269 CGM.Error(CGF.CurFuncDecl->getLocation(), 270 "Can't find declaration for cudaLaunchKernel()"); 271 return; 272 } 273 // Create temporary dim3 grid_dim, block_dim. 274 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); 275 QualType Dim3Ty = GridDimParam->getType(); 276 Address GridDim = 277 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); 278 Address BlockDim = 279 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); 280 Address ShmemSize = 281 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); 282 Address Stream = 283 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); 284 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( 285 llvm::FunctionType::get(IntTy, 286 {/*gridDim=*/GridDim.getType(), 287 /*blockDim=*/BlockDim.getType(), 288 /*ShmemSize=*/ShmemSize.getType(), 289 /*Stream=*/Stream.getType()}, 290 /*isVarArg=*/false), 291 "__cudaPopCallConfiguration"); 292 293 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 294 {GridDim.getPointer(), BlockDim.getPointer(), 295 ShmemSize.getPointer(), Stream.getPointer()}); 296 297 // Emit the call to cudaLaunch 298 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 299 CallArgList LaunchKernelArgs; 300 LaunchKernelArgs.add(RValue::get(Kernel), 301 cudaLaunchKernelFD->getParamDecl(0)->getType()); 302 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 303 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 304 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), 305 cudaLaunchKernelFD->getParamDecl(3)->getType()); 306 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), 307 cudaLaunchKernelFD->getParamDecl(4)->getType()); 308 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), 309 cudaLaunchKernelFD->getParamDecl(5)->getType()); 310 311 QualType QT = cudaLaunchKernelFD->getType(); 312 QualType CQT = QT.getCanonicalType(); 313 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); 314 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty); 315 316 const CGFunctionInfo &FI = 317 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); 318 llvm::FunctionCallee cudaLaunchKernelFn = 319 CGM.CreateRuntimeFunction(FTy, "cudaLaunchKernel"); 320 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), 321 LaunchKernelArgs); 322 CGF.EmitBranch(EndBlock); 323 324 CGF.EmitBlock(EndBlock); 325 } 326 327 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, 328 FunctionArgList &Args) { 329 // Emit a call to cudaSetupArgument for each arg in Args. 330 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 331 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 332 CharUnits Offset = CharUnits::Zero(); 333 for (const VarDecl *A : Args) { 334 CharUnits TyWidth, TyAlign; 335 std::tie(TyWidth, TyAlign) = 336 CGM.getContext().getTypeInfoInChars(A->getType()); 337 Offset = Offset.alignTo(TyAlign); 338 llvm::Value *Args[] = { 339 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 340 VoidPtrTy), 341 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 342 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 343 }; 344 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 345 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 346 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 347 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 348 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 349 CGF.EmitBlock(NextBlock); 350 Offset += TyWidth; 351 } 352 353 // Emit the call to cudaLaunch 354 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 355 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 356 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 357 CGF.EmitBranch(EndBlock); 358 359 CGF.EmitBlock(EndBlock); 360 } 361 362 /// Creates a function that sets up state on the host side for CUDA objects that 363 /// have a presence on both the host and device sides. Specifically, registers 364 /// the host side of kernel functions and device global variables with the CUDA 365 /// runtime. 366 /// \code 367 /// void __cuda_register_globals(void** GpuBinaryHandle) { 368 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 369 /// ... 370 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 371 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 372 /// ... 373 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 374 /// } 375 /// \endcode 376 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 377 // No need to register anything 378 if (EmittedKernels.empty() && DeviceVars.empty()) 379 return nullptr; 380 381 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 382 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 383 addUnderscoredPrefixToName("_register_globals"), &TheModule); 384 llvm::BasicBlock *EntryBB = 385 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 386 CGBuilderTy Builder(CGM, Context); 387 Builder.SetInsertPoint(EntryBB); 388 389 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 390 // int, uint3*, uint3*, dim3*, dim3*, int*) 391 llvm::Type *RegisterFuncParams[] = { 392 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 393 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 394 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( 395 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 396 addUnderscoredPrefixToName("RegisterFunction")); 397 398 // Extract GpuBinaryHandle passed as the first argument passed to 399 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 400 // each emitted kernel. 401 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 402 for (auto &&I : EmittedKernels) { 403 llvm::Constant *KernelName = makeConstantString(getDeviceSideName(I.D)); 404 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 405 llvm::Value *Args[] = { 406 &GpuBinaryHandlePtr, 407 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 408 KernelName, 409 KernelName, 410 llvm::ConstantInt::get(IntTy, -1), 411 NullPtr, 412 NullPtr, 413 NullPtr, 414 NullPtr, 415 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 416 Builder.CreateCall(RegisterFunc, Args); 417 } 418 419 // void __cudaRegisterVar(void **, char *, char *, const char *, 420 // int, int, int, int) 421 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 422 CharPtrTy, IntTy, IntTy, 423 IntTy, IntTy}; 424 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 425 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 426 addUnderscoredPrefixToName("RegisterVar")); 427 for (auto &&Info : DeviceVars) { 428 llvm::GlobalVariable *Var = Info.Var; 429 unsigned Flags = Info.Flag; 430 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 431 uint64_t VarSize = 432 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 433 llvm::Value *Args[] = { 434 &GpuBinaryHandlePtr, 435 Builder.CreateBitCast(Var, VoidPtrTy), 436 VarName, 437 VarName, 438 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 439 llvm::ConstantInt::get(IntTy, VarSize), 440 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 441 llvm::ConstantInt::get(IntTy, 0)}; 442 Builder.CreateCall(RegisterVar, Args); 443 } 444 445 Builder.CreateRetVoid(); 446 return RegisterKernelsFunc; 447 } 448 449 /// Creates a global constructor function for the module: 450 /// 451 /// For CUDA: 452 /// \code 453 /// void __cuda_module_ctor(void*) { 454 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 455 /// __cuda_register_globals(Handle); 456 /// } 457 /// \endcode 458 /// 459 /// For HIP: 460 /// \code 461 /// void __hip_module_ctor(void*) { 462 /// if (__hip_gpubin_handle == 0) { 463 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); 464 /// __hip_register_globals(__hip_gpubin_handle); 465 /// } 466 /// } 467 /// \endcode 468 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 469 bool IsHIP = CGM.getLangOpts().HIP; 470 // No need to generate ctors/dtors if there is no GPU binary. 471 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 472 if (CudaGpuBinaryFileName.empty() && !IsHIP) 473 return nullptr; 474 475 // void __{cuda|hip}_register_globals(void* handle); 476 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 477 // We always need a function to pass in as callback. Create a dummy 478 // implementation if we don't need to register anything. 479 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 480 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 481 482 // void ** __{cuda|hip}RegisterFatBinary(void *); 483 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( 484 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 485 addUnderscoredPrefixToName("RegisterFatBinary")); 486 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 487 llvm::StructType *FatbinWrapperTy = 488 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 489 490 // Register GPU binary with the CUDA runtime, store returned handle in a 491 // global variable and save a reference in GpuBinaryHandle to be cleaned up 492 // in destructor on exit. Then associate all known kernels with the GPU binary 493 // handle so CUDA runtime can figure out what to call on the GPU side. 494 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; 495 if (!CudaGpuBinaryFileName.empty()) { 496 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = 497 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); 498 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { 499 CGM.getDiags().Report(diag::err_cannot_open_file) 500 << CudaGpuBinaryFileName << EC.message(); 501 return nullptr; 502 } 503 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); 504 } 505 506 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 507 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 508 llvm::GlobalValue::InternalLinkage, 509 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 510 llvm::BasicBlock *CtorEntryBB = 511 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 512 CGBuilderTy CtorBuilder(CGM, Context); 513 514 CtorBuilder.SetInsertPoint(CtorEntryBB); 515 516 const char *FatbinConstantName; 517 const char *FatbinSectionName; 518 const char *ModuleIDSectionName; 519 StringRef ModuleIDPrefix; 520 llvm::Constant *FatBinStr; 521 unsigned FatMagic; 522 if (IsHIP) { 523 FatbinConstantName = ".hip_fatbin"; 524 FatbinSectionName = ".hipFatBinSegment"; 525 526 ModuleIDSectionName = "__hip_module_id"; 527 ModuleIDPrefix = "__hip_"; 528 529 if (CudaGpuBinary) { 530 // If fatbin is available from early finalization, create a string 531 // literal containing the fat binary loaded from the given file. 532 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 533 FatbinConstantName, 8); 534 } else { 535 // If fatbin is not available, create an external symbol 536 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 537 // to contain the fat binary but will be populated somewhere else, 538 // e.g. by lld through link script. 539 FatBinStr = new llvm::GlobalVariable( 540 CGM.getModule(), CGM.Int8Ty, 541 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, 542 "__hip_fatbin", nullptr, 543 llvm::GlobalVariable::NotThreadLocal); 544 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); 545 } 546 547 FatMagic = HIPFatMagic; 548 } else { 549 if (RelocatableDeviceCode) 550 FatbinConstantName = CGM.getTriple().isMacOSX() 551 ? "__NV_CUDA,__nv_relfatbin" 552 : "__nv_relfatbin"; 553 else 554 FatbinConstantName = 555 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 556 // NVIDIA's cuobjdump looks for fatbins in this section. 557 FatbinSectionName = 558 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 559 560 ModuleIDSectionName = CGM.getTriple().isMacOSX() 561 ? "__NV_CUDA,__nv_module_id" 562 : "__nv_module_id"; 563 ModuleIDPrefix = "__nv_"; 564 565 // For CUDA, create a string literal containing the fat binary loaded from 566 // the given file. 567 FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", 568 FatbinConstantName, 8); 569 FatMagic = CudaFatMagic; 570 } 571 572 // Create initialized wrapper structure that points to the loaded GPU binary 573 ConstantInitBuilder Builder(CGM); 574 auto Values = Builder.beginStruct(FatbinWrapperTy); 575 // Fatbin wrapper magic. 576 Values.addInt(IntTy, FatMagic); 577 // Fatbin version. 578 Values.addInt(IntTy, 1); 579 // Data. 580 Values.add(FatBinStr); 581 // Unused in fatbin v1. 582 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 583 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 584 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 585 /*constant*/ true); 586 FatbinWrapper->setSection(FatbinSectionName); 587 588 // There is only one HIP fat binary per linked module, however there are 589 // multiple constructor functions. Make sure the fat binary is registered 590 // only once. The constructor functions are executed by the dynamic loader 591 // before the program gains control. The dynamic loader cannot execute the 592 // constructor functions concurrently since doing that would not guarantee 593 // thread safety of the loaded program. Therefore we can assume sequential 594 // execution of constructor functions here. 595 if (IsHIP) { 596 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage : 597 llvm::GlobalValue::LinkOnceAnyLinkage; 598 llvm::BasicBlock *IfBlock = 599 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); 600 llvm::BasicBlock *ExitBlock = 601 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); 602 // The name, size, and initialization pattern of this variable is part 603 // of HIP ABI. 604 GpuBinaryHandle = new llvm::GlobalVariable( 605 TheModule, VoidPtrPtrTy, /*isConstant=*/false, 606 Linkage, 607 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), 608 "__hip_gpubin_handle"); 609 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 610 // Prevent the weak symbol in different shared libraries being merged. 611 if (Linkage != llvm::GlobalValue::InternalLinkage) 612 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); 613 Address GpuBinaryAddr( 614 GpuBinaryHandle, 615 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); 616 { 617 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 618 llvm::Constant *Zero = 619 llvm::Constant::getNullValue(HandleValue->getType()); 620 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); 621 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); 622 } 623 { 624 CtorBuilder.SetInsertPoint(IfBlock); 625 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); 626 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 627 RegisterFatbinFunc, 628 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 629 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); 630 CtorBuilder.CreateBr(ExitBlock); 631 } 632 { 633 CtorBuilder.SetInsertPoint(ExitBlock); 634 // Call __hip_register_globals(GpuBinaryHandle); 635 if (RegisterGlobalsFunc) { 636 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); 637 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); 638 } 639 } 640 } else if (!RelocatableDeviceCode) { 641 // Register binary with CUDA runtime. This is substantially different in 642 // default mode vs. separate compilation! 643 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 644 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 645 RegisterFatbinFunc, 646 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 647 GpuBinaryHandle = new llvm::GlobalVariable( 648 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 649 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 650 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); 651 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 652 CGM.getPointerAlign()); 653 654 // Call __cuda_register_globals(GpuBinaryHandle); 655 if (RegisterGlobalsFunc) 656 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 657 658 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. 659 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 660 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { 661 // void __cudaRegisterFatBinaryEnd(void **); 662 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( 663 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 664 "__cudaRegisterFatBinaryEnd"); 665 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); 666 } 667 } else { 668 // Generate a unique module ID. 669 SmallString<64> ModuleID; 670 llvm::raw_svector_ostream OS(ModuleID); 671 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); 672 llvm::Constant *ModuleIDConstant = 673 makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); 674 675 // Create an alias for the FatbinWrapper that nvcc will look for. 676 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 677 Twine("__fatbinwrap") + ModuleID, FatbinWrapper); 678 679 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, 680 // void *, void (*)(void **)) 681 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); 682 RegisterLinkedBinaryName += ModuleID; 683 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 684 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 685 686 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 687 llvm::Value *Args[] = {RegisterGlobalsFunc, 688 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 689 ModuleIDConstant, 690 makeDummyFunction(getCallbackFnTy())}; 691 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 692 } 693 694 // Create destructor and register it with atexit() the way NVCC does it. Doing 695 // it during regular destructor phase worked in CUDA before 9.2 but results in 696 // double-free in 9.2. 697 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { 698 // extern "C" int atexit(void (*f)(void)); 699 llvm::FunctionType *AtExitTy = 700 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); 701 llvm::FunctionCallee AtExitFunc = 702 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), 703 /*Local=*/true); 704 CtorBuilder.CreateCall(AtExitFunc, CleanupFn); 705 } 706 707 CtorBuilder.CreateRetVoid(); 708 return ModuleCtorFunc; 709 } 710 711 /// Creates a global destructor function that unregisters the GPU code blob 712 /// registered by constructor. 713 /// 714 /// For CUDA: 715 /// \code 716 /// void __cuda_module_dtor(void*) { 717 /// __cudaUnregisterFatBinary(Handle); 718 /// } 719 /// \endcode 720 /// 721 /// For HIP: 722 /// \code 723 /// void __hip_module_dtor(void*) { 724 /// if (__hip_gpubin_handle) { 725 /// __hipUnregisterFatBinary(__hip_gpubin_handle); 726 /// __hip_gpubin_handle = 0; 727 /// } 728 /// } 729 /// \endcode 730 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 731 // No need for destructor if we don't have a handle to unregister. 732 if (!GpuBinaryHandle) 733 return nullptr; 734 735 // void __cudaUnregisterFatBinary(void ** handle); 736 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 737 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 738 addUnderscoredPrefixToName("UnregisterFatBinary")); 739 740 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 741 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 742 llvm::GlobalValue::InternalLinkage, 743 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 744 745 llvm::BasicBlock *DtorEntryBB = 746 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 747 CGBuilderTy DtorBuilder(CGM, Context); 748 DtorBuilder.SetInsertPoint(DtorEntryBB); 749 750 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( 751 GpuBinaryHandle->getAlignment())); 752 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); 753 // There is only one HIP fat binary per linked module, however there are 754 // multiple destructor functions. Make sure the fat binary is unregistered 755 // only once. 756 if (CGM.getLangOpts().HIP) { 757 llvm::BasicBlock *IfBlock = 758 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); 759 llvm::BasicBlock *ExitBlock = 760 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); 761 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); 762 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); 763 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); 764 765 DtorBuilder.SetInsertPoint(IfBlock); 766 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 767 DtorBuilder.CreateStore(Zero, GpuBinaryAddr); 768 DtorBuilder.CreateBr(ExitBlock); 769 770 DtorBuilder.SetInsertPoint(ExitBlock); 771 } else { 772 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 773 } 774 DtorBuilder.CreateRetVoid(); 775 return ModuleDtorFunc; 776 } 777 778 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 779 return new CGNVCUDARuntime(CGM); 780 } 781