1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "clang/CodeGen/ConstantInitBuilder.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/CallSite.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/Support/Format.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 29 namespace { 30 31 class CGNVCUDARuntime : public CGCUDARuntime { 32 33 private: 34 llvm::IntegerType *IntTy, *SizeTy; 35 llvm::Type *VoidTy; 36 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 37 38 /// Convenience reference to LLVM Context 39 llvm::LLVMContext &Context; 40 /// Convenience reference to the current module 41 llvm::Module &TheModule; 42 /// Keeps track of kernel launch stubs emitted in this module 43 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 44 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 45 /// Keeps track of variable containing handle of GPU binary. Populated by 46 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 47 /// ModuleDtorFunction() 48 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 49 /// Whether we generate relocatable device code. 50 bool RelocatableDeviceCode; 51 52 llvm::Constant *getSetupArgumentFn() const; 53 llvm::Constant *getLaunchFn() const; 54 55 llvm::FunctionType *getRegisterGlobalsFnTy() const; 56 llvm::FunctionType *getCallbackFnTy() const; 57 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; 58 std::string addPrefixToName(StringRef FuncName) const; 59 std::string addUnderscoredPrefixToName(StringRef FuncName) const; 60 61 /// Creates a function to register all kernel stubs generated in this module. 62 llvm::Function *makeRegisterGlobalsFn(); 63 64 /// Helper function that generates a constant string and returns a pointer to 65 /// the start of the string. The result of this function can be used anywhere 66 /// where the C code specifies const char*. 67 llvm::Constant *makeConstantString(const std::string &Str, 68 const std::string &Name = "", 69 const std::string &SectionName = "", 70 unsigned Alignment = 0) { 71 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 72 llvm::ConstantInt::get(SizeTy, 0)}; 73 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 74 llvm::GlobalVariable *GV = 75 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 76 if (!SectionName.empty()) 77 GV->setSection(SectionName); 78 if (Alignment) 79 GV->setAlignment(Alignment); 80 81 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 82 ConstStr.getPointer(), Zeros); 83 } 84 85 /// Helper function that generates an empty dummy function returning void. 86 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { 87 assert(FnTy->getReturnType()->isVoidTy() && 88 "Can only generate dummy functions returning void!"); 89 llvm::Function *DummyFunc = llvm::Function::Create( 90 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); 91 92 llvm::BasicBlock *DummyBlock = 93 llvm::BasicBlock::Create(Context, "", DummyFunc); 94 CGBuilderTy FuncBuilder(CGM, Context); 95 FuncBuilder.SetInsertPoint(DummyBlock); 96 FuncBuilder.CreateRetVoid(); 97 98 return DummyFunc; 99 } 100 101 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 102 103 public: 104 CGNVCUDARuntime(CodeGenModule &CGM); 105 106 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 107 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 108 DeviceVars.push_back(std::make_pair(&Var, Flags)); 109 } 110 111 /// Creates module constructor function 112 llvm::Function *makeModuleCtorFunction() override; 113 /// Creates module destructor function 114 llvm::Function *makeModuleDtorFunction() override; 115 }; 116 117 } 118 119 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 120 if (CGM.getLangOpts().HIP) 121 return ((Twine("hip") + Twine(FuncName)).str()); 122 return ((Twine("cuda") + Twine(FuncName)).str()); 123 } 124 std::string 125 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { 126 if (CGM.getLangOpts().HIP) 127 return ((Twine("__hip") + Twine(FuncName)).str()); 128 return ((Twine("__cuda") + Twine(FuncName)).str()); 129 } 130 131 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 132 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 133 TheModule(CGM.getModule()), 134 RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) { 135 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 136 ASTContext &Ctx = CGM.getContext(); 137 138 IntTy = CGM.IntTy; 139 SizeTy = CGM.SizeTy; 140 VoidTy = CGM.VoidTy; 141 142 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 143 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 144 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 145 } 146 147 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 148 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 149 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 150 return CGM.CreateRuntimeFunction( 151 llvm::FunctionType::get(IntTy, Params, false), 152 addPrefixToName("SetupArgument")); 153 } 154 155 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 156 if (CGM.getLangOpts().HIP) { 157 // hipError_t hipLaunchByPtr(char *); 158 return CGM.CreateRuntimeFunction( 159 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); 160 } else { 161 // cudaError_t cudaLaunch(char *); 162 return CGM.CreateRuntimeFunction( 163 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 164 } 165 } 166 167 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { 168 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); 169 } 170 171 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { 172 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); 173 } 174 175 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { 176 auto CallbackFnTy = getCallbackFnTy(); 177 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); 178 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, 179 VoidPtrTy, CallbackFnTy->getPointerTo()}; 180 return llvm::FunctionType::get(VoidTy, Params, false); 181 } 182 183 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 184 FunctionArgList &Args) { 185 EmittedKernels.push_back(CGF.CurFn); 186 emitDeviceStubBody(CGF, Args); 187 } 188 189 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 190 FunctionArgList &Args) { 191 // Emit a call to cudaSetupArgument for each arg in Args. 192 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 193 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 194 CharUnits Offset = CharUnits::Zero(); 195 for (const VarDecl *A : Args) { 196 CharUnits TyWidth, TyAlign; 197 std::tie(TyWidth, TyAlign) = 198 CGM.getContext().getTypeInfoInChars(A->getType()); 199 Offset = Offset.alignTo(TyAlign); 200 llvm::Value *Args[] = { 201 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 202 VoidPtrTy), 203 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 204 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 205 }; 206 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 207 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 208 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 209 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 210 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 211 CGF.EmitBlock(NextBlock); 212 Offset += TyWidth; 213 } 214 215 // Emit the call to cudaLaunch 216 llvm::Constant *cudaLaunchFn = getLaunchFn(); 217 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 218 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 219 CGF.EmitBranch(EndBlock); 220 221 CGF.EmitBlock(EndBlock); 222 } 223 224 /// Creates a function that sets up state on the host side for CUDA objects that 225 /// have a presence on both the host and device sides. Specifically, registers 226 /// the host side of kernel functions and device global variables with the CUDA 227 /// runtime. 228 /// \code 229 /// void __cuda_register_globals(void** GpuBinaryHandle) { 230 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 231 /// ... 232 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 233 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 234 /// ... 235 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 236 /// } 237 /// \endcode 238 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 239 // No need to register anything 240 if (EmittedKernels.empty() && DeviceVars.empty()) 241 return nullptr; 242 243 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 244 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, 245 addUnderscoredPrefixToName("_register_globals"), &TheModule); 246 llvm::BasicBlock *EntryBB = 247 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 248 CGBuilderTy Builder(CGM, Context); 249 Builder.SetInsertPoint(EntryBB); 250 251 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 252 // int, uint3*, uint3*, dim3*, dim3*, int*) 253 llvm::Type *RegisterFuncParams[] = { 254 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 255 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 256 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 257 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 258 addUnderscoredPrefixToName("RegisterFunction")); 259 260 // Extract GpuBinaryHandle passed as the first argument passed to 261 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 262 // each emitted kernel. 263 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 264 for (llvm::Function *Kernel : EmittedKernels) { 265 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 266 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 267 llvm::Value *Args[] = { 268 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 269 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 270 NullPtr, NullPtr, NullPtr, 271 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 272 Builder.CreateCall(RegisterFunc, Args); 273 } 274 275 // void __cudaRegisterVar(void **, char *, char *, const char *, 276 // int, int, int, int) 277 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 278 CharPtrTy, IntTy, IntTy, 279 IntTy, IntTy}; 280 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 281 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 282 addUnderscoredPrefixToName("RegisterVar")); 283 for (auto &Pair : DeviceVars) { 284 llvm::GlobalVariable *Var = Pair.first; 285 unsigned Flags = Pair.second; 286 llvm::Constant *VarName = makeConstantString(Var->getName()); 287 uint64_t VarSize = 288 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 289 llvm::Value *Args[] = { 290 &GpuBinaryHandlePtr, 291 Builder.CreateBitCast(Var, VoidPtrTy), 292 VarName, 293 VarName, 294 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 295 llvm::ConstantInt::get(IntTy, VarSize), 296 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 297 llvm::ConstantInt::get(IntTy, 0)}; 298 Builder.CreateCall(RegisterVar, Args); 299 } 300 301 Builder.CreateRetVoid(); 302 return RegisterKernelsFunc; 303 } 304 305 /// Creates a global constructor function for the module: 306 /// \code 307 /// void __cuda_module_ctor(void*) { 308 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 309 /// __cuda_register_globals(Handle); 310 /// } 311 /// \endcode 312 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 313 // No need to generate ctors/dtors if there is no GPU binary. 314 std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 315 if (GpuBinaryFileName.empty()) 316 return nullptr; 317 318 // void __cuda_register_globals(void* handle); 319 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 320 // We always need a function to pass in as callback. Create a dummy 321 // implementation if we don't need to register anything. 322 if (RelocatableDeviceCode && !RegisterGlobalsFunc) 323 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); 324 325 // void ** __cudaRegisterFatBinary(void *); 326 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 327 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 328 addUnderscoredPrefixToName("RegisterFatBinary")); 329 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 330 llvm::StructType *FatbinWrapperTy = 331 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 332 333 // Register GPU binary with the CUDA runtime, store returned handle in a 334 // global variable and save a reference in GpuBinaryHandle to be cleaned up 335 // in destructor on exit. Then associate all known kernels with the GPU binary 336 // handle so CUDA runtime can figure out what to call on the GPU side. 337 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 338 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 339 if (std::error_code EC = GpuBinaryOrErr.getError()) { 340 CGM.getDiags().Report(diag::err_cannot_open_file) 341 << GpuBinaryFileName << EC.message(); 342 return nullptr; 343 } 344 345 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 346 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 347 llvm::GlobalValue::InternalLinkage, 348 addUnderscoredPrefixToName("_module_ctor"), &TheModule); 349 llvm::BasicBlock *CtorEntryBB = 350 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 351 CGBuilderTy CtorBuilder(CGM, Context); 352 353 CtorBuilder.SetInsertPoint(CtorEntryBB); 354 355 const char *FatbinConstantName; 356 if (RelocatableDeviceCode) 357 // TODO: Figure out how this is called on mac OS! 358 FatbinConstantName = "__nv_relfatbin"; 359 else 360 FatbinConstantName = 361 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 362 // NVIDIA's cuobjdump looks for fatbins in this section. 363 const char *FatbinSectionName = 364 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 365 // TODO: Figure out how this is called on mac OS! 366 const char *NVModuleIDSectionName = "__nv_module_id"; 367 368 // Create initialized wrapper structure that points to the loaded GPU binary 369 ConstantInitBuilder Builder(CGM); 370 auto Values = Builder.beginStruct(FatbinWrapperTy); 371 // Fatbin wrapper magic. 372 Values.addInt(IntTy, 0x466243b1); 373 // Fatbin version. 374 Values.addInt(IntTy, 1); 375 // Data. 376 Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 377 FatbinConstantName, 8)); 378 // Unused in fatbin v1. 379 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 380 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 381 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), 382 /*constant*/ true); 383 FatbinWrapper->setSection(FatbinSectionName); 384 385 // Register binary with CUDA runtime. This is substantially different in 386 // default mode vs. separate compilation! 387 if (!RelocatableDeviceCode) { 388 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 389 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 390 RegisterFatbinFunc, 391 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 392 GpuBinaryHandle = new llvm::GlobalVariable( 393 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 394 llvm::ConstantPointerNull::get(VoidPtrPtrTy), 395 addUnderscoredPrefixToName("_gpubin_handle")); 396 397 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 398 CGM.getPointerAlign()); 399 400 // Call __cuda_register_globals(GpuBinaryHandle); 401 if (RegisterGlobalsFunc) 402 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 403 } else { 404 // Generate a unique module ID. 405 SmallString<64> NVModuleID; 406 llvm::raw_svector_ostream OS(NVModuleID); 407 OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID()); 408 llvm::Constant *NVModuleIDConstant = 409 makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32); 410 411 // Create an alias for the FatbinWrapper that nvcc will look for. 412 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, 413 Twine("__fatbinwrap") + NVModuleID, 414 FatbinWrapper); 415 416 // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *, 417 // void *, void (*)(void **)) 418 SmallString<128> RegisterLinkedBinaryName( 419 addUnderscoredPrefixToName("RegisterLinkedBinary")); 420 RegisterLinkedBinaryName += NVModuleID; 421 llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( 422 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); 423 424 assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); 425 llvm::Value *Args[] = {RegisterGlobalsFunc, 426 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), 427 NVModuleIDConstant, 428 makeDummyFunction(getCallbackFnTy())}; 429 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); 430 } 431 432 CtorBuilder.CreateRetVoid(); 433 return ModuleCtorFunc; 434 } 435 436 /// Creates a global destructor function that unregisters the GPU code blob 437 /// registered by constructor. 438 /// \code 439 /// void __cuda_module_dtor(void*) { 440 /// __cudaUnregisterFatBinary(Handle); 441 /// } 442 /// \endcode 443 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 444 // No need for destructor if we don't have a handle to unregister. 445 if (!GpuBinaryHandle) 446 return nullptr; 447 448 // void __cudaUnregisterFatBinary(void ** handle); 449 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 450 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 451 addUnderscoredPrefixToName("UnregisterFatBinary")); 452 453 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 454 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 455 llvm::GlobalValue::InternalLinkage, 456 addUnderscoredPrefixToName("_module_dtor"), &TheModule); 457 458 llvm::BasicBlock *DtorEntryBB = 459 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 460 CGBuilderTy DtorBuilder(CGM, Context); 461 DtorBuilder.SetInsertPoint(DtorEntryBB); 462 463 auto HandleValue = 464 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 465 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 466 467 DtorBuilder.CreateRetVoid(); 468 return ModuleDtorFunc; 469 } 470 471 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 472 return new CGNVCUDARuntime(CGM); 473 } 474