1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "llvm/IR/BasicBlock.h" 20 #include "llvm/IR/CallSite.h" 21 #include "llvm/IR/Constants.h" 22 #include "llvm/IR/DerivedTypes.h" 23 24 using namespace clang; 25 using namespace CodeGen; 26 27 namespace { 28 29 class CGNVCUDARuntime : public CGCUDARuntime { 30 31 private: 32 llvm::Type *IntTy, *SizeTy, *VoidTy; 33 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 34 35 /// Convenience reference to LLVM Context 36 llvm::LLVMContext &Context; 37 /// Convenience reference to the current module 38 llvm::Module &TheModule; 39 /// Keeps track of kernel launch stubs emitted in this module 40 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 41 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 42 /// Keeps track of variables containing handles of GPU binaries. Populated by 43 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 44 /// ModuleDtorFunction() 45 llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 46 47 llvm::Constant *getSetupArgumentFn() const; 48 llvm::Constant *getLaunchFn() const; 49 50 /// Creates a function to register all kernel stubs generated in this module. 51 llvm::Function *makeRegisterGlobalsFn(); 52 53 /// Helper function that generates a constant string and returns a pointer to 54 /// the start of the string. The result of this function can be used anywhere 55 /// where the C code specifies const char*. 56 llvm::Constant *makeConstantString(const std::string &Str, 57 const std::string &Name = "", 58 const std::string &SectionName = "", 59 unsigned Alignment = 0) { 60 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 61 llvm::ConstantInt::get(SizeTy, 0)}; 62 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 63 llvm::GlobalVariable *GV = 64 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 65 if (!SectionName.empty()) 66 GV->setSection(SectionName); 67 if (Alignment) 68 GV->setAlignment(Alignment); 69 70 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 71 ConstStr.getPointer(), Zeros); 72 } 73 74 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 75 76 public: 77 CGNVCUDARuntime(CodeGenModule &CGM); 78 79 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 80 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 81 DeviceVars.push_back(std::make_pair(&Var, Flags)); 82 } 83 84 /// Creates module constructor function 85 llvm::Function *makeModuleCtorFunction() override; 86 /// Creates module destructor function 87 llvm::Function *makeModuleDtorFunction() override; 88 }; 89 90 } 91 92 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 93 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 94 TheModule(CGM.getModule()) { 95 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 96 ASTContext &Ctx = CGM.getContext(); 97 98 IntTy = Types.ConvertType(Ctx.IntTy); 99 SizeTy = Types.ConvertType(Ctx.getSizeType()); 100 VoidTy = llvm::Type::getVoidTy(Context); 101 102 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 103 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 104 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 105 } 106 107 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 108 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 109 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 110 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 111 Params, false), 112 "cudaSetupArgument"); 113 } 114 115 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 116 // cudaError_t cudaLaunch(char *) 117 return CGM.CreateRuntimeFunction( 118 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 119 } 120 121 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 122 FunctionArgList &Args) { 123 EmittedKernels.push_back(CGF.CurFn); 124 emitDeviceStubBody(CGF, Args); 125 } 126 127 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 128 FunctionArgList &Args) { 129 // Emit a call to cudaSetupArgument for each arg in Args. 130 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 131 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 132 CharUnits Offset = CharUnits::Zero(); 133 for (const VarDecl *A : Args) { 134 CharUnits TyWidth, TyAlign; 135 std::tie(TyWidth, TyAlign) = 136 CGM.getContext().getTypeInfoInChars(A->getType()); 137 Offset = Offset.alignTo(TyAlign); 138 llvm::Value *Args[] = { 139 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 140 VoidPtrTy), 141 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 142 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 143 }; 144 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 145 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 146 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 147 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 148 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 149 CGF.EmitBlock(NextBlock); 150 Offset += TyWidth; 151 } 152 153 // Emit the call to cudaLaunch 154 llvm::Constant *cudaLaunchFn = getLaunchFn(); 155 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 156 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 157 CGF.EmitBranch(EndBlock); 158 159 CGF.EmitBlock(EndBlock); 160 } 161 162 /// Creates a function that sets up state on the host side for CUDA objects that 163 /// have a presence on both the host and device sides. Specifically, registers 164 /// the host side of kernel functions and device global variables with the CUDA 165 /// runtime. 166 /// \code 167 /// void __cuda_register_globals(void** GpuBinaryHandle) { 168 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 169 /// ... 170 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 171 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 172 /// ... 173 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 174 /// } 175 /// \endcode 176 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 177 // No need to register anything 178 if (EmittedKernels.empty() && DeviceVars.empty()) 179 return nullptr; 180 181 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 182 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 183 llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); 184 llvm::BasicBlock *EntryBB = 185 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 186 CGBuilderTy Builder(CGM, Context); 187 Builder.SetInsertPoint(EntryBB); 188 189 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 190 // int, uint3*, uint3*, dim3*, dim3*, int*) 191 llvm::Type *RegisterFuncParams[] = { 192 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 193 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 194 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 195 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 196 "__cudaRegisterFunction"); 197 198 // Extract GpuBinaryHandle passed as the first argument passed to 199 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 200 // each emitted kernel. 201 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 202 for (llvm::Function *Kernel : EmittedKernels) { 203 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 204 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 205 llvm::Value *Args[] = { 206 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 207 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 208 NullPtr, NullPtr, NullPtr, 209 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 210 Builder.CreateCall(RegisterFunc, Args); 211 } 212 213 // void __cudaRegisterVar(void **, char *, char *, const char *, 214 // int, int, int, int) 215 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 216 CharPtrTy, IntTy, IntTy, 217 IntTy, IntTy}; 218 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 219 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 220 "__cudaRegisterVar"); 221 for (auto &Pair : DeviceVars) { 222 llvm::GlobalVariable *Var = Pair.first; 223 unsigned Flags = Pair.second; 224 llvm::Constant *VarName = makeConstantString(Var->getName()); 225 uint64_t VarSize = 226 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 227 llvm::Value *Args[] = { 228 &GpuBinaryHandlePtr, 229 Builder.CreateBitCast(Var, VoidPtrTy), 230 VarName, 231 VarName, 232 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 233 llvm::ConstantInt::get(IntTy, VarSize), 234 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 235 llvm::ConstantInt::get(IntTy, 0)}; 236 Builder.CreateCall(RegisterVar, Args); 237 } 238 239 Builder.CreateRetVoid(); 240 return RegisterKernelsFunc; 241 } 242 243 /// Creates a global constructor function for the module: 244 /// \code 245 /// void __cuda_module_ctor(void*) { 246 /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 247 /// __cuda_register_globals(Handle0); 248 /// ... 249 /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 250 /// __cuda_register_globals(HandleN); 251 /// } 252 /// \endcode 253 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 254 // No need to generate ctors/dtors if there are no GPU binaries. 255 if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) 256 return nullptr; 257 258 // void __cuda_register_globals(void* handle); 259 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 260 // void ** __cudaRegisterFatBinary(void *); 261 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 262 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 263 "__cudaRegisterFatBinary"); 264 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 265 llvm::StructType *FatbinWrapperTy = 266 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 267 268 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 269 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 270 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 271 llvm::BasicBlock *CtorEntryBB = 272 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 273 CGBuilderTy CtorBuilder(CGM, Context); 274 275 CtorBuilder.SetInsertPoint(CtorEntryBB); 276 277 // For each GPU binary, register it with the CUDA runtime and store returned 278 // handle in a global variable and save the handle in GpuBinaryHandles vector 279 // to be cleaned up in destructor on exit. Then associate all known kernels 280 // with the GPU binary handle so CUDA runtime can figure out what to call on 281 // the GPU side. 282 for (const std::string &GpuBinaryFileName : 283 CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 284 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 285 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 286 if (std::error_code EC = GpuBinaryOrErr.getError()) { 287 CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 288 << EC.message(); 289 continue; 290 } 291 292 // Create initialized wrapper structure that points to the loaded GPU binary 293 llvm::Constant *Values[] = { 294 llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. 295 llvm::ConstantInt::get(IntTy, 1), // Fatbin version. 296 makeConstantString(GpuBinaryOrErr.get()->getBuffer(), // Data. 297 "", ".nv_fatbin", 8), // 298 llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. 299 llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( 300 TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, 301 llvm::ConstantStruct::get(FatbinWrapperTy, Values), 302 "__cuda_fatbin_wrapper"); 303 // NVIDIA's cuobjdump looks for fatbins in this section. 304 FatbinWrapper->setSection(".nvFatBinSegment"); 305 306 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 307 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 308 RegisterFatbinFunc, 309 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 310 llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 311 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 312 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 313 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 314 CGM.getPointerAlign()); 315 316 // Call __cuda_register_globals(GpuBinaryHandle); 317 if (RegisterGlobalsFunc) 318 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 319 320 // Save GpuBinaryHandle so we can unregister it in destructor. 321 GpuBinaryHandles.push_back(GpuBinaryHandle); 322 } 323 324 CtorBuilder.CreateRetVoid(); 325 return ModuleCtorFunc; 326 } 327 328 /// Creates a global destructor function that unregisters all GPU code blobs 329 /// registered by constructor. 330 /// \code 331 /// void __cuda_module_dtor(void*) { 332 /// __cudaUnregisterFatBinary(Handle0); 333 /// ... 334 /// __cudaUnregisterFatBinary(HandleN); 335 /// } 336 /// \endcode 337 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 338 // No need for destructor if we don't have handles to unregister. 339 if (GpuBinaryHandles.empty()) 340 return nullptr; 341 342 // void __cudaUnregisterFatBinary(void ** handle); 343 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 344 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 345 "__cudaUnregisterFatBinary"); 346 347 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 348 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 349 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 350 llvm::BasicBlock *DtorEntryBB = 351 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 352 CGBuilderTy DtorBuilder(CGM, Context); 353 DtorBuilder.SetInsertPoint(DtorEntryBB); 354 355 for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 356 auto HandleValue = 357 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 358 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 359 } 360 361 DtorBuilder.CreateRetVoid(); 362 return ModuleDtorFunc; 363 } 364 365 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 366 return new CGNVCUDARuntime(CGM); 367 } 368