1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/CodeGen/ConstantInitBuilder.h" 19 #include "clang/AST/Decl.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/CallSite.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 25 using namespace clang; 26 using namespace CodeGen; 27 28 namespace { 29 30 class CGNVCUDARuntime : public CGCUDARuntime { 31 32 private: 33 llvm::IntegerType *IntTy, *SizeTy; 34 llvm::Type *VoidTy; 35 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 36 37 /// Convenience reference to LLVM Context 38 llvm::LLVMContext &Context; 39 /// Convenience reference to the current module 40 llvm::Module &TheModule; 41 /// Keeps track of kernel launch stubs emitted in this module 42 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 43 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 44 /// Keeps track of variable containing handle of GPU binary. Populated by 45 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 46 /// ModuleDtorFunction() 47 llvm::GlobalVariable *GpuBinaryHandle = nullptr; 48 49 llvm::Constant *getSetupArgumentFn() const; 50 llvm::Constant *getLaunchFn() const; 51 52 /// Creates a function to register all kernel stubs generated in this module. 53 llvm::Function *makeRegisterGlobalsFn(); 54 55 /// Helper function that generates a constant string and returns a pointer to 56 /// the start of the string. The result of this function can be used anywhere 57 /// where the C code specifies const char*. 58 llvm::Constant *makeConstantString(const std::string &Str, 59 const std::string &Name = "", 60 const std::string &SectionName = "", 61 unsigned Alignment = 0) { 62 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 63 llvm::ConstantInt::get(SizeTy, 0)}; 64 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 65 llvm::GlobalVariable *GV = 66 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 67 if (!SectionName.empty()) 68 GV->setSection(SectionName); 69 if (Alignment) 70 GV->setAlignment(Alignment); 71 72 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 73 ConstStr.getPointer(), Zeros); 74 } 75 76 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 77 78 public: 79 CGNVCUDARuntime(CodeGenModule &CGM); 80 81 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 82 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 83 DeviceVars.push_back(std::make_pair(&Var, Flags)); 84 } 85 86 /// Creates module constructor function 87 llvm::Function *makeModuleCtorFunction() override; 88 /// Creates module destructor function 89 llvm::Function *makeModuleDtorFunction() override; 90 }; 91 92 } 93 94 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 95 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 96 TheModule(CGM.getModule()) { 97 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 98 ASTContext &Ctx = CGM.getContext(); 99 100 IntTy = CGM.IntTy; 101 SizeTy = CGM.SizeTy; 102 VoidTy = CGM.VoidTy; 103 104 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 105 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 106 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 107 } 108 109 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 110 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 111 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 112 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 113 Params, false), 114 "cudaSetupArgument"); 115 } 116 117 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 118 // cudaError_t cudaLaunch(char *) 119 return CGM.CreateRuntimeFunction( 120 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 121 } 122 123 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 124 FunctionArgList &Args) { 125 EmittedKernels.push_back(CGF.CurFn); 126 emitDeviceStubBody(CGF, Args); 127 } 128 129 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 130 FunctionArgList &Args) { 131 // Emit a call to cudaSetupArgument for each arg in Args. 132 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 133 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 134 CharUnits Offset = CharUnits::Zero(); 135 for (const VarDecl *A : Args) { 136 CharUnits TyWidth, TyAlign; 137 std::tie(TyWidth, TyAlign) = 138 CGM.getContext().getTypeInfoInChars(A->getType()); 139 Offset = Offset.alignTo(TyAlign); 140 llvm::Value *Args[] = { 141 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 142 VoidPtrTy), 143 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 144 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 145 }; 146 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 147 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 148 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 149 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 150 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 151 CGF.EmitBlock(NextBlock); 152 Offset += TyWidth; 153 } 154 155 // Emit the call to cudaLaunch 156 llvm::Constant *cudaLaunchFn = getLaunchFn(); 157 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 158 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 159 CGF.EmitBranch(EndBlock); 160 161 CGF.EmitBlock(EndBlock); 162 } 163 164 /// Creates a function that sets up state on the host side for CUDA objects that 165 /// have a presence on both the host and device sides. Specifically, registers 166 /// the host side of kernel functions and device global variables with the CUDA 167 /// runtime. 168 /// \code 169 /// void __cuda_register_globals(void** GpuBinaryHandle) { 170 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 171 /// ... 172 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 173 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 174 /// ... 175 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 176 /// } 177 /// \endcode 178 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 179 // No need to register anything 180 if (EmittedKernels.empty() && DeviceVars.empty()) 181 return nullptr; 182 183 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 184 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 185 llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); 186 llvm::BasicBlock *EntryBB = 187 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 188 CGBuilderTy Builder(CGM, Context); 189 Builder.SetInsertPoint(EntryBB); 190 191 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 192 // int, uint3*, uint3*, dim3*, dim3*, int*) 193 llvm::Type *RegisterFuncParams[] = { 194 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 195 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 196 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 197 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 198 "__cudaRegisterFunction"); 199 200 // Extract GpuBinaryHandle passed as the first argument passed to 201 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 202 // each emitted kernel. 203 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 204 for (llvm::Function *Kernel : EmittedKernels) { 205 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 206 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 207 llvm::Value *Args[] = { 208 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 209 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 210 NullPtr, NullPtr, NullPtr, 211 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 212 Builder.CreateCall(RegisterFunc, Args); 213 } 214 215 // void __cudaRegisterVar(void **, char *, char *, const char *, 216 // int, int, int, int) 217 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 218 CharPtrTy, IntTy, IntTy, 219 IntTy, IntTy}; 220 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 221 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 222 "__cudaRegisterVar"); 223 for (auto &Pair : DeviceVars) { 224 llvm::GlobalVariable *Var = Pair.first; 225 unsigned Flags = Pair.second; 226 llvm::Constant *VarName = makeConstantString(Var->getName()); 227 uint64_t VarSize = 228 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 229 llvm::Value *Args[] = { 230 &GpuBinaryHandlePtr, 231 Builder.CreateBitCast(Var, VoidPtrTy), 232 VarName, 233 VarName, 234 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 235 llvm::ConstantInt::get(IntTy, VarSize), 236 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 237 llvm::ConstantInt::get(IntTy, 0)}; 238 Builder.CreateCall(RegisterVar, Args); 239 } 240 241 Builder.CreateRetVoid(); 242 return RegisterKernelsFunc; 243 } 244 245 /// Creates a global constructor function for the module: 246 /// \code 247 /// void __cuda_module_ctor(void*) { 248 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); 249 /// __cuda_register_globals(Handle); 250 /// } 251 /// \endcode 252 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 253 // No need to generate ctors/dtors if there is no GPU binary. 254 std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; 255 if (GpuBinaryFileName.empty()) 256 return nullptr; 257 258 // void __cuda_register_globals(void* handle); 259 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 260 // void ** __cudaRegisterFatBinary(void *); 261 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 262 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 263 "__cudaRegisterFatBinary"); 264 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 265 llvm::StructType *FatbinWrapperTy = 266 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); 267 268 // Register GPU binary with the CUDA runtime, store returned handle in a 269 // global variable and save a reference in GpuBinaryHandle to be cleaned up 270 // in destructor on exit. Then associate all known kernels with the GPU binary 271 // handle so CUDA runtime can figure out what to call on the GPU side. 272 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 273 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 274 if (std::error_code EC = GpuBinaryOrErr.getError()) { 275 CGM.getDiags().Report(diag::err_cannot_open_file) 276 << GpuBinaryFileName << EC.message(); 277 return nullptr; 278 } 279 280 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 281 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 282 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 283 llvm::BasicBlock *CtorEntryBB = 284 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 285 CGBuilderTy CtorBuilder(CGM, Context); 286 287 CtorBuilder.SetInsertPoint(CtorEntryBB); 288 289 const char *FatbinConstantName = 290 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 291 // NVIDIA's cuobjdump looks for fatbins in this section. 292 const char *FatbinSectionName = 293 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 294 295 // Create initialized wrapper structure that points to the loaded GPU binary 296 ConstantInitBuilder Builder(CGM); 297 auto Values = Builder.beginStruct(FatbinWrapperTy); 298 // Fatbin wrapper magic. 299 Values.addInt(IntTy, 0x466243b1); 300 // Fatbin version. 301 Values.addInt(IntTy, 1); 302 // Data. 303 Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 304 FatbinConstantName, 8)); 305 // Unused in fatbin v1. 306 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 307 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( 308 "__cuda_fatbin_wrapper", CGM.getPointerAlign(), 309 /*constant*/ true); 310 FatbinWrapper->setSection(FatbinSectionName); 311 312 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 313 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 314 RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 315 GpuBinaryHandle = new llvm::GlobalVariable( 316 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 317 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 318 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 319 CGM.getPointerAlign()); 320 321 // Call __cuda_register_globals(GpuBinaryHandle); 322 if (RegisterGlobalsFunc) 323 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 324 325 CtorBuilder.CreateRetVoid(); 326 return ModuleCtorFunc; 327 } 328 329 /// Creates a global destructor function that unregisters the GPU code blob 330 /// registered by constructor. 331 /// \code 332 /// void __cuda_module_dtor(void*) { 333 /// __cudaUnregisterFatBinary(Handle); 334 /// } 335 /// \endcode 336 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 337 // No need for destructor if we don't have a handle to unregister. 338 if (!GpuBinaryHandle) 339 return nullptr; 340 341 // void __cudaUnregisterFatBinary(void ** handle); 342 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 343 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 344 "__cudaUnregisterFatBinary"); 345 346 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 347 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 348 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 349 llvm::BasicBlock *DtorEntryBB = 350 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 351 CGBuilderTy DtorBuilder(CGM, Context); 352 DtorBuilder.SetInsertPoint(DtorEntryBB); 353 354 auto HandleValue = 355 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 356 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 357 358 DtorBuilder.CreateRetVoid(); 359 return ModuleDtorFunc; 360 } 361 362 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 363 return new CGNVCUDARuntime(CGM); 364 } 365