1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "llvm/IR/BasicBlock.h" 20 #include "llvm/IR/CallSite.h" 21 #include "llvm/IR/Constants.h" 22 #include "llvm/IR/DerivedTypes.h" 23 24 using namespace clang; 25 using namespace CodeGen; 26 27 namespace { 28 29 class CGNVCUDARuntime : public CGCUDARuntime { 30 31 private: 32 llvm::Type *IntTy, *SizeTy, *VoidTy; 33 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 34 35 /// Convenience reference to LLVM Context 36 llvm::LLVMContext &Context; 37 /// Convenience reference to the current module 38 llvm::Module &TheModule; 39 /// Keeps track of kernel launch stubs emitted in this module 40 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 41 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 42 /// Keeps track of variables containing handles of GPU binaries. Populated by 43 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 44 /// ModuleDtorFunction() 45 llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 46 47 llvm::Constant *getSetupArgumentFn() const; 48 llvm::Constant *getLaunchFn() const; 49 50 /// Creates a function to register all kernel stubs generated in this module. 51 llvm::Function *makeRegisterGlobalsFn(); 52 53 /// Helper function that generates a constant string and returns a pointer to 54 /// the start of the string. The result of this function can be used anywhere 55 /// where the C code specifies const char*. 56 llvm::Constant *makeConstantString(const std::string &Str, 57 const std::string &Name = "", 58 unsigned Alignment = 0) { 59 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 60 llvm::ConstantInt::get(SizeTy, 0)}; 61 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 62 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 63 ConstStr.getPointer(), Zeros); 64 } 65 66 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 67 68 public: 69 CGNVCUDARuntime(CodeGenModule &CGM); 70 71 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 72 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 73 DeviceVars.push_back(std::make_pair(&Var, Flags)); 74 } 75 76 /// Creates module constructor function 77 llvm::Function *makeModuleCtorFunction() override; 78 /// Creates module destructor function 79 llvm::Function *makeModuleDtorFunction() override; 80 }; 81 82 } 83 84 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 85 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 86 TheModule(CGM.getModule()) { 87 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 88 ASTContext &Ctx = CGM.getContext(); 89 90 IntTy = Types.ConvertType(Ctx.IntTy); 91 SizeTy = Types.ConvertType(Ctx.getSizeType()); 92 VoidTy = llvm::Type::getVoidTy(Context); 93 94 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 95 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 96 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 97 } 98 99 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 100 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 101 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 102 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 103 Params, false), 104 "cudaSetupArgument"); 105 } 106 107 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 108 // cudaError_t cudaLaunch(char *) 109 return CGM.CreateRuntimeFunction( 110 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 111 } 112 113 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 114 FunctionArgList &Args) { 115 EmittedKernels.push_back(CGF.CurFn); 116 emitDeviceStubBody(CGF, Args); 117 } 118 119 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 120 FunctionArgList &Args) { 121 // Emit a call to cudaSetupArgument for each arg in Args. 122 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 123 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 124 CharUnits Offset = CharUnits::Zero(); 125 for (const VarDecl *A : Args) { 126 CharUnits TyWidth, TyAlign; 127 std::tie(TyWidth, TyAlign) = 128 CGM.getContext().getTypeInfoInChars(A->getType()); 129 Offset = Offset.alignTo(TyAlign); 130 llvm::Value *Args[] = { 131 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 132 VoidPtrTy), 133 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 134 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 135 }; 136 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 137 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 138 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 139 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 140 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 141 CGF.EmitBlock(NextBlock); 142 Offset += TyWidth; 143 } 144 145 // Emit the call to cudaLaunch 146 llvm::Constant *cudaLaunchFn = getLaunchFn(); 147 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 148 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 149 CGF.EmitBranch(EndBlock); 150 151 CGF.EmitBlock(EndBlock); 152 } 153 154 /// Creates a function that sets up state on the host side for CUDA objects that 155 /// have a presence on both the host and device sides. Specifically, registers 156 /// the host side of kernel functions and device global variables with the CUDA 157 /// runtime. 158 /// \code 159 /// void __cuda_register_globals(void** GpuBinaryHandle) { 160 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 161 /// ... 162 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 163 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 164 /// ... 165 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 166 /// } 167 /// \endcode 168 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 169 // No need to register anything 170 if (EmittedKernels.empty() && DeviceVars.empty()) 171 return nullptr; 172 173 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 174 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 175 llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); 176 llvm::BasicBlock *EntryBB = 177 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 178 CGBuilderTy Builder(CGM, Context); 179 Builder.SetInsertPoint(EntryBB); 180 181 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 182 // int, uint3*, uint3*, dim3*, dim3*, int*) 183 llvm::Type *RegisterFuncParams[] = { 184 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 185 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 186 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 187 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 188 "__cudaRegisterFunction"); 189 190 // Extract GpuBinaryHandle passed as the first argument passed to 191 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 192 // each emitted kernel. 193 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 194 for (llvm::Function *Kernel : EmittedKernels) { 195 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 196 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 197 llvm::Value *Args[] = { 198 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 199 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 200 NullPtr, NullPtr, NullPtr, 201 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 202 Builder.CreateCall(RegisterFunc, Args); 203 } 204 205 // void __cudaRegisterVar(void **, char *, char *, const char *, 206 // int, int, int, int) 207 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 208 CharPtrTy, IntTy, IntTy, 209 IntTy, IntTy}; 210 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 211 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 212 "__cudaRegisterVar"); 213 for (auto &Pair : DeviceVars) { 214 llvm::GlobalVariable *Var = Pair.first; 215 unsigned Flags = Pair.second; 216 llvm::Constant *VarName = makeConstantString(Var->getName()); 217 uint64_t VarSize = 218 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 219 llvm::Value *Args[] = { 220 &GpuBinaryHandlePtr, 221 Builder.CreateBitCast(Var, VoidPtrTy), 222 VarName, 223 VarName, 224 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 225 llvm::ConstantInt::get(IntTy, VarSize), 226 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 227 llvm::ConstantInt::get(IntTy, 0)}; 228 Builder.CreateCall(RegisterVar, Args); 229 } 230 231 Builder.CreateRetVoid(); 232 return RegisterKernelsFunc; 233 } 234 235 /// Creates a global constructor function for the module: 236 /// \code 237 /// void __cuda_module_ctor(void*) { 238 /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 239 /// __cuda_register_globals(Handle0); 240 /// ... 241 /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 242 /// __cuda_register_globals(HandleN); 243 /// } 244 /// \endcode 245 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 246 // No need to generate ctors/dtors if there are no GPU binaries. 247 if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) 248 return nullptr; 249 250 // void __cuda_register_globals(void* handle); 251 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 252 // void ** __cudaRegisterFatBinary(void *); 253 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 254 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 255 "__cudaRegisterFatBinary"); 256 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 257 llvm::StructType *FatbinWrapperTy = 258 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 259 260 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 261 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 262 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 263 llvm::BasicBlock *CtorEntryBB = 264 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 265 CGBuilderTy CtorBuilder(CGM, Context); 266 267 CtorBuilder.SetInsertPoint(CtorEntryBB); 268 269 // For each GPU binary, register it with the CUDA runtime and store returned 270 // handle in a global variable and save the handle in GpuBinaryHandles vector 271 // to be cleaned up in destructor on exit. Then associate all known kernels 272 // with the GPU binary handle so CUDA runtime can figure out what to call on 273 // the GPU side. 274 for (const std::string &GpuBinaryFileName : 275 CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 276 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 277 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 278 if (std::error_code EC = GpuBinaryOrErr.getError()) { 279 CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 280 << EC.message(); 281 continue; 282 } 283 284 // Create initialized wrapper structure that points to the loaded GPU binary 285 llvm::Constant *Values[] = { 286 llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. 287 llvm::ConstantInt::get(IntTy, 1), // Fatbin version. 288 makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data. 289 llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. 290 llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( 291 TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, 292 llvm::ConstantStruct::get(FatbinWrapperTy, Values), 293 "__cuda_fatbin_wrapper"); 294 // NVIDIA's cuobjdump looks for fatbins in this section. 295 FatbinWrapper->setSection(".nvFatBinSegment"); 296 297 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 298 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 299 RegisterFatbinFunc, 300 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 301 llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 302 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 303 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 304 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 305 CGM.getPointerAlign()); 306 307 // Call __cuda_register_globals(GpuBinaryHandle); 308 if (RegisterGlobalsFunc) 309 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 310 311 // Save GpuBinaryHandle so we can unregister it in destructor. 312 GpuBinaryHandles.push_back(GpuBinaryHandle); 313 } 314 315 CtorBuilder.CreateRetVoid(); 316 return ModuleCtorFunc; 317 } 318 319 /// Creates a global destructor function that unregisters all GPU code blobs 320 /// registered by constructor. 321 /// \code 322 /// void __cuda_module_dtor(void*) { 323 /// __cudaUnregisterFatBinary(Handle0); 324 /// ... 325 /// __cudaUnregisterFatBinary(HandleN); 326 /// } 327 /// \endcode 328 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 329 // No need for destructor if we don't have handles to unregister. 330 if (GpuBinaryHandles.empty()) 331 return nullptr; 332 333 // void __cudaUnregisterFatBinary(void ** handle); 334 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 335 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 336 "__cudaUnregisterFatBinary"); 337 338 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 339 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 340 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 341 llvm::BasicBlock *DtorEntryBB = 342 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 343 CGBuilderTy DtorBuilder(CGM, Context); 344 DtorBuilder.SetInsertPoint(DtorEntryBB); 345 346 for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 347 auto HandleValue = 348 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 349 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 350 } 351 352 DtorBuilder.CreateRetVoid(); 353 return ModuleDtorFunc; 354 } 355 356 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 357 return new CGNVCUDARuntime(CGM); 358 } 359