1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/AST/Decl.h" 19 #include "llvm/IR/BasicBlock.h" 20 #include "llvm/IR/CallSite.h" 21 #include "llvm/IR/Constants.h" 22 #include "llvm/IR/DerivedTypes.h" 23 24 using namespace clang; 25 using namespace CodeGen; 26 27 namespace { 28 29 class CGNVCUDARuntime : public CGCUDARuntime { 30 31 private: 32 llvm::Type *IntTy, *SizeTy, *VoidTy; 33 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 34 35 /// Convenience reference to LLVM Context 36 llvm::LLVMContext &Context; 37 /// Convenience reference to the current module 38 llvm::Module &TheModule; 39 /// Keeps track of kernel launch stubs emitted in this module 40 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 41 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 42 /// Keeps track of variables containing handles of GPU binaries. Populated by 43 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 44 /// ModuleDtorFunction() 45 llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 46 47 llvm::Constant *getSetupArgumentFn() const; 48 llvm::Constant *getLaunchFn() const; 49 50 /// Creates a function to register all kernel stubs generated in this module. 51 llvm::Function *makeRegisterGlobalsFn(); 52 53 /// Helper function that generates a constant string and returns a pointer to 54 /// the start of the string. The result of this function can be used anywhere 55 /// where the C code specifies const char*. 56 llvm::Constant *makeConstantString(const std::string &Str, 57 const std::string &Name = "", 58 unsigned Alignment = 0) { 59 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 60 llvm::ConstantInt::get(SizeTy, 0)}; 61 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 62 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 63 ConstStr.getPointer(), Zeros); 64 } 65 66 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 67 68 public: 69 CGNVCUDARuntime(CodeGenModule &CGM); 70 71 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 72 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 73 DeviceVars.push_back(std::make_pair(&Var, Flags)); 74 } 75 76 /// Creates module constructor function 77 llvm::Function *makeModuleCtorFunction() override; 78 /// Creates module destructor function 79 llvm::Function *makeModuleDtorFunction() override; 80 }; 81 82 } 83 84 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 85 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 86 TheModule(CGM.getModule()) { 87 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 88 ASTContext &Ctx = CGM.getContext(); 89 90 IntTy = Types.ConvertType(Ctx.IntTy); 91 SizeTy = Types.ConvertType(Ctx.getSizeType()); 92 VoidTy = llvm::Type::getVoidTy(Context); 93 94 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 95 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 96 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 97 } 98 99 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 100 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 101 std::vector<llvm::Type*> Params; 102 Params.push_back(VoidPtrTy); 103 Params.push_back(SizeTy); 104 Params.push_back(SizeTy); 105 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 106 Params, false), 107 "cudaSetupArgument"); 108 } 109 110 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 111 // cudaError_t cudaLaunch(char *) 112 return CGM.CreateRuntimeFunction( 113 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 114 } 115 116 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 117 FunctionArgList &Args) { 118 EmittedKernels.push_back(CGF.CurFn); 119 emitDeviceStubBody(CGF, Args); 120 } 121 122 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 123 FunctionArgList &Args) { 124 // Build the argument value list and the argument stack struct type. 125 SmallVector<llvm::Value *, 16> ArgValues; 126 std::vector<llvm::Type *> ArgTypes; 127 for (FunctionArgList::const_iterator I = Args.begin(), E = Args.end(); 128 I != E; ++I) { 129 llvm::Value *V = CGF.GetAddrOfLocalVar(*I).getPointer(); 130 ArgValues.push_back(V); 131 assert(isa<llvm::PointerType>(V->getType()) && "Arg type not PointerType"); 132 ArgTypes.push_back(cast<llvm::PointerType>(V->getType())->getElementType()); 133 } 134 llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes); 135 136 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 137 138 // Emit the calls to cudaSetupArgument 139 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 140 for (unsigned I = 0, E = Args.size(); I != E; ++I) { 141 llvm::Value *Args[3]; 142 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 143 Args[0] = CGF.Builder.CreatePointerCast(ArgValues[I], VoidPtrTy); 144 Args[1] = CGF.Builder.CreateIntCast( 145 llvm::ConstantExpr::getSizeOf(ArgTypes[I]), 146 SizeTy, false); 147 Args[2] = CGF.Builder.CreateIntCast( 148 llvm::ConstantExpr::getOffsetOf(ArgStackTy, I), 149 SizeTy, false); 150 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 151 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 152 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 153 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 154 CGF.EmitBlock(NextBlock); 155 } 156 157 // Emit the call to cudaLaunch 158 llvm::Constant *cudaLaunchFn = getLaunchFn(); 159 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 160 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 161 CGF.EmitBranch(EndBlock); 162 163 CGF.EmitBlock(EndBlock); 164 } 165 166 /// Creates a function that sets up state on the host side for CUDA objects that 167 /// have a presence on both the host and device sides. Specifically, registers 168 /// the host side of kernel functions and device global variables with the CUDA 169 /// runtime. 170 /// \code 171 /// void __cuda_register_globals(void** GpuBinaryHandle) { 172 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 173 /// ... 174 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 175 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 176 /// ... 177 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 178 /// } 179 /// \endcode 180 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 181 // No need to register anything 182 if (EmittedKernels.empty() && DeviceVars.empty()) 183 return nullptr; 184 185 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 186 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 187 llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); 188 llvm::BasicBlock *EntryBB = 189 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 190 CGBuilderTy Builder(CGM, Context); 191 Builder.SetInsertPoint(EntryBB); 192 193 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 194 // int, uint3*, uint3*, dim3*, dim3*, int*) 195 std::vector<llvm::Type *> RegisterFuncParams = { 196 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 197 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 198 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 199 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 200 "__cudaRegisterFunction"); 201 202 // Extract GpuBinaryHandle passed as the first argument passed to 203 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 204 // each emitted kernel. 205 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 206 for (llvm::Function *Kernel : EmittedKernels) { 207 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 208 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 209 llvm::Value *Args[] = { 210 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 211 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 212 NullPtr, NullPtr, NullPtr, 213 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 214 Builder.CreateCall(RegisterFunc, Args); 215 } 216 217 // void __cudaRegisterVar(void **, char *, char *, const char *, 218 // int, int, int, int) 219 std::vector<llvm::Type *> RegisterVarParams = { 220 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, 221 IntTy, IntTy, IntTy, IntTy}; 222 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 223 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 224 "__cudaRegisterVar"); 225 for (auto &Pair : DeviceVars) { 226 llvm::GlobalVariable *Var = Pair.first; 227 unsigned Flags = Pair.second; 228 llvm::Constant *VarName = makeConstantString(Var->getName()); 229 uint64_t VarSize = 230 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 231 llvm::Value *Args[] = { 232 &GpuBinaryHandlePtr, 233 Builder.CreateBitCast(Var, VoidPtrTy), 234 VarName, 235 VarName, 236 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 237 llvm::ConstantInt::get(IntTy, VarSize), 238 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 239 llvm::ConstantInt::get(IntTy, 0)}; 240 Builder.CreateCall(RegisterVar, Args); 241 } 242 243 Builder.CreateRetVoid(); 244 return RegisterKernelsFunc; 245 } 246 247 /// Creates a global constructor function for the module: 248 /// \code 249 /// void __cuda_module_ctor(void*) { 250 /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 251 /// __cuda_register_globals(Handle0); 252 /// ... 253 /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 254 /// __cuda_register_globals(HandleN); 255 /// } 256 /// \endcode 257 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 258 // No need to generate ctors/dtors if there are no GPU binaries. 259 if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) 260 return nullptr; 261 262 // void __cuda_register_globals(void* handle); 263 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 264 // void ** __cudaRegisterFatBinary(void *); 265 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 266 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 267 "__cudaRegisterFatBinary"); 268 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 269 llvm::StructType *FatbinWrapperTy = 270 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 271 272 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 273 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 274 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 275 llvm::BasicBlock *CtorEntryBB = 276 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 277 CGBuilderTy CtorBuilder(CGM, Context); 278 279 CtorBuilder.SetInsertPoint(CtorEntryBB); 280 281 // For each GPU binary, register it with the CUDA runtime and store returned 282 // handle in a global variable and save the handle in GpuBinaryHandles vector 283 // to be cleaned up in destructor on exit. Then associate all known kernels 284 // with the GPU binary handle so CUDA runtime can figure out what to call on 285 // the GPU side. 286 for (const std::string &GpuBinaryFileName : 287 CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 288 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 289 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 290 if (std::error_code EC = GpuBinaryOrErr.getError()) { 291 CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 292 << EC.message(); 293 continue; 294 } 295 296 // Create initialized wrapper structure that points to the loaded GPU binary 297 llvm::Constant *Values[] = { 298 llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. 299 llvm::ConstantInt::get(IntTy, 1), // Fatbin version. 300 makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data. 301 llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. 302 llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( 303 TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, 304 llvm::ConstantStruct::get(FatbinWrapperTy, Values), 305 "__cuda_fatbin_wrapper"); 306 // NVIDIA's cuobjdump looks for fatbins in this section. 307 FatbinWrapper->setSection(".nvFatBinSegment"); 308 309 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 310 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 311 RegisterFatbinFunc, 312 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 313 llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 314 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 315 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 316 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 317 CGM.getPointerAlign()); 318 319 // Call __cuda_register_globals(GpuBinaryHandle); 320 if (RegisterGlobalsFunc) 321 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 322 323 // Save GpuBinaryHandle so we can unregister it in destructor. 324 GpuBinaryHandles.push_back(GpuBinaryHandle); 325 } 326 327 CtorBuilder.CreateRetVoid(); 328 return ModuleCtorFunc; 329 } 330 331 /// Creates a global destructor function that unregisters all GPU code blobs 332 /// registered by constructor. 333 /// \code 334 /// void __cuda_module_dtor(void*) { 335 /// __cudaUnregisterFatBinary(Handle0); 336 /// ... 337 /// __cudaUnregisterFatBinary(HandleN); 338 /// } 339 /// \endcode 340 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 341 // No need for destructor if we don't have handles to unregister. 342 if (GpuBinaryHandles.empty()) 343 return nullptr; 344 345 // void __cudaUnregisterFatBinary(void ** handle); 346 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 347 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 348 "__cudaUnregisterFatBinary"); 349 350 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 351 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 352 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 353 llvm::BasicBlock *DtorEntryBB = 354 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 355 CGBuilderTy DtorBuilder(CGM, Context); 356 DtorBuilder.SetInsertPoint(DtorEntryBB); 357 358 for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 359 auto HandleValue = 360 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 361 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 362 } 363 364 DtorBuilder.CreateRetVoid(); 365 return ModuleDtorFunc; 366 } 367 368 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 369 return new CGNVCUDARuntime(CGM); 370 } 371