1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This provides a class for CUDA code generation targeting the NVIDIA CUDA 11 // runtime library. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "CGCUDARuntime.h" 16 #include "CodeGenFunction.h" 17 #include "CodeGenModule.h" 18 #include "clang/CodeGen/ConstantInitBuilder.h" 19 #include "clang/AST/Decl.h" 20 #include "llvm/IR/BasicBlock.h" 21 #include "llvm/IR/CallSite.h" 22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/DerivedTypes.h" 24 25 using namespace clang; 26 using namespace CodeGen; 27 28 namespace { 29 30 class CGNVCUDARuntime : public CGCUDARuntime { 31 32 private: 33 llvm::IntegerType *IntTy, *SizeTy; 34 llvm::Type *VoidTy; 35 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; 36 37 /// Convenience reference to LLVM Context 38 llvm::LLVMContext &Context; 39 /// Convenience reference to the current module 40 llvm::Module &TheModule; 41 /// Keeps track of kernel launch stubs emitted in this module 42 llvm::SmallVector<llvm::Function *, 16> EmittedKernels; 43 llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars; 44 /// Keeps track of variables containing handles of GPU binaries. Populated by 45 /// ModuleCtorFunction() and used to create corresponding cleanup calls in 46 /// ModuleDtorFunction() 47 llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles; 48 49 llvm::Constant *getSetupArgumentFn() const; 50 llvm::Constant *getLaunchFn() const; 51 52 /// Creates a function to register all kernel stubs generated in this module. 53 llvm::Function *makeRegisterGlobalsFn(); 54 55 /// Helper function that generates a constant string and returns a pointer to 56 /// the start of the string. The result of this function can be used anywhere 57 /// where the C code specifies const char*. 58 llvm::Constant *makeConstantString(const std::string &Str, 59 const std::string &Name = "", 60 const std::string &SectionName = "", 61 unsigned Alignment = 0) { 62 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), 63 llvm::ConstantInt::get(SizeTy, 0)}; 64 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); 65 llvm::GlobalVariable *GV = 66 cast<llvm::GlobalVariable>(ConstStr.getPointer()); 67 if (!SectionName.empty()) 68 GV->setSection(SectionName); 69 if (Alignment) 70 GV->setAlignment(Alignment); 71 72 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), 73 ConstStr.getPointer(), Zeros); 74 } 75 76 void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); 77 78 public: 79 CGNVCUDARuntime(CodeGenModule &CGM); 80 81 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; 82 void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { 83 DeviceVars.push_back(std::make_pair(&Var, Flags)); 84 } 85 86 /// Creates module constructor function 87 llvm::Function *makeModuleCtorFunction() override; 88 /// Creates module destructor function 89 llvm::Function *makeModuleDtorFunction() override; 90 }; 91 92 } 93 94 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 95 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 96 TheModule(CGM.getModule()) { 97 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 98 ASTContext &Ctx = CGM.getContext(); 99 100 IntTy = CGM.IntTy; 101 SizeTy = CGM.SizeTy; 102 VoidTy = CGM.VoidTy; 103 104 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); 105 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); 106 VoidPtrPtrTy = VoidPtrTy->getPointerTo(); 107 } 108 109 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { 110 // cudaError_t cudaSetupArgument(void *, size_t, size_t) 111 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; 112 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, 113 Params, false), 114 "cudaSetupArgument"); 115 } 116 117 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { 118 // cudaError_t cudaLaunch(char *) 119 return CGM.CreateRuntimeFunction( 120 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); 121 } 122 123 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 124 FunctionArgList &Args) { 125 EmittedKernels.push_back(CGF.CurFn); 126 emitDeviceStubBody(CGF, Args); 127 } 128 129 void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, 130 FunctionArgList &Args) { 131 // Emit a call to cudaSetupArgument for each arg in Args. 132 llvm::Constant *cudaSetupArgFn = getSetupArgumentFn(); 133 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 134 CharUnits Offset = CharUnits::Zero(); 135 for (const VarDecl *A : Args) { 136 CharUnits TyWidth, TyAlign; 137 std::tie(TyWidth, TyAlign) = 138 CGM.getContext().getTypeInfoInChars(A->getType()); 139 Offset = Offset.alignTo(TyAlign); 140 llvm::Value *Args[] = { 141 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 142 VoidPtrTy), 143 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 144 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 145 }; 146 llvm::CallSite CS = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 147 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 148 llvm::Value *CSZero = CGF.Builder.CreateICmpEQ(CS.getInstruction(), Zero); 149 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 150 CGF.Builder.CreateCondBr(CSZero, NextBlock, EndBlock); 151 CGF.EmitBlock(NextBlock); 152 Offset += TyWidth; 153 } 154 155 // Emit the call to cudaLaunch 156 llvm::Constant *cudaLaunchFn = getLaunchFn(); 157 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 158 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 159 CGF.EmitBranch(EndBlock); 160 161 CGF.EmitBlock(EndBlock); 162 } 163 164 /// Creates a function that sets up state on the host side for CUDA objects that 165 /// have a presence on both the host and device sides. Specifically, registers 166 /// the host side of kernel functions and device global variables with the CUDA 167 /// runtime. 168 /// \code 169 /// void __cuda_register_globals(void** GpuBinaryHandle) { 170 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); 171 /// ... 172 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); 173 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); 174 /// ... 175 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); 176 /// } 177 /// \endcode 178 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { 179 // No need to register anything 180 if (EmittedKernels.empty() && DeviceVars.empty()) 181 return nullptr; 182 183 llvm::Function *RegisterKernelsFunc = llvm::Function::Create( 184 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 185 llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); 186 llvm::BasicBlock *EntryBB = 187 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); 188 CGBuilderTy Builder(CGM, Context); 189 Builder.SetInsertPoint(EntryBB); 190 191 // void __cudaRegisterFunction(void **, const char *, char *, const char *, 192 // int, uint3*, uint3*, dim3*, dim3*, int*) 193 llvm::Type *RegisterFuncParams[] = { 194 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, 195 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; 196 llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( 197 llvm::FunctionType::get(IntTy, RegisterFuncParams, false), 198 "__cudaRegisterFunction"); 199 200 // Extract GpuBinaryHandle passed as the first argument passed to 201 // __cuda_register_globals() and generate __cudaRegisterFunction() call for 202 // each emitted kernel. 203 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); 204 for (llvm::Function *Kernel : EmittedKernels) { 205 llvm::Constant *KernelName = makeConstantString(Kernel->getName()); 206 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 207 llvm::Value *Args[] = { 208 &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), 209 KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, 210 NullPtr, NullPtr, NullPtr, 211 llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; 212 Builder.CreateCall(RegisterFunc, Args); 213 } 214 215 // void __cudaRegisterVar(void **, char *, char *, const char *, 216 // int, int, int, int) 217 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, 218 CharPtrTy, IntTy, IntTy, 219 IntTy, IntTy}; 220 llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( 221 llvm::FunctionType::get(IntTy, RegisterVarParams, false), 222 "__cudaRegisterVar"); 223 for (auto &Pair : DeviceVars) { 224 llvm::GlobalVariable *Var = Pair.first; 225 unsigned Flags = Pair.second; 226 llvm::Constant *VarName = makeConstantString(Var->getName()); 227 uint64_t VarSize = 228 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 229 llvm::Value *Args[] = { 230 &GpuBinaryHandlePtr, 231 Builder.CreateBitCast(Var, VoidPtrTy), 232 VarName, 233 VarName, 234 llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), 235 llvm::ConstantInt::get(IntTy, VarSize), 236 llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), 237 llvm::ConstantInt::get(IntTy, 0)}; 238 Builder.CreateCall(RegisterVar, Args); 239 } 240 241 Builder.CreateRetVoid(); 242 return RegisterKernelsFunc; 243 } 244 245 /// Creates a global constructor function for the module: 246 /// \code 247 /// void __cuda_module_ctor(void*) { 248 /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); 249 /// __cuda_register_globals(Handle0); 250 /// ... 251 /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); 252 /// __cuda_register_globals(HandleN); 253 /// } 254 /// \endcode 255 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { 256 // No need to generate ctors/dtors if there are no GPU binaries. 257 if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) 258 return nullptr; 259 260 // void __cuda_register_globals(void* handle); 261 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); 262 // void ** __cudaRegisterFatBinary(void *); 263 llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( 264 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), 265 "__cudaRegisterFatBinary"); 266 // struct { int magic, int version, void * gpu_binary, void * dont_care }; 267 llvm::StructType *FatbinWrapperTy = 268 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); 269 270 llvm::Function *ModuleCtorFunc = llvm::Function::Create( 271 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 272 llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); 273 llvm::BasicBlock *CtorEntryBB = 274 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); 275 CGBuilderTy CtorBuilder(CGM, Context); 276 277 CtorBuilder.SetInsertPoint(CtorEntryBB); 278 279 // For each GPU binary, register it with the CUDA runtime and store returned 280 // handle in a global variable and save the handle in GpuBinaryHandles vector 281 // to be cleaned up in destructor on exit. Then associate all known kernels 282 // with the GPU binary handle so CUDA runtime can figure out what to call on 283 // the GPU side. 284 for (const std::string &GpuBinaryFileName : 285 CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { 286 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr = 287 llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); 288 if (std::error_code EC = GpuBinaryOrErr.getError()) { 289 CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName 290 << EC.message(); 291 continue; 292 } 293 294 const char *FatbinConstantName = 295 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; 296 // NVIDIA's cuobjdump looks for fatbins in this section. 297 const char *FatbinSectionName = 298 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; 299 300 // Create initialized wrapper structure that points to the loaded GPU binary 301 ConstantInitBuilder Builder(CGM); 302 auto Values = Builder.beginStruct(FatbinWrapperTy); 303 // Fatbin wrapper magic. 304 Values.addInt(IntTy, 0x466243b1); 305 // Fatbin version. 306 Values.addInt(IntTy, 1); 307 // Data. 308 Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), 309 "", FatbinConstantName, 8)); 310 // Unused in fatbin v1. 311 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); 312 llvm::GlobalVariable *FatbinWrapper = 313 Values.finishAndCreateGlobal("__cuda_fatbin_wrapper", 314 CGM.getPointerAlign(), 315 /*constant*/ true); 316 FatbinWrapper->setSection(FatbinSectionName); 317 318 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); 319 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( 320 RegisterFatbinFunc, 321 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); 322 llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( 323 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, 324 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); 325 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, 326 CGM.getPointerAlign()); 327 328 // Call __cuda_register_globals(GpuBinaryHandle); 329 if (RegisterGlobalsFunc) 330 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); 331 332 // Save GpuBinaryHandle so we can unregister it in destructor. 333 GpuBinaryHandles.push_back(GpuBinaryHandle); 334 } 335 336 CtorBuilder.CreateRetVoid(); 337 return ModuleCtorFunc; 338 } 339 340 /// Creates a global destructor function that unregisters all GPU code blobs 341 /// registered by constructor. 342 /// \code 343 /// void __cuda_module_dtor(void*) { 344 /// __cudaUnregisterFatBinary(Handle0); 345 /// ... 346 /// __cudaUnregisterFatBinary(HandleN); 347 /// } 348 /// \endcode 349 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { 350 // No need for destructor if we don't have handles to unregister. 351 if (GpuBinaryHandles.empty()) 352 return nullptr; 353 354 // void __cudaUnregisterFatBinary(void ** handle); 355 llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( 356 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), 357 "__cudaUnregisterFatBinary"); 358 359 llvm::Function *ModuleDtorFunc = llvm::Function::Create( 360 llvm::FunctionType::get(VoidTy, VoidPtrTy, false), 361 llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); 362 llvm::BasicBlock *DtorEntryBB = 363 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); 364 CGBuilderTy DtorBuilder(CGM, Context); 365 DtorBuilder.SetInsertPoint(DtorEntryBB); 366 367 for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { 368 auto HandleValue = 369 DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); 370 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); 371 } 372 373 DtorBuilder.CreateRetVoid(); 374 return ModuleDtorFunc; 375 } 376 377 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 378 return new CGNVCUDARuntime(CGM); 379 } 380