1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopDetection.h" 21 #include "polly/ScopInfo.h" 22 #include "polly/Support/SCEVValidator.h" 23 #include "llvm/ADT/PostOrderIterator.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/BasicAliasAnalysis.h" 26 #include "llvm/Analysis/GlobalsModRef.h" 27 #include "llvm/Analysis/PostDominators.h" 28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/LegacyPassManager.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/Support/TargetRegistry.h" 34 #include "llvm/Support/TargetSelect.h" 35 #include "llvm/Target/TargetMachine.h" 36 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 37 38 #include "isl/union_map.h" 39 40 extern "C" { 41 #include "ppcg/cuda.h" 42 #include "ppcg/gpu.h" 43 #include "ppcg/gpu_print.h" 44 #include "ppcg/ppcg.h" 45 #include "ppcg/schedule.h" 46 } 47 48 #include "llvm/Support/Debug.h" 49 50 using namespace polly; 51 using namespace llvm; 52 53 #define DEBUG_TYPE "polly-codegen-ppcg" 54 55 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 56 cl::desc("Dump the computed GPU Schedule"), 57 cl::Hidden, cl::init(false), cl::ZeroOrMore, 58 cl::cat(PollyCategory)); 59 60 static cl::opt<bool> 61 DumpCode("polly-acc-dump-code", 62 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 63 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 64 65 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 66 cl::desc("Dump the kernel LLVM-IR"), 67 cl::Hidden, cl::init(false), cl::ZeroOrMore, 68 cl::cat(PollyCategory)); 69 70 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 71 cl::desc("Dump the kernel assembly code"), 72 cl::Hidden, cl::init(false), cl::ZeroOrMore, 73 cl::cat(PollyCategory)); 74 75 static cl::opt<bool> FastMath("polly-acc-fastmath", 76 cl::desc("Allow unsafe math optimizations"), 77 cl::Hidden, cl::init(false), cl::ZeroOrMore, 78 cl::cat(PollyCategory)); 79 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 80 cl::desc("Use shared memory"), cl::Hidden, 81 cl::init(false), cl::ZeroOrMore, 82 cl::cat(PollyCategory)); 83 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 84 cl::desc("Use private memory"), cl::Hidden, 85 cl::init(false), cl::ZeroOrMore, 86 cl::cat(PollyCategory)); 87 88 static cl::opt<std::string> 89 CudaVersion("polly-acc-cuda-version", 90 cl::desc("The CUDA version to compile for"), cl::Hidden, 91 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 92 93 /// Create the ast expressions for a ScopStmt. 94 /// 95 /// This function is a callback for to generate the ast expressions for each 96 /// of the scheduled ScopStmts. 97 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 98 void *StmtT, isl_ast_build *Build, 99 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 100 isl_id *Id, void *User), 101 void *UserIndex, 102 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 103 void *UserExpr) { 104 105 ScopStmt *Stmt = (ScopStmt *)StmtT; 106 107 isl_ctx *Ctx; 108 109 if (!Stmt || !Build) 110 return NULL; 111 112 Ctx = isl_ast_build_get_ctx(Build); 113 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 114 115 for (MemoryAccess *Acc : *Stmt) { 116 isl_map *AddrFunc = Acc->getAddressFunction(); 117 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 118 isl_id *RefId = Acc->getId(); 119 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 120 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 121 MPA = isl_multi_pw_aff_coalesce(MPA); 122 MPA = FunctionIndex(MPA, RefId, UserIndex); 123 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 124 Access = FunctionExpr(Access, RefId, UserExpr); 125 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 126 } 127 128 return RefToExpr; 129 } 130 131 /// Generate code for a GPU specific isl AST. 132 /// 133 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 134 /// generates code for general-prupose AST nodes, with special functionality 135 /// for generating GPU specific user nodes. 136 /// 137 /// @see GPUNodeBuilder::createUser 138 class GPUNodeBuilder : public IslNodeBuilder { 139 public: 140 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 141 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 142 DominatorTree &DT, Scop &S, gpu_prog *Prog) 143 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 144 getExprBuilder().setIDToSAI(&IDToSAI); 145 } 146 147 /// Create after-run-time-check initialization code. 148 void initializeAfterRTH(); 149 150 /// Finalize the generated scop. 151 virtual void finalize(); 152 153 private: 154 /// A vector of array base pointers for which a new ScopArrayInfo was created. 155 /// 156 /// This vector is used to delete the ScopArrayInfo when it is not needed any 157 /// more. 158 std::vector<Value *> LocalArrays; 159 160 /// A map from ScopArrays to their corresponding device allocations. 161 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 162 163 /// The current GPU context. 164 Value *GPUContext; 165 166 /// The set of isl_ids allocated in the kernel 167 std::vector<isl_id *> KernelIds; 168 169 /// A module containing GPU code. 170 /// 171 /// This pointer is only set in case we are currently generating GPU code. 172 std::unique_ptr<Module> GPUModule; 173 174 /// The GPU program we generate code for. 175 gpu_prog *Prog; 176 177 /// Class to free isl_ids. 178 class IslIdDeleter { 179 public: 180 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 181 }; 182 183 /// A set containing all isl_ids allocated in a GPU kernel. 184 /// 185 /// By releasing this set all isl_ids will be freed. 186 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 187 188 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 189 190 /// Create code for user-defined AST nodes. 191 /// 192 /// These AST nodes can be of type: 193 /// 194 /// - ScopStmt: A computational statement (TODO) 195 /// - Kernel: A GPU kernel call (TODO) 196 /// - Data-Transfer: A GPU <-> CPU data-transfer 197 /// - In-kernel synchronization 198 /// - In-kernel memory copy statement 199 /// 200 /// @param UserStmt The ast node to generate code for. 201 virtual void createUser(__isl_take isl_ast_node *UserStmt); 202 203 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 204 205 /// Create code for a data transfer statement 206 /// 207 /// @param TransferStmt The data transfer statement. 208 /// @param Direction The direction in which to transfer data. 209 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 210 enum DataDirection Direction); 211 212 /// Find llvm::Values referenced in GPU kernel. 213 /// 214 /// @param Kernel The kernel to scan for llvm::Values 215 /// 216 /// @returns A set of values referenced by the kernel. 217 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 218 219 /// Compute the sizes of the execution grid for a given kernel. 220 /// 221 /// @param Kernel The kernel to compute grid sizes for. 222 /// 223 /// @returns A tuple with grid sizes for X and Y dimension 224 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 225 226 /// Compute the sizes of the thread blocks for a given kernel. 227 /// 228 /// @param Kernel The kernel to compute thread block sizes for. 229 /// 230 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 231 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 232 233 /// Create kernel launch parameters. 234 /// 235 /// @param Kernel The kernel to create parameters for. 236 /// @param F The kernel function that has been created. 237 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 238 /// 239 /// @returns A stack allocated array with pointers to the parameter 240 /// values that are passed to the kernel. 241 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 242 SetVector<Value *> SubtreeValues); 243 244 /// Create declarations for kernel variable. 245 /// 246 /// This includes shared memory declarations. 247 /// 248 /// @param Kernel The kernel definition to create variables for. 249 /// @param FN The function into which to generate the variables. 250 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 251 252 /// Add CUDA annotations to module. 253 /// 254 /// Add a set of CUDA annotations that declares the maximal block dimensions 255 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 256 /// PTX compiler to bound the number of allocated registers to ensure the 257 /// resulting kernel is known to run with up to as many block dimensions 258 /// as specified here. 259 /// 260 /// @param M The module to add the annotations to. 261 /// @param BlockDimX The size of block dimension X. 262 /// @param BlockDimY The size of block dimension Y. 263 /// @param BlockDimZ The size of block dimension Z. 264 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 265 Value *BlockDimZ); 266 267 /// Create GPU kernel. 268 /// 269 /// Code generate the kernel described by @p KernelStmt. 270 /// 271 /// @param KernelStmt The ast node to generate kernel code for. 272 void createKernel(__isl_take isl_ast_node *KernelStmt); 273 274 /// Generate code that computes the size of an array. 275 /// 276 /// @param Array The array for which to compute a size. 277 Value *getArraySize(gpu_array_info *Array); 278 279 /// Prepare the kernel arguments for kernel code generation 280 /// 281 /// @param Kernel The kernel to generate code for. 282 /// @param FN The function created for the kernel. 283 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 284 285 /// Create kernel function. 286 /// 287 /// Create a kernel function located in a newly created module that can serve 288 /// as target for device code generation. Set the Builder to point to the 289 /// start block of this newly created function. 290 /// 291 /// @param Kernel The kernel to generate code for. 292 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 293 void createKernelFunction(ppcg_kernel *Kernel, 294 SetVector<Value *> &SubtreeValues); 295 296 /// Create the declaration of a kernel function. 297 /// 298 /// The kernel function takes as arguments: 299 /// 300 /// - One i8 pointer for each external array reference used in the kernel. 301 /// - Host iterators 302 /// - Parameters 303 /// - Other LLVM Value references (TODO) 304 /// 305 /// @param Kernel The kernel to generate the function declaration for. 306 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 307 /// 308 /// @returns The newly declared function. 309 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 310 SetVector<Value *> &SubtreeValues); 311 312 /// Insert intrinsic functions to obtain thread and block ids. 313 /// 314 /// @param The kernel to generate the intrinsic functions for. 315 void insertKernelIntrinsics(ppcg_kernel *Kernel); 316 317 /// Create a global-to-shared or shared-to-global copy statement. 318 /// 319 /// @param CopyStmt The copy statement to generate code for 320 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 321 322 /// Create code for a ScopStmt called in @p Expr. 323 /// 324 /// @param Expr The expression containing the call. 325 /// @param KernelStmt The kernel statement referenced in the call. 326 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 327 328 /// Create an in-kernel synchronization call. 329 void createKernelSync(); 330 331 /// Create a PTX assembly string for the current GPU kernel. 332 /// 333 /// @returns A string containing the corresponding PTX assembly code. 334 std::string createKernelASM(); 335 336 /// Remove references from the dominator tree to the kernel function @p F. 337 /// 338 /// @param F The function to remove references to. 339 void clearDominators(Function *F); 340 341 /// Remove references from scalar evolution to the kernel function @p F. 342 /// 343 /// @param F The function to remove references to. 344 void clearScalarEvolution(Function *F); 345 346 /// Remove references from loop info to the kernel function @p F. 347 /// 348 /// @param F The function to remove references to. 349 void clearLoops(Function *F); 350 351 /// Finalize the generation of the kernel function. 352 /// 353 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 354 /// dump its IR to stderr. 355 /// 356 /// @returns The Assembly string of the kernel. 357 std::string finalizeKernelFunction(); 358 359 /// Create code that allocates memory to store arrays on device. 360 void allocateDeviceArrays(); 361 362 /// Free all allocated device arrays. 363 void freeDeviceArrays(); 364 365 /// Create a call to initialize the GPU context. 366 /// 367 /// @returns A pointer to the newly initialized context. 368 Value *createCallInitContext(); 369 370 /// Create a call to get the device pointer for a kernel allocation. 371 /// 372 /// @param Allocation The Polly GPU allocation 373 /// 374 /// @returns The device parameter corresponding to this allocation. 375 Value *createCallGetDevicePtr(Value *Allocation); 376 377 /// Create a call to free the GPU context. 378 /// 379 /// @param Context A pointer to an initialized GPU context. 380 void createCallFreeContext(Value *Context); 381 382 /// Create a call to allocate memory on the device. 383 /// 384 /// @param Size The size of memory to allocate 385 /// 386 /// @returns A pointer that identifies this allocation. 387 Value *createCallAllocateMemoryForDevice(Value *Size); 388 389 /// Create a call to free a device array. 390 /// 391 /// @param Array The device array to free. 392 void createCallFreeDeviceMemory(Value *Array); 393 394 /// Create a call to copy data from host to device. 395 /// 396 /// @param HostPtr A pointer to the host data that should be copied. 397 /// @param DevicePtr A device pointer specifying the location to copy to. 398 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 399 Value *Size); 400 401 /// Create a call to copy data from device to host. 402 /// 403 /// @param DevicePtr A pointer to the device data that should be copied. 404 /// @param HostPtr A host pointer specifying the location to copy to. 405 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 406 Value *Size); 407 408 /// Create a call to get a kernel from an assembly string. 409 /// 410 /// @param Buffer The string describing the kernel. 411 /// @param Entry The name of the kernel function to call. 412 /// 413 /// @returns A pointer to a kernel object 414 Value *createCallGetKernel(Value *Buffer, Value *Entry); 415 416 /// Create a call to free a GPU kernel. 417 /// 418 /// @param GPUKernel THe kernel to free. 419 void createCallFreeKernel(Value *GPUKernel); 420 421 /// Create a call to launch a GPU kernel. 422 /// 423 /// @param GPUKernel The kernel to launch. 424 /// @param GridDimX The size of the first grid dimension. 425 /// @param GridDimY The size of the second grid dimension. 426 /// @param GridBlockX The size of the first block dimension. 427 /// @param GridBlockY The size of the second block dimension. 428 /// @param GridBlockZ The size of the third block dimension. 429 /// @param Paramters A pointer to an array that contains itself pointers to 430 /// the parameter values passed for each kernel argument. 431 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 432 Value *GridDimY, Value *BlockDimX, 433 Value *BlockDimY, Value *BlockDimZ, 434 Value *Parameters); 435 }; 436 437 void GPUNodeBuilder::initializeAfterRTH() { 438 GPUContext = createCallInitContext(); 439 allocateDeviceArrays(); 440 } 441 442 void GPUNodeBuilder::finalize() { 443 freeDeviceArrays(); 444 createCallFreeContext(GPUContext); 445 IslNodeBuilder::finalize(); 446 } 447 448 void GPUNodeBuilder::allocateDeviceArrays() { 449 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 450 451 for (int i = 0; i < Prog->n_array; ++i) { 452 gpu_array_info *Array = &Prog->array[i]; 453 auto *ScopArray = (ScopArrayInfo *)Array->user; 454 std::string DevArrayName("p_dev_array_"); 455 DevArrayName.append(Array->name); 456 457 Value *ArraySize = getArraySize(Array); 458 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 459 DevArray->setName(DevArrayName); 460 DeviceAllocations[ScopArray] = DevArray; 461 } 462 463 isl_ast_build_free(Build); 464 } 465 466 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 467 Value *BlockDimY, Value *BlockDimZ) { 468 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 469 470 for (auto &F : *M) { 471 if (F.getCallingConv() != CallingConv::PTX_Kernel) 472 continue; 473 474 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 475 476 Metadata *Elements[] = { 477 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 478 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 479 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 480 ValueAsMetadata::get(V[2]), 481 }; 482 MDNode *Node = MDNode::get(M->getContext(), Elements); 483 AnnotationNode->addOperand(Node); 484 } 485 } 486 487 void GPUNodeBuilder::freeDeviceArrays() { 488 for (auto &Array : DeviceAllocations) 489 createCallFreeDeviceMemory(Array.second); 490 } 491 492 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 493 const char *Name = "polly_getKernel"; 494 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 495 Function *F = M->getFunction(Name); 496 497 // If F is not available, declare it. 498 if (!F) { 499 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 500 std::vector<Type *> Args; 501 Args.push_back(Builder.getInt8PtrTy()); 502 Args.push_back(Builder.getInt8PtrTy()); 503 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 504 F = Function::Create(Ty, Linkage, Name, M); 505 } 506 507 return Builder.CreateCall(F, {Buffer, Entry}); 508 } 509 510 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 511 const char *Name = "polly_getDevicePtr"; 512 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 513 Function *F = M->getFunction(Name); 514 515 // If F is not available, declare it. 516 if (!F) { 517 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 518 std::vector<Type *> Args; 519 Args.push_back(Builder.getInt8PtrTy()); 520 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 521 F = Function::Create(Ty, Linkage, Name, M); 522 } 523 524 return Builder.CreateCall(F, {Allocation}); 525 } 526 527 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 528 Value *GridDimY, Value *BlockDimX, 529 Value *BlockDimY, Value *BlockDimZ, 530 Value *Parameters) { 531 const char *Name = "polly_launchKernel"; 532 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 533 Function *F = M->getFunction(Name); 534 535 // If F is not available, declare it. 536 if (!F) { 537 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 538 std::vector<Type *> Args; 539 Args.push_back(Builder.getInt8PtrTy()); 540 Args.push_back(Builder.getInt32Ty()); 541 Args.push_back(Builder.getInt32Ty()); 542 Args.push_back(Builder.getInt32Ty()); 543 Args.push_back(Builder.getInt32Ty()); 544 Args.push_back(Builder.getInt32Ty()); 545 Args.push_back(Builder.getInt8PtrTy()); 546 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 547 F = Function::Create(Ty, Linkage, Name, M); 548 } 549 550 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 551 BlockDimZ, Parameters}); 552 } 553 554 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 555 const char *Name = "polly_freeKernel"; 556 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 557 Function *F = M->getFunction(Name); 558 559 // If F is not available, declare it. 560 if (!F) { 561 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 562 std::vector<Type *> Args; 563 Args.push_back(Builder.getInt8PtrTy()); 564 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 565 F = Function::Create(Ty, Linkage, Name, M); 566 } 567 568 Builder.CreateCall(F, {GPUKernel}); 569 } 570 571 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 572 const char *Name = "polly_freeDeviceMemory"; 573 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 574 Function *F = M->getFunction(Name); 575 576 // If F is not available, declare it. 577 if (!F) { 578 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 579 std::vector<Type *> Args; 580 Args.push_back(Builder.getInt8PtrTy()); 581 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 582 F = Function::Create(Ty, Linkage, Name, M); 583 } 584 585 Builder.CreateCall(F, {Array}); 586 } 587 588 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 589 const char *Name = "polly_allocateMemoryForDevice"; 590 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 591 Function *F = M->getFunction(Name); 592 593 // If F is not available, declare it. 594 if (!F) { 595 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 596 std::vector<Type *> Args; 597 Args.push_back(Builder.getInt64Ty()); 598 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 599 F = Function::Create(Ty, Linkage, Name, M); 600 } 601 602 return Builder.CreateCall(F, {Size}); 603 } 604 605 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 606 Value *DeviceData, 607 Value *Size) { 608 const char *Name = "polly_copyFromHostToDevice"; 609 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 610 Function *F = M->getFunction(Name); 611 612 // If F is not available, declare it. 613 if (!F) { 614 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 615 std::vector<Type *> Args; 616 Args.push_back(Builder.getInt8PtrTy()); 617 Args.push_back(Builder.getInt8PtrTy()); 618 Args.push_back(Builder.getInt64Ty()); 619 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 620 F = Function::Create(Ty, Linkage, Name, M); 621 } 622 623 Builder.CreateCall(F, {HostData, DeviceData, Size}); 624 } 625 626 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 627 Value *HostData, 628 Value *Size) { 629 const char *Name = "polly_copyFromDeviceToHost"; 630 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 631 Function *F = M->getFunction(Name); 632 633 // If F is not available, declare it. 634 if (!F) { 635 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 636 std::vector<Type *> Args; 637 Args.push_back(Builder.getInt8PtrTy()); 638 Args.push_back(Builder.getInt8PtrTy()); 639 Args.push_back(Builder.getInt64Ty()); 640 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 641 F = Function::Create(Ty, Linkage, Name, M); 642 } 643 644 Builder.CreateCall(F, {DeviceData, HostData, Size}); 645 } 646 647 Value *GPUNodeBuilder::createCallInitContext() { 648 const char *Name = "polly_initContext"; 649 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 650 Function *F = M->getFunction(Name); 651 652 // If F is not available, declare it. 653 if (!F) { 654 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 655 std::vector<Type *> Args; 656 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 657 F = Function::Create(Ty, Linkage, Name, M); 658 } 659 660 return Builder.CreateCall(F, {}); 661 } 662 663 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 664 const char *Name = "polly_freeContext"; 665 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 666 Function *F = M->getFunction(Name); 667 668 // If F is not available, declare it. 669 if (!F) { 670 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 671 std::vector<Type *> Args; 672 Args.push_back(Builder.getInt8PtrTy()); 673 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 674 F = Function::Create(Ty, Linkage, Name, M); 675 } 676 677 Builder.CreateCall(F, {Context}); 678 } 679 680 /// Check if one string is a prefix of another. 681 /// 682 /// @param String The string in which to look for the prefix. 683 /// @param Prefix The prefix to look for. 684 static bool isPrefix(std::string String, std::string Prefix) { 685 return String.find(Prefix) == 0; 686 } 687 688 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 689 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 690 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 691 692 if (!gpu_array_is_scalar(Array)) { 693 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 694 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 695 696 for (unsigned int i = 1; i < Array->n_index; i++) { 697 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 698 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 699 Res = isl_ast_expr_mul(Res, Expr); 700 } 701 702 Value *NumElements = ExprBuilder.create(Res); 703 ArraySize = Builder.CreateMul(ArraySize, NumElements); 704 } 705 isl_ast_build_free(Build); 706 return ArraySize; 707 } 708 709 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 710 enum DataDirection Direction) { 711 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 712 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 713 isl_id *Id = isl_ast_expr_get_id(Arg); 714 auto Array = (gpu_array_info *)isl_id_get_user(Id); 715 auto ScopArray = (ScopArrayInfo *)(Array->user); 716 717 Value *Size = getArraySize(Array); 718 Value *HostPtr = ScopArray->getBasePtr(); 719 720 Value *DevPtr = DeviceAllocations[ScopArray]; 721 722 if (gpu_array_is_scalar(Array)) { 723 HostPtr = Builder.CreateAlloca(ScopArray->getElementType()); 724 Builder.CreateStore(ScopArray->getBasePtr(), HostPtr); 725 } 726 727 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 728 729 if (Direction == HOST_TO_DEVICE) 730 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 731 else 732 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 733 734 isl_id_free(Id); 735 isl_ast_expr_free(Arg); 736 isl_ast_expr_free(Expr); 737 isl_ast_node_free(TransferStmt); 738 } 739 740 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 741 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 742 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 743 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 744 isl_id_free(Id); 745 isl_ast_expr_free(StmtExpr); 746 747 const char *Str = isl_id_get_name(Id); 748 if (!strcmp(Str, "kernel")) { 749 createKernel(UserStmt); 750 isl_ast_expr_free(Expr); 751 return; 752 } 753 754 if (isPrefix(Str, "to_device")) { 755 createDataTransfer(UserStmt, HOST_TO_DEVICE); 756 isl_ast_expr_free(Expr); 757 return; 758 } 759 760 if (isPrefix(Str, "from_device")) { 761 createDataTransfer(UserStmt, DEVICE_TO_HOST); 762 isl_ast_expr_free(Expr); 763 return; 764 } 765 766 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 767 struct ppcg_kernel_stmt *KernelStmt = 768 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 769 isl_id_free(Anno); 770 771 switch (KernelStmt->type) { 772 case ppcg_kernel_domain: 773 createScopStmt(Expr, KernelStmt); 774 isl_ast_node_free(UserStmt); 775 return; 776 case ppcg_kernel_copy: 777 createKernelCopy(KernelStmt); 778 isl_ast_expr_free(Expr); 779 isl_ast_node_free(UserStmt); 780 return; 781 case ppcg_kernel_sync: 782 createKernelSync(); 783 isl_ast_expr_free(Expr); 784 isl_ast_node_free(UserStmt); 785 return; 786 } 787 788 isl_ast_expr_free(Expr); 789 isl_ast_node_free(UserStmt); 790 return; 791 } 792 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 793 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 794 LocalIndex = isl_ast_expr_address_of(LocalIndex); 795 Value *LocalAddr = ExprBuilder.create(LocalIndex); 796 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 797 Index = isl_ast_expr_address_of(Index); 798 Value *GlobalAddr = ExprBuilder.create(Index); 799 800 if (KernelStmt->u.c.read) { 801 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 802 Builder.CreateStore(Load, LocalAddr); 803 } else { 804 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 805 Builder.CreateStore(Load, GlobalAddr); 806 } 807 } 808 809 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 810 ppcg_kernel_stmt *KernelStmt) { 811 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 812 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 813 814 LoopToScevMapT LTS; 815 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 816 817 createSubstitutions(Expr, Stmt, LTS); 818 819 if (Stmt->isBlockStmt()) 820 BlockGen.copyStmt(*Stmt, LTS, Indexes); 821 else 822 assert(0 && "Region statement not supported\n"); 823 } 824 825 void GPUNodeBuilder::createKernelSync() { 826 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 827 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 828 Builder.CreateCall(Sync, {}); 829 } 830 831 /// Collect llvm::Values referenced from @p Node 832 /// 833 /// This function only applies to isl_ast_nodes that are user_nodes referring 834 /// to a ScopStmt. All other node types are ignore. 835 /// 836 /// @param Node The node to collect references for. 837 /// @param User A user pointer used as storage for the data that is collected. 838 /// 839 /// @returns isl_bool_true if data could be collected successfully. 840 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 841 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 842 return isl_bool_true; 843 844 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 845 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 846 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 847 const char *Str = isl_id_get_name(Id); 848 isl_id_free(Id); 849 isl_ast_expr_free(StmtExpr); 850 isl_ast_expr_free(Expr); 851 852 if (!isPrefix(Str, "Stmt")) 853 return isl_bool_true; 854 855 Id = isl_ast_node_get_annotation(Node); 856 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 857 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 858 isl_id_free(Id); 859 860 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 861 862 return isl_bool_true; 863 } 864 865 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 866 SetVector<Value *> SubtreeValues; 867 SetVector<const SCEV *> SCEVs; 868 SetVector<const Loop *> Loops; 869 SubtreeReferences References = { 870 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 871 872 for (const auto &I : IDToValue) 873 SubtreeValues.insert(I.second); 874 875 isl_ast_node_foreach_descendant_top_down( 876 Kernel->tree, collectReferencesInGPUStmt, &References); 877 878 for (const SCEV *Expr : SCEVs) 879 findValues(Expr, SE, SubtreeValues); 880 881 for (auto &SAI : S.arrays()) 882 SubtreeValues.remove(SAI->getBasePtr()); 883 884 isl_space *Space = S.getParamSpace(); 885 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 886 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 887 assert(IDToValue.count(Id)); 888 Value *Val = IDToValue[Id]; 889 SubtreeValues.remove(Val); 890 isl_id_free(Id); 891 } 892 isl_space_free(Space); 893 894 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 895 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 896 assert(IDToValue.count(Id)); 897 Value *Val = IDToValue[Id]; 898 SubtreeValues.remove(Val); 899 isl_id_free(Id); 900 } 901 902 return SubtreeValues; 903 } 904 905 void GPUNodeBuilder::clearDominators(Function *F) { 906 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 907 std::vector<BasicBlock *> Nodes; 908 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 909 Nodes.push_back(I->getBlock()); 910 911 for (BasicBlock *BB : Nodes) 912 DT.eraseNode(BB); 913 } 914 915 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 916 for (BasicBlock &BB : *F) { 917 Loop *L = LI.getLoopFor(&BB); 918 if (L) 919 SE.forgetLoop(L); 920 } 921 } 922 923 void GPUNodeBuilder::clearLoops(Function *F) { 924 for (BasicBlock &BB : *F) { 925 Loop *L = LI.getLoopFor(&BB); 926 if (L) 927 SE.forgetLoop(L); 928 LI.removeBlock(&BB); 929 } 930 } 931 932 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 933 std::vector<Value *> Sizes; 934 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 935 936 for (long i = 0; i < Kernel->n_grid; i++) { 937 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 938 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 939 Value *Res = ExprBuilder.create(GridSize); 940 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 941 Sizes.push_back(Res); 942 } 943 isl_ast_build_free(Context); 944 945 for (long i = Kernel->n_grid; i < 3; i++) 946 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 947 948 return std::make_tuple(Sizes[0], Sizes[1]); 949 } 950 951 std::tuple<Value *, Value *, Value *> 952 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 953 std::vector<Value *> Sizes; 954 955 for (long i = 0; i < Kernel->n_block; i++) { 956 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 957 Sizes.push_back(Res); 958 } 959 960 for (long i = Kernel->n_block; i < 3; i++) 961 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 962 963 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 964 } 965 966 Value * 967 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 968 SetVector<Value *> SubtreeValues) { 969 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 970 std::distance(F->arg_begin(), F->arg_end())); 971 972 BasicBlock *EntryBlock = 973 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 974 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 975 Instruction *Parameters = 976 new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator()); 977 978 int Index = 0; 979 for (long i = 0; i < Prog->n_array; i++) { 980 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 981 continue; 982 983 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 984 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 985 986 Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI]; 987 DevArray = createCallGetDevicePtr(DevArray); 988 Instruction *Param = new AllocaInst( 989 Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), 990 EntryBlock->getTerminator()); 991 Builder.CreateStore(DevArray, Param); 992 Value *Slot = Builder.CreateGEP( 993 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 994 Value *ParamTyped = 995 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 996 Builder.CreateStore(ParamTyped, Slot); 997 Index++; 998 } 999 1000 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1001 1002 for (long i = 0; i < NumHostIters; i++) { 1003 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1004 Value *Val = IDToValue[Id]; 1005 isl_id_free(Id); 1006 Instruction *Param = new AllocaInst( 1007 Val->getType(), Launch + "_param_" + std::to_string(Index), 1008 EntryBlock->getTerminator()); 1009 Builder.CreateStore(Val, Param); 1010 Value *Slot = Builder.CreateGEP( 1011 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1012 Value *ParamTyped = 1013 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1014 Builder.CreateStore(ParamTyped, Slot); 1015 Index++; 1016 } 1017 1018 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1019 1020 for (long i = 0; i < NumVars; i++) { 1021 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1022 Value *Val = IDToValue[Id]; 1023 isl_id_free(Id); 1024 Instruction *Param = new AllocaInst( 1025 Val->getType(), Launch + "_param_" + std::to_string(Index), 1026 EntryBlock->getTerminator()); 1027 Builder.CreateStore(Val, Param); 1028 Value *Slot = Builder.CreateGEP( 1029 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1030 Value *ParamTyped = 1031 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1032 Builder.CreateStore(ParamTyped, Slot); 1033 Index++; 1034 } 1035 1036 for (auto Val : SubtreeValues) { 1037 Instruction *Param = new AllocaInst( 1038 Val->getType(), Launch + "_param_" + std::to_string(Index), 1039 EntryBlock->getTerminator()); 1040 Builder.CreateStore(Val, Param); 1041 Value *Slot = Builder.CreateGEP( 1042 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1043 Value *ParamTyped = 1044 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1045 Builder.CreateStore(ParamTyped, Slot); 1046 Index++; 1047 } 1048 1049 auto Location = EntryBlock->getTerminator(); 1050 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1051 Launch + "_params_i8ptr", Location); 1052 } 1053 1054 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1055 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1056 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1057 isl_id_free(Id); 1058 isl_ast_node_free(KernelStmt); 1059 1060 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1061 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1062 1063 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 1064 1065 assert(Kernel->tree && "Device AST of kernel node is empty"); 1066 1067 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1068 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1069 ValueMapT HostValueMap = ValueMap; 1070 1071 SetVector<const Loop *> Loops; 1072 1073 // Create for all loops we depend on values that contain the current loop 1074 // iteration. These values are necessary to generate code for SCEVs that 1075 // depend on such loops. As a result we need to pass them to the subfunction. 1076 for (const Loop *L : Loops) { 1077 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1078 SE.getUnknown(Builder.getInt64(1)), 1079 L, SCEV::FlagAnyWrap); 1080 Value *V = generateSCEV(OuterLIV); 1081 OutsideLoopIterations[L] = SE.getUnknown(V); 1082 SubtreeValues.insert(V); 1083 } 1084 1085 createKernelFunction(Kernel, SubtreeValues); 1086 1087 create(isl_ast_node_copy(Kernel->tree)); 1088 1089 Function *F = Builder.GetInsertBlock()->getParent(); 1090 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1091 clearDominators(F); 1092 clearScalarEvolution(F); 1093 clearLoops(F); 1094 1095 Builder.SetInsertPoint(&HostInsertPoint); 1096 IDToValue = HostIDs; 1097 1098 ValueMap = HostValueMap; 1099 ScalarMap.clear(); 1100 PHIOpMap.clear(); 1101 EscapeMap.clear(); 1102 IDToSAI.clear(); 1103 Annotator.resetAlternativeAliasBases(); 1104 for (auto &BasePtr : LocalArrays) 1105 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 1106 LocalArrays.clear(); 1107 1108 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1109 1110 std::string ASMString = finalizeKernelFunction(); 1111 std::string Name = "kernel_" + std::to_string(Kernel->id); 1112 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1113 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1114 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1115 1116 Value *GridDimX, *GridDimY; 1117 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1118 1119 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1120 BlockDimZ, Parameters); 1121 createCallFreeKernel(GPUKernel); 1122 1123 for (auto Id : KernelIds) 1124 isl_id_free(Id); 1125 1126 KernelIds.clear(); 1127 } 1128 1129 /// Compute the DataLayout string for the NVPTX backend. 1130 /// 1131 /// @param is64Bit Are we looking for a 64 bit architecture? 1132 static std::string computeNVPTXDataLayout(bool is64Bit) { 1133 std::string Ret = "e"; 1134 1135 if (!is64Bit) 1136 Ret += "-p:32:32"; 1137 1138 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 1139 1140 return Ret; 1141 } 1142 1143 Function * 1144 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1145 SetVector<Value *> &SubtreeValues) { 1146 std::vector<Type *> Args; 1147 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1148 1149 for (long i = 0; i < Prog->n_array; i++) { 1150 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1151 continue; 1152 1153 Args.push_back(Builder.getInt8PtrTy()); 1154 } 1155 1156 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1157 1158 for (long i = 0; i < NumHostIters; i++) 1159 Args.push_back(Builder.getInt64Ty()); 1160 1161 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1162 1163 for (long i = 0; i < NumVars; i++) 1164 Args.push_back(Builder.getInt64Ty()); 1165 1166 for (auto *V : SubtreeValues) 1167 Args.push_back(V->getType()); 1168 1169 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1170 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1171 GPUModule.get()); 1172 FN->setCallingConv(CallingConv::PTX_Kernel); 1173 1174 auto Arg = FN->arg_begin(); 1175 for (long i = 0; i < Kernel->n_array; i++) { 1176 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1177 continue; 1178 1179 Arg->setName(Kernel->array[i].array->name); 1180 1181 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1182 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1183 Type *EleTy = SAI->getElementType(); 1184 Value *Val = &*Arg; 1185 SmallVector<const SCEV *, 4> Sizes; 1186 isl_ast_build *Build = 1187 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1188 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1189 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1190 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1191 auto V = ExprBuilder.create(DimSize); 1192 Sizes.push_back(SE.getSCEV(V)); 1193 } 1194 const ScopArrayInfo *SAIRep = 1195 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 1196 LocalArrays.push_back(Val); 1197 1198 isl_ast_build_free(Build); 1199 KernelIds.push_back(Id); 1200 IDToSAI[Id] = SAIRep; 1201 Arg++; 1202 } 1203 1204 for (long i = 0; i < NumHostIters; i++) { 1205 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1206 Arg->setName(isl_id_get_name(Id)); 1207 IDToValue[Id] = &*Arg; 1208 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1209 Arg++; 1210 } 1211 1212 for (long i = 0; i < NumVars; i++) { 1213 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1214 Arg->setName(isl_id_get_name(Id)); 1215 IDToValue[Id] = &*Arg; 1216 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1217 Arg++; 1218 } 1219 1220 for (auto *V : SubtreeValues) { 1221 Arg->setName(V->getName()); 1222 ValueMap[V] = &*Arg; 1223 Arg++; 1224 } 1225 1226 return FN; 1227 } 1228 1229 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1230 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 1231 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 1232 1233 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 1234 Intrinsic::nvvm_read_ptx_sreg_tid_y, 1235 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 1236 1237 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1238 std::string Name = isl_id_get_name(Id); 1239 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1240 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1241 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1242 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1243 IDToValue[Id] = Val; 1244 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1245 }; 1246 1247 for (int i = 0; i < Kernel->n_grid; ++i) { 1248 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1249 addId(Id, IntrinsicsBID[i]); 1250 } 1251 1252 for (int i = 0; i < Kernel->n_block; ++i) { 1253 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1254 addId(Id, IntrinsicsTID[i]); 1255 } 1256 } 1257 1258 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1259 auto Arg = FN->arg_begin(); 1260 for (long i = 0; i < Kernel->n_array; i++) { 1261 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1262 continue; 1263 1264 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1265 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1266 isl_id_free(Id); 1267 1268 if (SAI->getNumberOfDimensions() > 0) { 1269 Arg++; 1270 continue; 1271 } 1272 1273 Value *Alloca = BlockGen.getOrCreateScalarAlloca(SAI->getBasePtr()); 1274 Value *ArgPtr = &*Arg; 1275 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1276 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1277 Value *Val = Builder.CreateLoad(TypedArgPtr); 1278 Builder.CreateStore(Val, Alloca); 1279 1280 Arg++; 1281 } 1282 } 1283 1284 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1285 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1286 1287 for (int i = 0; i < Kernel->n_var; ++i) { 1288 struct ppcg_kernel_var &Var = Kernel->var[i]; 1289 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1290 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1291 1292 Type *ArrayTy = EleTy; 1293 SmallVector<const SCEV *, 4> Sizes; 1294 1295 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1296 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1297 long Bound = isl_val_get_num_si(Val); 1298 isl_val_free(Val); 1299 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1300 } 1301 1302 for (int j = Var.array->n_index - 1; j >= 0; --j) { 1303 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1304 long Bound = isl_val_get_num_si(Val); 1305 isl_val_free(Val); 1306 ArrayTy = ArrayType::get(ArrayTy, Bound); 1307 } 1308 1309 const ScopArrayInfo *SAI; 1310 Value *Allocation; 1311 if (Var.type == ppcg_access_shared) { 1312 auto GlobalVar = new GlobalVariable( 1313 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1314 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1315 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1316 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1317 1318 Allocation = GlobalVar; 1319 } else if (Var.type == ppcg_access_private) { 1320 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1321 } else { 1322 llvm_unreachable("unknown variable type"); 1323 } 1324 SAI = S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, 1325 ScopArrayInfo::MK_Array); 1326 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1327 IDToValue[Id] = Allocation; 1328 LocalArrays.push_back(Allocation); 1329 KernelIds.push_back(Id); 1330 IDToSAI[Id] = SAI; 1331 } 1332 } 1333 1334 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1335 SetVector<Value *> &SubtreeValues) { 1336 1337 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1338 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1339 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1340 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1341 1342 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1343 1344 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1345 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1346 1347 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1348 DT.addNewBlock(EntryBlock, PrevBlock); 1349 1350 Builder.SetInsertPoint(EntryBlock); 1351 Builder.CreateRetVoid(); 1352 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1353 1354 ScopDetection::markFunctionAsInvalid(FN); 1355 1356 prepareKernelArguments(Kernel, FN); 1357 createKernelVariables(Kernel, FN); 1358 insertKernelIntrinsics(Kernel); 1359 } 1360 1361 std::string GPUNodeBuilder::createKernelASM() { 1362 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1363 std::string ErrMsg; 1364 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1365 1366 if (!GPUTarget) { 1367 errs() << ErrMsg << "\n"; 1368 return ""; 1369 } 1370 1371 TargetOptions Options; 1372 Options.UnsafeFPMath = FastMath; 1373 std::unique_ptr<TargetMachine> TargetM( 1374 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 1375 Options, Optional<Reloc::Model>())); 1376 1377 SmallString<0> ASMString; 1378 raw_svector_ostream ASMStream(ASMString); 1379 llvm::legacy::PassManager PM; 1380 1381 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1382 1383 if (TargetM->addPassesToEmitFile( 1384 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1385 errs() << "The target does not support generation of this file type!\n"; 1386 return ""; 1387 } 1388 1389 PM.run(*GPUModule); 1390 1391 return ASMStream.str(); 1392 } 1393 1394 std::string GPUNodeBuilder::finalizeKernelFunction() { 1395 // Verify module. 1396 llvm::legacy::PassManager Passes; 1397 Passes.add(createVerifierPass()); 1398 Passes.run(*GPUModule); 1399 1400 if (DumpKernelIR) 1401 outs() << *GPUModule << "\n"; 1402 1403 // Optimize module. 1404 llvm::legacy::PassManager OptPasses; 1405 PassManagerBuilder PassBuilder; 1406 PassBuilder.OptLevel = 3; 1407 PassBuilder.SizeLevel = 0; 1408 PassBuilder.populateModulePassManager(OptPasses); 1409 OptPasses.run(*GPUModule); 1410 1411 std::string Assembly = createKernelASM(); 1412 1413 if (DumpKernelASM) 1414 outs() << Assembly << "\n"; 1415 1416 GPUModule.release(); 1417 KernelIDs.clear(); 1418 1419 return Assembly; 1420 } 1421 1422 namespace { 1423 class PPCGCodeGeneration : public ScopPass { 1424 public: 1425 static char ID; 1426 1427 /// The scop that is currently processed. 1428 Scop *S; 1429 1430 LoopInfo *LI; 1431 DominatorTree *DT; 1432 ScalarEvolution *SE; 1433 const DataLayout *DL; 1434 RegionInfo *RI; 1435 1436 PPCGCodeGeneration() : ScopPass(ID) {} 1437 1438 /// Construct compilation options for PPCG. 1439 /// 1440 /// @returns The compilation options. 1441 ppcg_options *createPPCGOptions() { 1442 auto DebugOptions = 1443 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1444 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1445 1446 DebugOptions->dump_schedule_constraints = false; 1447 DebugOptions->dump_schedule = false; 1448 DebugOptions->dump_final_schedule = false; 1449 DebugOptions->dump_sizes = false; 1450 DebugOptions->verbose = false; 1451 1452 Options->debug = DebugOptions; 1453 1454 Options->reschedule = true; 1455 Options->scale_tile_loops = false; 1456 Options->wrap = false; 1457 1458 Options->non_negative_parameters = false; 1459 Options->ctx = nullptr; 1460 Options->sizes = nullptr; 1461 1462 Options->tile_size = 32; 1463 1464 Options->use_private_memory = PrivateMemory; 1465 Options->use_shared_memory = SharedMemory; 1466 Options->max_shared_memory = 48 * 1024; 1467 1468 Options->target = PPCG_TARGET_CUDA; 1469 Options->openmp = false; 1470 Options->linearize_device_arrays = true; 1471 Options->live_range_reordering = false; 1472 1473 Options->opencl_compiler_options = nullptr; 1474 Options->opencl_use_gpu = false; 1475 Options->opencl_n_include_file = 0; 1476 Options->opencl_include_files = nullptr; 1477 Options->opencl_print_kernel_types = false; 1478 Options->opencl_embed_kernel_code = false; 1479 1480 Options->save_schedule_file = nullptr; 1481 Options->load_schedule_file = nullptr; 1482 1483 return Options; 1484 } 1485 1486 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1487 /// 1488 /// Instead of a normal access of the form: 1489 /// 1490 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1491 /// 1492 /// a tagged access has the form 1493 /// 1494 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1495 /// 1496 /// where 'id' is an additional space that references the memory access that 1497 /// triggered the access. 1498 /// 1499 /// @param AccessTy The type of the memory accesses to collect. 1500 /// 1501 /// @return The relation describing all tagged memory accesses. 1502 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1503 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1504 1505 for (auto &Stmt : *S) 1506 for (auto &Acc : Stmt) 1507 if (Acc->getType() == AccessTy) { 1508 isl_map *Relation = Acc->getAccessRelation(); 1509 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1510 1511 isl_space *Space = isl_map_get_space(Relation); 1512 Space = isl_space_range(Space); 1513 Space = isl_space_from_range(Space); 1514 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1515 isl_map *Universe = isl_map_universe(Space); 1516 Relation = isl_map_domain_product(Relation, Universe); 1517 Accesses = isl_union_map_add_map(Accesses, Relation); 1518 } 1519 1520 return Accesses; 1521 } 1522 1523 /// Get the set of all read accesses, tagged with the access id. 1524 /// 1525 /// @see getTaggedAccesses 1526 isl_union_map *getTaggedReads() { 1527 return getTaggedAccesses(MemoryAccess::READ); 1528 } 1529 1530 /// Get the set of all may (and must) accesses, tagged with the access id. 1531 /// 1532 /// @see getTaggedAccesses 1533 isl_union_map *getTaggedMayWrites() { 1534 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1535 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1536 } 1537 1538 /// Get the set of all must accesses, tagged with the access id. 1539 /// 1540 /// @see getTaggedAccesses 1541 isl_union_map *getTaggedMustWrites() { 1542 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1543 } 1544 1545 /// Collect parameter and array names as isl_ids. 1546 /// 1547 /// To reason about the different parameters and arrays used, ppcg requires 1548 /// a list of all isl_ids in use. As PPCG traditionally performs 1549 /// source-to-source compilation each of these isl_ids is mapped to the 1550 /// expression that represents it. As we do not have a corresponding 1551 /// expression in Polly, we just map each id to a 'zero' expression to match 1552 /// the data format that ppcg expects. 1553 /// 1554 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1555 __isl_give isl_id_to_ast_expr *getNames() { 1556 auto *Names = isl_id_to_ast_expr_alloc( 1557 S->getIslCtx(), 1558 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1559 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1560 auto *Space = S->getParamSpace(); 1561 1562 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1563 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1564 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1565 } 1566 1567 for (auto &Array : S->arrays()) { 1568 auto Id = Array->getBasePtrId(); 1569 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1570 } 1571 1572 isl_space_free(Space); 1573 isl_ast_expr_free(Zero); 1574 1575 return Names; 1576 } 1577 1578 /// Create a new PPCG scop from the current scop. 1579 /// 1580 /// The PPCG scop is initialized with data from the current polly::Scop. From 1581 /// this initial data, the data-dependences in the PPCG scop are initialized. 1582 /// We do not use Polly's dependence analysis for now, to ensure we match 1583 /// the PPCG default behaviour more closely. 1584 /// 1585 /// @returns A new ppcg scop. 1586 ppcg_scop *createPPCGScop() { 1587 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1588 1589 PPCGScop->options = createPPCGOptions(); 1590 1591 PPCGScop->start = 0; 1592 PPCGScop->end = 0; 1593 1594 PPCGScop->context = S->getContext(); 1595 PPCGScop->domain = S->getDomains(); 1596 PPCGScop->call = nullptr; 1597 PPCGScop->tagged_reads = getTaggedReads(); 1598 PPCGScop->reads = S->getReads(); 1599 PPCGScop->live_in = nullptr; 1600 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1601 PPCGScop->may_writes = S->getWrites(); 1602 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1603 PPCGScop->must_writes = S->getMustWrites(); 1604 PPCGScop->live_out = nullptr; 1605 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1606 PPCGScop->tagger = nullptr; 1607 1608 PPCGScop->independence = nullptr; 1609 PPCGScop->dep_flow = nullptr; 1610 PPCGScop->tagged_dep_flow = nullptr; 1611 PPCGScop->dep_false = nullptr; 1612 PPCGScop->dep_forced = nullptr; 1613 PPCGScop->dep_order = nullptr; 1614 PPCGScop->tagged_dep_order = nullptr; 1615 1616 PPCGScop->schedule = S->getScheduleTree(); 1617 PPCGScop->names = getNames(); 1618 1619 PPCGScop->pet = nullptr; 1620 1621 compute_tagger(PPCGScop); 1622 compute_dependences(PPCGScop); 1623 1624 return PPCGScop; 1625 } 1626 1627 /// Collect the array acesses in a statement. 1628 /// 1629 /// @param Stmt The statement for which to collect the accesses. 1630 /// 1631 /// @returns A list of array accesses. 1632 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1633 gpu_stmt_access *Accesses = nullptr; 1634 1635 for (MemoryAccess *Acc : Stmt) { 1636 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1637 Access->read = Acc->isRead(); 1638 Access->write = Acc->isWrite(); 1639 Access->access = Acc->getAccessRelation(); 1640 isl_space *Space = isl_map_get_space(Access->access); 1641 Space = isl_space_range(Space); 1642 Space = isl_space_from_range(Space); 1643 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1644 isl_map *Universe = isl_map_universe(Space); 1645 Access->tagged_access = 1646 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1647 Access->exact_write = !Acc->isMayWrite(); 1648 Access->ref_id = Acc->getId(); 1649 Access->next = Accesses; 1650 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 1651 Accesses = Access; 1652 } 1653 1654 return Accesses; 1655 } 1656 1657 /// Collect the list of GPU statements. 1658 /// 1659 /// Each statement has an id, a pointer to the underlying data structure, 1660 /// as well as a list with all memory accesses. 1661 /// 1662 /// TODO: Initialize the list of memory accesses. 1663 /// 1664 /// @returns A linked-list of statements. 1665 gpu_stmt *getStatements() { 1666 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1667 std::distance(S->begin(), S->end())); 1668 1669 int i = 0; 1670 for (auto &Stmt : *S) { 1671 gpu_stmt *GPUStmt = &Stmts[i]; 1672 1673 GPUStmt->id = Stmt.getDomainId(); 1674 1675 // We use the pet stmt pointer to keep track of the Polly statements. 1676 GPUStmt->stmt = (pet_stmt *)&Stmt; 1677 GPUStmt->accesses = getStmtAccesses(Stmt); 1678 i++; 1679 } 1680 1681 return Stmts; 1682 } 1683 1684 /// Derive the extent of an array. 1685 /// 1686 /// The extent of an array is defined by the set of memory locations for 1687 /// which a memory access in the iteration domain exists. 1688 /// 1689 /// @param Array The array to derive the extent for. 1690 /// 1691 /// @returns An isl_set describing the extent of the array. 1692 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1693 isl_union_map *Accesses = S->getAccesses(); 1694 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1695 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1696 isl_set *AccessSet = 1697 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1698 isl_union_set_free(AccessUSet); 1699 1700 return AccessSet; 1701 } 1702 1703 /// Derive the bounds of an array. 1704 /// 1705 /// For the first dimension we derive the bound of the array from the extent 1706 /// of this dimension. For inner dimensions we obtain their size directly from 1707 /// ScopArrayInfo. 1708 /// 1709 /// @param PPCGArray The array to compute bounds for. 1710 /// @param Array The polly array from which to take the information. 1711 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1712 if (PPCGArray.n_index > 0) { 1713 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1714 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1715 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1716 isl_set_free(Dom); 1717 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1718 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1719 isl_aff *One = isl_aff_zero_on_domain(LS); 1720 One = isl_aff_add_constant_si(One, 1); 1721 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1722 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1723 PPCGArray.bound[0] = Bound; 1724 } 1725 1726 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1727 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1728 auto LS = isl_pw_aff_get_domain_space(Bound); 1729 auto Aff = isl_multi_aff_zero(LS); 1730 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1731 PPCGArray.bound[i] = Bound; 1732 } 1733 } 1734 1735 /// Create the arrays for @p PPCGProg. 1736 /// 1737 /// @param PPCGProg The program to compute the arrays for. 1738 void createArrays(gpu_prog *PPCGProg) { 1739 int i = 0; 1740 for (auto &Array : S->arrays()) { 1741 std::string TypeName; 1742 raw_string_ostream OS(TypeName); 1743 1744 OS << *Array->getElementType(); 1745 TypeName = OS.str(); 1746 1747 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1748 1749 PPCGArray.space = Array->getSpace(); 1750 PPCGArray.type = strdup(TypeName.c_str()); 1751 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1752 PPCGArray.name = strdup(Array->getName().c_str()); 1753 PPCGArray.extent = nullptr; 1754 PPCGArray.n_index = Array->getNumberOfDimensions(); 1755 PPCGArray.bound = 1756 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1757 PPCGArray.extent = getExtent(Array); 1758 PPCGArray.n_ref = 0; 1759 PPCGArray.refs = nullptr; 1760 PPCGArray.accessed = true; 1761 PPCGArray.read_only_scalar = false; 1762 PPCGArray.has_compound_element = false; 1763 PPCGArray.local = false; 1764 PPCGArray.declare_local = false; 1765 PPCGArray.global = false; 1766 PPCGArray.linearize = false; 1767 PPCGArray.dep_order = nullptr; 1768 PPCGArray.user = Array; 1769 1770 setArrayBounds(PPCGArray, Array); 1771 i++; 1772 1773 collect_references(PPCGProg, &PPCGArray); 1774 } 1775 } 1776 1777 /// Create an identity map between the arrays in the scop. 1778 /// 1779 /// @returns An identity map between the arrays in the scop. 1780 isl_union_map *getArrayIdentity() { 1781 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1782 1783 for (auto &Array : S->arrays()) { 1784 isl_space *Space = Array->getSpace(); 1785 Space = isl_space_map_from_set(Space); 1786 isl_map *Identity = isl_map_identity(Space); 1787 Maps = isl_union_map_add_map(Maps, Identity); 1788 } 1789 1790 return Maps; 1791 } 1792 1793 /// Create a default-initialized PPCG GPU program. 1794 /// 1795 /// @returns A new gpu grogram description. 1796 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1797 1798 if (!PPCGScop) 1799 return nullptr; 1800 1801 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1802 1803 PPCGProg->ctx = S->getIslCtx(); 1804 PPCGProg->scop = PPCGScop; 1805 PPCGProg->context = isl_set_copy(PPCGScop->context); 1806 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1807 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1808 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1809 PPCGProg->tagged_must_kill = 1810 isl_union_map_copy(PPCGScop->tagged_must_kills); 1811 PPCGProg->to_inner = getArrayIdentity(); 1812 PPCGProg->to_outer = getArrayIdentity(); 1813 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1814 PPCGProg->any_to_outer = nullptr; 1815 PPCGProg->array_order = nullptr; 1816 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1817 PPCGProg->stmts = getStatements(); 1818 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1819 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1820 PPCGProg->n_array); 1821 1822 createArrays(PPCGProg); 1823 1824 return PPCGProg; 1825 } 1826 1827 struct PrintGPUUserData { 1828 struct cuda_info *CudaInfo; 1829 struct gpu_prog *PPCGProg; 1830 std::vector<ppcg_kernel *> Kernels; 1831 }; 1832 1833 /// Print a user statement node in the host code. 1834 /// 1835 /// We use ppcg's printing facilities to print the actual statement and 1836 /// additionally build up a list of all kernels that are encountered in the 1837 /// host ast. 1838 /// 1839 /// @param P The printer to print to 1840 /// @param Options The printing options to use 1841 /// @param Node The node to print 1842 /// @param User A user pointer to carry additional data. This pointer is 1843 /// expected to be of type PrintGPUUserData. 1844 /// 1845 /// @returns A printer to which the output has been printed. 1846 static __isl_give isl_printer * 1847 printHostUser(__isl_take isl_printer *P, 1848 __isl_take isl_ast_print_options *Options, 1849 __isl_take isl_ast_node *Node, void *User) { 1850 auto Data = (struct PrintGPUUserData *)User; 1851 auto Id = isl_ast_node_get_annotation(Node); 1852 1853 if (Id) { 1854 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1855 1856 // If this is a user statement, format it ourselves as ppcg would 1857 // otherwise try to call pet functionality that is not available in 1858 // Polly. 1859 if (IsUser) { 1860 P = isl_printer_start_line(P); 1861 P = isl_printer_print_ast_node(P, Node); 1862 P = isl_printer_end_line(P); 1863 isl_id_free(Id); 1864 isl_ast_print_options_free(Options); 1865 return P; 1866 } 1867 1868 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1869 isl_id_free(Id); 1870 Data->Kernels.push_back(Kernel); 1871 } 1872 1873 return print_host_user(P, Options, Node, User); 1874 } 1875 1876 /// Print C code corresponding to the control flow in @p Kernel. 1877 /// 1878 /// @param Kernel The kernel to print 1879 void printKernel(ppcg_kernel *Kernel) { 1880 auto *P = isl_printer_to_str(S->getIslCtx()); 1881 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1882 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1883 P = isl_ast_node_print(Kernel->tree, P, Options); 1884 char *String = isl_printer_get_str(P); 1885 printf("%s\n", String); 1886 free(String); 1887 isl_printer_free(P); 1888 } 1889 1890 /// Print C code corresponding to the GPU code described by @p Tree. 1891 /// 1892 /// @param Tree An AST describing GPU code 1893 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1894 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1895 auto *P = isl_printer_to_str(S->getIslCtx()); 1896 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1897 1898 PrintGPUUserData Data; 1899 Data.PPCGProg = PPCGProg; 1900 1901 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1902 Options = 1903 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1904 P = isl_ast_node_print(Tree, P, Options); 1905 char *String = isl_printer_get_str(P); 1906 printf("# host\n"); 1907 printf("%s\n", String); 1908 free(String); 1909 isl_printer_free(P); 1910 1911 for (auto Kernel : Data.Kernels) { 1912 printf("# kernel%d\n", Kernel->id); 1913 printKernel(Kernel); 1914 } 1915 } 1916 1917 // Generate a GPU program using PPCG. 1918 // 1919 // GPU mapping consists of multiple steps: 1920 // 1921 // 1) Compute new schedule for the program. 1922 // 2) Map schedule to GPU (TODO) 1923 // 3) Generate code for new schedule (TODO) 1924 // 1925 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1926 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1927 // strategy directly from this pass. 1928 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1929 1930 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1931 1932 PPCGGen->ctx = S->getIslCtx(); 1933 PPCGGen->options = PPCGScop->options; 1934 PPCGGen->print = nullptr; 1935 PPCGGen->print_user = nullptr; 1936 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1937 PPCGGen->prog = PPCGProg; 1938 PPCGGen->tree = nullptr; 1939 PPCGGen->types.n = 0; 1940 PPCGGen->types.name = nullptr; 1941 PPCGGen->sizes = nullptr; 1942 PPCGGen->used_sizes = nullptr; 1943 PPCGGen->kernel_id = 0; 1944 1945 // Set scheduling strategy to same strategy PPCG is using. 1946 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1947 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1948 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1949 1950 isl_schedule *Schedule = get_schedule(PPCGGen); 1951 1952 int has_permutable = has_any_permutable_node(Schedule); 1953 1954 if (!has_permutable || has_permutable < 0) { 1955 Schedule = isl_schedule_free(Schedule); 1956 } else { 1957 Schedule = map_to_device(PPCGGen, Schedule); 1958 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1959 } 1960 1961 if (DumpSchedule) { 1962 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1963 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1964 P = isl_printer_print_str(P, "Schedule\n"); 1965 P = isl_printer_print_str(P, "========\n"); 1966 if (Schedule) 1967 P = isl_printer_print_schedule(P, Schedule); 1968 else 1969 P = isl_printer_print_str(P, "No schedule found\n"); 1970 1971 printf("%s\n", isl_printer_get_str(P)); 1972 isl_printer_free(P); 1973 } 1974 1975 if (DumpCode) { 1976 printf("Code\n"); 1977 printf("====\n"); 1978 if (PPCGGen->tree) 1979 printGPUTree(PPCGGen->tree, PPCGProg); 1980 else 1981 printf("No code generated\n"); 1982 } 1983 1984 isl_schedule_free(Schedule); 1985 1986 return PPCGGen; 1987 } 1988 1989 /// Free gpu_gen structure. 1990 /// 1991 /// @param PPCGGen The ppcg_gen object to free. 1992 void freePPCGGen(gpu_gen *PPCGGen) { 1993 isl_ast_node_free(PPCGGen->tree); 1994 isl_union_map_free(PPCGGen->sizes); 1995 isl_union_map_free(PPCGGen->used_sizes); 1996 free(PPCGGen); 1997 } 1998 1999 /// Free the options in the ppcg scop structure. 2000 /// 2001 /// ppcg is not freeing these options for us. To avoid leaks we do this 2002 /// ourselves. 2003 /// 2004 /// @param PPCGScop The scop referencing the options to free. 2005 void freeOptions(ppcg_scop *PPCGScop) { 2006 free(PPCGScop->options->debug); 2007 PPCGScop->options->debug = nullptr; 2008 free(PPCGScop->options); 2009 PPCGScop->options = nullptr; 2010 } 2011 2012 /// Generate code for a given GPU AST described by @p Root. 2013 /// 2014 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 2015 /// @param Prog The GPU Program to generate code for. 2016 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 2017 ScopAnnotator Annotator; 2018 Annotator.buildAliasScopes(*S); 2019 2020 Region *R = &S->getRegion(); 2021 2022 simplifyRegion(R, DT, LI, RI); 2023 2024 BasicBlock *EnteringBB = R->getEnteringBlock(); 2025 2026 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 2027 2028 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 2029 Prog); 2030 2031 // Only build the run-time condition and parameters _after_ having 2032 // introduced the conditional branch. This is important as the conditional 2033 // branch will guard the original scop from new induction variables that 2034 // the SCEVExpander may introduce while code generating the parameters and 2035 // which may introduce scalar dependences that prevent us from correctly 2036 // code generating this scop. 2037 BasicBlock *StartBlock = 2038 executeScopConditionally(*S, this, Builder.getTrue()); 2039 2040 // TODO: Handle LICM 2041 // TODO: Verify run-time checks 2042 auto SplitBlock = StartBlock->getSinglePredecessor(); 2043 Builder.SetInsertPoint(SplitBlock->getTerminator()); 2044 NodeBuilder.addParameters(S->getContext()); 2045 Builder.SetInsertPoint(&*StartBlock->begin()); 2046 2047 NodeBuilder.initializeAfterRTH(); 2048 NodeBuilder.create(Root); 2049 NodeBuilder.finalize(); 2050 } 2051 2052 bool runOnScop(Scop &CurrentScop) override { 2053 S = &CurrentScop; 2054 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2055 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2056 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2057 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 2058 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2059 2060 // We currently do not support scops with invariant loads. 2061 if (S->hasInvariantAccesses()) 2062 return false; 2063 2064 auto PPCGScop = createPPCGScop(); 2065 auto PPCGProg = createPPCGProg(PPCGScop); 2066 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2067 2068 if (PPCGGen->tree) 2069 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2070 2071 freeOptions(PPCGScop); 2072 freePPCGGen(PPCGGen); 2073 gpu_prog_free(PPCGProg); 2074 ppcg_scop_free(PPCGScop); 2075 2076 return true; 2077 } 2078 2079 void printScop(raw_ostream &, Scop &) const override {} 2080 2081 void getAnalysisUsage(AnalysisUsage &AU) const override { 2082 AU.addRequired<DominatorTreeWrapperPass>(); 2083 AU.addRequired<RegionInfoPass>(); 2084 AU.addRequired<ScalarEvolutionWrapperPass>(); 2085 AU.addRequired<ScopDetection>(); 2086 AU.addRequired<ScopInfoRegionPass>(); 2087 AU.addRequired<LoopInfoWrapperPass>(); 2088 2089 AU.addPreserved<AAResultsWrapperPass>(); 2090 AU.addPreserved<BasicAAWrapperPass>(); 2091 AU.addPreserved<LoopInfoWrapperPass>(); 2092 AU.addPreserved<DominatorTreeWrapperPass>(); 2093 AU.addPreserved<GlobalsAAWrapperPass>(); 2094 AU.addPreserved<PostDominatorTreeWrapperPass>(); 2095 AU.addPreserved<ScopDetection>(); 2096 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2097 AU.addPreserved<SCEVAAWrapperPass>(); 2098 2099 // FIXME: We do not yet add regions for the newly generated code to the 2100 // region tree. 2101 AU.addPreserved<RegionInfoPass>(); 2102 AU.addPreserved<ScopInfoRegionPass>(); 2103 } 2104 }; 2105 } 2106 2107 char PPCGCodeGeneration::ID = 1; 2108 2109 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 2110 2111 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 2112 "Polly - Apply PPCG translation to SCOP", false, false) 2113 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 2114 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 2115 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 2116 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 2117 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 2118 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 2119 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 2120 "Polly - Apply PPCG translation to SCOP", false, false) 2121