1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslAst.h" 16 #include "polly/CodeGen/IslNodeBuilder.h" 17 #include "polly/CodeGen/Utils.h" 18 #include "polly/DependenceInfo.h" 19 #include "polly/LinkAllPasses.h" 20 #include "polly/Options.h" 21 #include "polly/ScopDetection.h" 22 #include "polly/ScopInfo.h" 23 #include "polly/Support/SCEVValidator.h" 24 #include "llvm/ADT/PostOrderIterator.h" 25 #include "llvm/Analysis/AliasAnalysis.h" 26 #include "llvm/Analysis/BasicAliasAnalysis.h" 27 #include "llvm/Analysis/GlobalsModRef.h" 28 #include "llvm/Analysis/PostDominators.h" 29 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 30 #include "llvm/Analysis/TargetLibraryInfo.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/IR/LegacyPassManager.h" 33 #include "llvm/IR/Verifier.h" 34 #include "llvm/Support/TargetRegistry.h" 35 #include "llvm/Support/TargetSelect.h" 36 #include "llvm/Target/TargetMachine.h" 37 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 38 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 39 40 #include "isl/union_map.h" 41 42 extern "C" { 43 #include "ppcg/cuda.h" 44 #include "ppcg/gpu.h" 45 #include "ppcg/gpu_print.h" 46 #include "ppcg/ppcg.h" 47 #include "ppcg/schedule.h" 48 } 49 50 #include "llvm/Support/Debug.h" 51 52 using namespace polly; 53 using namespace llvm; 54 55 #define DEBUG_TYPE "polly-codegen-ppcg" 56 57 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 58 cl::desc("Dump the computed GPU Schedule"), 59 cl::Hidden, cl::init(false), cl::ZeroOrMore, 60 cl::cat(PollyCategory)); 61 62 static cl::opt<bool> 63 DumpCode("polly-acc-dump-code", 64 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 65 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 66 67 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 68 cl::desc("Dump the kernel LLVM-IR"), 69 cl::Hidden, cl::init(false), cl::ZeroOrMore, 70 cl::cat(PollyCategory)); 71 72 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 73 cl::desc("Dump the kernel assembly code"), 74 cl::Hidden, cl::init(false), cl::ZeroOrMore, 75 cl::cat(PollyCategory)); 76 77 static cl::opt<bool> FastMath("polly-acc-fastmath", 78 cl::desc("Allow unsafe math optimizations"), 79 cl::Hidden, cl::init(false), cl::ZeroOrMore, 80 cl::cat(PollyCategory)); 81 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 82 cl::desc("Use shared memory"), cl::Hidden, 83 cl::init(false), cl::ZeroOrMore, 84 cl::cat(PollyCategory)); 85 static cl::opt<bool> PrivateMemory("polly-acc-use-private", 86 cl::desc("Use private memory"), cl::Hidden, 87 cl::init(false), cl::ZeroOrMore, 88 cl::cat(PollyCategory)); 89 90 static cl::opt<std::string> 91 CudaVersion("polly-acc-cuda-version", 92 cl::desc("The CUDA version to compile for"), cl::Hidden, 93 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 94 95 /// Create the ast expressions for a ScopStmt. 96 /// 97 /// This function is a callback for to generate the ast expressions for each 98 /// of the scheduled ScopStmts. 99 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 100 void *StmtT, isl_ast_build *Build, 101 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 102 isl_id *Id, void *User), 103 void *UserIndex, 104 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 105 void *UserExpr) { 106 107 ScopStmt *Stmt = (ScopStmt *)StmtT; 108 109 isl_ctx *Ctx; 110 111 if (!Stmt || !Build) 112 return NULL; 113 114 Ctx = isl_ast_build_get_ctx(Build); 115 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 116 117 for (MemoryAccess *Acc : *Stmt) { 118 isl_map *AddrFunc = Acc->getAddressFunction(); 119 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 120 isl_id *RefId = Acc->getId(); 121 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 122 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 123 MPA = isl_multi_pw_aff_coalesce(MPA); 124 MPA = FunctionIndex(MPA, RefId, UserIndex); 125 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 126 Access = FunctionExpr(Access, RefId, UserExpr); 127 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 128 } 129 130 return RefToExpr; 131 } 132 133 /// Generate code for a GPU specific isl AST. 134 /// 135 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 136 /// generates code for general-prupose AST nodes, with special functionality 137 /// for generating GPU specific user nodes. 138 /// 139 /// @see GPUNodeBuilder::createUser 140 class GPUNodeBuilder : public IslNodeBuilder { 141 public: 142 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 143 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 144 DominatorTree &DT, Scop &S, gpu_prog *Prog) 145 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 146 getExprBuilder().setIDToSAI(&IDToSAI); 147 } 148 149 /// Create after-run-time-check initialization code. 150 void initializeAfterRTH(); 151 152 /// Finalize the generated scop. 153 virtual void finalize(); 154 155 private: 156 /// A vector of array base pointers for which a new ScopArrayInfo was created. 157 /// 158 /// This vector is used to delete the ScopArrayInfo when it is not needed any 159 /// more. 160 std::vector<Value *> LocalArrays; 161 162 /// A map from ScopArrays to their corresponding device allocations. 163 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 164 165 /// The current GPU context. 166 Value *GPUContext; 167 168 /// The set of isl_ids allocated in the kernel 169 std::vector<isl_id *> KernelIds; 170 171 /// A module containing GPU code. 172 /// 173 /// This pointer is only set in case we are currently generating GPU code. 174 std::unique_ptr<Module> GPUModule; 175 176 /// The GPU program we generate code for. 177 gpu_prog *Prog; 178 179 /// Class to free isl_ids. 180 class IslIdDeleter { 181 public: 182 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 183 }; 184 185 /// A set containing all isl_ids allocated in a GPU kernel. 186 /// 187 /// By releasing this set all isl_ids will be freed. 188 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 189 190 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 191 192 /// Create code for user-defined AST nodes. 193 /// 194 /// These AST nodes can be of type: 195 /// 196 /// - ScopStmt: A computational statement (TODO) 197 /// - Kernel: A GPU kernel call (TODO) 198 /// - Data-Transfer: A GPU <-> CPU data-transfer 199 /// - In-kernel synchronization 200 /// - In-kernel memory copy statement 201 /// 202 /// @param UserStmt The ast node to generate code for. 203 virtual void createUser(__isl_take isl_ast_node *UserStmt); 204 205 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 206 207 /// Create code for a data transfer statement 208 /// 209 /// @param TransferStmt The data transfer statement. 210 /// @param Direction The direction in which to transfer data. 211 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 212 enum DataDirection Direction); 213 214 /// Find llvm::Values referenced in GPU kernel. 215 /// 216 /// @param Kernel The kernel to scan for llvm::Values 217 /// 218 /// @returns A set of values referenced by the kernel. 219 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 220 221 /// Compute the sizes of the execution grid for a given kernel. 222 /// 223 /// @param Kernel The kernel to compute grid sizes for. 224 /// 225 /// @returns A tuple with grid sizes for X and Y dimension 226 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 227 228 /// Compute the sizes of the thread blocks for a given kernel. 229 /// 230 /// @param Kernel The kernel to compute thread block sizes for. 231 /// 232 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 233 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 234 235 /// Create kernel launch parameters. 236 /// 237 /// @param Kernel The kernel to create parameters for. 238 /// @param F The kernel function that has been created. 239 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 240 /// 241 /// @returns A stack allocated array with pointers to the parameter 242 /// values that are passed to the kernel. 243 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 244 SetVector<Value *> SubtreeValues); 245 246 /// Create declarations for kernel variable. 247 /// 248 /// This includes shared memory declarations. 249 /// 250 /// @param Kernel The kernel definition to create variables for. 251 /// @param FN The function into which to generate the variables. 252 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 253 254 /// Add CUDA annotations to module. 255 /// 256 /// Add a set of CUDA annotations that declares the maximal block dimensions 257 /// that will be used to execute the CUDA kernel. This allows the NVIDIA 258 /// PTX compiler to bound the number of allocated registers to ensure the 259 /// resulting kernel is known to run with up to as many block dimensions 260 /// as specified here. 261 /// 262 /// @param M The module to add the annotations to. 263 /// @param BlockDimX The size of block dimension X. 264 /// @param BlockDimY The size of block dimension Y. 265 /// @param BlockDimZ The size of block dimension Z. 266 void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, 267 Value *BlockDimZ); 268 269 /// Create GPU kernel. 270 /// 271 /// Code generate the kernel described by @p KernelStmt. 272 /// 273 /// @param KernelStmt The ast node to generate kernel code for. 274 void createKernel(__isl_take isl_ast_node *KernelStmt); 275 276 /// Generate code that computes the size of an array. 277 /// 278 /// @param Array The array for which to compute a size. 279 Value *getArraySize(gpu_array_info *Array); 280 281 /// Prepare the kernel arguments for kernel code generation 282 /// 283 /// @param Kernel The kernel to generate code for. 284 /// @param FN The function created for the kernel. 285 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 286 287 /// Create kernel function. 288 /// 289 /// Create a kernel function located in a newly created module that can serve 290 /// as target for device code generation. Set the Builder to point to the 291 /// start block of this newly created function. 292 /// 293 /// @param Kernel The kernel to generate code for. 294 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 295 void createKernelFunction(ppcg_kernel *Kernel, 296 SetVector<Value *> &SubtreeValues); 297 298 /// Create the declaration of a kernel function. 299 /// 300 /// The kernel function takes as arguments: 301 /// 302 /// - One i8 pointer for each external array reference used in the kernel. 303 /// - Host iterators 304 /// - Parameters 305 /// - Other LLVM Value references (TODO) 306 /// 307 /// @param Kernel The kernel to generate the function declaration for. 308 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 309 /// 310 /// @returns The newly declared function. 311 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 312 SetVector<Value *> &SubtreeValues); 313 314 /// Insert intrinsic functions to obtain thread and block ids. 315 /// 316 /// @param The kernel to generate the intrinsic functions for. 317 void insertKernelIntrinsics(ppcg_kernel *Kernel); 318 319 /// Create a global-to-shared or shared-to-global copy statement. 320 /// 321 /// @param CopyStmt The copy statement to generate code for 322 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 323 324 /// Create code for a ScopStmt called in @p Expr. 325 /// 326 /// @param Expr The expression containing the call. 327 /// @param KernelStmt The kernel statement referenced in the call. 328 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 329 330 /// Create an in-kernel synchronization call. 331 void createKernelSync(); 332 333 /// Create a PTX assembly string for the current GPU kernel. 334 /// 335 /// @returns A string containing the corresponding PTX assembly code. 336 std::string createKernelASM(); 337 338 /// Remove references from the dominator tree to the kernel function @p F. 339 /// 340 /// @param F The function to remove references to. 341 void clearDominators(Function *F); 342 343 /// Remove references from scalar evolution to the kernel function @p F. 344 /// 345 /// @param F The function to remove references to. 346 void clearScalarEvolution(Function *F); 347 348 /// Remove references from loop info to the kernel function @p F. 349 /// 350 /// @param F The function to remove references to. 351 void clearLoops(Function *F); 352 353 /// Finalize the generation of the kernel function. 354 /// 355 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 356 /// dump its IR to stderr. 357 /// 358 /// @returns The Assembly string of the kernel. 359 std::string finalizeKernelFunction(); 360 361 /// Create code that allocates memory to store arrays on device. 362 void allocateDeviceArrays(); 363 364 /// Free all allocated device arrays. 365 void freeDeviceArrays(); 366 367 /// Create a call to initialize the GPU context. 368 /// 369 /// @returns A pointer to the newly initialized context. 370 Value *createCallInitContext(); 371 372 /// Create a call to get the device pointer for a kernel allocation. 373 /// 374 /// @param Allocation The Polly GPU allocation 375 /// 376 /// @returns The device parameter corresponding to this allocation. 377 Value *createCallGetDevicePtr(Value *Allocation); 378 379 /// Create a call to free the GPU context. 380 /// 381 /// @param Context A pointer to an initialized GPU context. 382 void createCallFreeContext(Value *Context); 383 384 /// Create a call to allocate memory on the device. 385 /// 386 /// @param Size The size of memory to allocate 387 /// 388 /// @returns A pointer that identifies this allocation. 389 Value *createCallAllocateMemoryForDevice(Value *Size); 390 391 /// Create a call to free a device array. 392 /// 393 /// @param Array The device array to free. 394 void createCallFreeDeviceMemory(Value *Array); 395 396 /// Create a call to copy data from host to device. 397 /// 398 /// @param HostPtr A pointer to the host data that should be copied. 399 /// @param DevicePtr A device pointer specifying the location to copy to. 400 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 401 Value *Size); 402 403 /// Create a call to copy data from device to host. 404 /// 405 /// @param DevicePtr A pointer to the device data that should be copied. 406 /// @param HostPtr A host pointer specifying the location to copy to. 407 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 408 Value *Size); 409 410 /// Create a call to get a kernel from an assembly string. 411 /// 412 /// @param Buffer The string describing the kernel. 413 /// @param Entry The name of the kernel function to call. 414 /// 415 /// @returns A pointer to a kernel object 416 Value *createCallGetKernel(Value *Buffer, Value *Entry); 417 418 /// Create a call to free a GPU kernel. 419 /// 420 /// @param GPUKernel THe kernel to free. 421 void createCallFreeKernel(Value *GPUKernel); 422 423 /// Create a call to launch a GPU kernel. 424 /// 425 /// @param GPUKernel The kernel to launch. 426 /// @param GridDimX The size of the first grid dimension. 427 /// @param GridDimY The size of the second grid dimension. 428 /// @param GridBlockX The size of the first block dimension. 429 /// @param GridBlockY The size of the second block dimension. 430 /// @param GridBlockZ The size of the third block dimension. 431 /// @param Paramters A pointer to an array that contains itself pointers to 432 /// the parameter values passed for each kernel argument. 433 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 434 Value *GridDimY, Value *BlockDimX, 435 Value *BlockDimY, Value *BlockDimZ, 436 Value *Parameters); 437 }; 438 439 void GPUNodeBuilder::initializeAfterRTH() { 440 BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(), 441 &*Builder.GetInsertPoint(), &DT, &LI); 442 NewBB->setName("polly.acc.initialize"); 443 Builder.SetInsertPoint(&NewBB->front()); 444 445 GPUContext = createCallInitContext(); 446 allocateDeviceArrays(); 447 } 448 449 void GPUNodeBuilder::finalize() { 450 freeDeviceArrays(); 451 createCallFreeContext(GPUContext); 452 IslNodeBuilder::finalize(); 453 } 454 455 void GPUNodeBuilder::allocateDeviceArrays() { 456 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 457 458 for (int i = 0; i < Prog->n_array; ++i) { 459 gpu_array_info *Array = &Prog->array[i]; 460 auto *ScopArray = (ScopArrayInfo *)Array->user; 461 std::string DevArrayName("p_dev_array_"); 462 DevArrayName.append(Array->name); 463 464 Value *ArraySize = getArraySize(Array); 465 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 466 DevArray->setName(DevArrayName); 467 DeviceAllocations[ScopArray] = DevArray; 468 } 469 470 isl_ast_build_free(Build); 471 } 472 473 void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, 474 Value *BlockDimY, Value *BlockDimZ) { 475 auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); 476 477 for (auto &F : *M) { 478 if (F.getCallingConv() != CallingConv::PTX_Kernel) 479 continue; 480 481 Value *V[] = {BlockDimX, BlockDimY, BlockDimZ}; 482 483 Metadata *Elements[] = { 484 ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"), 485 ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"), 486 ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"), 487 ValueAsMetadata::get(V[2]), 488 }; 489 MDNode *Node = MDNode::get(M->getContext(), Elements); 490 AnnotationNode->addOperand(Node); 491 } 492 } 493 494 void GPUNodeBuilder::freeDeviceArrays() { 495 for (auto &Array : DeviceAllocations) 496 createCallFreeDeviceMemory(Array.second); 497 } 498 499 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 500 const char *Name = "polly_getKernel"; 501 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 502 Function *F = M->getFunction(Name); 503 504 // If F is not available, declare it. 505 if (!F) { 506 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 507 std::vector<Type *> Args; 508 Args.push_back(Builder.getInt8PtrTy()); 509 Args.push_back(Builder.getInt8PtrTy()); 510 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 511 F = Function::Create(Ty, Linkage, Name, M); 512 } 513 514 return Builder.CreateCall(F, {Buffer, Entry}); 515 } 516 517 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 518 const char *Name = "polly_getDevicePtr"; 519 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 520 Function *F = M->getFunction(Name); 521 522 // If F is not available, declare it. 523 if (!F) { 524 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 525 std::vector<Type *> Args; 526 Args.push_back(Builder.getInt8PtrTy()); 527 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 528 F = Function::Create(Ty, Linkage, Name, M); 529 } 530 531 return Builder.CreateCall(F, {Allocation}); 532 } 533 534 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 535 Value *GridDimY, Value *BlockDimX, 536 Value *BlockDimY, Value *BlockDimZ, 537 Value *Parameters) { 538 const char *Name = "polly_launchKernel"; 539 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 540 Function *F = M->getFunction(Name); 541 542 // If F is not available, declare it. 543 if (!F) { 544 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 545 std::vector<Type *> Args; 546 Args.push_back(Builder.getInt8PtrTy()); 547 Args.push_back(Builder.getInt32Ty()); 548 Args.push_back(Builder.getInt32Ty()); 549 Args.push_back(Builder.getInt32Ty()); 550 Args.push_back(Builder.getInt32Ty()); 551 Args.push_back(Builder.getInt32Ty()); 552 Args.push_back(Builder.getInt8PtrTy()); 553 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 554 F = Function::Create(Ty, Linkage, Name, M); 555 } 556 557 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 558 BlockDimZ, Parameters}); 559 } 560 561 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 562 const char *Name = "polly_freeKernel"; 563 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 564 Function *F = M->getFunction(Name); 565 566 // If F is not available, declare it. 567 if (!F) { 568 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 569 std::vector<Type *> Args; 570 Args.push_back(Builder.getInt8PtrTy()); 571 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 572 F = Function::Create(Ty, Linkage, Name, M); 573 } 574 575 Builder.CreateCall(F, {GPUKernel}); 576 } 577 578 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 579 const char *Name = "polly_freeDeviceMemory"; 580 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 581 Function *F = M->getFunction(Name); 582 583 // If F is not available, declare it. 584 if (!F) { 585 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 586 std::vector<Type *> Args; 587 Args.push_back(Builder.getInt8PtrTy()); 588 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 589 F = Function::Create(Ty, Linkage, Name, M); 590 } 591 592 Builder.CreateCall(F, {Array}); 593 } 594 595 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 596 const char *Name = "polly_allocateMemoryForDevice"; 597 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 598 Function *F = M->getFunction(Name); 599 600 // If F is not available, declare it. 601 if (!F) { 602 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 603 std::vector<Type *> Args; 604 Args.push_back(Builder.getInt64Ty()); 605 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 606 F = Function::Create(Ty, Linkage, Name, M); 607 } 608 609 return Builder.CreateCall(F, {Size}); 610 } 611 612 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 613 Value *DeviceData, 614 Value *Size) { 615 const char *Name = "polly_copyFromHostToDevice"; 616 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 617 Function *F = M->getFunction(Name); 618 619 // If F is not available, declare it. 620 if (!F) { 621 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 622 std::vector<Type *> Args; 623 Args.push_back(Builder.getInt8PtrTy()); 624 Args.push_back(Builder.getInt8PtrTy()); 625 Args.push_back(Builder.getInt64Ty()); 626 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 627 F = Function::Create(Ty, Linkage, Name, M); 628 } 629 630 Builder.CreateCall(F, {HostData, DeviceData, Size}); 631 } 632 633 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 634 Value *HostData, 635 Value *Size) { 636 const char *Name = "polly_copyFromDeviceToHost"; 637 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 638 Function *F = M->getFunction(Name); 639 640 // If F is not available, declare it. 641 if (!F) { 642 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 643 std::vector<Type *> Args; 644 Args.push_back(Builder.getInt8PtrTy()); 645 Args.push_back(Builder.getInt8PtrTy()); 646 Args.push_back(Builder.getInt64Ty()); 647 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 648 F = Function::Create(Ty, Linkage, Name, M); 649 } 650 651 Builder.CreateCall(F, {DeviceData, HostData, Size}); 652 } 653 654 Value *GPUNodeBuilder::createCallInitContext() { 655 const char *Name = "polly_initContext"; 656 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 657 Function *F = M->getFunction(Name); 658 659 // If F is not available, declare it. 660 if (!F) { 661 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 662 std::vector<Type *> Args; 663 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 664 F = Function::Create(Ty, Linkage, Name, M); 665 } 666 667 return Builder.CreateCall(F, {}); 668 } 669 670 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 671 const char *Name = "polly_freeContext"; 672 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 673 Function *F = M->getFunction(Name); 674 675 // If F is not available, declare it. 676 if (!F) { 677 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 678 std::vector<Type *> Args; 679 Args.push_back(Builder.getInt8PtrTy()); 680 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 681 F = Function::Create(Ty, Linkage, Name, M); 682 } 683 684 Builder.CreateCall(F, {Context}); 685 } 686 687 /// Check if one string is a prefix of another. 688 /// 689 /// @param String The string in which to look for the prefix. 690 /// @param Prefix The prefix to look for. 691 static bool isPrefix(std::string String, std::string Prefix) { 692 return String.find(Prefix) == 0; 693 } 694 695 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 696 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 697 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 698 699 if (!gpu_array_is_scalar(Array)) { 700 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 701 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 702 703 for (unsigned int i = 1; i < Array->n_index; i++) { 704 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 705 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 706 Res = isl_ast_expr_mul(Res, Expr); 707 } 708 709 Value *NumElements = ExprBuilder.create(Res); 710 ArraySize = Builder.CreateMul(ArraySize, NumElements); 711 } 712 isl_ast_build_free(Build); 713 return ArraySize; 714 } 715 716 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 717 enum DataDirection Direction) { 718 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 719 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 720 isl_id *Id = isl_ast_expr_get_id(Arg); 721 auto Array = (gpu_array_info *)isl_id_get_user(Id); 722 auto ScopArray = (ScopArrayInfo *)(Array->user); 723 724 Value *Size = getArraySize(Array); 725 Value *DevPtr = DeviceAllocations[ScopArray]; 726 727 Value *HostPtr; 728 729 if (gpu_array_is_scalar(Array)) 730 HostPtr = BlockGen.getOrCreateAlloca(ScopArray); 731 else 732 HostPtr = ScopArray->getBasePtr(); 733 734 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 735 736 if (Direction == HOST_TO_DEVICE) 737 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 738 else 739 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 740 741 isl_id_free(Id); 742 isl_ast_expr_free(Arg); 743 isl_ast_expr_free(Expr); 744 isl_ast_node_free(TransferStmt); 745 } 746 747 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 748 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 749 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 750 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 751 isl_id_free(Id); 752 isl_ast_expr_free(StmtExpr); 753 754 const char *Str = isl_id_get_name(Id); 755 if (!strcmp(Str, "kernel")) { 756 createKernel(UserStmt); 757 isl_ast_expr_free(Expr); 758 return; 759 } 760 761 if (isPrefix(Str, "to_device")) { 762 createDataTransfer(UserStmt, HOST_TO_DEVICE); 763 isl_ast_expr_free(Expr); 764 return; 765 } 766 767 if (isPrefix(Str, "from_device")) { 768 createDataTransfer(UserStmt, DEVICE_TO_HOST); 769 isl_ast_expr_free(Expr); 770 return; 771 } 772 773 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 774 struct ppcg_kernel_stmt *KernelStmt = 775 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 776 isl_id_free(Anno); 777 778 switch (KernelStmt->type) { 779 case ppcg_kernel_domain: 780 createScopStmt(Expr, KernelStmt); 781 isl_ast_node_free(UserStmt); 782 return; 783 case ppcg_kernel_copy: 784 createKernelCopy(KernelStmt); 785 isl_ast_expr_free(Expr); 786 isl_ast_node_free(UserStmt); 787 return; 788 case ppcg_kernel_sync: 789 createKernelSync(); 790 isl_ast_expr_free(Expr); 791 isl_ast_node_free(UserStmt); 792 return; 793 } 794 795 isl_ast_expr_free(Expr); 796 isl_ast_node_free(UserStmt); 797 return; 798 } 799 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 800 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 801 LocalIndex = isl_ast_expr_address_of(LocalIndex); 802 Value *LocalAddr = ExprBuilder.create(LocalIndex); 803 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 804 Index = isl_ast_expr_address_of(Index); 805 Value *GlobalAddr = ExprBuilder.create(Index); 806 807 if (KernelStmt->u.c.read) { 808 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 809 Builder.CreateStore(Load, LocalAddr); 810 } else { 811 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 812 Builder.CreateStore(Load, GlobalAddr); 813 } 814 } 815 816 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 817 ppcg_kernel_stmt *KernelStmt) { 818 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 819 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 820 821 LoopToScevMapT LTS; 822 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 823 824 createSubstitutions(Expr, Stmt, LTS); 825 826 if (Stmt->isBlockStmt()) 827 BlockGen.copyStmt(*Stmt, LTS, Indexes); 828 else 829 assert(0 && "Region statement not supported\n"); 830 } 831 832 void GPUNodeBuilder::createKernelSync() { 833 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 834 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 835 Builder.CreateCall(Sync, {}); 836 } 837 838 /// Collect llvm::Values referenced from @p Node 839 /// 840 /// This function only applies to isl_ast_nodes that are user_nodes referring 841 /// to a ScopStmt. All other node types are ignore. 842 /// 843 /// @param Node The node to collect references for. 844 /// @param User A user pointer used as storage for the data that is collected. 845 /// 846 /// @returns isl_bool_true if data could be collected successfully. 847 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 848 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 849 return isl_bool_true; 850 851 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 852 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 853 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 854 const char *Str = isl_id_get_name(Id); 855 isl_id_free(Id); 856 isl_ast_expr_free(StmtExpr); 857 isl_ast_expr_free(Expr); 858 859 if (!isPrefix(Str, "Stmt")) 860 return isl_bool_true; 861 862 Id = isl_ast_node_get_annotation(Node); 863 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 864 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 865 isl_id_free(Id); 866 867 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 868 869 return isl_bool_true; 870 } 871 872 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 873 SetVector<Value *> SubtreeValues; 874 SetVector<const SCEV *> SCEVs; 875 SetVector<const Loop *> Loops; 876 SubtreeReferences References = { 877 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 878 879 for (const auto &I : IDToValue) 880 SubtreeValues.insert(I.second); 881 882 isl_ast_node_foreach_descendant_top_down( 883 Kernel->tree, collectReferencesInGPUStmt, &References); 884 885 for (const SCEV *Expr : SCEVs) 886 findValues(Expr, SE, SubtreeValues); 887 888 for (auto &SAI : S.arrays()) 889 SubtreeValues.remove(SAI->getBasePtr()); 890 891 isl_space *Space = S.getParamSpace(); 892 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 893 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 894 assert(IDToValue.count(Id)); 895 Value *Val = IDToValue[Id]; 896 SubtreeValues.remove(Val); 897 isl_id_free(Id); 898 } 899 isl_space_free(Space); 900 901 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 902 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 903 assert(IDToValue.count(Id)); 904 Value *Val = IDToValue[Id]; 905 SubtreeValues.remove(Val); 906 isl_id_free(Id); 907 } 908 909 return SubtreeValues; 910 } 911 912 void GPUNodeBuilder::clearDominators(Function *F) { 913 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 914 std::vector<BasicBlock *> Nodes; 915 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 916 Nodes.push_back(I->getBlock()); 917 918 for (BasicBlock *BB : Nodes) 919 DT.eraseNode(BB); 920 } 921 922 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 923 for (BasicBlock &BB : *F) { 924 Loop *L = LI.getLoopFor(&BB); 925 if (L) 926 SE.forgetLoop(L); 927 } 928 } 929 930 void GPUNodeBuilder::clearLoops(Function *F) { 931 for (BasicBlock &BB : *F) { 932 Loop *L = LI.getLoopFor(&BB); 933 if (L) 934 SE.forgetLoop(L); 935 LI.removeBlock(&BB); 936 } 937 } 938 939 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 940 std::vector<Value *> Sizes; 941 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 942 943 for (long i = 0; i < Kernel->n_grid; i++) { 944 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 945 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 946 Value *Res = ExprBuilder.create(GridSize); 947 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 948 Sizes.push_back(Res); 949 } 950 isl_ast_build_free(Context); 951 952 for (long i = Kernel->n_grid; i < 3; i++) 953 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 954 955 return std::make_tuple(Sizes[0], Sizes[1]); 956 } 957 958 std::tuple<Value *, Value *, Value *> 959 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 960 std::vector<Value *> Sizes; 961 962 for (long i = 0; i < Kernel->n_block; i++) { 963 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 964 Sizes.push_back(Res); 965 } 966 967 for (long i = Kernel->n_block; i < 3; i++) 968 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 969 970 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 971 } 972 973 Value * 974 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 975 SetVector<Value *> SubtreeValues) { 976 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 977 std::distance(F->arg_begin(), F->arg_end())); 978 979 BasicBlock *EntryBlock = 980 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 981 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 982 Instruction *Parameters = 983 new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator()); 984 985 int Index = 0; 986 for (long i = 0; i < Prog->n_array; i++) { 987 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 988 continue; 989 990 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 991 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 992 993 Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI]; 994 DevArray = createCallGetDevicePtr(DevArray); 995 Instruction *Param = new AllocaInst( 996 Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), 997 EntryBlock->getTerminator()); 998 Builder.CreateStore(DevArray, Param); 999 Value *Slot = Builder.CreateGEP( 1000 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1001 Value *ParamTyped = 1002 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1003 Builder.CreateStore(ParamTyped, Slot); 1004 Index++; 1005 } 1006 1007 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1008 1009 for (long i = 0; i < NumHostIters; i++) { 1010 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1011 Value *Val = IDToValue[Id]; 1012 isl_id_free(Id); 1013 Instruction *Param = new AllocaInst( 1014 Val->getType(), Launch + "_param_" + std::to_string(Index), 1015 EntryBlock->getTerminator()); 1016 Builder.CreateStore(Val, Param); 1017 Value *Slot = Builder.CreateGEP( 1018 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1019 Value *ParamTyped = 1020 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1021 Builder.CreateStore(ParamTyped, Slot); 1022 Index++; 1023 } 1024 1025 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1026 1027 for (long i = 0; i < NumVars; i++) { 1028 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1029 Value *Val = IDToValue[Id]; 1030 isl_id_free(Id); 1031 Instruction *Param = new AllocaInst( 1032 Val->getType(), Launch + "_param_" + std::to_string(Index), 1033 EntryBlock->getTerminator()); 1034 Builder.CreateStore(Val, Param); 1035 Value *Slot = Builder.CreateGEP( 1036 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1037 Value *ParamTyped = 1038 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1039 Builder.CreateStore(ParamTyped, Slot); 1040 Index++; 1041 } 1042 1043 for (auto Val : SubtreeValues) { 1044 Instruction *Param = new AllocaInst( 1045 Val->getType(), Launch + "_param_" + std::to_string(Index), 1046 EntryBlock->getTerminator()); 1047 Builder.CreateStore(Val, Param); 1048 Value *Slot = Builder.CreateGEP( 1049 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1050 Value *ParamTyped = 1051 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1052 Builder.CreateStore(ParamTyped, Slot); 1053 Index++; 1054 } 1055 1056 auto Location = EntryBlock->getTerminator(); 1057 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1058 Launch + "_params_i8ptr", Location); 1059 } 1060 1061 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1062 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1063 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1064 isl_id_free(Id); 1065 isl_ast_node_free(KernelStmt); 1066 1067 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1068 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1069 1070 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 1071 1072 assert(Kernel->tree && "Device AST of kernel node is empty"); 1073 1074 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1075 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1076 ValueMapT HostValueMap = ValueMap; 1077 BlockGenerator::ScalarAllocaMapTy HostScalarMap = ScalarMap; 1078 BlockGenerator::ScalarAllocaMapTy HostPHIOpMap = PHIOpMap; 1079 ScalarMap.clear(); 1080 PHIOpMap.clear(); 1081 1082 SetVector<const Loop *> Loops; 1083 1084 // Create for all loops we depend on values that contain the current loop 1085 // iteration. These values are necessary to generate code for SCEVs that 1086 // depend on such loops. As a result we need to pass them to the subfunction. 1087 for (const Loop *L : Loops) { 1088 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1089 SE.getUnknown(Builder.getInt64(1)), 1090 L, SCEV::FlagAnyWrap); 1091 Value *V = generateSCEV(OuterLIV); 1092 OutsideLoopIterations[L] = SE.getUnknown(V); 1093 SubtreeValues.insert(V); 1094 } 1095 1096 createKernelFunction(Kernel, SubtreeValues); 1097 1098 create(isl_ast_node_copy(Kernel->tree)); 1099 1100 Function *F = Builder.GetInsertBlock()->getParent(); 1101 addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ); 1102 clearDominators(F); 1103 clearScalarEvolution(F); 1104 clearLoops(F); 1105 1106 Builder.SetInsertPoint(&HostInsertPoint); 1107 IDToValue = HostIDs; 1108 1109 ValueMap = std::move(HostValueMap); 1110 ScalarMap = std::move(HostScalarMap); 1111 PHIOpMap = std::move(HostPHIOpMap); 1112 EscapeMap.clear(); 1113 IDToSAI.clear(); 1114 Annotator.resetAlternativeAliasBases(); 1115 for (auto &BasePtr : LocalArrays) 1116 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 1117 LocalArrays.clear(); 1118 1119 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1120 1121 std::string ASMString = finalizeKernelFunction(); 1122 std::string Name = "kernel_" + std::to_string(Kernel->id); 1123 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1124 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1125 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1126 1127 Value *GridDimX, *GridDimY; 1128 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1129 1130 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1131 BlockDimZ, Parameters); 1132 createCallFreeKernel(GPUKernel); 1133 1134 for (auto Id : KernelIds) 1135 isl_id_free(Id); 1136 1137 KernelIds.clear(); 1138 } 1139 1140 /// Compute the DataLayout string for the NVPTX backend. 1141 /// 1142 /// @param is64Bit Are we looking for a 64 bit architecture? 1143 static std::string computeNVPTXDataLayout(bool is64Bit) { 1144 std::string Ret = "e"; 1145 1146 if (!is64Bit) 1147 Ret += "-p:32:32"; 1148 1149 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 1150 1151 return Ret; 1152 } 1153 1154 Function * 1155 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1156 SetVector<Value *> &SubtreeValues) { 1157 std::vector<Type *> Args; 1158 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1159 1160 for (long i = 0; i < Prog->n_array; i++) { 1161 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1162 continue; 1163 1164 Args.push_back(Builder.getInt8PtrTy()); 1165 } 1166 1167 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1168 1169 for (long i = 0; i < NumHostIters; i++) 1170 Args.push_back(Builder.getInt64Ty()); 1171 1172 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1173 1174 for (long i = 0; i < NumVars; i++) { 1175 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1176 Value *Val = IDToValue[Id]; 1177 isl_id_free(Id); 1178 Args.push_back(Val->getType()); 1179 } 1180 1181 for (auto *V : SubtreeValues) 1182 Args.push_back(V->getType()); 1183 1184 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1185 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1186 GPUModule.get()); 1187 FN->setCallingConv(CallingConv::PTX_Kernel); 1188 1189 auto Arg = FN->arg_begin(); 1190 for (long i = 0; i < Kernel->n_array; i++) { 1191 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1192 continue; 1193 1194 Arg->setName(Kernel->array[i].array->name); 1195 1196 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1197 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1198 Type *EleTy = SAI->getElementType(); 1199 Value *Val = &*Arg; 1200 SmallVector<const SCEV *, 4> Sizes; 1201 isl_ast_build *Build = 1202 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1203 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1204 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1205 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1206 auto V = ExprBuilder.create(DimSize); 1207 Sizes.push_back(SE.getSCEV(V)); 1208 } 1209 const ScopArrayInfo *SAIRep = 1210 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 1211 LocalArrays.push_back(Val); 1212 1213 isl_ast_build_free(Build); 1214 KernelIds.push_back(Id); 1215 IDToSAI[Id] = SAIRep; 1216 Arg++; 1217 } 1218 1219 for (long i = 0; i < NumHostIters; i++) { 1220 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1221 Arg->setName(isl_id_get_name(Id)); 1222 IDToValue[Id] = &*Arg; 1223 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1224 Arg++; 1225 } 1226 1227 for (long i = 0; i < NumVars; i++) { 1228 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1229 Arg->setName(isl_id_get_name(Id)); 1230 Value *Val = IDToValue[Id]; 1231 ValueMap[Val] = &*Arg; 1232 IDToValue[Id] = &*Arg; 1233 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1234 Arg++; 1235 } 1236 1237 for (auto *V : SubtreeValues) { 1238 Arg->setName(V->getName()); 1239 ValueMap[V] = &*Arg; 1240 Arg++; 1241 } 1242 1243 return FN; 1244 } 1245 1246 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1247 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 1248 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 1249 1250 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 1251 Intrinsic::nvvm_read_ptx_sreg_tid_y, 1252 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 1253 1254 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1255 std::string Name = isl_id_get_name(Id); 1256 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1257 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1258 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1259 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1260 IDToValue[Id] = Val; 1261 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1262 }; 1263 1264 for (int i = 0; i < Kernel->n_grid; ++i) { 1265 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1266 addId(Id, IntrinsicsBID[i]); 1267 } 1268 1269 for (int i = 0; i < Kernel->n_block; ++i) { 1270 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1271 addId(Id, IntrinsicsTID[i]); 1272 } 1273 } 1274 1275 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1276 auto Arg = FN->arg_begin(); 1277 for (long i = 0; i < Kernel->n_array; i++) { 1278 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1279 continue; 1280 1281 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1282 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1283 isl_id_free(Id); 1284 1285 if (SAI->getNumberOfDimensions() > 0) { 1286 Arg++; 1287 continue; 1288 } 1289 1290 Value *Alloca = BlockGen.getOrCreateAlloca(SAI); 1291 Value *ArgPtr = &*Arg; 1292 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1293 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1294 Value *Val = Builder.CreateLoad(TypedArgPtr); 1295 Builder.CreateStore(Val, Alloca); 1296 1297 Arg++; 1298 } 1299 } 1300 1301 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1302 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1303 1304 for (int i = 0; i < Kernel->n_var; ++i) { 1305 struct ppcg_kernel_var &Var = Kernel->var[i]; 1306 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1307 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1308 1309 Type *ArrayTy = EleTy; 1310 SmallVector<const SCEV *, 4> Sizes; 1311 1312 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1313 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1314 long Bound = isl_val_get_num_si(Val); 1315 isl_val_free(Val); 1316 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1317 } 1318 1319 for (int j = Var.array->n_index - 1; j >= 0; --j) { 1320 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1321 long Bound = isl_val_get_num_si(Val); 1322 isl_val_free(Val); 1323 ArrayTy = ArrayType::get(ArrayTy, Bound); 1324 } 1325 1326 const ScopArrayInfo *SAI; 1327 Value *Allocation; 1328 if (Var.type == ppcg_access_shared) { 1329 auto GlobalVar = new GlobalVariable( 1330 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, 1331 nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1332 GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1333 GlobalVar->setInitializer(Constant::getNullValue(ArrayTy)); 1334 1335 Allocation = GlobalVar; 1336 } else if (Var.type == ppcg_access_private) { 1337 Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array"); 1338 } else { 1339 llvm_unreachable("unknown variable type"); 1340 } 1341 SAI = S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, 1342 ScopArrayInfo::MK_Array); 1343 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1344 IDToValue[Id] = Allocation; 1345 LocalArrays.push_back(Allocation); 1346 KernelIds.push_back(Id); 1347 IDToSAI[Id] = SAI; 1348 } 1349 } 1350 1351 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1352 SetVector<Value *> &SubtreeValues) { 1353 1354 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1355 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1356 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1357 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1358 1359 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1360 1361 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1362 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1363 1364 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1365 DT.addNewBlock(EntryBlock, PrevBlock); 1366 1367 Builder.SetInsertPoint(EntryBlock); 1368 Builder.CreateRetVoid(); 1369 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1370 1371 ScopDetection::markFunctionAsInvalid(FN); 1372 1373 prepareKernelArguments(Kernel, FN); 1374 createKernelVariables(Kernel, FN); 1375 insertKernelIntrinsics(Kernel); 1376 } 1377 1378 std::string GPUNodeBuilder::createKernelASM() { 1379 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1380 std::string ErrMsg; 1381 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1382 1383 if (!GPUTarget) { 1384 errs() << ErrMsg << "\n"; 1385 return ""; 1386 } 1387 1388 TargetOptions Options; 1389 Options.UnsafeFPMath = FastMath; 1390 std::unique_ptr<TargetMachine> TargetM( 1391 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 1392 Options, Optional<Reloc::Model>())); 1393 1394 SmallString<0> ASMString; 1395 raw_svector_ostream ASMStream(ASMString); 1396 llvm::legacy::PassManager PM; 1397 1398 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1399 1400 if (TargetM->addPassesToEmitFile( 1401 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1402 errs() << "The target does not support generation of this file type!\n"; 1403 return ""; 1404 } 1405 1406 PM.run(*GPUModule); 1407 1408 return ASMStream.str(); 1409 } 1410 1411 std::string GPUNodeBuilder::finalizeKernelFunction() { 1412 // Verify module. 1413 llvm::legacy::PassManager Passes; 1414 Passes.add(createVerifierPass()); 1415 Passes.run(*GPUModule); 1416 1417 if (DumpKernelIR) 1418 outs() << *GPUModule << "\n"; 1419 1420 // Optimize module. 1421 llvm::legacy::PassManager OptPasses; 1422 PassManagerBuilder PassBuilder; 1423 PassBuilder.OptLevel = 3; 1424 PassBuilder.SizeLevel = 0; 1425 PassBuilder.populateModulePassManager(OptPasses); 1426 OptPasses.run(*GPUModule); 1427 1428 std::string Assembly = createKernelASM(); 1429 1430 if (DumpKernelASM) 1431 outs() << Assembly << "\n"; 1432 1433 GPUModule.release(); 1434 KernelIDs.clear(); 1435 1436 return Assembly; 1437 } 1438 1439 namespace { 1440 class PPCGCodeGeneration : public ScopPass { 1441 public: 1442 static char ID; 1443 1444 /// The scop that is currently processed. 1445 Scop *S; 1446 1447 LoopInfo *LI; 1448 DominatorTree *DT; 1449 ScalarEvolution *SE; 1450 const DataLayout *DL; 1451 RegionInfo *RI; 1452 1453 PPCGCodeGeneration() : ScopPass(ID) {} 1454 1455 /// Construct compilation options for PPCG. 1456 /// 1457 /// @returns The compilation options. 1458 ppcg_options *createPPCGOptions() { 1459 auto DebugOptions = 1460 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1461 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1462 1463 DebugOptions->dump_schedule_constraints = false; 1464 DebugOptions->dump_schedule = false; 1465 DebugOptions->dump_final_schedule = false; 1466 DebugOptions->dump_sizes = false; 1467 DebugOptions->verbose = false; 1468 1469 Options->debug = DebugOptions; 1470 1471 Options->reschedule = true; 1472 Options->scale_tile_loops = false; 1473 Options->wrap = false; 1474 1475 Options->non_negative_parameters = false; 1476 Options->ctx = nullptr; 1477 Options->sizes = nullptr; 1478 1479 Options->tile_size = 32; 1480 1481 Options->use_private_memory = PrivateMemory; 1482 Options->use_shared_memory = SharedMemory; 1483 Options->max_shared_memory = 48 * 1024; 1484 1485 Options->target = PPCG_TARGET_CUDA; 1486 Options->openmp = false; 1487 Options->linearize_device_arrays = true; 1488 Options->live_range_reordering = false; 1489 1490 Options->opencl_compiler_options = nullptr; 1491 Options->opencl_use_gpu = false; 1492 Options->opencl_n_include_file = 0; 1493 Options->opencl_include_files = nullptr; 1494 Options->opencl_print_kernel_types = false; 1495 Options->opencl_embed_kernel_code = false; 1496 1497 Options->save_schedule_file = nullptr; 1498 Options->load_schedule_file = nullptr; 1499 1500 return Options; 1501 } 1502 1503 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1504 /// 1505 /// Instead of a normal access of the form: 1506 /// 1507 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1508 /// 1509 /// a tagged access has the form 1510 /// 1511 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1512 /// 1513 /// where 'id' is an additional space that references the memory access that 1514 /// triggered the access. 1515 /// 1516 /// @param AccessTy The type of the memory accesses to collect. 1517 /// 1518 /// @return The relation describing all tagged memory accesses. 1519 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1520 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1521 1522 for (auto &Stmt : *S) 1523 for (auto &Acc : Stmt) 1524 if (Acc->getType() == AccessTy) { 1525 isl_map *Relation = Acc->getAccessRelation(); 1526 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1527 1528 isl_space *Space = isl_map_get_space(Relation); 1529 Space = isl_space_range(Space); 1530 Space = isl_space_from_range(Space); 1531 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1532 isl_map *Universe = isl_map_universe(Space); 1533 Relation = isl_map_domain_product(Relation, Universe); 1534 Accesses = isl_union_map_add_map(Accesses, Relation); 1535 } 1536 1537 return Accesses; 1538 } 1539 1540 /// Get the set of all read accesses, tagged with the access id. 1541 /// 1542 /// @see getTaggedAccesses 1543 isl_union_map *getTaggedReads() { 1544 return getTaggedAccesses(MemoryAccess::READ); 1545 } 1546 1547 /// Get the set of all may (and must) accesses, tagged with the access id. 1548 /// 1549 /// @see getTaggedAccesses 1550 isl_union_map *getTaggedMayWrites() { 1551 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1552 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1553 } 1554 1555 /// Get the set of all must accesses, tagged with the access id. 1556 /// 1557 /// @see getTaggedAccesses 1558 isl_union_map *getTaggedMustWrites() { 1559 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1560 } 1561 1562 /// Collect parameter and array names as isl_ids. 1563 /// 1564 /// To reason about the different parameters and arrays used, ppcg requires 1565 /// a list of all isl_ids in use. As PPCG traditionally performs 1566 /// source-to-source compilation each of these isl_ids is mapped to the 1567 /// expression that represents it. As we do not have a corresponding 1568 /// expression in Polly, we just map each id to a 'zero' expression to match 1569 /// the data format that ppcg expects. 1570 /// 1571 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1572 __isl_give isl_id_to_ast_expr *getNames() { 1573 auto *Names = isl_id_to_ast_expr_alloc( 1574 S->getIslCtx(), 1575 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1576 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1577 auto *Space = S->getParamSpace(); 1578 1579 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1580 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1581 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1582 } 1583 1584 for (auto &Array : S->arrays()) { 1585 auto Id = Array->getBasePtrId(); 1586 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1587 } 1588 1589 isl_space_free(Space); 1590 isl_ast_expr_free(Zero); 1591 1592 return Names; 1593 } 1594 1595 /// Create a new PPCG scop from the current scop. 1596 /// 1597 /// The PPCG scop is initialized with data from the current polly::Scop. From 1598 /// this initial data, the data-dependences in the PPCG scop are initialized. 1599 /// We do not use Polly's dependence analysis for now, to ensure we match 1600 /// the PPCG default behaviour more closely. 1601 /// 1602 /// @returns A new ppcg scop. 1603 ppcg_scop *createPPCGScop() { 1604 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1605 1606 PPCGScop->options = createPPCGOptions(); 1607 1608 PPCGScop->start = 0; 1609 PPCGScop->end = 0; 1610 1611 PPCGScop->context = S->getContext(); 1612 PPCGScop->domain = S->getDomains(); 1613 PPCGScop->call = nullptr; 1614 PPCGScop->tagged_reads = getTaggedReads(); 1615 PPCGScop->reads = S->getReads(); 1616 PPCGScop->live_in = nullptr; 1617 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1618 PPCGScop->may_writes = S->getWrites(); 1619 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1620 PPCGScop->must_writes = S->getMustWrites(); 1621 PPCGScop->live_out = nullptr; 1622 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1623 PPCGScop->tagger = nullptr; 1624 1625 PPCGScop->independence = nullptr; 1626 PPCGScop->dep_flow = nullptr; 1627 PPCGScop->tagged_dep_flow = nullptr; 1628 PPCGScop->dep_false = nullptr; 1629 PPCGScop->dep_forced = nullptr; 1630 PPCGScop->dep_order = nullptr; 1631 PPCGScop->tagged_dep_order = nullptr; 1632 1633 PPCGScop->schedule = S->getScheduleTree(); 1634 PPCGScop->names = getNames(); 1635 1636 PPCGScop->pet = nullptr; 1637 1638 compute_tagger(PPCGScop); 1639 compute_dependences(PPCGScop); 1640 1641 return PPCGScop; 1642 } 1643 1644 /// Collect the array acesses in a statement. 1645 /// 1646 /// @param Stmt The statement for which to collect the accesses. 1647 /// 1648 /// @returns A list of array accesses. 1649 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1650 gpu_stmt_access *Accesses = nullptr; 1651 1652 for (MemoryAccess *Acc : Stmt) { 1653 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1654 Access->read = Acc->isRead(); 1655 Access->write = Acc->isWrite(); 1656 Access->access = Acc->getAccessRelation(); 1657 isl_space *Space = isl_map_get_space(Access->access); 1658 Space = isl_space_range(Space); 1659 Space = isl_space_from_range(Space); 1660 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1661 isl_map *Universe = isl_map_universe(Space); 1662 Access->tagged_access = 1663 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1664 Access->exact_write = !Acc->isMayWrite(); 1665 Access->ref_id = Acc->getId(); 1666 Access->next = Accesses; 1667 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 1668 Accesses = Access; 1669 } 1670 1671 return Accesses; 1672 } 1673 1674 /// Collect the list of GPU statements. 1675 /// 1676 /// Each statement has an id, a pointer to the underlying data structure, 1677 /// as well as a list with all memory accesses. 1678 /// 1679 /// TODO: Initialize the list of memory accesses. 1680 /// 1681 /// @returns A linked-list of statements. 1682 gpu_stmt *getStatements() { 1683 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1684 std::distance(S->begin(), S->end())); 1685 1686 int i = 0; 1687 for (auto &Stmt : *S) { 1688 gpu_stmt *GPUStmt = &Stmts[i]; 1689 1690 GPUStmt->id = Stmt.getDomainId(); 1691 1692 // We use the pet stmt pointer to keep track of the Polly statements. 1693 GPUStmt->stmt = (pet_stmt *)&Stmt; 1694 GPUStmt->accesses = getStmtAccesses(Stmt); 1695 i++; 1696 } 1697 1698 return Stmts; 1699 } 1700 1701 /// Derive the extent of an array. 1702 /// 1703 /// The extent of an array is the set of elements that are within the 1704 /// accessed array. For the inner dimensions, the extent constraints are 1705 /// 0 and the size of the corresponding array dimension. For the first 1706 /// (outermost) dimension, the extent constraints are the minimal and maximal 1707 /// subscript value for the first dimension. 1708 /// 1709 /// @param Array The array to derive the extent for. 1710 /// 1711 /// @returns An isl_set describing the extent of the array. 1712 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1713 unsigned NumDims = Array->getNumberOfDimensions(); 1714 isl_union_map *Accesses = S->getAccesses(); 1715 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1716 Accesses = isl_union_map_detect_equalities(Accesses); 1717 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1718 AccessUSet = isl_union_set_coalesce(AccessUSet); 1719 AccessUSet = isl_union_set_detect_equalities(AccessUSet); 1720 AccessUSet = isl_union_set_coalesce(AccessUSet); 1721 1722 if (isl_union_set_is_empty(AccessUSet)) { 1723 isl_union_set_free(AccessUSet); 1724 return isl_set_empty(Array->getSpace()); 1725 } 1726 1727 if (Array->getNumberOfDimensions() == 0) { 1728 isl_union_set_free(AccessUSet); 1729 return isl_set_universe(Array->getSpace()); 1730 } 1731 1732 isl_set *AccessSet = 1733 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1734 1735 isl_union_set_free(AccessUSet); 1736 isl_local_space *LS = isl_local_space_from_space(Array->getSpace()); 1737 1738 isl_pw_aff *Val = 1739 isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0)); 1740 1741 isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0); 1742 isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0); 1743 OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in, 1744 isl_pw_aff_dim(Val, isl_dim_in)); 1745 OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in, 1746 isl_pw_aff_dim(Val, isl_dim_in)); 1747 OuterMin = 1748 isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in, Array->getBasePtrId()); 1749 OuterMax = 1750 isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in, Array->getBasePtrId()); 1751 1752 isl_set *Extent = isl_set_universe(Array->getSpace()); 1753 1754 Extent = isl_set_intersect( 1755 Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val))); 1756 Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val)); 1757 1758 for (unsigned i = 1; i < NumDims; ++i) 1759 Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0); 1760 1761 for (unsigned i = 1; i < NumDims; ++i) { 1762 isl_pw_aff *PwAff = 1763 const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i)); 1764 isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain( 1765 isl_local_space_from_space(Array->getSpace()), isl_dim_set, i)); 1766 PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in, 1767 isl_pw_aff_dim(Val, isl_dim_in)); 1768 PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in, 1769 isl_pw_aff_get_tuple_id(Val, isl_dim_in)); 1770 auto *Set = isl_pw_aff_gt_set(PwAff, Val); 1771 Extent = isl_set_intersect(Set, Extent); 1772 } 1773 1774 return Extent; 1775 } 1776 1777 /// Derive the bounds of an array. 1778 /// 1779 /// For the first dimension we derive the bound of the array from the extent 1780 /// of this dimension. For inner dimensions we obtain their size directly from 1781 /// ScopArrayInfo. 1782 /// 1783 /// @param PPCGArray The array to compute bounds for. 1784 /// @param Array The polly array from which to take the information. 1785 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1786 if (PPCGArray.n_index > 0) { 1787 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1788 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1789 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1790 isl_set_free(Dom); 1791 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1792 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1793 isl_aff *One = isl_aff_zero_on_domain(LS); 1794 One = isl_aff_add_constant_si(One, 1); 1795 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1796 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1797 PPCGArray.bound[0] = Bound; 1798 } 1799 1800 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1801 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1802 auto LS = isl_pw_aff_get_domain_space(Bound); 1803 auto Aff = isl_multi_aff_zero(LS); 1804 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1805 PPCGArray.bound[i] = Bound; 1806 } 1807 } 1808 1809 /// Create the arrays for @p PPCGProg. 1810 /// 1811 /// @param PPCGProg The program to compute the arrays for. 1812 void createArrays(gpu_prog *PPCGProg) { 1813 int i = 0; 1814 for (auto &Array : S->arrays()) { 1815 std::string TypeName; 1816 raw_string_ostream OS(TypeName); 1817 1818 OS << *Array->getElementType(); 1819 TypeName = OS.str(); 1820 1821 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1822 1823 PPCGArray.space = Array->getSpace(); 1824 PPCGArray.type = strdup(TypeName.c_str()); 1825 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1826 PPCGArray.name = strdup(Array->getName().c_str()); 1827 PPCGArray.extent = nullptr; 1828 PPCGArray.n_index = Array->getNumberOfDimensions(); 1829 PPCGArray.bound = 1830 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1831 PPCGArray.extent = getExtent(Array); 1832 PPCGArray.n_ref = 0; 1833 PPCGArray.refs = nullptr; 1834 PPCGArray.accessed = true; 1835 PPCGArray.read_only_scalar = false; 1836 PPCGArray.has_compound_element = false; 1837 PPCGArray.local = false; 1838 PPCGArray.declare_local = false; 1839 PPCGArray.global = false; 1840 PPCGArray.linearize = false; 1841 PPCGArray.dep_order = nullptr; 1842 PPCGArray.user = Array; 1843 1844 setArrayBounds(PPCGArray, Array); 1845 i++; 1846 1847 collect_references(PPCGProg, &PPCGArray); 1848 } 1849 } 1850 1851 /// Create an identity map between the arrays in the scop. 1852 /// 1853 /// @returns An identity map between the arrays in the scop. 1854 isl_union_map *getArrayIdentity() { 1855 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1856 1857 for (auto &Array : S->arrays()) { 1858 isl_space *Space = Array->getSpace(); 1859 Space = isl_space_map_from_set(Space); 1860 isl_map *Identity = isl_map_identity(Space); 1861 Maps = isl_union_map_add_map(Maps, Identity); 1862 } 1863 1864 return Maps; 1865 } 1866 1867 /// Create a default-initialized PPCG GPU program. 1868 /// 1869 /// @returns A new gpu grogram description. 1870 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1871 1872 if (!PPCGScop) 1873 return nullptr; 1874 1875 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1876 1877 PPCGProg->ctx = S->getIslCtx(); 1878 PPCGProg->scop = PPCGScop; 1879 PPCGProg->context = isl_set_copy(PPCGScop->context); 1880 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1881 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1882 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1883 PPCGProg->tagged_must_kill = 1884 isl_union_map_copy(PPCGScop->tagged_must_kills); 1885 PPCGProg->to_inner = getArrayIdentity(); 1886 PPCGProg->to_outer = getArrayIdentity(); 1887 PPCGProg->any_to_outer = nullptr; 1888 PPCGProg->array_order = nullptr; 1889 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1890 PPCGProg->stmts = getStatements(); 1891 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1892 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1893 PPCGProg->n_array); 1894 1895 createArrays(PPCGProg); 1896 1897 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1898 1899 return PPCGProg; 1900 } 1901 1902 struct PrintGPUUserData { 1903 struct cuda_info *CudaInfo; 1904 struct gpu_prog *PPCGProg; 1905 std::vector<ppcg_kernel *> Kernels; 1906 }; 1907 1908 /// Print a user statement node in the host code. 1909 /// 1910 /// We use ppcg's printing facilities to print the actual statement and 1911 /// additionally build up a list of all kernels that are encountered in the 1912 /// host ast. 1913 /// 1914 /// @param P The printer to print to 1915 /// @param Options The printing options to use 1916 /// @param Node The node to print 1917 /// @param User A user pointer to carry additional data. This pointer is 1918 /// expected to be of type PrintGPUUserData. 1919 /// 1920 /// @returns A printer to which the output has been printed. 1921 static __isl_give isl_printer * 1922 printHostUser(__isl_take isl_printer *P, 1923 __isl_take isl_ast_print_options *Options, 1924 __isl_take isl_ast_node *Node, void *User) { 1925 auto Data = (struct PrintGPUUserData *)User; 1926 auto Id = isl_ast_node_get_annotation(Node); 1927 1928 if (Id) { 1929 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1930 1931 // If this is a user statement, format it ourselves as ppcg would 1932 // otherwise try to call pet functionality that is not available in 1933 // Polly. 1934 if (IsUser) { 1935 P = isl_printer_start_line(P); 1936 P = isl_printer_print_ast_node(P, Node); 1937 P = isl_printer_end_line(P); 1938 isl_id_free(Id); 1939 isl_ast_print_options_free(Options); 1940 return P; 1941 } 1942 1943 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1944 isl_id_free(Id); 1945 Data->Kernels.push_back(Kernel); 1946 } 1947 1948 return print_host_user(P, Options, Node, User); 1949 } 1950 1951 /// Print C code corresponding to the control flow in @p Kernel. 1952 /// 1953 /// @param Kernel The kernel to print 1954 void printKernel(ppcg_kernel *Kernel) { 1955 auto *P = isl_printer_to_str(S->getIslCtx()); 1956 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1957 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1958 P = isl_ast_node_print(Kernel->tree, P, Options); 1959 char *String = isl_printer_get_str(P); 1960 printf("%s\n", String); 1961 free(String); 1962 isl_printer_free(P); 1963 } 1964 1965 /// Print C code corresponding to the GPU code described by @p Tree. 1966 /// 1967 /// @param Tree An AST describing GPU code 1968 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1969 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1970 auto *P = isl_printer_to_str(S->getIslCtx()); 1971 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1972 1973 PrintGPUUserData Data; 1974 Data.PPCGProg = PPCGProg; 1975 1976 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1977 Options = 1978 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1979 P = isl_ast_node_print(Tree, P, Options); 1980 char *String = isl_printer_get_str(P); 1981 printf("# host\n"); 1982 printf("%s\n", String); 1983 free(String); 1984 isl_printer_free(P); 1985 1986 for (auto Kernel : Data.Kernels) { 1987 printf("# kernel%d\n", Kernel->id); 1988 printKernel(Kernel); 1989 } 1990 } 1991 1992 // Generate a GPU program using PPCG. 1993 // 1994 // GPU mapping consists of multiple steps: 1995 // 1996 // 1) Compute new schedule for the program. 1997 // 2) Map schedule to GPU (TODO) 1998 // 3) Generate code for new schedule (TODO) 1999 // 2000 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 2001 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 2002 // strategy directly from this pass. 2003 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 2004 2005 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 2006 2007 PPCGGen->ctx = S->getIslCtx(); 2008 PPCGGen->options = PPCGScop->options; 2009 PPCGGen->print = nullptr; 2010 PPCGGen->print_user = nullptr; 2011 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 2012 PPCGGen->prog = PPCGProg; 2013 PPCGGen->tree = nullptr; 2014 PPCGGen->types.n = 0; 2015 PPCGGen->types.name = nullptr; 2016 PPCGGen->sizes = nullptr; 2017 PPCGGen->used_sizes = nullptr; 2018 PPCGGen->kernel_id = 0; 2019 2020 // Set scheduling strategy to same strategy PPCG is using. 2021 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 2022 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 2023 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 2024 2025 isl_schedule *Schedule = get_schedule(PPCGGen); 2026 2027 int has_permutable = has_any_permutable_node(Schedule); 2028 2029 if (!has_permutable || has_permutable < 0) { 2030 Schedule = isl_schedule_free(Schedule); 2031 } else { 2032 Schedule = map_to_device(PPCGGen, Schedule); 2033 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 2034 } 2035 2036 if (DumpSchedule) { 2037 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 2038 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 2039 P = isl_printer_print_str(P, "Schedule\n"); 2040 P = isl_printer_print_str(P, "========\n"); 2041 if (Schedule) 2042 P = isl_printer_print_schedule(P, Schedule); 2043 else 2044 P = isl_printer_print_str(P, "No schedule found\n"); 2045 2046 printf("%s\n", isl_printer_get_str(P)); 2047 isl_printer_free(P); 2048 } 2049 2050 if (DumpCode) { 2051 printf("Code\n"); 2052 printf("====\n"); 2053 if (PPCGGen->tree) 2054 printGPUTree(PPCGGen->tree, PPCGProg); 2055 else 2056 printf("No code generated\n"); 2057 } 2058 2059 isl_schedule_free(Schedule); 2060 2061 return PPCGGen; 2062 } 2063 2064 /// Free gpu_gen structure. 2065 /// 2066 /// @param PPCGGen The ppcg_gen object to free. 2067 void freePPCGGen(gpu_gen *PPCGGen) { 2068 isl_ast_node_free(PPCGGen->tree); 2069 isl_union_map_free(PPCGGen->sizes); 2070 isl_union_map_free(PPCGGen->used_sizes); 2071 free(PPCGGen); 2072 } 2073 2074 /// Free the options in the ppcg scop structure. 2075 /// 2076 /// ppcg is not freeing these options for us. To avoid leaks we do this 2077 /// ourselves. 2078 /// 2079 /// @param PPCGScop The scop referencing the options to free. 2080 void freeOptions(ppcg_scop *PPCGScop) { 2081 free(PPCGScop->options->debug); 2082 PPCGScop->options->debug = nullptr; 2083 free(PPCGScop->options); 2084 PPCGScop->options = nullptr; 2085 } 2086 2087 /// Generate code for a given GPU AST described by @p Root. 2088 /// 2089 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 2090 /// @param Prog The GPU Program to generate code for. 2091 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 2092 ScopAnnotator Annotator; 2093 Annotator.buildAliasScopes(*S); 2094 2095 Region *R = &S->getRegion(); 2096 2097 simplifyRegion(R, DT, LI, RI); 2098 2099 BasicBlock *EnteringBB = R->getEnteringBlock(); 2100 2101 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 2102 2103 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 2104 Prog); 2105 2106 // Only build the run-time condition and parameters _after_ having 2107 // introduced the conditional branch. This is important as the conditional 2108 // branch will guard the original scop from new induction variables that 2109 // the SCEVExpander may introduce while code generating the parameters and 2110 // which may introduce scalar dependences that prevent us from correctly 2111 // code generating this scop. 2112 BasicBlock *StartBlock = 2113 executeScopConditionally(*S, this, Builder.getTrue()); 2114 2115 // TODO: Handle LICM 2116 auto SplitBlock = StartBlock->getSinglePredecessor(); 2117 Builder.SetInsertPoint(SplitBlock->getTerminator()); 2118 NodeBuilder.addParameters(S->getContext()); 2119 2120 isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx()); 2121 isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build); 2122 isl_ast_build_free(Build); 2123 2124 Value *RTC = NodeBuilder.createRTC(Condition); 2125 Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC); 2126 2127 Builder.SetInsertPoint(&*StartBlock->begin()); 2128 2129 NodeBuilder.initializeAfterRTH(); 2130 NodeBuilder.create(Root); 2131 NodeBuilder.finalize(); 2132 } 2133 2134 bool runOnScop(Scop &CurrentScop) override { 2135 S = &CurrentScop; 2136 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2137 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2138 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2139 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 2140 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2141 2142 // We currently do not support scops with invariant loads. 2143 if (S->hasInvariantAccesses()) 2144 return false; 2145 2146 auto PPCGScop = createPPCGScop(); 2147 auto PPCGProg = createPPCGProg(PPCGScop); 2148 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2149 2150 if (PPCGGen->tree) 2151 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2152 2153 freeOptions(PPCGScop); 2154 freePPCGGen(PPCGGen); 2155 gpu_prog_free(PPCGProg); 2156 ppcg_scop_free(PPCGScop); 2157 2158 return true; 2159 } 2160 2161 void printScop(raw_ostream &, Scop &) const override {} 2162 2163 void getAnalysisUsage(AnalysisUsage &AU) const override { 2164 AU.addRequired<DominatorTreeWrapperPass>(); 2165 AU.addRequired<RegionInfoPass>(); 2166 AU.addRequired<ScalarEvolutionWrapperPass>(); 2167 AU.addRequired<ScopDetection>(); 2168 AU.addRequired<ScopInfoRegionPass>(); 2169 AU.addRequired<LoopInfoWrapperPass>(); 2170 2171 AU.addPreserved<AAResultsWrapperPass>(); 2172 AU.addPreserved<BasicAAWrapperPass>(); 2173 AU.addPreserved<LoopInfoWrapperPass>(); 2174 AU.addPreserved<DominatorTreeWrapperPass>(); 2175 AU.addPreserved<GlobalsAAWrapperPass>(); 2176 AU.addPreserved<PostDominatorTreeWrapperPass>(); 2177 AU.addPreserved<ScopDetection>(); 2178 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2179 AU.addPreserved<SCEVAAWrapperPass>(); 2180 2181 // FIXME: We do not yet add regions for the newly generated code to the 2182 // region tree. 2183 AU.addPreserved<RegionInfoPass>(); 2184 AU.addPreserved<ScopInfoRegionPass>(); 2185 } 2186 }; 2187 } 2188 2189 char PPCGCodeGeneration::ID = 1; 2190 2191 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 2192 2193 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 2194 "Polly - Apply PPCG translation to SCOP", false, false) 2195 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 2196 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 2197 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 2198 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 2199 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 2200 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 2201 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 2202 "Polly - Apply PPCG translation to SCOP", false, false) 2203