1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopDetection.h" 21 #include "polly/ScopInfo.h" 22 #include "polly/Support/SCEVValidator.h" 23 #include "llvm/ADT/PostOrderIterator.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/BasicAliasAnalysis.h" 26 #include "llvm/Analysis/GlobalsModRef.h" 27 #include "llvm/Analysis/PostDominators.h" 28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/LegacyPassManager.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/Support/TargetRegistry.h" 34 #include "llvm/Support/TargetSelect.h" 35 #include "llvm/Target/TargetMachine.h" 36 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 37 38 #include "isl/union_map.h" 39 40 extern "C" { 41 #include "ppcg/cuda.h" 42 #include "ppcg/gpu.h" 43 #include "ppcg/gpu_print.h" 44 #include "ppcg/ppcg.h" 45 #include "ppcg/schedule.h" 46 } 47 48 #include "llvm/Support/Debug.h" 49 50 using namespace polly; 51 using namespace llvm; 52 53 #define DEBUG_TYPE "polly-codegen-ppcg" 54 55 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 56 cl::desc("Dump the computed GPU Schedule"), 57 cl::Hidden, cl::init(false), cl::ZeroOrMore, 58 cl::cat(PollyCategory)); 59 60 static cl::opt<bool> 61 DumpCode("polly-acc-dump-code", 62 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 63 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 64 65 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 66 cl::desc("Dump the kernel LLVM-IR"), 67 cl::Hidden, cl::init(false), cl::ZeroOrMore, 68 cl::cat(PollyCategory)); 69 70 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 71 cl::desc("Dump the kernel assembly code"), 72 cl::Hidden, cl::init(false), cl::ZeroOrMore, 73 cl::cat(PollyCategory)); 74 75 static cl::opt<bool> FastMath("polly-acc-fastmath", 76 cl::desc("Allow unsafe math optimizations"), 77 cl::Hidden, cl::init(false), cl::ZeroOrMore, 78 cl::cat(PollyCategory)); 79 static cl::opt<bool> SharedMemory("polly-acc-use-shared", 80 cl::desc("Use shared memory"), cl::Hidden, 81 cl::init(false), cl::ZeroOrMore, 82 cl::cat(PollyCategory)); 83 84 static cl::opt<std::string> 85 CudaVersion("polly-acc-cuda-version", 86 cl::desc("The CUDA version to compile for"), cl::Hidden, 87 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 88 89 /// Create the ast expressions for a ScopStmt. 90 /// 91 /// This function is a callback for to generate the ast expressions for each 92 /// of the scheduled ScopStmts. 93 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 94 void *StmtT, isl_ast_build *Build, 95 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 96 isl_id *Id, void *User), 97 void *UserIndex, 98 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 99 void *UserExpr) { 100 101 ScopStmt *Stmt = (ScopStmt *)StmtT; 102 103 isl_ctx *Ctx; 104 105 if (!Stmt || !Build) 106 return NULL; 107 108 Ctx = isl_ast_build_get_ctx(Build); 109 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 110 111 for (MemoryAccess *Acc : *Stmt) { 112 isl_map *AddrFunc = Acc->getAddressFunction(); 113 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 114 isl_id *RefId = Acc->getId(); 115 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 116 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 117 MPA = isl_multi_pw_aff_coalesce(MPA); 118 MPA = FunctionIndex(MPA, RefId, UserIndex); 119 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 120 Access = FunctionExpr(Access, RefId, UserExpr); 121 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 122 } 123 124 return RefToExpr; 125 } 126 127 /// Generate code for a GPU specific isl AST. 128 /// 129 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 130 /// generates code for general-prupose AST nodes, with special functionality 131 /// for generating GPU specific user nodes. 132 /// 133 /// @see GPUNodeBuilder::createUser 134 class GPUNodeBuilder : public IslNodeBuilder { 135 public: 136 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 137 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 138 DominatorTree &DT, Scop &S, gpu_prog *Prog) 139 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 140 getExprBuilder().setIDToSAI(&IDToSAI); 141 } 142 143 /// Create after-run-time-check initialization code. 144 void initializeAfterRTH(); 145 146 /// Finalize the generated scop. 147 virtual void finalize(); 148 149 private: 150 /// A vector of array base pointers for which a new ScopArrayInfo was created. 151 /// 152 /// This vector is used to delete the ScopArrayInfo when it is not needed any 153 /// more. 154 std::vector<Value *> LocalArrays; 155 156 /// A map from ScopArrays to their corresponding device allocations. 157 std::map<ScopArrayInfo *, Value *> DeviceAllocations; 158 159 /// The current GPU context. 160 Value *GPUContext; 161 162 /// The set of isl_ids allocated in the kernel 163 std::vector<isl_id *> KernelIds; 164 165 /// A module containing GPU code. 166 /// 167 /// This pointer is only set in case we are currently generating GPU code. 168 std::unique_ptr<Module> GPUModule; 169 170 /// The GPU program we generate code for. 171 gpu_prog *Prog; 172 173 /// Class to free isl_ids. 174 class IslIdDeleter { 175 public: 176 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 177 }; 178 179 /// A set containing all isl_ids allocated in a GPU kernel. 180 /// 181 /// By releasing this set all isl_ids will be freed. 182 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 183 184 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 185 186 /// Create code for user-defined AST nodes. 187 /// 188 /// These AST nodes can be of type: 189 /// 190 /// - ScopStmt: A computational statement (TODO) 191 /// - Kernel: A GPU kernel call (TODO) 192 /// - Data-Transfer: A GPU <-> CPU data-transfer 193 /// - In-kernel synchronization 194 /// - In-kernel memory copy statement 195 /// 196 /// @param UserStmt The ast node to generate code for. 197 virtual void createUser(__isl_take isl_ast_node *UserStmt); 198 199 enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST }; 200 201 /// Create code for a data transfer statement 202 /// 203 /// @param TransferStmt The data transfer statement. 204 /// @param Direction The direction in which to transfer data. 205 void createDataTransfer(__isl_take isl_ast_node *TransferStmt, 206 enum DataDirection Direction); 207 208 /// Find llvm::Values referenced in GPU kernel. 209 /// 210 /// @param Kernel The kernel to scan for llvm::Values 211 /// 212 /// @returns A set of values referenced by the kernel. 213 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 214 215 /// Compute the sizes of the execution grid for a given kernel. 216 /// 217 /// @param Kernel The kernel to compute grid sizes for. 218 /// 219 /// @returns A tuple with grid sizes for X and Y dimension 220 std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel); 221 222 /// Compute the sizes of the thread blocks for a given kernel. 223 /// 224 /// @param Kernel The kernel to compute thread block sizes for. 225 /// 226 /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. 227 std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel); 228 229 /// Create kernel launch parameters. 230 /// 231 /// @param Kernel The kernel to create parameters for. 232 /// @param F The kernel function that has been created. 233 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 234 /// 235 /// @returns A stack allocated array with pointers to the parameter 236 /// values that are passed to the kernel. 237 Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F, 238 SetVector<Value *> SubtreeValues); 239 240 /// Create declarations for kernel variable. 241 /// 242 /// This includes shared memory declarations. 243 /// 244 /// @param Kernel The kernel definition to create variables for. 245 /// @param FN The function into which to generate the variables. 246 void createKernelVariables(ppcg_kernel *Kernel, Function *FN); 247 248 /// Create GPU kernel. 249 /// 250 /// Code generate the kernel described by @p KernelStmt. 251 /// 252 /// @param KernelStmt The ast node to generate kernel code for. 253 void createKernel(__isl_take isl_ast_node *KernelStmt); 254 255 /// Generate code that computes the size of an array. 256 /// 257 /// @param Array The array for which to compute a size. 258 Value *getArraySize(gpu_array_info *Array); 259 260 /// Prepare the kernel arguments for kernel code generation 261 /// 262 /// @param Kernel The kernel to generate code for. 263 /// @param FN The function created for the kernel. 264 void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN); 265 266 /// Create kernel function. 267 /// 268 /// Create a kernel function located in a newly created module that can serve 269 /// as target for device code generation. Set the Builder to point to the 270 /// start block of this newly created function. 271 /// 272 /// @param Kernel The kernel to generate code for. 273 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 274 void createKernelFunction(ppcg_kernel *Kernel, 275 SetVector<Value *> &SubtreeValues); 276 277 /// Create the declaration of a kernel function. 278 /// 279 /// The kernel function takes as arguments: 280 /// 281 /// - One i8 pointer for each external array reference used in the kernel. 282 /// - Host iterators 283 /// - Parameters 284 /// - Other LLVM Value references (TODO) 285 /// 286 /// @param Kernel The kernel to generate the function declaration for. 287 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 288 /// 289 /// @returns The newly declared function. 290 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 291 SetVector<Value *> &SubtreeValues); 292 293 /// Insert intrinsic functions to obtain thread and block ids. 294 /// 295 /// @param The kernel to generate the intrinsic functions for. 296 void insertKernelIntrinsics(ppcg_kernel *Kernel); 297 298 /// Create a global-to-shared or shared-to-global copy statement. 299 /// 300 /// @param CopyStmt The copy statement to generate code for 301 void createKernelCopy(ppcg_kernel_stmt *CopyStmt); 302 303 /// Create code for a ScopStmt called in @p Expr. 304 /// 305 /// @param Expr The expression containing the call. 306 /// @param KernelStmt The kernel statement referenced in the call. 307 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 308 309 /// Create an in-kernel synchronization call. 310 void createKernelSync(); 311 312 /// Create a PTX assembly string for the current GPU kernel. 313 /// 314 /// @returns A string containing the corresponding PTX assembly code. 315 std::string createKernelASM(); 316 317 /// Remove references from the dominator tree to the kernel function @p F. 318 /// 319 /// @param F The function to remove references to. 320 void clearDominators(Function *F); 321 322 /// Remove references from scalar evolution to the kernel function @p F. 323 /// 324 /// @param F The function to remove references to. 325 void clearScalarEvolution(Function *F); 326 327 /// Remove references from loop info to the kernel function @p F. 328 /// 329 /// @param F The function to remove references to. 330 void clearLoops(Function *F); 331 332 /// Finalize the generation of the kernel function. 333 /// 334 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 335 /// dump its IR to stderr. 336 /// 337 /// @returns The Assembly string of the kernel. 338 std::string finalizeKernelFunction(); 339 340 /// Create code that allocates memory to store arrays on device. 341 void allocateDeviceArrays(); 342 343 /// Free all allocated device arrays. 344 void freeDeviceArrays(); 345 346 /// Create a call to initialize the GPU context. 347 /// 348 /// @returns A pointer to the newly initialized context. 349 Value *createCallInitContext(); 350 351 /// Create a call to get the device pointer for a kernel allocation. 352 /// 353 /// @param Allocation The Polly GPU allocation 354 /// 355 /// @returns The device parameter corresponding to this allocation. 356 Value *createCallGetDevicePtr(Value *Allocation); 357 358 /// Create a call to free the GPU context. 359 /// 360 /// @param Context A pointer to an initialized GPU context. 361 void createCallFreeContext(Value *Context); 362 363 /// Create a call to allocate memory on the device. 364 /// 365 /// @param Size The size of memory to allocate 366 /// 367 /// @returns A pointer that identifies this allocation. 368 Value *createCallAllocateMemoryForDevice(Value *Size); 369 370 /// Create a call to free a device array. 371 /// 372 /// @param Array The device array to free. 373 void createCallFreeDeviceMemory(Value *Array); 374 375 /// Create a call to copy data from host to device. 376 /// 377 /// @param HostPtr A pointer to the host data that should be copied. 378 /// @param DevicePtr A device pointer specifying the location to copy to. 379 void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr, 380 Value *Size); 381 382 /// Create a call to copy data from device to host. 383 /// 384 /// @param DevicePtr A pointer to the device data that should be copied. 385 /// @param HostPtr A host pointer specifying the location to copy to. 386 void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, 387 Value *Size); 388 389 /// Create a call to get a kernel from an assembly string. 390 /// 391 /// @param Buffer The string describing the kernel. 392 /// @param Entry The name of the kernel function to call. 393 /// 394 /// @returns A pointer to a kernel object 395 Value *createCallGetKernel(Value *Buffer, Value *Entry); 396 397 /// Create a call to free a GPU kernel. 398 /// 399 /// @param GPUKernel THe kernel to free. 400 void createCallFreeKernel(Value *GPUKernel); 401 402 /// Create a call to launch a GPU kernel. 403 /// 404 /// @param GPUKernel The kernel to launch. 405 /// @param GridDimX The size of the first grid dimension. 406 /// @param GridDimY The size of the second grid dimension. 407 /// @param GridBlockX The size of the first block dimension. 408 /// @param GridBlockY The size of the second block dimension. 409 /// @param GridBlockZ The size of the third block dimension. 410 /// @param Paramters A pointer to an array that contains itself pointers to 411 /// the parameter values passed for each kernel argument. 412 void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 413 Value *GridDimY, Value *BlockDimX, 414 Value *BlockDimY, Value *BlockDimZ, 415 Value *Parameters); 416 }; 417 418 void GPUNodeBuilder::initializeAfterRTH() { 419 GPUContext = createCallInitContext(); 420 allocateDeviceArrays(); 421 } 422 423 void GPUNodeBuilder::finalize() { 424 freeDeviceArrays(); 425 createCallFreeContext(GPUContext); 426 IslNodeBuilder::finalize(); 427 } 428 429 void GPUNodeBuilder::allocateDeviceArrays() { 430 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 431 432 for (int i = 0; i < Prog->n_array; ++i) { 433 gpu_array_info *Array = &Prog->array[i]; 434 auto *ScopArray = (ScopArrayInfo *)Array->user; 435 std::string DevArrayName("p_dev_array_"); 436 DevArrayName.append(Array->name); 437 438 Value *ArraySize = getArraySize(Array); 439 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 440 DevArray->setName(DevArrayName); 441 DeviceAllocations[ScopArray] = DevArray; 442 } 443 444 isl_ast_build_free(Build); 445 } 446 447 void GPUNodeBuilder::freeDeviceArrays() { 448 for (auto &Array : DeviceAllocations) 449 createCallFreeDeviceMemory(Array.second); 450 } 451 452 Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) { 453 const char *Name = "polly_getKernel"; 454 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 455 Function *F = M->getFunction(Name); 456 457 // If F is not available, declare it. 458 if (!F) { 459 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 460 std::vector<Type *> Args; 461 Args.push_back(Builder.getInt8PtrTy()); 462 Args.push_back(Builder.getInt8PtrTy()); 463 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 464 F = Function::Create(Ty, Linkage, Name, M); 465 } 466 467 return Builder.CreateCall(F, {Buffer, Entry}); 468 } 469 470 Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) { 471 const char *Name = "polly_getDevicePtr"; 472 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 473 Function *F = M->getFunction(Name); 474 475 // If F is not available, declare it. 476 if (!F) { 477 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 478 std::vector<Type *> Args; 479 Args.push_back(Builder.getInt8PtrTy()); 480 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 481 F = Function::Create(Ty, Linkage, Name, M); 482 } 483 484 return Builder.CreateCall(F, {Allocation}); 485 } 486 487 void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX, 488 Value *GridDimY, Value *BlockDimX, 489 Value *BlockDimY, Value *BlockDimZ, 490 Value *Parameters) { 491 const char *Name = "polly_launchKernel"; 492 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 493 Function *F = M->getFunction(Name); 494 495 // If F is not available, declare it. 496 if (!F) { 497 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 498 std::vector<Type *> Args; 499 Args.push_back(Builder.getInt8PtrTy()); 500 Args.push_back(Builder.getInt32Ty()); 501 Args.push_back(Builder.getInt32Ty()); 502 Args.push_back(Builder.getInt32Ty()); 503 Args.push_back(Builder.getInt32Ty()); 504 Args.push_back(Builder.getInt32Ty()); 505 Args.push_back(Builder.getInt8PtrTy()); 506 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 507 F = Function::Create(Ty, Linkage, Name, M); 508 } 509 510 Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 511 BlockDimZ, Parameters}); 512 } 513 514 void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) { 515 const char *Name = "polly_freeKernel"; 516 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 517 Function *F = M->getFunction(Name); 518 519 // If F is not available, declare it. 520 if (!F) { 521 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 522 std::vector<Type *> Args; 523 Args.push_back(Builder.getInt8PtrTy()); 524 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 525 F = Function::Create(Ty, Linkage, Name, M); 526 } 527 528 Builder.CreateCall(F, {GPUKernel}); 529 } 530 531 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 532 const char *Name = "polly_freeDeviceMemory"; 533 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 534 Function *F = M->getFunction(Name); 535 536 // If F is not available, declare it. 537 if (!F) { 538 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 539 std::vector<Type *> Args; 540 Args.push_back(Builder.getInt8PtrTy()); 541 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 542 F = Function::Create(Ty, Linkage, Name, M); 543 } 544 545 Builder.CreateCall(F, {Array}); 546 } 547 548 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 549 const char *Name = "polly_allocateMemoryForDevice"; 550 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 551 Function *F = M->getFunction(Name); 552 553 // If F is not available, declare it. 554 if (!F) { 555 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 556 std::vector<Type *> Args; 557 Args.push_back(Builder.getInt64Ty()); 558 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 559 F = Function::Create(Ty, Linkage, Name, M); 560 } 561 562 return Builder.CreateCall(F, {Size}); 563 } 564 565 void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, 566 Value *DeviceData, 567 Value *Size) { 568 const char *Name = "polly_copyFromHostToDevice"; 569 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 570 Function *F = M->getFunction(Name); 571 572 // If F is not available, declare it. 573 if (!F) { 574 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 575 std::vector<Type *> Args; 576 Args.push_back(Builder.getInt8PtrTy()); 577 Args.push_back(Builder.getInt8PtrTy()); 578 Args.push_back(Builder.getInt64Ty()); 579 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 580 F = Function::Create(Ty, Linkage, Name, M); 581 } 582 583 Builder.CreateCall(F, {HostData, DeviceData, Size}); 584 } 585 586 void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, 587 Value *HostData, 588 Value *Size) { 589 const char *Name = "polly_copyFromDeviceToHost"; 590 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 591 Function *F = M->getFunction(Name); 592 593 // If F is not available, declare it. 594 if (!F) { 595 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 596 std::vector<Type *> Args; 597 Args.push_back(Builder.getInt8PtrTy()); 598 Args.push_back(Builder.getInt8PtrTy()); 599 Args.push_back(Builder.getInt64Ty()); 600 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 601 F = Function::Create(Ty, Linkage, Name, M); 602 } 603 604 Builder.CreateCall(F, {DeviceData, HostData, Size}); 605 } 606 607 Value *GPUNodeBuilder::createCallInitContext() { 608 const char *Name = "polly_initContext"; 609 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 610 Function *F = M->getFunction(Name); 611 612 // If F is not available, declare it. 613 if (!F) { 614 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 615 std::vector<Type *> Args; 616 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 617 F = Function::Create(Ty, Linkage, Name, M); 618 } 619 620 return Builder.CreateCall(F, {}); 621 } 622 623 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 624 const char *Name = "polly_freeContext"; 625 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 626 Function *F = M->getFunction(Name); 627 628 // If F is not available, declare it. 629 if (!F) { 630 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 631 std::vector<Type *> Args; 632 Args.push_back(Builder.getInt8PtrTy()); 633 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 634 F = Function::Create(Ty, Linkage, Name, M); 635 } 636 637 Builder.CreateCall(F, {Context}); 638 } 639 640 /// Check if one string is a prefix of another. 641 /// 642 /// @param String The string in which to look for the prefix. 643 /// @param Prefix The prefix to look for. 644 static bool isPrefix(std::string String, std::string Prefix) { 645 return String.find(Prefix) == 0; 646 } 647 648 Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) { 649 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 650 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 651 652 if (!gpu_array_is_scalar(Array)) { 653 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 654 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 655 656 for (unsigned int i = 1; i < Array->n_index; i++) { 657 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 658 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 659 Res = isl_ast_expr_mul(Res, Expr); 660 } 661 662 Value *NumElements = ExprBuilder.create(Res); 663 ArraySize = Builder.CreateMul(ArraySize, NumElements); 664 } 665 isl_ast_build_free(Build); 666 return ArraySize; 667 } 668 669 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, 670 enum DataDirection Direction) { 671 isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); 672 isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); 673 isl_id *Id = isl_ast_expr_get_id(Arg); 674 auto Array = (gpu_array_info *)isl_id_get_user(Id); 675 auto ScopArray = (ScopArrayInfo *)(Array->user); 676 677 Value *Size = getArraySize(Array); 678 Value *HostPtr = ScopArray->getBasePtr(); 679 680 Value *DevPtr = DeviceAllocations[ScopArray]; 681 682 if (gpu_array_is_scalar(Array)) { 683 HostPtr = Builder.CreateAlloca(ScopArray->getElementType()); 684 Builder.CreateStore(ScopArray->getBasePtr(), HostPtr); 685 } 686 687 HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); 688 689 if (Direction == HOST_TO_DEVICE) 690 createCallCopyFromHostToDevice(HostPtr, DevPtr, Size); 691 else 692 createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size); 693 694 isl_id_free(Id); 695 isl_ast_expr_free(Arg); 696 isl_ast_expr_free(Expr); 697 isl_ast_node_free(TransferStmt); 698 } 699 700 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 701 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 702 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 703 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 704 isl_id_free(Id); 705 isl_ast_expr_free(StmtExpr); 706 707 const char *Str = isl_id_get_name(Id); 708 if (!strcmp(Str, "kernel")) { 709 createKernel(UserStmt); 710 isl_ast_expr_free(Expr); 711 return; 712 } 713 714 if (isPrefix(Str, "to_device")) { 715 createDataTransfer(UserStmt, HOST_TO_DEVICE); 716 isl_ast_expr_free(Expr); 717 return; 718 } 719 720 if (isPrefix(Str, "from_device")) { 721 createDataTransfer(UserStmt, DEVICE_TO_HOST); 722 isl_ast_expr_free(Expr); 723 return; 724 } 725 726 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 727 struct ppcg_kernel_stmt *KernelStmt = 728 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 729 isl_id_free(Anno); 730 731 switch (KernelStmt->type) { 732 case ppcg_kernel_domain: 733 createScopStmt(Expr, KernelStmt); 734 isl_ast_node_free(UserStmt); 735 return; 736 case ppcg_kernel_copy: 737 createKernelCopy(KernelStmt); 738 isl_ast_expr_free(Expr); 739 isl_ast_node_free(UserStmt); 740 return; 741 case ppcg_kernel_sync: 742 createKernelSync(); 743 isl_ast_expr_free(Expr); 744 isl_ast_node_free(UserStmt); 745 return; 746 } 747 748 isl_ast_expr_free(Expr); 749 isl_ast_node_free(UserStmt); 750 return; 751 } 752 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) { 753 isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index); 754 LocalIndex = isl_ast_expr_address_of(LocalIndex); 755 Value *LocalAddr = ExprBuilder.create(LocalIndex); 756 isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index); 757 Index = isl_ast_expr_address_of(Index); 758 Value *GlobalAddr = ExprBuilder.create(Index); 759 760 if (KernelStmt->u.c.read) { 761 LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read"); 762 Builder.CreateStore(Load, LocalAddr); 763 } else { 764 LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write"); 765 Builder.CreateStore(Load, GlobalAddr); 766 } 767 } 768 769 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 770 ppcg_kernel_stmt *KernelStmt) { 771 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 772 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 773 774 LoopToScevMapT LTS; 775 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 776 777 createSubstitutions(Expr, Stmt, LTS); 778 779 if (Stmt->isBlockStmt()) 780 BlockGen.copyStmt(*Stmt, LTS, Indexes); 781 else 782 assert(0 && "Region statement not supported\n"); 783 } 784 785 void GPUNodeBuilder::createKernelSync() { 786 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 787 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 788 Builder.CreateCall(Sync, {}); 789 } 790 791 /// Collect llvm::Values referenced from @p Node 792 /// 793 /// This function only applies to isl_ast_nodes that are user_nodes referring 794 /// to a ScopStmt. All other node types are ignore. 795 /// 796 /// @param Node The node to collect references for. 797 /// @param User A user pointer used as storage for the data that is collected. 798 /// 799 /// @returns isl_bool_true if data could be collected successfully. 800 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 801 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 802 return isl_bool_true; 803 804 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 805 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 806 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 807 const char *Str = isl_id_get_name(Id); 808 isl_id_free(Id); 809 isl_ast_expr_free(StmtExpr); 810 isl_ast_expr_free(Expr); 811 812 if (!isPrefix(Str, "Stmt")) 813 return isl_bool_true; 814 815 Id = isl_ast_node_get_annotation(Node); 816 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 817 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 818 isl_id_free(Id); 819 820 addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */); 821 822 return isl_bool_true; 823 } 824 825 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 826 SetVector<Value *> SubtreeValues; 827 SetVector<const SCEV *> SCEVs; 828 SetVector<const Loop *> Loops; 829 SubtreeReferences References = { 830 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 831 832 for (const auto &I : IDToValue) 833 SubtreeValues.insert(I.second); 834 835 isl_ast_node_foreach_descendant_top_down( 836 Kernel->tree, collectReferencesInGPUStmt, &References); 837 838 for (const SCEV *Expr : SCEVs) 839 findValues(Expr, SE, SubtreeValues); 840 841 for (auto &SAI : S.arrays()) 842 SubtreeValues.remove(SAI->getBasePtr()); 843 844 isl_space *Space = S.getParamSpace(); 845 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 846 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 847 assert(IDToValue.count(Id)); 848 Value *Val = IDToValue[Id]; 849 SubtreeValues.remove(Val); 850 isl_id_free(Id); 851 } 852 isl_space_free(Space); 853 854 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 855 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 856 assert(IDToValue.count(Id)); 857 Value *Val = IDToValue[Id]; 858 SubtreeValues.remove(Val); 859 isl_id_free(Id); 860 } 861 862 return SubtreeValues; 863 } 864 865 void GPUNodeBuilder::clearDominators(Function *F) { 866 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 867 std::vector<BasicBlock *> Nodes; 868 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 869 Nodes.push_back(I->getBlock()); 870 871 for (BasicBlock *BB : Nodes) 872 DT.eraseNode(BB); 873 } 874 875 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 876 for (BasicBlock &BB : *F) { 877 Loop *L = LI.getLoopFor(&BB); 878 if (L) 879 SE.forgetLoop(L); 880 } 881 } 882 883 void GPUNodeBuilder::clearLoops(Function *F) { 884 for (BasicBlock &BB : *F) { 885 Loop *L = LI.getLoopFor(&BB); 886 if (L) 887 SE.forgetLoop(L); 888 LI.removeBlock(&BB); 889 } 890 } 891 892 std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) { 893 std::vector<Value *> Sizes; 894 isl_ast_build *Context = isl_ast_build_from_context(S.getContext()); 895 896 for (long i = 0; i < Kernel->n_grid; i++) { 897 isl_pw_aff *Size = isl_multi_pw_aff_get_pw_aff(Kernel->grid_size, i); 898 isl_ast_expr *GridSize = isl_ast_build_expr_from_pw_aff(Context, Size); 899 Value *Res = ExprBuilder.create(GridSize); 900 Res = Builder.CreateTrunc(Res, Builder.getInt32Ty()); 901 Sizes.push_back(Res); 902 } 903 isl_ast_build_free(Context); 904 905 for (long i = Kernel->n_grid; i < 3; i++) 906 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 907 908 return std::make_tuple(Sizes[0], Sizes[1]); 909 } 910 911 std::tuple<Value *, Value *, Value *> 912 GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) { 913 std::vector<Value *> Sizes; 914 915 for (long i = 0; i < Kernel->n_block; i++) { 916 Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]); 917 Sizes.push_back(Res); 918 } 919 920 for (long i = Kernel->n_block; i < 3; i++) 921 Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1)); 922 923 return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); 924 } 925 926 Value * 927 GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, 928 SetVector<Value *> SubtreeValues) { 929 Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 930 std::distance(F->arg_begin(), F->arg_end())); 931 932 BasicBlock *EntryBlock = 933 &Builder.GetInsertBlock()->getParent()->getEntryBlock(); 934 std::string Launch = "polly_launch_" + std::to_string(Kernel->id); 935 Instruction *Parameters = 936 new AllocaInst(ArrayTy, Launch + "_params", EntryBlock->getTerminator()); 937 938 int Index = 0; 939 for (long i = 0; i < Prog->n_array; i++) { 940 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 941 continue; 942 943 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 944 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); 945 946 Value *DevArray = DeviceAllocations[(ScopArrayInfo *)SAI]; 947 DevArray = createCallGetDevicePtr(DevArray); 948 Instruction *Param = new AllocaInst( 949 Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index), 950 EntryBlock->getTerminator()); 951 Builder.CreateStore(DevArray, Param); 952 Value *Slot = Builder.CreateGEP( 953 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 954 Value *ParamTyped = 955 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 956 Builder.CreateStore(ParamTyped, Slot); 957 Index++; 958 } 959 960 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 961 962 for (long i = 0; i < NumHostIters; i++) { 963 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 964 Value *Val = IDToValue[Id]; 965 isl_id_free(Id); 966 Instruction *Param = new AllocaInst( 967 Val->getType(), Launch + "_param_" + std::to_string(Index), 968 EntryBlock->getTerminator()); 969 Builder.CreateStore(Val, Param); 970 Value *Slot = Builder.CreateGEP( 971 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 972 Value *ParamTyped = 973 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 974 Builder.CreateStore(ParamTyped, Slot); 975 Index++; 976 } 977 978 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 979 980 for (long i = 0; i < NumVars; i++) { 981 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 982 Value *Val = IDToValue[Id]; 983 isl_id_free(Id); 984 Instruction *Param = new AllocaInst( 985 Val->getType(), Launch + "_param_" + std::to_string(Index), 986 EntryBlock->getTerminator()); 987 Builder.CreateStore(Val, Param); 988 Value *Slot = Builder.CreateGEP( 989 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 990 Value *ParamTyped = 991 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 992 Builder.CreateStore(ParamTyped, Slot); 993 Index++; 994 } 995 996 for (auto Val : SubtreeValues) { 997 Instruction *Param = new AllocaInst( 998 Val->getType(), Launch + "_param_" + std::to_string(Index), 999 EntryBlock->getTerminator()); 1000 Builder.CreateStore(Val, Param); 1001 Value *Slot = Builder.CreateGEP( 1002 Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); 1003 Value *ParamTyped = 1004 Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); 1005 Builder.CreateStore(ParamTyped, Slot); 1006 Index++; 1007 } 1008 1009 auto Location = EntryBlock->getTerminator(); 1010 return new BitCastInst(Parameters, Builder.getInt8PtrTy(), 1011 Launch + "_params_i8ptr", Location); 1012 } 1013 1014 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 1015 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 1016 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 1017 isl_id_free(Id); 1018 isl_ast_node_free(KernelStmt); 1019 1020 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 1021 1022 assert(Kernel->tree && "Device AST of kernel node is empty"); 1023 1024 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 1025 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 1026 ValueMapT HostValueMap = ValueMap; 1027 1028 SetVector<const Loop *> Loops; 1029 1030 // Create for all loops we depend on values that contain the current loop 1031 // iteration. These values are necessary to generate code for SCEVs that 1032 // depend on such loops. As a result we need to pass them to the subfunction. 1033 for (const Loop *L : Loops) { 1034 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 1035 SE.getUnknown(Builder.getInt64(1)), 1036 L, SCEV::FlagAnyWrap); 1037 Value *V = generateSCEV(OuterLIV); 1038 OutsideLoopIterations[L] = SE.getUnknown(V); 1039 SubtreeValues.insert(V); 1040 } 1041 1042 createKernelFunction(Kernel, SubtreeValues); 1043 1044 create(isl_ast_node_copy(Kernel->tree)); 1045 1046 Function *F = Builder.GetInsertBlock()->getParent(); 1047 clearDominators(F); 1048 clearScalarEvolution(F); 1049 clearLoops(F); 1050 1051 Builder.SetInsertPoint(&HostInsertPoint); 1052 IDToValue = HostIDs; 1053 1054 ValueMap = HostValueMap; 1055 ScalarMap.clear(); 1056 PHIOpMap.clear(); 1057 EscapeMap.clear(); 1058 IDToSAI.clear(); 1059 Annotator.resetAlternativeAliasBases(); 1060 for (auto &BasePtr : LocalArrays) 1061 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 1062 LocalArrays.clear(); 1063 1064 Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues); 1065 1066 std::string ASMString = finalizeKernelFunction(); 1067 std::string Name = "kernel_" + std::to_string(Kernel->id); 1068 Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name); 1069 Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name"); 1070 Value *GPUKernel = createCallGetKernel(KernelString, NameString); 1071 1072 Value *GridDimX, *GridDimY; 1073 std::tie(GridDimX, GridDimY) = getGridSizes(Kernel); 1074 1075 Value *BlockDimX, *BlockDimY, *BlockDimZ; 1076 std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel); 1077 1078 createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY, 1079 BlockDimZ, Parameters); 1080 createCallFreeKernel(GPUKernel); 1081 1082 for (auto Id : KernelIds) 1083 isl_id_free(Id); 1084 1085 KernelIds.clear(); 1086 } 1087 1088 /// Compute the DataLayout string for the NVPTX backend. 1089 /// 1090 /// @param is64Bit Are we looking for a 64 bit architecture? 1091 static std::string computeNVPTXDataLayout(bool is64Bit) { 1092 std::string Ret = "e"; 1093 1094 if (!is64Bit) 1095 Ret += "-p:32:32"; 1096 1097 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 1098 1099 return Ret; 1100 } 1101 1102 Function * 1103 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 1104 SetVector<Value *> &SubtreeValues) { 1105 std::vector<Type *> Args; 1106 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1107 1108 for (long i = 0; i < Prog->n_array; i++) { 1109 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1110 continue; 1111 1112 Args.push_back(Builder.getInt8PtrTy()); 1113 } 1114 1115 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 1116 1117 for (long i = 0; i < NumHostIters; i++) 1118 Args.push_back(Builder.getInt64Ty()); 1119 1120 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 1121 1122 for (long i = 0; i < NumVars; i++) 1123 Args.push_back(Builder.getInt64Ty()); 1124 1125 for (auto *V : SubtreeValues) 1126 Args.push_back(V->getType()); 1127 1128 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 1129 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 1130 GPUModule.get()); 1131 FN->setCallingConv(CallingConv::PTX_Kernel); 1132 1133 auto Arg = FN->arg_begin(); 1134 for (long i = 0; i < Kernel->n_array; i++) { 1135 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1136 continue; 1137 1138 Arg->setName(Kernel->array[i].array->name); 1139 1140 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1141 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1142 Type *EleTy = SAI->getElementType(); 1143 Value *Val = &*Arg; 1144 SmallVector<const SCEV *, 4> Sizes; 1145 isl_ast_build *Build = 1146 isl_ast_build_from_context(isl_set_copy(Prog->context)); 1147 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 1148 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 1149 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 1150 auto V = ExprBuilder.create(DimSize); 1151 Sizes.push_back(SE.getSCEV(V)); 1152 } 1153 const ScopArrayInfo *SAIRep = 1154 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 1155 LocalArrays.push_back(Val); 1156 1157 isl_ast_build_free(Build); 1158 KernelIds.push_back(Id); 1159 IDToSAI[Id] = SAIRep; 1160 Arg++; 1161 } 1162 1163 for (long i = 0; i < NumHostIters; i++) { 1164 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 1165 Arg->setName(isl_id_get_name(Id)); 1166 IDToValue[Id] = &*Arg; 1167 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1168 Arg++; 1169 } 1170 1171 for (long i = 0; i < NumVars; i++) { 1172 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 1173 Arg->setName(isl_id_get_name(Id)); 1174 IDToValue[Id] = &*Arg; 1175 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1176 Arg++; 1177 } 1178 1179 for (auto *V : SubtreeValues) { 1180 Arg->setName(V->getName()); 1181 ValueMap[V] = &*Arg; 1182 Arg++; 1183 } 1184 1185 return FN; 1186 } 1187 1188 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 1189 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 1190 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 1191 1192 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 1193 Intrinsic::nvvm_read_ptx_sreg_tid_y, 1194 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 1195 1196 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 1197 std::string Name = isl_id_get_name(Id); 1198 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1199 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 1200 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 1201 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 1202 IDToValue[Id] = Val; 1203 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 1204 }; 1205 1206 for (int i = 0; i < Kernel->n_grid; ++i) { 1207 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 1208 addId(Id, IntrinsicsBID[i]); 1209 } 1210 1211 for (int i = 0; i < Kernel->n_block; ++i) { 1212 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 1213 addId(Id, IntrinsicsTID[i]); 1214 } 1215 } 1216 1217 void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) { 1218 auto Arg = FN->arg_begin(); 1219 for (long i = 0; i < Kernel->n_array; i++) { 1220 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 1221 continue; 1222 1223 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 1224 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 1225 isl_id_free(Id); 1226 1227 if (SAI->getNumberOfDimensions() > 0) { 1228 Arg++; 1229 continue; 1230 } 1231 1232 Value *Alloca = BlockGen.getOrCreateScalarAlloca(SAI->getBasePtr()); 1233 Value *ArgPtr = &*Arg; 1234 Type *TypePtr = SAI->getElementType()->getPointerTo(); 1235 Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr); 1236 Value *Val = Builder.CreateLoad(TypedArgPtr); 1237 Builder.CreateStore(Val, Alloca); 1238 1239 Arg++; 1240 } 1241 } 1242 1243 void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) { 1244 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 1245 1246 for (int i = 0; i < Kernel->n_var; ++i) { 1247 struct ppcg_kernel_var &Var = Kernel->var[i]; 1248 isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set); 1249 Type *EleTy = ScopArrayInfo::getFromId(Id)->getElementType(); 1250 1251 SmallVector<const SCEV *, 4> Sizes; 1252 isl_val *V0 = isl_vec_get_element_val(Var.size, 0); 1253 long Bound = isl_val_get_num_si(V0); 1254 isl_val_free(V0); 1255 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1256 1257 ArrayType *ArrayTy = ArrayType::get(EleTy, Bound); 1258 for (unsigned int j = 1; j < Var.array->n_index; ++j) { 1259 isl_val *Val = isl_vec_get_element_val(Var.size, j); 1260 Bound = isl_val_get_num_si(Val); 1261 isl_val_free(Val); 1262 Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound)); 1263 ArrayTy = ArrayType::get(ArrayTy, Bound); 1264 } 1265 1266 assert(Var.type == ppcg_access_shared && "Only shared memory supported"); 1267 1268 GlobalVariable *SharedVar = new GlobalVariable( 1269 *M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name, nullptr, 1270 GlobalValue::ThreadLocalMode::NotThreadLocal, 3); 1271 SharedVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8); 1272 ConstantAggregateZero *Zero = ConstantAggregateZero::get(ArrayTy); 1273 SharedVar->setInitializer(Zero); 1274 1275 Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr); 1276 IDToValue[Id] = SharedVar; 1277 const ScopArrayInfo *SAI = S.getOrCreateScopArrayInfo( 1278 SharedVar, EleTy, Sizes, ScopArrayInfo::MK_Array); 1279 LocalArrays.push_back(SharedVar); 1280 KernelIds.push_back(Id); 1281 IDToSAI[Id] = SAI; 1282 } 1283 } 1284 1285 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 1286 SetVector<Value *> &SubtreeValues) { 1287 1288 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 1289 GPUModule.reset(new Module(Identifier, Builder.getContext())); 1290 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1291 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 1292 1293 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 1294 1295 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 1296 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 1297 1298 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1299 DT.addNewBlock(EntryBlock, PrevBlock); 1300 1301 Builder.SetInsertPoint(EntryBlock); 1302 Builder.CreateRetVoid(); 1303 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 1304 1305 ScopDetection::markFunctionAsInvalid(FN); 1306 1307 prepareKernelArguments(Kernel, FN); 1308 createKernelVariables(Kernel, FN); 1309 insertKernelIntrinsics(Kernel); 1310 } 1311 1312 std::string GPUNodeBuilder::createKernelASM() { 1313 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 1314 std::string ErrMsg; 1315 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 1316 1317 if (!GPUTarget) { 1318 errs() << ErrMsg << "\n"; 1319 return ""; 1320 } 1321 1322 TargetOptions Options; 1323 Options.UnsafeFPMath = FastMath; 1324 std::unique_ptr<TargetMachine> TargetM( 1325 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 1326 Options, Optional<Reloc::Model>())); 1327 1328 SmallString<0> ASMString; 1329 raw_svector_ostream ASMStream(ASMString); 1330 llvm::legacy::PassManager PM; 1331 1332 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 1333 1334 if (TargetM->addPassesToEmitFile( 1335 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 1336 errs() << "The target does not support generation of this file type!\n"; 1337 return ""; 1338 } 1339 1340 PM.run(*GPUModule); 1341 1342 return ASMStream.str(); 1343 } 1344 1345 std::string GPUNodeBuilder::finalizeKernelFunction() { 1346 // Verify module. 1347 llvm::legacy::PassManager Passes; 1348 Passes.add(createVerifierPass()); 1349 Passes.run(*GPUModule); 1350 1351 if (DumpKernelIR) 1352 outs() << *GPUModule << "\n"; 1353 1354 // Optimize module. 1355 llvm::legacy::PassManager OptPasses; 1356 PassManagerBuilder PassBuilder; 1357 PassBuilder.OptLevel = 3; 1358 PassBuilder.SizeLevel = 0; 1359 PassBuilder.populateModulePassManager(OptPasses); 1360 OptPasses.run(*GPUModule); 1361 1362 std::string Assembly = createKernelASM(); 1363 1364 if (DumpKernelASM) 1365 outs() << Assembly << "\n"; 1366 1367 GPUModule.release(); 1368 KernelIDs.clear(); 1369 1370 return Assembly; 1371 } 1372 1373 namespace { 1374 class PPCGCodeGeneration : public ScopPass { 1375 public: 1376 static char ID; 1377 1378 /// The scop that is currently processed. 1379 Scop *S; 1380 1381 LoopInfo *LI; 1382 DominatorTree *DT; 1383 ScalarEvolution *SE; 1384 const DataLayout *DL; 1385 RegionInfo *RI; 1386 1387 PPCGCodeGeneration() : ScopPass(ID) {} 1388 1389 /// Construct compilation options for PPCG. 1390 /// 1391 /// @returns The compilation options. 1392 ppcg_options *createPPCGOptions() { 1393 auto DebugOptions = 1394 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 1395 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 1396 1397 DebugOptions->dump_schedule_constraints = false; 1398 DebugOptions->dump_schedule = false; 1399 DebugOptions->dump_final_schedule = false; 1400 DebugOptions->dump_sizes = false; 1401 1402 Options->debug = DebugOptions; 1403 1404 Options->reschedule = true; 1405 Options->scale_tile_loops = false; 1406 Options->wrap = false; 1407 1408 Options->non_negative_parameters = false; 1409 Options->ctx = nullptr; 1410 Options->sizes = nullptr; 1411 1412 Options->tile_size = 32; 1413 1414 Options->use_private_memory = false; 1415 Options->use_shared_memory = SharedMemory; 1416 Options->max_shared_memory = 48 * 1024; 1417 1418 Options->target = PPCG_TARGET_CUDA; 1419 Options->openmp = false; 1420 Options->linearize_device_arrays = true; 1421 Options->live_range_reordering = false; 1422 1423 Options->opencl_compiler_options = nullptr; 1424 Options->opencl_use_gpu = false; 1425 Options->opencl_n_include_file = 0; 1426 Options->opencl_include_files = nullptr; 1427 Options->opencl_print_kernel_types = false; 1428 Options->opencl_embed_kernel_code = false; 1429 1430 Options->save_schedule_file = nullptr; 1431 Options->load_schedule_file = nullptr; 1432 1433 return Options; 1434 } 1435 1436 /// Get a tagged access relation containing all accesses of type @p AccessTy. 1437 /// 1438 /// Instead of a normal access of the form: 1439 /// 1440 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 1441 /// 1442 /// a tagged access has the form 1443 /// 1444 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 1445 /// 1446 /// where 'id' is an additional space that references the memory access that 1447 /// triggered the access. 1448 /// 1449 /// @param AccessTy The type of the memory accesses to collect. 1450 /// 1451 /// @return The relation describing all tagged memory accesses. 1452 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 1453 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 1454 1455 for (auto &Stmt : *S) 1456 for (auto &Acc : Stmt) 1457 if (Acc->getType() == AccessTy) { 1458 isl_map *Relation = Acc->getAccessRelation(); 1459 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 1460 1461 isl_space *Space = isl_map_get_space(Relation); 1462 Space = isl_space_range(Space); 1463 Space = isl_space_from_range(Space); 1464 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1465 isl_map *Universe = isl_map_universe(Space); 1466 Relation = isl_map_domain_product(Relation, Universe); 1467 Accesses = isl_union_map_add_map(Accesses, Relation); 1468 } 1469 1470 return Accesses; 1471 } 1472 1473 /// Get the set of all read accesses, tagged with the access id. 1474 /// 1475 /// @see getTaggedAccesses 1476 isl_union_map *getTaggedReads() { 1477 return getTaggedAccesses(MemoryAccess::READ); 1478 } 1479 1480 /// Get the set of all may (and must) accesses, tagged with the access id. 1481 /// 1482 /// @see getTaggedAccesses 1483 isl_union_map *getTaggedMayWrites() { 1484 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 1485 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 1486 } 1487 1488 /// Get the set of all must accesses, tagged with the access id. 1489 /// 1490 /// @see getTaggedAccesses 1491 isl_union_map *getTaggedMustWrites() { 1492 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 1493 } 1494 1495 /// Collect parameter and array names as isl_ids. 1496 /// 1497 /// To reason about the different parameters and arrays used, ppcg requires 1498 /// a list of all isl_ids in use. As PPCG traditionally performs 1499 /// source-to-source compilation each of these isl_ids is mapped to the 1500 /// expression that represents it. As we do not have a corresponding 1501 /// expression in Polly, we just map each id to a 'zero' expression to match 1502 /// the data format that ppcg expects. 1503 /// 1504 /// @returns Retun a map from collected ids to 'zero' ast expressions. 1505 __isl_give isl_id_to_ast_expr *getNames() { 1506 auto *Names = isl_id_to_ast_expr_alloc( 1507 S->getIslCtx(), 1508 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 1509 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 1510 auto *Space = S->getParamSpace(); 1511 1512 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 1513 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1514 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1515 } 1516 1517 for (auto &Array : S->arrays()) { 1518 auto Id = Array->getBasePtrId(); 1519 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1520 } 1521 1522 isl_space_free(Space); 1523 isl_ast_expr_free(Zero); 1524 1525 return Names; 1526 } 1527 1528 /// Create a new PPCG scop from the current scop. 1529 /// 1530 /// The PPCG scop is initialized with data from the current polly::Scop. From 1531 /// this initial data, the data-dependences in the PPCG scop are initialized. 1532 /// We do not use Polly's dependence analysis for now, to ensure we match 1533 /// the PPCG default behaviour more closely. 1534 /// 1535 /// @returns A new ppcg scop. 1536 ppcg_scop *createPPCGScop() { 1537 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1538 1539 PPCGScop->options = createPPCGOptions(); 1540 1541 PPCGScop->start = 0; 1542 PPCGScop->end = 0; 1543 1544 PPCGScop->context = S->getContext(); 1545 PPCGScop->domain = S->getDomains(); 1546 PPCGScop->call = nullptr; 1547 PPCGScop->tagged_reads = getTaggedReads(); 1548 PPCGScop->reads = S->getReads(); 1549 PPCGScop->live_in = nullptr; 1550 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1551 PPCGScop->may_writes = S->getWrites(); 1552 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1553 PPCGScop->must_writes = S->getMustWrites(); 1554 PPCGScop->live_out = nullptr; 1555 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1556 PPCGScop->tagger = nullptr; 1557 1558 PPCGScop->independence = nullptr; 1559 PPCGScop->dep_flow = nullptr; 1560 PPCGScop->tagged_dep_flow = nullptr; 1561 PPCGScop->dep_false = nullptr; 1562 PPCGScop->dep_forced = nullptr; 1563 PPCGScop->dep_order = nullptr; 1564 PPCGScop->tagged_dep_order = nullptr; 1565 1566 PPCGScop->schedule = S->getScheduleTree(); 1567 PPCGScop->names = getNames(); 1568 1569 PPCGScop->pet = nullptr; 1570 1571 compute_tagger(PPCGScop); 1572 compute_dependences(PPCGScop); 1573 1574 return PPCGScop; 1575 } 1576 1577 /// Collect the array acesses in a statement. 1578 /// 1579 /// @param Stmt The statement for which to collect the accesses. 1580 /// 1581 /// @returns A list of array accesses. 1582 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1583 gpu_stmt_access *Accesses = nullptr; 1584 1585 for (MemoryAccess *Acc : Stmt) { 1586 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1587 Access->read = Acc->isRead(); 1588 Access->write = Acc->isWrite(); 1589 Access->access = Acc->getAccessRelation(); 1590 isl_space *Space = isl_map_get_space(Access->access); 1591 Space = isl_space_range(Space); 1592 Space = isl_space_from_range(Space); 1593 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1594 isl_map *Universe = isl_map_universe(Space); 1595 Access->tagged_access = 1596 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1597 Access->exact_write = !Acc->isMayWrite(); 1598 Access->ref_id = Acc->getId(); 1599 Access->next = Accesses; 1600 Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions(); 1601 Accesses = Access; 1602 } 1603 1604 return Accesses; 1605 } 1606 1607 /// Collect the list of GPU statements. 1608 /// 1609 /// Each statement has an id, a pointer to the underlying data structure, 1610 /// as well as a list with all memory accesses. 1611 /// 1612 /// TODO: Initialize the list of memory accesses. 1613 /// 1614 /// @returns A linked-list of statements. 1615 gpu_stmt *getStatements() { 1616 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1617 std::distance(S->begin(), S->end())); 1618 1619 int i = 0; 1620 for (auto &Stmt : *S) { 1621 gpu_stmt *GPUStmt = &Stmts[i]; 1622 1623 GPUStmt->id = Stmt.getDomainId(); 1624 1625 // We use the pet stmt pointer to keep track of the Polly statements. 1626 GPUStmt->stmt = (pet_stmt *)&Stmt; 1627 GPUStmt->accesses = getStmtAccesses(Stmt); 1628 i++; 1629 } 1630 1631 return Stmts; 1632 } 1633 1634 /// Derive the extent of an array. 1635 /// 1636 /// The extent of an array is defined by the set of memory locations for 1637 /// which a memory access in the iteration domain exists. 1638 /// 1639 /// @param Array The array to derive the extent for. 1640 /// 1641 /// @returns An isl_set describing the extent of the array. 1642 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1643 isl_union_map *Accesses = S->getAccesses(); 1644 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1645 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1646 isl_set *AccessSet = 1647 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1648 isl_union_set_free(AccessUSet); 1649 1650 return AccessSet; 1651 } 1652 1653 /// Derive the bounds of an array. 1654 /// 1655 /// For the first dimension we derive the bound of the array from the extent 1656 /// of this dimension. For inner dimensions we obtain their size directly from 1657 /// ScopArrayInfo. 1658 /// 1659 /// @param PPCGArray The array to compute bounds for. 1660 /// @param Array The polly array from which to take the information. 1661 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1662 if (PPCGArray.n_index > 0) { 1663 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1664 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1665 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1666 isl_set_free(Dom); 1667 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1668 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1669 isl_aff *One = isl_aff_zero_on_domain(LS); 1670 One = isl_aff_add_constant_si(One, 1); 1671 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1672 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1673 PPCGArray.bound[0] = Bound; 1674 } 1675 1676 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1677 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1678 auto LS = isl_pw_aff_get_domain_space(Bound); 1679 auto Aff = isl_multi_aff_zero(LS); 1680 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1681 PPCGArray.bound[i] = Bound; 1682 } 1683 } 1684 1685 /// Create the arrays for @p PPCGProg. 1686 /// 1687 /// @param PPCGProg The program to compute the arrays for. 1688 void createArrays(gpu_prog *PPCGProg) { 1689 int i = 0; 1690 for (auto &Array : S->arrays()) { 1691 std::string TypeName; 1692 raw_string_ostream OS(TypeName); 1693 1694 OS << *Array->getElementType(); 1695 TypeName = OS.str(); 1696 1697 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1698 1699 PPCGArray.space = Array->getSpace(); 1700 PPCGArray.type = strdup(TypeName.c_str()); 1701 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1702 PPCGArray.name = strdup(Array->getName().c_str()); 1703 PPCGArray.extent = nullptr; 1704 PPCGArray.n_index = Array->getNumberOfDimensions(); 1705 PPCGArray.bound = 1706 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1707 PPCGArray.extent = getExtent(Array); 1708 PPCGArray.n_ref = 0; 1709 PPCGArray.refs = nullptr; 1710 PPCGArray.accessed = true; 1711 PPCGArray.read_only_scalar = false; 1712 PPCGArray.has_compound_element = false; 1713 PPCGArray.local = false; 1714 PPCGArray.declare_local = false; 1715 PPCGArray.global = false; 1716 PPCGArray.linearize = false; 1717 PPCGArray.dep_order = nullptr; 1718 PPCGArray.user = Array; 1719 1720 setArrayBounds(PPCGArray, Array); 1721 i++; 1722 1723 collect_references(PPCGProg, &PPCGArray); 1724 } 1725 } 1726 1727 /// Create an identity map between the arrays in the scop. 1728 /// 1729 /// @returns An identity map between the arrays in the scop. 1730 isl_union_map *getArrayIdentity() { 1731 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1732 1733 for (auto &Array : S->arrays()) { 1734 isl_space *Space = Array->getSpace(); 1735 Space = isl_space_map_from_set(Space); 1736 isl_map *Identity = isl_map_identity(Space); 1737 Maps = isl_union_map_add_map(Maps, Identity); 1738 } 1739 1740 return Maps; 1741 } 1742 1743 /// Create a default-initialized PPCG GPU program. 1744 /// 1745 /// @returns A new gpu grogram description. 1746 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1747 1748 if (!PPCGScop) 1749 return nullptr; 1750 1751 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1752 1753 PPCGProg->ctx = S->getIslCtx(); 1754 PPCGProg->scop = PPCGScop; 1755 PPCGProg->context = isl_set_copy(PPCGScop->context); 1756 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1757 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1758 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1759 PPCGProg->tagged_must_kill = 1760 isl_union_map_copy(PPCGScop->tagged_must_kills); 1761 PPCGProg->to_inner = getArrayIdentity(); 1762 PPCGProg->to_outer = getArrayIdentity(); 1763 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1764 PPCGProg->any_to_outer = nullptr; 1765 PPCGProg->array_order = nullptr; 1766 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1767 PPCGProg->stmts = getStatements(); 1768 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1769 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1770 PPCGProg->n_array); 1771 1772 createArrays(PPCGProg); 1773 1774 return PPCGProg; 1775 } 1776 1777 struct PrintGPUUserData { 1778 struct cuda_info *CudaInfo; 1779 struct gpu_prog *PPCGProg; 1780 std::vector<ppcg_kernel *> Kernels; 1781 }; 1782 1783 /// Print a user statement node in the host code. 1784 /// 1785 /// We use ppcg's printing facilities to print the actual statement and 1786 /// additionally build up a list of all kernels that are encountered in the 1787 /// host ast. 1788 /// 1789 /// @param P The printer to print to 1790 /// @param Options The printing options to use 1791 /// @param Node The node to print 1792 /// @param User A user pointer to carry additional data. This pointer is 1793 /// expected to be of type PrintGPUUserData. 1794 /// 1795 /// @returns A printer to which the output has been printed. 1796 static __isl_give isl_printer * 1797 printHostUser(__isl_take isl_printer *P, 1798 __isl_take isl_ast_print_options *Options, 1799 __isl_take isl_ast_node *Node, void *User) { 1800 auto Data = (struct PrintGPUUserData *)User; 1801 auto Id = isl_ast_node_get_annotation(Node); 1802 1803 if (Id) { 1804 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1805 1806 // If this is a user statement, format it ourselves as ppcg would 1807 // otherwise try to call pet functionality that is not available in 1808 // Polly. 1809 if (IsUser) { 1810 P = isl_printer_start_line(P); 1811 P = isl_printer_print_ast_node(P, Node); 1812 P = isl_printer_end_line(P); 1813 isl_id_free(Id); 1814 isl_ast_print_options_free(Options); 1815 return P; 1816 } 1817 1818 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1819 isl_id_free(Id); 1820 Data->Kernels.push_back(Kernel); 1821 } 1822 1823 return print_host_user(P, Options, Node, User); 1824 } 1825 1826 /// Print C code corresponding to the control flow in @p Kernel. 1827 /// 1828 /// @param Kernel The kernel to print 1829 void printKernel(ppcg_kernel *Kernel) { 1830 auto *P = isl_printer_to_str(S->getIslCtx()); 1831 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1832 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1833 P = isl_ast_node_print(Kernel->tree, P, Options); 1834 char *String = isl_printer_get_str(P); 1835 printf("%s\n", String); 1836 free(String); 1837 isl_printer_free(P); 1838 } 1839 1840 /// Print C code corresponding to the GPU code described by @p Tree. 1841 /// 1842 /// @param Tree An AST describing GPU code 1843 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1844 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1845 auto *P = isl_printer_to_str(S->getIslCtx()); 1846 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1847 1848 PrintGPUUserData Data; 1849 Data.PPCGProg = PPCGProg; 1850 1851 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1852 Options = 1853 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1854 P = isl_ast_node_print(Tree, P, Options); 1855 char *String = isl_printer_get_str(P); 1856 printf("# host\n"); 1857 printf("%s\n", String); 1858 free(String); 1859 isl_printer_free(P); 1860 1861 for (auto Kernel : Data.Kernels) { 1862 printf("# kernel%d\n", Kernel->id); 1863 printKernel(Kernel); 1864 } 1865 } 1866 1867 // Generate a GPU program using PPCG. 1868 // 1869 // GPU mapping consists of multiple steps: 1870 // 1871 // 1) Compute new schedule for the program. 1872 // 2) Map schedule to GPU (TODO) 1873 // 3) Generate code for new schedule (TODO) 1874 // 1875 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1876 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1877 // strategy directly from this pass. 1878 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1879 1880 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1881 1882 PPCGGen->ctx = S->getIslCtx(); 1883 PPCGGen->options = PPCGScop->options; 1884 PPCGGen->print = nullptr; 1885 PPCGGen->print_user = nullptr; 1886 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1887 PPCGGen->prog = PPCGProg; 1888 PPCGGen->tree = nullptr; 1889 PPCGGen->types.n = 0; 1890 PPCGGen->types.name = nullptr; 1891 PPCGGen->sizes = nullptr; 1892 PPCGGen->used_sizes = nullptr; 1893 PPCGGen->kernel_id = 0; 1894 1895 // Set scheduling strategy to same strategy PPCG is using. 1896 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1897 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1898 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1899 1900 isl_schedule *Schedule = get_schedule(PPCGGen); 1901 1902 int has_permutable = has_any_permutable_node(Schedule); 1903 1904 if (!has_permutable || has_permutable < 0) { 1905 Schedule = isl_schedule_free(Schedule); 1906 } else { 1907 Schedule = map_to_device(PPCGGen, Schedule); 1908 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1909 } 1910 1911 if (DumpSchedule) { 1912 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1913 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1914 P = isl_printer_print_str(P, "Schedule\n"); 1915 P = isl_printer_print_str(P, "========\n"); 1916 if (Schedule) 1917 P = isl_printer_print_schedule(P, Schedule); 1918 else 1919 P = isl_printer_print_str(P, "No schedule found\n"); 1920 1921 printf("%s\n", isl_printer_get_str(P)); 1922 isl_printer_free(P); 1923 } 1924 1925 if (DumpCode) { 1926 printf("Code\n"); 1927 printf("====\n"); 1928 if (PPCGGen->tree) 1929 printGPUTree(PPCGGen->tree, PPCGProg); 1930 else 1931 printf("No code generated\n"); 1932 } 1933 1934 isl_schedule_free(Schedule); 1935 1936 return PPCGGen; 1937 } 1938 1939 /// Free gpu_gen structure. 1940 /// 1941 /// @param PPCGGen The ppcg_gen object to free. 1942 void freePPCGGen(gpu_gen *PPCGGen) { 1943 isl_ast_node_free(PPCGGen->tree); 1944 isl_union_map_free(PPCGGen->sizes); 1945 isl_union_map_free(PPCGGen->used_sizes); 1946 free(PPCGGen); 1947 } 1948 1949 /// Free the options in the ppcg scop structure. 1950 /// 1951 /// ppcg is not freeing these options for us. To avoid leaks we do this 1952 /// ourselves. 1953 /// 1954 /// @param PPCGScop The scop referencing the options to free. 1955 void freeOptions(ppcg_scop *PPCGScop) { 1956 free(PPCGScop->options->debug); 1957 PPCGScop->options->debug = nullptr; 1958 free(PPCGScop->options); 1959 PPCGScop->options = nullptr; 1960 } 1961 1962 /// Generate code for a given GPU AST described by @p Root. 1963 /// 1964 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1965 /// @param Prog The GPU Program to generate code for. 1966 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1967 ScopAnnotator Annotator; 1968 Annotator.buildAliasScopes(*S); 1969 1970 Region *R = &S->getRegion(); 1971 1972 simplifyRegion(R, DT, LI, RI); 1973 1974 BasicBlock *EnteringBB = R->getEnteringBlock(); 1975 1976 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1977 1978 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1979 Prog); 1980 1981 // Only build the run-time condition and parameters _after_ having 1982 // introduced the conditional branch. This is important as the conditional 1983 // branch will guard the original scop from new induction variables that 1984 // the SCEVExpander may introduce while code generating the parameters and 1985 // which may introduce scalar dependences that prevent us from correctly 1986 // code generating this scop. 1987 BasicBlock *StartBlock = 1988 executeScopConditionally(*S, this, Builder.getTrue()); 1989 1990 // TODO: Handle LICM 1991 // TODO: Verify run-time checks 1992 auto SplitBlock = StartBlock->getSinglePredecessor(); 1993 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1994 NodeBuilder.addParameters(S->getContext()); 1995 Builder.SetInsertPoint(&*StartBlock->begin()); 1996 1997 NodeBuilder.initializeAfterRTH(); 1998 NodeBuilder.create(Root); 1999 NodeBuilder.finalize(); 2000 } 2001 2002 bool runOnScop(Scop &CurrentScop) override { 2003 S = &CurrentScop; 2004 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 2005 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 2006 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 2007 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 2008 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 2009 2010 // We currently do not support scops with invariant loads. 2011 if (S->hasInvariantAccesses()) 2012 return false; 2013 2014 auto PPCGScop = createPPCGScop(); 2015 auto PPCGProg = createPPCGProg(PPCGScop); 2016 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 2017 2018 if (PPCGGen->tree) 2019 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 2020 2021 freeOptions(PPCGScop); 2022 freePPCGGen(PPCGGen); 2023 gpu_prog_free(PPCGProg); 2024 ppcg_scop_free(PPCGScop); 2025 2026 return true; 2027 } 2028 2029 void printScop(raw_ostream &, Scop &) const override {} 2030 2031 void getAnalysisUsage(AnalysisUsage &AU) const override { 2032 AU.addRequired<DominatorTreeWrapperPass>(); 2033 AU.addRequired<RegionInfoPass>(); 2034 AU.addRequired<ScalarEvolutionWrapperPass>(); 2035 AU.addRequired<ScopDetection>(); 2036 AU.addRequired<ScopInfoRegionPass>(); 2037 AU.addRequired<LoopInfoWrapperPass>(); 2038 2039 AU.addPreserved<AAResultsWrapperPass>(); 2040 AU.addPreserved<BasicAAWrapperPass>(); 2041 AU.addPreserved<LoopInfoWrapperPass>(); 2042 AU.addPreserved<DominatorTreeWrapperPass>(); 2043 AU.addPreserved<GlobalsAAWrapperPass>(); 2044 AU.addPreserved<PostDominatorTreeWrapperPass>(); 2045 AU.addPreserved<ScopDetection>(); 2046 AU.addPreserved<ScalarEvolutionWrapperPass>(); 2047 AU.addPreserved<SCEVAAWrapperPass>(); 2048 2049 // FIXME: We do not yet add regions for the newly generated code to the 2050 // region tree. 2051 AU.addPreserved<RegionInfoPass>(); 2052 AU.addPreserved<ScopInfoRegionPass>(); 2053 } 2054 }; 2055 } 2056 2057 char PPCGCodeGeneration::ID = 1; 2058 2059 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 2060 2061 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 2062 "Polly - Apply PPCG translation to SCOP", false, false) 2063 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 2064 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 2065 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 2066 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 2067 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 2068 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 2069 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 2070 "Polly - Apply PPCG translation to SCOP", false, false) 2071