1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopInfo.h" 21 #include "polly/Support/SCEVValidator.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/Analysis/BasicAliasAnalysis.h" 25 #include "llvm/Analysis/GlobalsModRef.h" 26 #include "llvm/Analysis/PostDominators.h" 27 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/LegacyPassManager.h" 31 #include "llvm/IR/Verifier.h" 32 #include "llvm/Support/TargetRegistry.h" 33 #include "llvm/Support/TargetSelect.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 36 37 #include "isl/union_map.h" 38 39 extern "C" { 40 #include "ppcg/cuda.h" 41 #include "ppcg/gpu.h" 42 #include "ppcg/gpu_print.h" 43 #include "ppcg/ppcg.h" 44 #include "ppcg/schedule.h" 45 } 46 47 #include "llvm/Support/Debug.h" 48 49 using namespace polly; 50 using namespace llvm; 51 52 #define DEBUG_TYPE "polly-codegen-ppcg" 53 54 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 55 cl::desc("Dump the computed GPU Schedule"), 56 cl::Hidden, cl::init(false), cl::ZeroOrMore, 57 cl::cat(PollyCategory)); 58 59 static cl::opt<bool> 60 DumpCode("polly-acc-dump-code", 61 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 62 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 63 64 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 65 cl::desc("Dump the kernel LLVM-IR"), 66 cl::Hidden, cl::init(false), cl::ZeroOrMore, 67 cl::cat(PollyCategory)); 68 69 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 70 cl::desc("Dump the kernel assembly code"), 71 cl::Hidden, cl::init(false), cl::ZeroOrMore, 72 cl::cat(PollyCategory)); 73 74 static cl::opt<bool> FastMath("polly-acc-fastmath", 75 cl::desc("Allow unsafe math optimizations"), 76 cl::Hidden, cl::init(false), cl::ZeroOrMore, 77 cl::cat(PollyCategory)); 78 79 static cl::opt<std::string> 80 CudaVersion("polly-acc-cuda-version", 81 cl::desc("The CUDA version to compile for"), cl::Hidden, 82 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 83 84 /// Create the ast expressions for a ScopStmt. 85 /// 86 /// This function is a callback for to generate the ast expressions for each 87 /// of the scheduled ScopStmts. 88 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 89 void *StmtT, isl_ast_build *Build, 90 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 91 isl_id *Id, void *User), 92 void *UserIndex, 93 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 94 void *UserExpr) { 95 96 ScopStmt *Stmt = (ScopStmt *)StmtT; 97 98 isl_ctx *Ctx; 99 100 if (!Stmt || !Build) 101 return NULL; 102 103 Ctx = isl_ast_build_get_ctx(Build); 104 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 105 106 for (MemoryAccess *Acc : *Stmt) { 107 isl_map *AddrFunc = Acc->getAddressFunction(); 108 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 109 isl_id *RefId = Acc->getId(); 110 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 111 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 112 MPA = isl_multi_pw_aff_coalesce(MPA); 113 MPA = FunctionIndex(MPA, RefId, UserIndex); 114 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 115 Access = FunctionExpr(Access, RefId, UserExpr); 116 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 117 } 118 119 return RefToExpr; 120 } 121 122 /// Generate code for a GPU specific isl AST. 123 /// 124 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 125 /// generates code for general-prupose AST nodes, with special functionality 126 /// for generating GPU specific user nodes. 127 /// 128 /// @see GPUNodeBuilder::createUser 129 class GPUNodeBuilder : public IslNodeBuilder { 130 public: 131 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 132 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 133 DominatorTree &DT, Scop &S, gpu_prog *Prog) 134 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 135 getExprBuilder().setIDToSAI(&IDToSAI); 136 } 137 138 /// Create after-run-time-check initialization code. 139 void initializeAfterRTH(); 140 141 /// Finalize the generated scop. 142 virtual void finalize(); 143 144 private: 145 /// A vector of array base pointers for which a new ScopArrayInfo was created. 146 /// 147 /// This vector is used to delete the ScopArrayInfo when it is not needed any 148 /// more. 149 std::vector<Value *> LocalArrays; 150 151 /// A list of device arrays that has been allocated. 152 std::vector<Value *> AllocatedDevArrays; 153 154 /// The current GPU context. 155 Value *GPUContext; 156 157 /// A module containing GPU code. 158 /// 159 /// This pointer is only set in case we are currently generating GPU code. 160 std::unique_ptr<Module> GPUModule; 161 162 /// The GPU program we generate code for. 163 gpu_prog *Prog; 164 165 /// Class to free isl_ids. 166 class IslIdDeleter { 167 public: 168 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 169 }; 170 171 /// A set containing all isl_ids allocated in a GPU kernel. 172 /// 173 /// By releasing this set all isl_ids will be freed. 174 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 175 176 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 177 178 /// Create code for user-defined AST nodes. 179 /// 180 /// These AST nodes can be of type: 181 /// 182 /// - ScopStmt: A computational statement (TODO) 183 /// - Kernel: A GPU kernel call (TODO) 184 /// - Data-Transfer: A GPU <-> CPU data-transfer (TODO) 185 /// - In-kernel synchronization 186 /// - In-kernel memory copy statement 187 /// 188 /// @param UserStmt The ast node to generate code for. 189 virtual void createUser(__isl_take isl_ast_node *UserStmt); 190 191 /// Find llvm::Values referenced in GPU kernel. 192 /// 193 /// @param Kernel The kernel to scan for llvm::Values 194 /// 195 /// @returns A set of values referenced by the kernel. 196 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 197 198 /// Create GPU kernel. 199 /// 200 /// Code generate the kernel described by @p KernelStmt. 201 /// 202 /// @param KernelStmt The ast node to generate kernel code for. 203 void createKernel(__isl_take isl_ast_node *KernelStmt); 204 205 /// Create kernel function. 206 /// 207 /// Create a kernel function located in a newly created module that can serve 208 /// as target for device code generation. Set the Builder to point to the 209 /// start block of this newly created function. 210 /// 211 /// @param Kernel The kernel to generate code for. 212 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 213 void createKernelFunction(ppcg_kernel *Kernel, 214 SetVector<Value *> &SubtreeValues); 215 216 /// Create the declaration of a kernel function. 217 /// 218 /// The kernel function takes as arguments: 219 /// 220 /// - One i8 pointer for each external array reference used in the kernel. 221 /// - Host iterators 222 /// - Parameters 223 /// - Other LLVM Value references (TODO) 224 /// 225 /// @param Kernel The kernel to generate the function declaration for. 226 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 227 /// 228 /// @returns The newly declared function. 229 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 230 SetVector<Value *> &SubtreeValues); 231 232 /// Insert intrinsic functions to obtain thread and block ids. 233 /// 234 /// @param The kernel to generate the intrinsic functions for. 235 void insertKernelIntrinsics(ppcg_kernel *Kernel); 236 237 /// Create code for a ScopStmt called in @p Expr. 238 /// 239 /// @param Expr The expression containing the call. 240 /// @param KernelStmt The kernel statement referenced in the call. 241 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 242 243 /// Create an in-kernel synchronization call. 244 void createKernelSync(); 245 246 /// Create a PTX assembly string for the current GPU kernel. 247 /// 248 /// @returns A string containing the corresponding PTX assembly code. 249 std::string createKernelASM(); 250 251 /// Remove references from the dominator tree to the kernel function @p F. 252 /// 253 /// @param F The function to remove references to. 254 void clearDominators(Function *F); 255 256 /// Remove references from scalar evolution to the kernel function @p F. 257 /// 258 /// @param F The function to remove references to. 259 void clearScalarEvolution(Function *F); 260 261 /// Remove references from loop info to the kernel function @p F. 262 /// 263 /// @param F The function to remove references to. 264 void clearLoops(Function *F); 265 266 /// Finalize the generation of the kernel function. 267 /// 268 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 269 /// dump its IR to stderr. 270 void finalizeKernelFunction(); 271 272 /// Create code that allocates memory to store arrays on device. 273 void allocateDeviceArrays(); 274 275 /// Free all allocated device arrays. 276 void freeDeviceArrays(); 277 278 /// Create a call to initialize the GPU context. 279 /// 280 /// @returns A pointer to the newly initialized context. 281 Value *createCallInitContext(); 282 283 /// Create a call to free the GPU context. 284 /// 285 /// @param Context A pointer to an initialized GPU context. 286 void createCallFreeContext(Value *Context); 287 288 /// Create a call to allocate memory on the device. 289 /// 290 /// @param Size The size of memory to allocate 291 /// 292 /// @returns A pointer that identifies this allocation. 293 Value *createCallAllocateMemoryForDevice(Value *Size); 294 295 /// Create a call to free a device array. 296 /// 297 /// @param Array The device array to free. 298 void createCallFreeDeviceMemory(Value *Array); 299 }; 300 301 void GPUNodeBuilder::initializeAfterRTH() { 302 GPUContext = createCallInitContext(); 303 allocateDeviceArrays(); 304 } 305 306 void GPUNodeBuilder::finalize() { 307 freeDeviceArrays(); 308 createCallFreeContext(GPUContext); 309 IslNodeBuilder::finalize(); 310 } 311 312 void GPUNodeBuilder::allocateDeviceArrays() { 313 isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); 314 315 for (int i = 0; i < Prog->n_array; ++i) { 316 gpu_array_info *Array = &Prog->array[i]; 317 std::string DevArrayName("p_dev_array_"); 318 DevArrayName.append(Array->name); 319 320 Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size); 321 322 if (!gpu_array_is_scalar(Array)) { 323 auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]); 324 isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero); 325 326 for (unsigned int i = 1; i < Array->n_index; i++) { 327 isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]); 328 isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I); 329 Res = isl_ast_expr_mul(Res, Expr); 330 } 331 332 Value *NumElements = ExprBuilder.create(Res); 333 ArraySize = Builder.CreateMul(ArraySize, NumElements); 334 } 335 336 Value *DevArray = createCallAllocateMemoryForDevice(ArraySize); 337 DevArray->setName(DevArrayName); 338 AllocatedDevArrays.push_back(DevArray); 339 } 340 341 isl_ast_build_free(Build); 342 } 343 344 void GPUNodeBuilder::freeDeviceArrays() { 345 for (auto &Array : AllocatedDevArrays) 346 createCallFreeDeviceMemory(Array); 347 } 348 349 void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { 350 const char *Name = "polly_freeDeviceMemory"; 351 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 352 Function *F = M->getFunction(Name); 353 354 // If F is not available, declare it. 355 if (!F) { 356 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 357 std::vector<Type *> Args; 358 Args.push_back(Builder.getInt8PtrTy()); 359 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 360 F = Function::Create(Ty, Linkage, Name, M); 361 } 362 363 Builder.CreateCall(F, {Array}); 364 } 365 366 Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { 367 const char *Name = "polly_allocateMemoryForDevice"; 368 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 369 Function *F = M->getFunction(Name); 370 371 // If F is not available, declare it. 372 if (!F) { 373 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 374 std::vector<Type *> Args; 375 Args.push_back(Builder.getInt64Ty()); 376 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 377 F = Function::Create(Ty, Linkage, Name, M); 378 } 379 380 return Builder.CreateCall(F, {Size}); 381 } 382 383 Value *GPUNodeBuilder::createCallInitContext() { 384 const char *Name = "polly_initContext"; 385 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 386 Function *F = M->getFunction(Name); 387 388 // If F is not available, declare it. 389 if (!F) { 390 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 391 std::vector<Type *> Args; 392 FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false); 393 F = Function::Create(Ty, Linkage, Name, M); 394 } 395 396 return Builder.CreateCall(F, {}); 397 } 398 399 void GPUNodeBuilder::createCallFreeContext(Value *Context) { 400 const char *Name = "polly_freeContext"; 401 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 402 Function *F = M->getFunction(Name); 403 404 // If F is not available, declare it. 405 if (!F) { 406 GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; 407 std::vector<Type *> Args; 408 Args.push_back(Builder.getInt8PtrTy()); 409 FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false); 410 F = Function::Create(Ty, Linkage, Name, M); 411 } 412 413 Builder.CreateCall(F, {Context}); 414 } 415 416 /// Check if one string is a prefix of another. 417 /// 418 /// @param String The string in which to look for the prefix. 419 /// @param Prefix The prefix to look for. 420 static bool isPrefix(std::string String, std::string Prefix) { 421 return String.find(Prefix) == 0; 422 } 423 424 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 425 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 426 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 427 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 428 isl_id_free(Id); 429 isl_ast_expr_free(StmtExpr); 430 431 const char *Str = isl_id_get_name(Id); 432 if (!strcmp(Str, "kernel")) { 433 createKernel(UserStmt); 434 isl_ast_expr_free(Expr); 435 return; 436 } 437 438 if (isPrefix(Str, "to_device") || isPrefix(Str, "from_device")) { 439 // TODO: Insert memory copies 440 isl_ast_expr_free(Expr); 441 isl_ast_node_free(UserStmt); 442 return; 443 } 444 445 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 446 struct ppcg_kernel_stmt *KernelStmt = 447 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 448 isl_id_free(Anno); 449 450 switch (KernelStmt->type) { 451 case ppcg_kernel_domain: 452 createScopStmt(Expr, KernelStmt); 453 isl_ast_node_free(UserStmt); 454 return; 455 case ppcg_kernel_copy: 456 // TODO: Create kernel copy stmt 457 isl_ast_expr_free(Expr); 458 isl_ast_node_free(UserStmt); 459 return; 460 case ppcg_kernel_sync: 461 createKernelSync(); 462 isl_ast_expr_free(Expr); 463 isl_ast_node_free(UserStmt); 464 return; 465 } 466 467 isl_ast_expr_free(Expr); 468 isl_ast_node_free(UserStmt); 469 return; 470 } 471 472 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 473 ppcg_kernel_stmt *KernelStmt) { 474 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 475 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 476 477 LoopToScevMapT LTS; 478 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 479 480 createSubstitutions(Expr, Stmt, LTS); 481 482 if (Stmt->isBlockStmt()) 483 BlockGen.copyStmt(*Stmt, LTS, Indexes); 484 else 485 assert(0 && "Region statement not supported\n"); 486 } 487 488 void GPUNodeBuilder::createKernelSync() { 489 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 490 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 491 Builder.CreateCall(Sync, {}); 492 } 493 494 /// Collect llvm::Values referenced from @p Node 495 /// 496 /// This function only applies to isl_ast_nodes that are user_nodes referring 497 /// to a ScopStmt. All other node types are ignore. 498 /// 499 /// @param Node The node to collect references for. 500 /// @param User A user pointer used as storage for the data that is collected. 501 /// 502 /// @returns isl_bool_true if data could be collected successfully. 503 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 504 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 505 return isl_bool_true; 506 507 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 508 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 509 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 510 const char *Str = isl_id_get_name(Id); 511 isl_id_free(Id); 512 isl_ast_expr_free(StmtExpr); 513 isl_ast_expr_free(Expr); 514 515 if (!isPrefix(Str, "Stmt")) 516 return isl_bool_true; 517 518 Id = isl_ast_node_get_annotation(Node); 519 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 520 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 521 isl_id_free(Id); 522 523 addReferencesFromStmt(Stmt, User); 524 525 return isl_bool_true; 526 } 527 528 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 529 SetVector<Value *> SubtreeValues; 530 SetVector<const SCEV *> SCEVs; 531 SetVector<const Loop *> Loops; 532 SubtreeReferences References = { 533 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 534 535 for (const auto &I : IDToValue) 536 SubtreeValues.insert(I.second); 537 538 isl_ast_node_foreach_descendant_top_down( 539 Kernel->tree, collectReferencesInGPUStmt, &References); 540 541 for (const SCEV *Expr : SCEVs) 542 findValues(Expr, SE, SubtreeValues); 543 544 for (auto &SAI : S.arrays()) 545 SubtreeValues.remove(SAI.second->getBasePtr()); 546 547 isl_space *Space = S.getParamSpace(); 548 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 549 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 550 assert(IDToValue.count(Id)); 551 Value *Val = IDToValue[Id]; 552 SubtreeValues.remove(Val); 553 isl_id_free(Id); 554 } 555 isl_space_free(Space); 556 557 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 558 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 559 assert(IDToValue.count(Id)); 560 Value *Val = IDToValue[Id]; 561 SubtreeValues.remove(Val); 562 isl_id_free(Id); 563 } 564 565 return SubtreeValues; 566 } 567 568 void GPUNodeBuilder::clearDominators(Function *F) { 569 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 570 std::vector<BasicBlock *> Nodes; 571 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 572 Nodes.push_back(I->getBlock()); 573 574 for (BasicBlock *BB : Nodes) 575 DT.eraseNode(BB); 576 } 577 578 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 579 for (BasicBlock &BB : *F) { 580 Loop *L = LI.getLoopFor(&BB); 581 if (L) 582 SE.forgetLoop(L); 583 } 584 } 585 586 void GPUNodeBuilder::clearLoops(Function *F) { 587 for (BasicBlock &BB : *F) { 588 Loop *L = LI.getLoopFor(&BB); 589 if (L) 590 SE.forgetLoop(L); 591 LI.removeBlock(&BB); 592 } 593 } 594 595 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 596 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 597 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 598 isl_id_free(Id); 599 isl_ast_node_free(KernelStmt); 600 601 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 602 603 assert(Kernel->tree && "Device AST of kernel node is empty"); 604 605 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 606 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 607 ValueMapT HostValueMap = ValueMap; 608 609 SetVector<const Loop *> Loops; 610 611 // Create for all loops we depend on values that contain the current loop 612 // iteration. These values are necessary to generate code for SCEVs that 613 // depend on such loops. As a result we need to pass them to the subfunction. 614 for (const Loop *L : Loops) { 615 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 616 SE.getUnknown(Builder.getInt64(1)), 617 L, SCEV::FlagAnyWrap); 618 Value *V = generateSCEV(OuterLIV); 619 OutsideLoopIterations[L] = SE.getUnknown(V); 620 SubtreeValues.insert(V); 621 } 622 623 createKernelFunction(Kernel, SubtreeValues); 624 625 create(isl_ast_node_copy(Kernel->tree)); 626 627 Function *F = Builder.GetInsertBlock()->getParent(); 628 clearDominators(F); 629 clearScalarEvolution(F); 630 clearLoops(F); 631 632 Builder.SetInsertPoint(&HostInsertPoint); 633 IDToValue = HostIDs; 634 635 ValueMap = HostValueMap; 636 ScalarMap.clear(); 637 PHIOpMap.clear(); 638 EscapeMap.clear(); 639 IDToSAI.clear(); 640 Annotator.resetAlternativeAliasBases(); 641 for (auto &BasePtr : LocalArrays) 642 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 643 LocalArrays.clear(); 644 645 finalizeKernelFunction(); 646 } 647 648 /// Compute the DataLayout string for the NVPTX backend. 649 /// 650 /// @param is64Bit Are we looking for a 64 bit architecture? 651 static std::string computeNVPTXDataLayout(bool is64Bit) { 652 std::string Ret = "e"; 653 654 if (!is64Bit) 655 Ret += "-p:32:32"; 656 657 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 658 659 return Ret; 660 } 661 662 Function * 663 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 664 SetVector<Value *> &SubtreeValues) { 665 std::vector<Type *> Args; 666 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 667 668 for (long i = 0; i < Prog->n_array; i++) { 669 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 670 continue; 671 672 Args.push_back(Builder.getInt8PtrTy()); 673 } 674 675 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 676 677 for (long i = 0; i < NumHostIters; i++) 678 Args.push_back(Builder.getInt64Ty()); 679 680 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 681 682 for (long i = 0; i < NumVars; i++) 683 Args.push_back(Builder.getInt64Ty()); 684 685 for (auto *V : SubtreeValues) 686 Args.push_back(V->getType()); 687 688 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 689 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 690 GPUModule.get()); 691 FN->setCallingConv(CallingConv::PTX_Kernel); 692 693 auto Arg = FN->arg_begin(); 694 for (long i = 0; i < Kernel->n_array; i++) { 695 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 696 continue; 697 698 Arg->setName(Kernel->array[i].array->name); 699 700 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 701 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 702 Type *EleTy = SAI->getElementType(); 703 Value *Val = &*Arg; 704 SmallVector<const SCEV *, 4> Sizes; 705 isl_ast_build *Build = 706 isl_ast_build_from_context(isl_set_copy(Prog->context)); 707 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 708 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 709 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 710 auto V = ExprBuilder.create(DimSize); 711 Sizes.push_back(SE.getSCEV(V)); 712 } 713 const ScopArrayInfo *SAIRep = 714 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 715 LocalArrays.push_back(Val); 716 717 isl_ast_build_free(Build); 718 isl_id_free(Id); 719 IDToSAI[Id] = SAIRep; 720 Arg++; 721 } 722 723 for (long i = 0; i < NumHostIters; i++) { 724 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 725 Arg->setName(isl_id_get_name(Id)); 726 IDToValue[Id] = &*Arg; 727 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 728 Arg++; 729 } 730 731 for (long i = 0; i < NumVars; i++) { 732 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 733 Arg->setName(isl_id_get_name(Id)); 734 IDToValue[Id] = &*Arg; 735 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 736 Arg++; 737 } 738 739 for (auto *V : SubtreeValues) { 740 Arg->setName(V->getName()); 741 ValueMap[V] = &*Arg; 742 Arg++; 743 } 744 745 return FN; 746 } 747 748 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 749 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 750 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 751 752 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 753 Intrinsic::nvvm_read_ptx_sreg_tid_y, 754 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 755 756 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 757 std::string Name = isl_id_get_name(Id); 758 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 759 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 760 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 761 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 762 IDToValue[Id] = Val; 763 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 764 }; 765 766 for (int i = 0; i < Kernel->n_grid; ++i) { 767 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 768 addId(Id, IntrinsicsBID[i]); 769 } 770 771 for (int i = 0; i < Kernel->n_block; ++i) { 772 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 773 addId(Id, IntrinsicsTID[i]); 774 } 775 } 776 777 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 778 SetVector<Value *> &SubtreeValues) { 779 780 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 781 GPUModule.reset(new Module(Identifier, Builder.getContext())); 782 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 783 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 784 785 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 786 787 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 788 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 789 790 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 791 DT.addNewBlock(EntryBlock, PrevBlock); 792 793 Builder.SetInsertPoint(EntryBlock); 794 Builder.CreateRetVoid(); 795 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 796 797 insertKernelIntrinsics(Kernel); 798 } 799 800 std::string GPUNodeBuilder::createKernelASM() { 801 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 802 std::string ErrMsg; 803 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 804 805 if (!GPUTarget) { 806 errs() << ErrMsg << "\n"; 807 return ""; 808 } 809 810 TargetOptions Options; 811 Options.UnsafeFPMath = FastMath; 812 std::unique_ptr<TargetMachine> TargetM( 813 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 814 Options, Optional<Reloc::Model>())); 815 816 SmallString<0> ASMString; 817 raw_svector_ostream ASMStream(ASMString); 818 llvm::legacy::PassManager PM; 819 820 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 821 822 if (TargetM->addPassesToEmitFile( 823 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 824 errs() << "The target does not support generation of this file type!\n"; 825 return ""; 826 } 827 828 PM.run(*GPUModule); 829 830 return ASMStream.str(); 831 } 832 833 void GPUNodeBuilder::finalizeKernelFunction() { 834 // Verify module. 835 llvm::legacy::PassManager Passes; 836 Passes.add(createVerifierPass()); 837 Passes.run(*GPUModule); 838 839 if (DumpKernelIR) 840 outs() << *GPUModule << "\n"; 841 842 // Optimize module. 843 llvm::legacy::PassManager OptPasses; 844 PassManagerBuilder PassBuilder; 845 PassBuilder.OptLevel = 3; 846 PassBuilder.SizeLevel = 0; 847 PassBuilder.populateModulePassManager(OptPasses); 848 OptPasses.run(*GPUModule); 849 850 std::string Assembly = createKernelASM(); 851 852 if (DumpKernelASM) 853 outs() << Assembly << "\n"; 854 855 GPUModule.release(); 856 KernelIDs.clear(); 857 } 858 859 namespace { 860 class PPCGCodeGeneration : public ScopPass { 861 public: 862 static char ID; 863 864 /// The scop that is currently processed. 865 Scop *S; 866 867 LoopInfo *LI; 868 DominatorTree *DT; 869 ScalarEvolution *SE; 870 const DataLayout *DL; 871 RegionInfo *RI; 872 873 PPCGCodeGeneration() : ScopPass(ID) {} 874 875 /// Construct compilation options for PPCG. 876 /// 877 /// @returns The compilation options. 878 ppcg_options *createPPCGOptions() { 879 auto DebugOptions = 880 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 881 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 882 883 DebugOptions->dump_schedule_constraints = false; 884 DebugOptions->dump_schedule = false; 885 DebugOptions->dump_final_schedule = false; 886 DebugOptions->dump_sizes = false; 887 888 Options->debug = DebugOptions; 889 890 Options->reschedule = true; 891 Options->scale_tile_loops = false; 892 Options->wrap = false; 893 894 Options->non_negative_parameters = false; 895 Options->ctx = nullptr; 896 Options->sizes = nullptr; 897 898 Options->tile_size = 32; 899 900 Options->use_private_memory = false; 901 Options->use_shared_memory = false; 902 Options->max_shared_memory = 0; 903 904 Options->target = PPCG_TARGET_CUDA; 905 Options->openmp = false; 906 Options->linearize_device_arrays = true; 907 Options->live_range_reordering = false; 908 909 Options->opencl_compiler_options = nullptr; 910 Options->opencl_use_gpu = false; 911 Options->opencl_n_include_file = 0; 912 Options->opencl_include_files = nullptr; 913 Options->opencl_print_kernel_types = false; 914 Options->opencl_embed_kernel_code = false; 915 916 Options->save_schedule_file = nullptr; 917 Options->load_schedule_file = nullptr; 918 919 return Options; 920 } 921 922 /// Get a tagged access relation containing all accesses of type @p AccessTy. 923 /// 924 /// Instead of a normal access of the form: 925 /// 926 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 927 /// 928 /// a tagged access has the form 929 /// 930 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 931 /// 932 /// where 'id' is an additional space that references the memory access that 933 /// triggered the access. 934 /// 935 /// @param AccessTy The type of the memory accesses to collect. 936 /// 937 /// @return The relation describing all tagged memory accesses. 938 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 939 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 940 941 for (auto &Stmt : *S) 942 for (auto &Acc : Stmt) 943 if (Acc->getType() == AccessTy) { 944 isl_map *Relation = Acc->getAccessRelation(); 945 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 946 947 isl_space *Space = isl_map_get_space(Relation); 948 Space = isl_space_range(Space); 949 Space = isl_space_from_range(Space); 950 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 951 isl_map *Universe = isl_map_universe(Space); 952 Relation = isl_map_domain_product(Relation, Universe); 953 Accesses = isl_union_map_add_map(Accesses, Relation); 954 } 955 956 return Accesses; 957 } 958 959 /// Get the set of all read accesses, tagged with the access id. 960 /// 961 /// @see getTaggedAccesses 962 isl_union_map *getTaggedReads() { 963 return getTaggedAccesses(MemoryAccess::READ); 964 } 965 966 /// Get the set of all may (and must) accesses, tagged with the access id. 967 /// 968 /// @see getTaggedAccesses 969 isl_union_map *getTaggedMayWrites() { 970 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 971 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 972 } 973 974 /// Get the set of all must accesses, tagged with the access id. 975 /// 976 /// @see getTaggedAccesses 977 isl_union_map *getTaggedMustWrites() { 978 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 979 } 980 981 /// Collect parameter and array names as isl_ids. 982 /// 983 /// To reason about the different parameters and arrays used, ppcg requires 984 /// a list of all isl_ids in use. As PPCG traditionally performs 985 /// source-to-source compilation each of these isl_ids is mapped to the 986 /// expression that represents it. As we do not have a corresponding 987 /// expression in Polly, we just map each id to a 'zero' expression to match 988 /// the data format that ppcg expects. 989 /// 990 /// @returns Retun a map from collected ids to 'zero' ast expressions. 991 __isl_give isl_id_to_ast_expr *getNames() { 992 auto *Names = isl_id_to_ast_expr_alloc( 993 S->getIslCtx(), 994 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 995 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 996 auto *Space = S->getParamSpace(); 997 998 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 999 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 1000 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1001 } 1002 1003 for (auto &Array : S->arrays()) { 1004 auto Id = Array.second->getBasePtrId(); 1005 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 1006 } 1007 1008 isl_space_free(Space); 1009 isl_ast_expr_free(Zero); 1010 1011 return Names; 1012 } 1013 1014 /// Create a new PPCG scop from the current scop. 1015 /// 1016 /// The PPCG scop is initialized with data from the current polly::Scop. From 1017 /// this initial data, the data-dependences in the PPCG scop are initialized. 1018 /// We do not use Polly's dependence analysis for now, to ensure we match 1019 /// the PPCG default behaviour more closely. 1020 /// 1021 /// @returns A new ppcg scop. 1022 ppcg_scop *createPPCGScop() { 1023 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 1024 1025 PPCGScop->options = createPPCGOptions(); 1026 1027 PPCGScop->start = 0; 1028 PPCGScop->end = 0; 1029 1030 PPCGScop->context = S->getContext(); 1031 PPCGScop->domain = S->getDomains(); 1032 PPCGScop->call = nullptr; 1033 PPCGScop->tagged_reads = getTaggedReads(); 1034 PPCGScop->reads = S->getReads(); 1035 PPCGScop->live_in = nullptr; 1036 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 1037 PPCGScop->may_writes = S->getWrites(); 1038 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 1039 PPCGScop->must_writes = S->getMustWrites(); 1040 PPCGScop->live_out = nullptr; 1041 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 1042 PPCGScop->tagger = nullptr; 1043 1044 PPCGScop->independence = nullptr; 1045 PPCGScop->dep_flow = nullptr; 1046 PPCGScop->tagged_dep_flow = nullptr; 1047 PPCGScop->dep_false = nullptr; 1048 PPCGScop->dep_forced = nullptr; 1049 PPCGScop->dep_order = nullptr; 1050 PPCGScop->tagged_dep_order = nullptr; 1051 1052 PPCGScop->schedule = S->getScheduleTree(); 1053 PPCGScop->names = getNames(); 1054 1055 PPCGScop->pet = nullptr; 1056 1057 compute_tagger(PPCGScop); 1058 compute_dependences(PPCGScop); 1059 1060 return PPCGScop; 1061 } 1062 1063 /// Collect the array acesses in a statement. 1064 /// 1065 /// @param Stmt The statement for which to collect the accesses. 1066 /// 1067 /// @returns A list of array accesses. 1068 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 1069 gpu_stmt_access *Accesses = nullptr; 1070 1071 for (MemoryAccess *Acc : Stmt) { 1072 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 1073 Access->read = Acc->isRead(); 1074 Access->write = Acc->isWrite(); 1075 Access->access = Acc->getAccessRelation(); 1076 isl_space *Space = isl_map_get_space(Access->access); 1077 Space = isl_space_range(Space); 1078 Space = isl_space_from_range(Space); 1079 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 1080 isl_map *Universe = isl_map_universe(Space); 1081 Access->tagged_access = 1082 isl_map_domain_product(Acc->getAccessRelation(), Universe); 1083 Access->exact_write = Acc->isWrite(); 1084 Access->ref_id = Acc->getId(); 1085 Access->next = Accesses; 1086 Accesses = Access; 1087 } 1088 1089 return Accesses; 1090 } 1091 1092 /// Collect the list of GPU statements. 1093 /// 1094 /// Each statement has an id, a pointer to the underlying data structure, 1095 /// as well as a list with all memory accesses. 1096 /// 1097 /// TODO: Initialize the list of memory accesses. 1098 /// 1099 /// @returns A linked-list of statements. 1100 gpu_stmt *getStatements() { 1101 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 1102 std::distance(S->begin(), S->end())); 1103 1104 int i = 0; 1105 for (auto &Stmt : *S) { 1106 gpu_stmt *GPUStmt = &Stmts[i]; 1107 1108 GPUStmt->id = Stmt.getDomainId(); 1109 1110 // We use the pet stmt pointer to keep track of the Polly statements. 1111 GPUStmt->stmt = (pet_stmt *)&Stmt; 1112 GPUStmt->accesses = getStmtAccesses(Stmt); 1113 i++; 1114 } 1115 1116 return Stmts; 1117 } 1118 1119 /// Derive the extent of an array. 1120 /// 1121 /// The extent of an array is defined by the set of memory locations for 1122 /// which a memory access in the iteration domain exists. 1123 /// 1124 /// @param Array The array to derive the extent for. 1125 /// 1126 /// @returns An isl_set describing the extent of the array. 1127 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 1128 isl_union_map *Accesses = S->getAccesses(); 1129 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 1130 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 1131 isl_set *AccessSet = 1132 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 1133 isl_union_set_free(AccessUSet); 1134 1135 return AccessSet; 1136 } 1137 1138 /// Derive the bounds of an array. 1139 /// 1140 /// For the first dimension we derive the bound of the array from the extent 1141 /// of this dimension. For inner dimensions we obtain their size directly from 1142 /// ScopArrayInfo. 1143 /// 1144 /// @param PPCGArray The array to compute bounds for. 1145 /// @param Array The polly array from which to take the information. 1146 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 1147 if (PPCGArray.n_index > 0) { 1148 isl_set *Dom = isl_set_copy(PPCGArray.extent); 1149 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 1150 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 1151 isl_set_free(Dom); 1152 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 1153 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 1154 isl_aff *One = isl_aff_zero_on_domain(LS); 1155 One = isl_aff_add_constant_si(One, 1); 1156 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1157 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1158 PPCGArray.bound[0] = Bound; 1159 } 1160 1161 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1162 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1163 auto LS = isl_pw_aff_get_domain_space(Bound); 1164 auto Aff = isl_multi_aff_zero(LS); 1165 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1166 PPCGArray.bound[i] = Bound; 1167 } 1168 } 1169 1170 /// Create the arrays for @p PPCGProg. 1171 /// 1172 /// @param PPCGProg The program to compute the arrays for. 1173 void createArrays(gpu_prog *PPCGProg) { 1174 int i = 0; 1175 for (auto &Element : S->arrays()) { 1176 ScopArrayInfo *Array = Element.second.get(); 1177 1178 std::string TypeName; 1179 raw_string_ostream OS(TypeName); 1180 1181 OS << *Array->getElementType(); 1182 TypeName = OS.str(); 1183 1184 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1185 1186 PPCGArray.space = Array->getSpace(); 1187 PPCGArray.type = strdup(TypeName.c_str()); 1188 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1189 PPCGArray.name = strdup(Array->getName().c_str()); 1190 PPCGArray.extent = nullptr; 1191 PPCGArray.n_index = Array->getNumberOfDimensions(); 1192 PPCGArray.bound = 1193 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1194 PPCGArray.extent = getExtent(Array); 1195 PPCGArray.n_ref = 0; 1196 PPCGArray.refs = nullptr; 1197 PPCGArray.accessed = true; 1198 PPCGArray.read_only_scalar = false; 1199 PPCGArray.has_compound_element = false; 1200 PPCGArray.local = false; 1201 PPCGArray.declare_local = false; 1202 PPCGArray.global = false; 1203 PPCGArray.linearize = false; 1204 PPCGArray.dep_order = nullptr; 1205 1206 setArrayBounds(PPCGArray, Array); 1207 i++; 1208 1209 collect_references(PPCGProg, &PPCGArray); 1210 } 1211 } 1212 1213 /// Create an identity map between the arrays in the scop. 1214 /// 1215 /// @returns An identity map between the arrays in the scop. 1216 isl_union_map *getArrayIdentity() { 1217 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1218 1219 for (auto &Item : S->arrays()) { 1220 ScopArrayInfo *Array = Item.second.get(); 1221 isl_space *Space = Array->getSpace(); 1222 Space = isl_space_map_from_set(Space); 1223 isl_map *Identity = isl_map_identity(Space); 1224 Maps = isl_union_map_add_map(Maps, Identity); 1225 } 1226 1227 return Maps; 1228 } 1229 1230 /// Create a default-initialized PPCG GPU program. 1231 /// 1232 /// @returns A new gpu grogram description. 1233 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1234 1235 if (!PPCGScop) 1236 return nullptr; 1237 1238 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1239 1240 PPCGProg->ctx = S->getIslCtx(); 1241 PPCGProg->scop = PPCGScop; 1242 PPCGProg->context = isl_set_copy(PPCGScop->context); 1243 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1244 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1245 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1246 PPCGProg->tagged_must_kill = 1247 isl_union_map_copy(PPCGScop->tagged_must_kills); 1248 PPCGProg->to_inner = getArrayIdentity(); 1249 PPCGProg->to_outer = getArrayIdentity(); 1250 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1251 PPCGProg->any_to_outer = nullptr; 1252 PPCGProg->array_order = nullptr; 1253 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1254 PPCGProg->stmts = getStatements(); 1255 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1256 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1257 PPCGProg->n_array); 1258 1259 createArrays(PPCGProg); 1260 1261 return PPCGProg; 1262 } 1263 1264 struct PrintGPUUserData { 1265 struct cuda_info *CudaInfo; 1266 struct gpu_prog *PPCGProg; 1267 std::vector<ppcg_kernel *> Kernels; 1268 }; 1269 1270 /// Print a user statement node in the host code. 1271 /// 1272 /// We use ppcg's printing facilities to print the actual statement and 1273 /// additionally build up a list of all kernels that are encountered in the 1274 /// host ast. 1275 /// 1276 /// @param P The printer to print to 1277 /// @param Options The printing options to use 1278 /// @param Node The node to print 1279 /// @param User A user pointer to carry additional data. This pointer is 1280 /// expected to be of type PrintGPUUserData. 1281 /// 1282 /// @returns A printer to which the output has been printed. 1283 static __isl_give isl_printer * 1284 printHostUser(__isl_take isl_printer *P, 1285 __isl_take isl_ast_print_options *Options, 1286 __isl_take isl_ast_node *Node, void *User) { 1287 auto Data = (struct PrintGPUUserData *)User; 1288 auto Id = isl_ast_node_get_annotation(Node); 1289 1290 if (Id) { 1291 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1292 1293 // If this is a user statement, format it ourselves as ppcg would 1294 // otherwise try to call pet functionality that is not available in 1295 // Polly. 1296 if (IsUser) { 1297 P = isl_printer_start_line(P); 1298 P = isl_printer_print_ast_node(P, Node); 1299 P = isl_printer_end_line(P); 1300 isl_id_free(Id); 1301 isl_ast_print_options_free(Options); 1302 return P; 1303 } 1304 1305 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1306 isl_id_free(Id); 1307 Data->Kernels.push_back(Kernel); 1308 } 1309 1310 return print_host_user(P, Options, Node, User); 1311 } 1312 1313 /// Print C code corresponding to the control flow in @p Kernel. 1314 /// 1315 /// @param Kernel The kernel to print 1316 void printKernel(ppcg_kernel *Kernel) { 1317 auto *P = isl_printer_to_str(S->getIslCtx()); 1318 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1319 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1320 P = isl_ast_node_print(Kernel->tree, P, Options); 1321 char *String = isl_printer_get_str(P); 1322 printf("%s\n", String); 1323 free(String); 1324 isl_printer_free(P); 1325 } 1326 1327 /// Print C code corresponding to the GPU code described by @p Tree. 1328 /// 1329 /// @param Tree An AST describing GPU code 1330 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1331 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1332 auto *P = isl_printer_to_str(S->getIslCtx()); 1333 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1334 1335 PrintGPUUserData Data; 1336 Data.PPCGProg = PPCGProg; 1337 1338 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1339 Options = 1340 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1341 P = isl_ast_node_print(Tree, P, Options); 1342 char *String = isl_printer_get_str(P); 1343 printf("# host\n"); 1344 printf("%s\n", String); 1345 free(String); 1346 isl_printer_free(P); 1347 1348 for (auto Kernel : Data.Kernels) { 1349 printf("# kernel%d\n", Kernel->id); 1350 printKernel(Kernel); 1351 } 1352 } 1353 1354 // Generate a GPU program using PPCG. 1355 // 1356 // GPU mapping consists of multiple steps: 1357 // 1358 // 1) Compute new schedule for the program. 1359 // 2) Map schedule to GPU (TODO) 1360 // 3) Generate code for new schedule (TODO) 1361 // 1362 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1363 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1364 // strategy directly from this pass. 1365 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1366 1367 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1368 1369 PPCGGen->ctx = S->getIslCtx(); 1370 PPCGGen->options = PPCGScop->options; 1371 PPCGGen->print = nullptr; 1372 PPCGGen->print_user = nullptr; 1373 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1374 PPCGGen->prog = PPCGProg; 1375 PPCGGen->tree = nullptr; 1376 PPCGGen->types.n = 0; 1377 PPCGGen->types.name = nullptr; 1378 PPCGGen->sizes = nullptr; 1379 PPCGGen->used_sizes = nullptr; 1380 PPCGGen->kernel_id = 0; 1381 1382 // Set scheduling strategy to same strategy PPCG is using. 1383 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1384 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1385 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1386 1387 isl_schedule *Schedule = get_schedule(PPCGGen); 1388 1389 int has_permutable = has_any_permutable_node(Schedule); 1390 1391 if (!has_permutable || has_permutable < 0) { 1392 Schedule = isl_schedule_free(Schedule); 1393 } else { 1394 Schedule = map_to_device(PPCGGen, Schedule); 1395 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1396 } 1397 1398 if (DumpSchedule) { 1399 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1400 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1401 P = isl_printer_print_str(P, "Schedule\n"); 1402 P = isl_printer_print_str(P, "========\n"); 1403 if (Schedule) 1404 P = isl_printer_print_schedule(P, Schedule); 1405 else 1406 P = isl_printer_print_str(P, "No schedule found\n"); 1407 1408 printf("%s\n", isl_printer_get_str(P)); 1409 isl_printer_free(P); 1410 } 1411 1412 if (DumpCode) { 1413 printf("Code\n"); 1414 printf("====\n"); 1415 if (PPCGGen->tree) 1416 printGPUTree(PPCGGen->tree, PPCGProg); 1417 else 1418 printf("No code generated\n"); 1419 } 1420 1421 isl_schedule_free(Schedule); 1422 1423 return PPCGGen; 1424 } 1425 1426 /// Free gpu_gen structure. 1427 /// 1428 /// @param PPCGGen The ppcg_gen object to free. 1429 void freePPCGGen(gpu_gen *PPCGGen) { 1430 isl_ast_node_free(PPCGGen->tree); 1431 isl_union_map_free(PPCGGen->sizes); 1432 isl_union_map_free(PPCGGen->used_sizes); 1433 free(PPCGGen); 1434 } 1435 1436 /// Free the options in the ppcg scop structure. 1437 /// 1438 /// ppcg is not freeing these options for us. To avoid leaks we do this 1439 /// ourselves. 1440 /// 1441 /// @param PPCGScop The scop referencing the options to free. 1442 void freeOptions(ppcg_scop *PPCGScop) { 1443 free(PPCGScop->options->debug); 1444 PPCGScop->options->debug = nullptr; 1445 free(PPCGScop->options); 1446 PPCGScop->options = nullptr; 1447 } 1448 1449 /// Generate code for a given GPU AST described by @p Root. 1450 /// 1451 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1452 /// @param Prog The GPU Program to generate code for. 1453 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1454 ScopAnnotator Annotator; 1455 Annotator.buildAliasScopes(*S); 1456 1457 Region *R = &S->getRegion(); 1458 1459 simplifyRegion(R, DT, LI, RI); 1460 1461 BasicBlock *EnteringBB = R->getEnteringBlock(); 1462 1463 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1464 1465 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1466 Prog); 1467 1468 // Only build the run-time condition and parameters _after_ having 1469 // introduced the conditional branch. This is important as the conditional 1470 // branch will guard the original scop from new induction variables that 1471 // the SCEVExpander may introduce while code generating the parameters and 1472 // which may introduce scalar dependences that prevent us from correctly 1473 // code generating this scop. 1474 BasicBlock *StartBlock = 1475 executeScopConditionally(*S, this, Builder.getTrue()); 1476 1477 // TODO: Handle LICM 1478 // TODO: Verify run-time checks 1479 auto SplitBlock = StartBlock->getSinglePredecessor(); 1480 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1481 NodeBuilder.addParameters(S->getContext()); 1482 Builder.SetInsertPoint(&*StartBlock->begin()); 1483 1484 NodeBuilder.initializeAfterRTH(); 1485 NodeBuilder.create(Root); 1486 NodeBuilder.finalize(); 1487 } 1488 1489 bool runOnScop(Scop &CurrentScop) override { 1490 S = &CurrentScop; 1491 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1492 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1493 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1494 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 1495 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 1496 1497 // We currently do not support scops with invariant loads. 1498 if (S->hasInvariantAccesses()) 1499 return false; 1500 1501 auto PPCGScop = createPPCGScop(); 1502 auto PPCGProg = createPPCGProg(PPCGScop); 1503 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 1504 1505 if (PPCGGen->tree) 1506 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 1507 1508 freeOptions(PPCGScop); 1509 freePPCGGen(PPCGGen); 1510 gpu_prog_free(PPCGProg); 1511 ppcg_scop_free(PPCGScop); 1512 1513 return true; 1514 } 1515 1516 void printScop(raw_ostream &, Scop &) const override {} 1517 1518 void getAnalysisUsage(AnalysisUsage &AU) const override { 1519 AU.addRequired<DominatorTreeWrapperPass>(); 1520 AU.addRequired<RegionInfoPass>(); 1521 AU.addRequired<ScalarEvolutionWrapperPass>(); 1522 AU.addRequired<ScopDetection>(); 1523 AU.addRequired<ScopInfoRegionPass>(); 1524 AU.addRequired<LoopInfoWrapperPass>(); 1525 1526 AU.addPreserved<AAResultsWrapperPass>(); 1527 AU.addPreserved<BasicAAWrapperPass>(); 1528 AU.addPreserved<LoopInfoWrapperPass>(); 1529 AU.addPreserved<DominatorTreeWrapperPass>(); 1530 AU.addPreserved<GlobalsAAWrapperPass>(); 1531 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1532 AU.addPreserved<ScopDetection>(); 1533 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1534 AU.addPreserved<SCEVAAWrapperPass>(); 1535 1536 // FIXME: We do not yet add regions for the newly generated code to the 1537 // region tree. 1538 AU.addPreserved<RegionInfoPass>(); 1539 AU.addPreserved<ScopInfoRegionPass>(); 1540 } 1541 }; 1542 } 1543 1544 char PPCGCodeGeneration::ID = 1; 1545 1546 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 1547 1548 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 1549 "Polly - Apply PPCG translation to SCOP", false, false) 1550 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1551 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1552 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 1553 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1554 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 1555 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1556 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 1557 "Polly - Apply PPCG translation to SCOP", false, false) 1558