1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopInfo.h" 21 #include "polly/Support/SCEVValidator.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/Analysis/BasicAliasAnalysis.h" 25 #include "llvm/Analysis/GlobalsModRef.h" 26 #include "llvm/Analysis/PostDominators.h" 27 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/LegacyPassManager.h" 31 #include "llvm/IR/Verifier.h" 32 #include "llvm/Support/TargetRegistry.h" 33 #include "llvm/Support/TargetSelect.h" 34 #include "llvm/Target/TargetMachine.h" 35 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 36 37 #include "isl/union_map.h" 38 39 extern "C" { 40 #include "ppcg/cuda.h" 41 #include "ppcg/gpu.h" 42 #include "ppcg/gpu_print.h" 43 #include "ppcg/ppcg.h" 44 #include "ppcg/schedule.h" 45 } 46 47 #include "llvm/Support/Debug.h" 48 49 using namespace polly; 50 using namespace llvm; 51 52 #define DEBUG_TYPE "polly-codegen-ppcg" 53 54 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 55 cl::desc("Dump the computed GPU Schedule"), 56 cl::Hidden, cl::init(false), cl::ZeroOrMore, 57 cl::cat(PollyCategory)); 58 59 static cl::opt<bool> 60 DumpCode("polly-acc-dump-code", 61 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 62 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 63 64 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 65 cl::desc("Dump the kernel LLVM-IR"), 66 cl::Hidden, cl::init(false), cl::ZeroOrMore, 67 cl::cat(PollyCategory)); 68 69 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 70 cl::desc("Dump the kernel assembly code"), 71 cl::Hidden, cl::init(false), cl::ZeroOrMore, 72 cl::cat(PollyCategory)); 73 74 static cl::opt<bool> FastMath("polly-acc-fastmath", 75 cl::desc("Allow unsafe math optimizations"), 76 cl::Hidden, cl::init(false), cl::ZeroOrMore, 77 cl::cat(PollyCategory)); 78 79 static cl::opt<std::string> 80 CudaVersion("polly-acc-cuda-version", 81 cl::desc("The CUDA version to compile for"), cl::Hidden, 82 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 83 84 /// Create the ast expressions for a ScopStmt. 85 /// 86 /// This function is a callback for to generate the ast expressions for each 87 /// of the scheduled ScopStmts. 88 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 89 void *StmtT, isl_ast_build *Build, 90 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 91 isl_id *Id, void *User), 92 void *UserIndex, 93 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 94 void *UserExpr) { 95 96 ScopStmt *Stmt = (ScopStmt *)StmtT; 97 98 isl_ctx *Ctx; 99 100 if (!Stmt || !Build) 101 return NULL; 102 103 Ctx = isl_ast_build_get_ctx(Build); 104 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 105 106 for (MemoryAccess *Acc : *Stmt) { 107 isl_map *AddrFunc = Acc->getAddressFunction(); 108 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 109 isl_id *RefId = Acc->getId(); 110 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 111 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 112 MPA = isl_multi_pw_aff_coalesce(MPA); 113 MPA = FunctionIndex(MPA, RefId, UserIndex); 114 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 115 Access = FunctionExpr(Access, RefId, UserExpr); 116 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 117 } 118 119 return RefToExpr; 120 } 121 122 /// Generate code for a GPU specific isl AST. 123 /// 124 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 125 /// generates code for general-prupose AST nodes, with special functionality 126 /// for generating GPU specific user nodes. 127 /// 128 /// @see GPUNodeBuilder::createUser 129 class GPUNodeBuilder : public IslNodeBuilder { 130 public: 131 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 132 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 133 DominatorTree &DT, Scop &S, gpu_prog *Prog) 134 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 135 getExprBuilder().setIDToSAI(&IDToSAI); 136 } 137 138 private: 139 /// A vector of array base pointers for which a new ScopArrayInfo was created. 140 /// 141 /// This vector is used to delete the ScopArrayInfo when it is not needed any 142 /// more. 143 std::vector<Value *> LocalArrays; 144 145 /// A module containing GPU code. 146 /// 147 /// This pointer is only set in case we are currently generating GPU code. 148 std::unique_ptr<Module> GPUModule; 149 150 /// The GPU program we generate code for. 151 gpu_prog *Prog; 152 153 /// Class to free isl_ids. 154 class IslIdDeleter { 155 public: 156 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 157 }; 158 159 /// A set containing all isl_ids allocated in a GPU kernel. 160 /// 161 /// By releasing this set all isl_ids will be freed. 162 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 163 164 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 165 166 /// Create code for user-defined AST nodes. 167 /// 168 /// These AST nodes can be of type: 169 /// 170 /// - ScopStmt: A computational statement (TODO) 171 /// - Kernel: A GPU kernel call (TODO) 172 /// - Data-Transfer: A GPU <-> CPU data-transfer (TODO) 173 /// - In-kernel synchronization 174 /// - In-kernel memory copy statement 175 /// 176 /// @param UserStmt The ast node to generate code for. 177 virtual void createUser(__isl_take isl_ast_node *UserStmt); 178 179 /// Find llvm::Values referenced in GPU kernel. 180 /// 181 /// @param Kernel The kernel to scan for llvm::Values 182 /// 183 /// @returns A set of values referenced by the kernel. 184 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 185 186 /// Create GPU kernel. 187 /// 188 /// Code generate the kernel described by @p KernelStmt. 189 /// 190 /// @param KernelStmt The ast node to generate kernel code for. 191 void createKernel(__isl_take isl_ast_node *KernelStmt); 192 193 /// Create kernel function. 194 /// 195 /// Create a kernel function located in a newly created module that can serve 196 /// as target for device code generation. Set the Builder to point to the 197 /// start block of this newly created function. 198 /// 199 /// @param Kernel The kernel to generate code for. 200 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 201 void createKernelFunction(ppcg_kernel *Kernel, 202 SetVector<Value *> &SubtreeValues); 203 204 /// Create the declaration of a kernel function. 205 /// 206 /// The kernel function takes as arguments: 207 /// 208 /// - One i8 pointer for each external array reference used in the kernel. 209 /// - Host iterators 210 /// - Parameters 211 /// - Other LLVM Value references (TODO) 212 /// 213 /// @param Kernel The kernel to generate the function declaration for. 214 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 215 /// 216 /// @returns The newly declared function. 217 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 218 SetVector<Value *> &SubtreeValues); 219 220 /// Insert intrinsic functions to obtain thread and block ids. 221 /// 222 /// @param The kernel to generate the intrinsic functions for. 223 void insertKernelIntrinsics(ppcg_kernel *Kernel); 224 225 /// Create code for a ScopStmt called in @p Expr. 226 /// 227 /// @param Expr The expression containing the call. 228 /// @param KernelStmt The kernel statement referenced in the call. 229 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 230 231 /// Create an in-kernel synchronization call. 232 void createKernelSync(); 233 234 /// Create a PTX assembly string for the current GPU kernel. 235 /// 236 /// @returns A string containing the corresponding PTX assembly code. 237 std::string createKernelASM(); 238 239 /// Remove references from the dominator tree to the kernel function @p F. 240 /// 241 /// @param F The function to remove references to. 242 void clearDominators(Function *F); 243 244 /// Remove references from scalar evolution to the kernel function @p F. 245 /// 246 /// @param F The function to remove references to. 247 void clearScalarEvolution(Function *F); 248 249 /// Remove references from loop info to the kernel function @p F. 250 /// 251 /// @param F The function to remove references to. 252 void clearLoops(Function *F); 253 254 /// Finalize the generation of the kernel function. 255 /// 256 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 257 /// dump its IR to stderr. 258 void finalizeKernelFunction(); 259 }; 260 261 /// Check if one string is a prefix of another. 262 /// 263 /// @param String The string in which to look for the prefix. 264 /// @param Prefix The prefix to look for. 265 static bool isPrefix(std::string String, std::string Prefix) { 266 return String.find(Prefix) == 0; 267 } 268 269 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 270 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 271 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 272 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 273 isl_id_free(Id); 274 isl_ast_expr_free(StmtExpr); 275 276 const char *Str = isl_id_get_name(Id); 277 if (!strcmp(Str, "kernel")) { 278 createKernel(UserStmt); 279 isl_ast_expr_free(Expr); 280 return; 281 } 282 283 if (isPrefix(Str, "to_device") || isPrefix(Str, "from_device")) { 284 // TODO: Insert memory copies 285 isl_ast_expr_free(Expr); 286 isl_ast_node_free(UserStmt); 287 return; 288 } 289 290 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 291 struct ppcg_kernel_stmt *KernelStmt = 292 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 293 isl_id_free(Anno); 294 295 switch (KernelStmt->type) { 296 case ppcg_kernel_domain: 297 createScopStmt(Expr, KernelStmt); 298 isl_ast_node_free(UserStmt); 299 return; 300 case ppcg_kernel_copy: 301 // TODO: Create kernel copy stmt 302 isl_ast_expr_free(Expr); 303 isl_ast_node_free(UserStmt); 304 return; 305 case ppcg_kernel_sync: 306 createKernelSync(); 307 isl_ast_expr_free(Expr); 308 isl_ast_node_free(UserStmt); 309 return; 310 } 311 312 isl_ast_expr_free(Expr); 313 isl_ast_node_free(UserStmt); 314 return; 315 } 316 317 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 318 ppcg_kernel_stmt *KernelStmt) { 319 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 320 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 321 322 LoopToScevMapT LTS; 323 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 324 325 createSubstitutions(Expr, Stmt, LTS); 326 327 if (Stmt->isBlockStmt()) 328 BlockGen.copyStmt(*Stmt, LTS, Indexes); 329 else 330 assert(0 && "Region statement not supported\n"); 331 } 332 333 void GPUNodeBuilder::createKernelSync() { 334 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 335 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 336 Builder.CreateCall(Sync, {}); 337 } 338 339 /// Collect llvm::Values referenced from @p Node 340 /// 341 /// This function only applies to isl_ast_nodes that are user_nodes referring 342 /// to a ScopStmt. All other node types are ignore. 343 /// 344 /// @param Node The node to collect references for. 345 /// @param User A user pointer used as storage for the data that is collected. 346 /// 347 /// @returns isl_bool_true if data could be collected successfully. 348 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 349 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 350 return isl_bool_true; 351 352 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 353 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 354 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 355 const char *Str = isl_id_get_name(Id); 356 isl_id_free(Id); 357 isl_ast_expr_free(StmtExpr); 358 isl_ast_expr_free(Expr); 359 360 if (!isPrefix(Str, "Stmt")) 361 return isl_bool_true; 362 363 Id = isl_ast_node_get_annotation(Node); 364 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 365 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 366 isl_id_free(Id); 367 368 addReferencesFromStmt(Stmt, User); 369 370 return isl_bool_true; 371 } 372 373 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 374 SetVector<Value *> SubtreeValues; 375 SetVector<const SCEV *> SCEVs; 376 SetVector<const Loop *> Loops; 377 SubtreeReferences References = { 378 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 379 380 for (const auto &I : IDToValue) 381 SubtreeValues.insert(I.second); 382 383 isl_ast_node_foreach_descendant_top_down( 384 Kernel->tree, collectReferencesInGPUStmt, &References); 385 386 for (const SCEV *Expr : SCEVs) 387 findValues(Expr, SE, SubtreeValues); 388 389 for (auto &SAI : S.arrays()) 390 SubtreeValues.remove(SAI.second->getBasePtr()); 391 392 isl_space *Space = S.getParamSpace(); 393 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 394 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 395 assert(IDToValue.count(Id)); 396 Value *Val = IDToValue[Id]; 397 SubtreeValues.remove(Val); 398 isl_id_free(Id); 399 } 400 isl_space_free(Space); 401 402 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 403 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 404 assert(IDToValue.count(Id)); 405 Value *Val = IDToValue[Id]; 406 SubtreeValues.remove(Val); 407 isl_id_free(Id); 408 } 409 410 return SubtreeValues; 411 } 412 413 void GPUNodeBuilder::clearDominators(Function *F) { 414 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 415 std::vector<BasicBlock *> Nodes; 416 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 417 Nodes.push_back(I->getBlock()); 418 419 for (BasicBlock *BB : Nodes) 420 DT.eraseNode(BB); 421 } 422 423 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 424 for (BasicBlock &BB : *F) { 425 Loop *L = LI.getLoopFor(&BB); 426 if (L) 427 SE.forgetLoop(L); 428 } 429 } 430 431 void GPUNodeBuilder::clearLoops(Function *F) { 432 for (BasicBlock &BB : *F) { 433 Loop *L = LI.getLoopFor(&BB); 434 if (L) 435 SE.forgetLoop(L); 436 LI.removeBlock(&BB); 437 } 438 } 439 440 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 441 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 442 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 443 isl_id_free(Id); 444 isl_ast_node_free(KernelStmt); 445 446 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 447 448 assert(Kernel->tree && "Device AST of kernel node is empty"); 449 450 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 451 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 452 ValueMapT HostValueMap = ValueMap; 453 454 SetVector<const Loop *> Loops; 455 456 // Create for all loops we depend on values that contain the current loop 457 // iteration. These values are necessary to generate code for SCEVs that 458 // depend on such loops. As a result we need to pass them to the subfunction. 459 for (const Loop *L : Loops) { 460 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 461 SE.getUnknown(Builder.getInt64(1)), 462 L, SCEV::FlagAnyWrap); 463 Value *V = generateSCEV(OuterLIV); 464 OutsideLoopIterations[L] = SE.getUnknown(V); 465 SubtreeValues.insert(V); 466 } 467 468 createKernelFunction(Kernel, SubtreeValues); 469 470 create(isl_ast_node_copy(Kernel->tree)); 471 472 Function *F = Builder.GetInsertBlock()->getParent(); 473 clearDominators(F); 474 clearScalarEvolution(F); 475 clearLoops(F); 476 477 Builder.SetInsertPoint(&HostInsertPoint); 478 IDToValue = HostIDs; 479 480 ValueMap = HostValueMap; 481 ScalarMap.clear(); 482 PHIOpMap.clear(); 483 EscapeMap.clear(); 484 IDToSAI.clear(); 485 Annotator.resetAlternativeAliasBases(); 486 for (auto &BasePtr : LocalArrays) 487 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 488 LocalArrays.clear(); 489 490 finalizeKernelFunction(); 491 } 492 493 /// Compute the DataLayout string for the NVPTX backend. 494 /// 495 /// @param is64Bit Are we looking for a 64 bit architecture? 496 static std::string computeNVPTXDataLayout(bool is64Bit) { 497 std::string Ret = "e"; 498 499 if (!is64Bit) 500 Ret += "-p:32:32"; 501 502 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 503 504 return Ret; 505 } 506 507 Function * 508 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 509 SetVector<Value *> &SubtreeValues) { 510 std::vector<Type *> Args; 511 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 512 513 for (long i = 0; i < Prog->n_array; i++) { 514 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 515 continue; 516 517 Args.push_back(Builder.getInt8PtrTy()); 518 } 519 520 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 521 522 for (long i = 0; i < NumHostIters; i++) 523 Args.push_back(Builder.getInt64Ty()); 524 525 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 526 527 for (long i = 0; i < NumVars; i++) 528 Args.push_back(Builder.getInt64Ty()); 529 530 for (auto *V : SubtreeValues) 531 Args.push_back(V->getType()); 532 533 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 534 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 535 GPUModule.get()); 536 FN->setCallingConv(CallingConv::PTX_Kernel); 537 538 auto Arg = FN->arg_begin(); 539 for (long i = 0; i < Kernel->n_array; i++) { 540 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 541 continue; 542 543 Arg->setName(Kernel->array[i].array->name); 544 545 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 546 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 547 Type *EleTy = SAI->getElementType(); 548 Value *Val = &*Arg; 549 SmallVector<const SCEV *, 4> Sizes; 550 isl_ast_build *Build = 551 isl_ast_build_from_context(isl_set_copy(Prog->context)); 552 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 553 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 554 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 555 auto V = ExprBuilder.create(DimSize); 556 Sizes.push_back(SE.getSCEV(V)); 557 } 558 const ScopArrayInfo *SAIRep = 559 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 560 LocalArrays.push_back(Val); 561 562 isl_ast_build_free(Build); 563 isl_id_free(Id); 564 IDToSAI[Id] = SAIRep; 565 Arg++; 566 } 567 568 for (long i = 0; i < NumHostIters; i++) { 569 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 570 Arg->setName(isl_id_get_name(Id)); 571 IDToValue[Id] = &*Arg; 572 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 573 Arg++; 574 } 575 576 for (long i = 0; i < NumVars; i++) { 577 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 578 Arg->setName(isl_id_get_name(Id)); 579 IDToValue[Id] = &*Arg; 580 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 581 Arg++; 582 } 583 584 for (auto *V : SubtreeValues) { 585 Arg->setName(V->getName()); 586 ValueMap[V] = &*Arg; 587 Arg++; 588 } 589 590 return FN; 591 } 592 593 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 594 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 595 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 596 597 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 598 Intrinsic::nvvm_read_ptx_sreg_tid_y, 599 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 600 601 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 602 std::string Name = isl_id_get_name(Id); 603 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 604 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 605 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 606 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 607 IDToValue[Id] = Val; 608 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 609 }; 610 611 for (int i = 0; i < Kernel->n_grid; ++i) { 612 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 613 addId(Id, IntrinsicsBID[i]); 614 } 615 616 for (int i = 0; i < Kernel->n_block; ++i) { 617 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 618 addId(Id, IntrinsicsTID[i]); 619 } 620 } 621 622 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 623 SetVector<Value *> &SubtreeValues) { 624 625 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 626 GPUModule.reset(new Module(Identifier, Builder.getContext())); 627 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 628 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 629 630 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 631 632 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 633 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 634 635 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 636 DT.addNewBlock(EntryBlock, PrevBlock); 637 638 Builder.SetInsertPoint(EntryBlock); 639 Builder.CreateRetVoid(); 640 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 641 642 insertKernelIntrinsics(Kernel); 643 } 644 645 std::string GPUNodeBuilder::createKernelASM() { 646 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 647 std::string ErrMsg; 648 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 649 650 if (!GPUTarget) { 651 errs() << ErrMsg << "\n"; 652 return ""; 653 } 654 655 TargetOptions Options; 656 Options.UnsafeFPMath = FastMath; 657 std::unique_ptr<TargetMachine> TargetM( 658 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 659 Options, Optional<Reloc::Model>())); 660 661 SmallString<0> ASMString; 662 raw_svector_ostream ASMStream(ASMString); 663 llvm::legacy::PassManager PM; 664 665 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 666 667 if (TargetM->addPassesToEmitFile( 668 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 669 errs() << "The target does not support generation of this file type!\n"; 670 return ""; 671 } 672 673 PM.run(*GPUModule); 674 675 return ASMStream.str(); 676 } 677 678 void GPUNodeBuilder::finalizeKernelFunction() { 679 // Verify module. 680 llvm::legacy::PassManager Passes; 681 Passes.add(createVerifierPass()); 682 Passes.run(*GPUModule); 683 684 if (DumpKernelIR) 685 outs() << *GPUModule << "\n"; 686 687 // Optimize module. 688 llvm::legacy::PassManager OptPasses; 689 PassManagerBuilder PassBuilder; 690 PassBuilder.OptLevel = 3; 691 PassBuilder.SizeLevel = 0; 692 PassBuilder.populateModulePassManager(OptPasses); 693 OptPasses.run(*GPUModule); 694 695 std::string Assembly = createKernelASM(); 696 697 if (DumpKernelASM) 698 outs() << Assembly << "\n"; 699 700 GPUModule.release(); 701 KernelIDs.clear(); 702 } 703 704 namespace { 705 class PPCGCodeGeneration : public ScopPass { 706 public: 707 static char ID; 708 709 /// The scop that is currently processed. 710 Scop *S; 711 712 LoopInfo *LI; 713 DominatorTree *DT; 714 ScalarEvolution *SE; 715 const DataLayout *DL; 716 RegionInfo *RI; 717 718 PPCGCodeGeneration() : ScopPass(ID) {} 719 720 /// Construct compilation options for PPCG. 721 /// 722 /// @returns The compilation options. 723 ppcg_options *createPPCGOptions() { 724 auto DebugOptions = 725 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 726 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 727 728 DebugOptions->dump_schedule_constraints = false; 729 DebugOptions->dump_schedule = false; 730 DebugOptions->dump_final_schedule = false; 731 DebugOptions->dump_sizes = false; 732 733 Options->debug = DebugOptions; 734 735 Options->reschedule = true; 736 Options->scale_tile_loops = false; 737 Options->wrap = false; 738 739 Options->non_negative_parameters = false; 740 Options->ctx = nullptr; 741 Options->sizes = nullptr; 742 743 Options->tile_size = 32; 744 745 Options->use_private_memory = false; 746 Options->use_shared_memory = false; 747 Options->max_shared_memory = 0; 748 749 Options->target = PPCG_TARGET_CUDA; 750 Options->openmp = false; 751 Options->linearize_device_arrays = true; 752 Options->live_range_reordering = false; 753 754 Options->opencl_compiler_options = nullptr; 755 Options->opencl_use_gpu = false; 756 Options->opencl_n_include_file = 0; 757 Options->opencl_include_files = nullptr; 758 Options->opencl_print_kernel_types = false; 759 Options->opencl_embed_kernel_code = false; 760 761 Options->save_schedule_file = nullptr; 762 Options->load_schedule_file = nullptr; 763 764 return Options; 765 } 766 767 /// Get a tagged access relation containing all accesses of type @p AccessTy. 768 /// 769 /// Instead of a normal access of the form: 770 /// 771 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 772 /// 773 /// a tagged access has the form 774 /// 775 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 776 /// 777 /// where 'id' is an additional space that references the memory access that 778 /// triggered the access. 779 /// 780 /// @param AccessTy The type of the memory accesses to collect. 781 /// 782 /// @return The relation describing all tagged memory accesses. 783 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 784 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 785 786 for (auto &Stmt : *S) 787 for (auto &Acc : Stmt) 788 if (Acc->getType() == AccessTy) { 789 isl_map *Relation = Acc->getAccessRelation(); 790 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 791 792 isl_space *Space = isl_map_get_space(Relation); 793 Space = isl_space_range(Space); 794 Space = isl_space_from_range(Space); 795 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 796 isl_map *Universe = isl_map_universe(Space); 797 Relation = isl_map_domain_product(Relation, Universe); 798 Accesses = isl_union_map_add_map(Accesses, Relation); 799 } 800 801 return Accesses; 802 } 803 804 /// Get the set of all read accesses, tagged with the access id. 805 /// 806 /// @see getTaggedAccesses 807 isl_union_map *getTaggedReads() { 808 return getTaggedAccesses(MemoryAccess::READ); 809 } 810 811 /// Get the set of all may (and must) accesses, tagged with the access id. 812 /// 813 /// @see getTaggedAccesses 814 isl_union_map *getTaggedMayWrites() { 815 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 816 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 817 } 818 819 /// Get the set of all must accesses, tagged with the access id. 820 /// 821 /// @see getTaggedAccesses 822 isl_union_map *getTaggedMustWrites() { 823 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 824 } 825 826 /// Collect parameter and array names as isl_ids. 827 /// 828 /// To reason about the different parameters and arrays used, ppcg requires 829 /// a list of all isl_ids in use. As PPCG traditionally performs 830 /// source-to-source compilation each of these isl_ids is mapped to the 831 /// expression that represents it. As we do not have a corresponding 832 /// expression in Polly, we just map each id to a 'zero' expression to match 833 /// the data format that ppcg expects. 834 /// 835 /// @returns Retun a map from collected ids to 'zero' ast expressions. 836 __isl_give isl_id_to_ast_expr *getNames() { 837 auto *Names = isl_id_to_ast_expr_alloc( 838 S->getIslCtx(), 839 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 840 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 841 auto *Space = S->getParamSpace(); 842 843 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 844 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 845 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 846 } 847 848 for (auto &Array : S->arrays()) { 849 auto Id = Array.second->getBasePtrId(); 850 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 851 } 852 853 isl_space_free(Space); 854 isl_ast_expr_free(Zero); 855 856 return Names; 857 } 858 859 /// Create a new PPCG scop from the current scop. 860 /// 861 /// The PPCG scop is initialized with data from the current polly::Scop. From 862 /// this initial data, the data-dependences in the PPCG scop are initialized. 863 /// We do not use Polly's dependence analysis for now, to ensure we match 864 /// the PPCG default behaviour more closely. 865 /// 866 /// @returns A new ppcg scop. 867 ppcg_scop *createPPCGScop() { 868 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 869 870 PPCGScop->options = createPPCGOptions(); 871 872 PPCGScop->start = 0; 873 PPCGScop->end = 0; 874 875 PPCGScop->context = S->getContext(); 876 PPCGScop->domain = S->getDomains(); 877 PPCGScop->call = nullptr; 878 PPCGScop->tagged_reads = getTaggedReads(); 879 PPCGScop->reads = S->getReads(); 880 PPCGScop->live_in = nullptr; 881 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 882 PPCGScop->may_writes = S->getWrites(); 883 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 884 PPCGScop->must_writes = S->getMustWrites(); 885 PPCGScop->live_out = nullptr; 886 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 887 PPCGScop->tagger = nullptr; 888 889 PPCGScop->independence = nullptr; 890 PPCGScop->dep_flow = nullptr; 891 PPCGScop->tagged_dep_flow = nullptr; 892 PPCGScop->dep_false = nullptr; 893 PPCGScop->dep_forced = nullptr; 894 PPCGScop->dep_order = nullptr; 895 PPCGScop->tagged_dep_order = nullptr; 896 897 PPCGScop->schedule = S->getScheduleTree(); 898 PPCGScop->names = getNames(); 899 900 PPCGScop->pet = nullptr; 901 902 compute_tagger(PPCGScop); 903 compute_dependences(PPCGScop); 904 905 return PPCGScop; 906 } 907 908 /// Collect the array acesses in a statement. 909 /// 910 /// @param Stmt The statement for which to collect the accesses. 911 /// 912 /// @returns A list of array accesses. 913 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 914 gpu_stmt_access *Accesses = nullptr; 915 916 for (MemoryAccess *Acc : Stmt) { 917 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 918 Access->read = Acc->isRead(); 919 Access->write = Acc->isWrite(); 920 Access->access = Acc->getAccessRelation(); 921 isl_space *Space = isl_map_get_space(Access->access); 922 Space = isl_space_range(Space); 923 Space = isl_space_from_range(Space); 924 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 925 isl_map *Universe = isl_map_universe(Space); 926 Access->tagged_access = 927 isl_map_domain_product(Acc->getAccessRelation(), Universe); 928 Access->exact_write = Acc->isWrite(); 929 Access->ref_id = Acc->getId(); 930 Access->next = Accesses; 931 Accesses = Access; 932 } 933 934 return Accesses; 935 } 936 937 /// Collect the list of GPU statements. 938 /// 939 /// Each statement has an id, a pointer to the underlying data structure, 940 /// as well as a list with all memory accesses. 941 /// 942 /// TODO: Initialize the list of memory accesses. 943 /// 944 /// @returns A linked-list of statements. 945 gpu_stmt *getStatements() { 946 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 947 std::distance(S->begin(), S->end())); 948 949 int i = 0; 950 for (auto &Stmt : *S) { 951 gpu_stmt *GPUStmt = &Stmts[i]; 952 953 GPUStmt->id = Stmt.getDomainId(); 954 955 // We use the pet stmt pointer to keep track of the Polly statements. 956 GPUStmt->stmt = (pet_stmt *)&Stmt; 957 GPUStmt->accesses = getStmtAccesses(Stmt); 958 i++; 959 } 960 961 return Stmts; 962 } 963 964 /// Derive the extent of an array. 965 /// 966 /// The extent of an array is defined by the set of memory locations for 967 /// which a memory access in the iteration domain exists. 968 /// 969 /// @param Array The array to derive the extent for. 970 /// 971 /// @returns An isl_set describing the extent of the array. 972 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 973 isl_union_map *Accesses = S->getAccesses(); 974 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 975 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 976 isl_set *AccessSet = 977 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 978 isl_union_set_free(AccessUSet); 979 980 return AccessSet; 981 } 982 983 /// Derive the bounds of an array. 984 /// 985 /// For the first dimension we derive the bound of the array from the extent 986 /// of this dimension. For inner dimensions we obtain their size directly from 987 /// ScopArrayInfo. 988 /// 989 /// @param PPCGArray The array to compute bounds for. 990 /// @param Array The polly array from which to take the information. 991 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 992 if (PPCGArray.n_index > 0) { 993 isl_set *Dom = isl_set_copy(PPCGArray.extent); 994 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 995 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 996 isl_set_free(Dom); 997 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 998 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 999 isl_aff *One = isl_aff_zero_on_domain(LS); 1000 One = isl_aff_add_constant_si(One, 1); 1001 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 1002 Bound = isl_pw_aff_gist(Bound, S->getContext()); 1003 PPCGArray.bound[0] = Bound; 1004 } 1005 1006 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 1007 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 1008 auto LS = isl_pw_aff_get_domain_space(Bound); 1009 auto Aff = isl_multi_aff_zero(LS); 1010 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 1011 PPCGArray.bound[i] = Bound; 1012 } 1013 } 1014 1015 /// Create the arrays for @p PPCGProg. 1016 /// 1017 /// @param PPCGProg The program to compute the arrays for. 1018 void createArrays(gpu_prog *PPCGProg) { 1019 int i = 0; 1020 for (auto &Element : S->arrays()) { 1021 ScopArrayInfo *Array = Element.second.get(); 1022 1023 std::string TypeName; 1024 raw_string_ostream OS(TypeName); 1025 1026 OS << *Array->getElementType(); 1027 TypeName = OS.str(); 1028 1029 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1030 1031 PPCGArray.space = Array->getSpace(); 1032 PPCGArray.type = strdup(TypeName.c_str()); 1033 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1034 PPCGArray.name = strdup(Array->getName().c_str()); 1035 PPCGArray.extent = nullptr; 1036 PPCGArray.n_index = Array->getNumberOfDimensions(); 1037 PPCGArray.bound = 1038 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1039 PPCGArray.extent = getExtent(Array); 1040 PPCGArray.n_ref = 0; 1041 PPCGArray.refs = nullptr; 1042 PPCGArray.accessed = true; 1043 PPCGArray.read_only_scalar = false; 1044 PPCGArray.has_compound_element = false; 1045 PPCGArray.local = false; 1046 PPCGArray.declare_local = false; 1047 PPCGArray.global = false; 1048 PPCGArray.linearize = false; 1049 PPCGArray.dep_order = nullptr; 1050 1051 setArrayBounds(PPCGArray, Array); 1052 i++; 1053 1054 collect_references(PPCGProg, &PPCGArray); 1055 } 1056 } 1057 1058 /// Create an identity map between the arrays in the scop. 1059 /// 1060 /// @returns An identity map between the arrays in the scop. 1061 isl_union_map *getArrayIdentity() { 1062 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1063 1064 for (auto &Item : S->arrays()) { 1065 ScopArrayInfo *Array = Item.second.get(); 1066 isl_space *Space = Array->getSpace(); 1067 Space = isl_space_map_from_set(Space); 1068 isl_map *Identity = isl_map_identity(Space); 1069 Maps = isl_union_map_add_map(Maps, Identity); 1070 } 1071 1072 return Maps; 1073 } 1074 1075 /// Create a default-initialized PPCG GPU program. 1076 /// 1077 /// @returns A new gpu grogram description. 1078 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1079 1080 if (!PPCGScop) 1081 return nullptr; 1082 1083 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1084 1085 PPCGProg->ctx = S->getIslCtx(); 1086 PPCGProg->scop = PPCGScop; 1087 PPCGProg->context = isl_set_copy(PPCGScop->context); 1088 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1089 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1090 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1091 PPCGProg->tagged_must_kill = 1092 isl_union_map_copy(PPCGScop->tagged_must_kills); 1093 PPCGProg->to_inner = getArrayIdentity(); 1094 PPCGProg->to_outer = getArrayIdentity(); 1095 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1096 PPCGProg->any_to_outer = nullptr; 1097 PPCGProg->array_order = nullptr; 1098 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1099 PPCGProg->stmts = getStatements(); 1100 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1101 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1102 PPCGProg->n_array); 1103 1104 createArrays(PPCGProg); 1105 1106 return PPCGProg; 1107 } 1108 1109 struct PrintGPUUserData { 1110 struct cuda_info *CudaInfo; 1111 struct gpu_prog *PPCGProg; 1112 std::vector<ppcg_kernel *> Kernels; 1113 }; 1114 1115 /// Print a user statement node in the host code. 1116 /// 1117 /// We use ppcg's printing facilities to print the actual statement and 1118 /// additionally build up a list of all kernels that are encountered in the 1119 /// host ast. 1120 /// 1121 /// @param P The printer to print to 1122 /// @param Options The printing options to use 1123 /// @param Node The node to print 1124 /// @param User A user pointer to carry additional data. This pointer is 1125 /// expected to be of type PrintGPUUserData. 1126 /// 1127 /// @returns A printer to which the output has been printed. 1128 static __isl_give isl_printer * 1129 printHostUser(__isl_take isl_printer *P, 1130 __isl_take isl_ast_print_options *Options, 1131 __isl_take isl_ast_node *Node, void *User) { 1132 auto Data = (struct PrintGPUUserData *)User; 1133 auto Id = isl_ast_node_get_annotation(Node); 1134 1135 if (Id) { 1136 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1137 1138 // If this is a user statement, format it ourselves as ppcg would 1139 // otherwise try to call pet functionality that is not available in 1140 // Polly. 1141 if (IsUser) { 1142 P = isl_printer_start_line(P); 1143 P = isl_printer_print_ast_node(P, Node); 1144 P = isl_printer_end_line(P); 1145 isl_id_free(Id); 1146 isl_ast_print_options_free(Options); 1147 return P; 1148 } 1149 1150 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1151 isl_id_free(Id); 1152 Data->Kernels.push_back(Kernel); 1153 } 1154 1155 return print_host_user(P, Options, Node, User); 1156 } 1157 1158 /// Print C code corresponding to the control flow in @p Kernel. 1159 /// 1160 /// @param Kernel The kernel to print 1161 void printKernel(ppcg_kernel *Kernel) { 1162 auto *P = isl_printer_to_str(S->getIslCtx()); 1163 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1164 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1165 P = isl_ast_node_print(Kernel->tree, P, Options); 1166 char *String = isl_printer_get_str(P); 1167 printf("%s\n", String); 1168 free(String); 1169 isl_printer_free(P); 1170 } 1171 1172 /// Print C code corresponding to the GPU code described by @p Tree. 1173 /// 1174 /// @param Tree An AST describing GPU code 1175 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1176 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1177 auto *P = isl_printer_to_str(S->getIslCtx()); 1178 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1179 1180 PrintGPUUserData Data; 1181 Data.PPCGProg = PPCGProg; 1182 1183 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1184 Options = 1185 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1186 P = isl_ast_node_print(Tree, P, Options); 1187 char *String = isl_printer_get_str(P); 1188 printf("# host\n"); 1189 printf("%s\n", String); 1190 free(String); 1191 isl_printer_free(P); 1192 1193 for (auto Kernel : Data.Kernels) { 1194 printf("# kernel%d\n", Kernel->id); 1195 printKernel(Kernel); 1196 } 1197 } 1198 1199 // Generate a GPU program using PPCG. 1200 // 1201 // GPU mapping consists of multiple steps: 1202 // 1203 // 1) Compute new schedule for the program. 1204 // 2) Map schedule to GPU (TODO) 1205 // 3) Generate code for new schedule (TODO) 1206 // 1207 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1208 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1209 // strategy directly from this pass. 1210 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1211 1212 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1213 1214 PPCGGen->ctx = S->getIslCtx(); 1215 PPCGGen->options = PPCGScop->options; 1216 PPCGGen->print = nullptr; 1217 PPCGGen->print_user = nullptr; 1218 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1219 PPCGGen->prog = PPCGProg; 1220 PPCGGen->tree = nullptr; 1221 PPCGGen->types.n = 0; 1222 PPCGGen->types.name = nullptr; 1223 PPCGGen->sizes = nullptr; 1224 PPCGGen->used_sizes = nullptr; 1225 PPCGGen->kernel_id = 0; 1226 1227 // Set scheduling strategy to same strategy PPCG is using. 1228 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1229 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1230 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1231 1232 isl_schedule *Schedule = get_schedule(PPCGGen); 1233 1234 int has_permutable = has_any_permutable_node(Schedule); 1235 1236 if (!has_permutable || has_permutable < 0) { 1237 Schedule = isl_schedule_free(Schedule); 1238 } else { 1239 Schedule = map_to_device(PPCGGen, Schedule); 1240 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1241 } 1242 1243 if (DumpSchedule) { 1244 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1245 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1246 P = isl_printer_print_str(P, "Schedule\n"); 1247 P = isl_printer_print_str(P, "========\n"); 1248 if (Schedule) 1249 P = isl_printer_print_schedule(P, Schedule); 1250 else 1251 P = isl_printer_print_str(P, "No schedule found\n"); 1252 1253 printf("%s\n", isl_printer_get_str(P)); 1254 isl_printer_free(P); 1255 } 1256 1257 if (DumpCode) { 1258 printf("Code\n"); 1259 printf("====\n"); 1260 if (PPCGGen->tree) 1261 printGPUTree(PPCGGen->tree, PPCGProg); 1262 else 1263 printf("No code generated\n"); 1264 } 1265 1266 isl_schedule_free(Schedule); 1267 1268 return PPCGGen; 1269 } 1270 1271 /// Free gpu_gen structure. 1272 /// 1273 /// @param PPCGGen The ppcg_gen object to free. 1274 void freePPCGGen(gpu_gen *PPCGGen) { 1275 isl_ast_node_free(PPCGGen->tree); 1276 isl_union_map_free(PPCGGen->sizes); 1277 isl_union_map_free(PPCGGen->used_sizes); 1278 free(PPCGGen); 1279 } 1280 1281 /// Free the options in the ppcg scop structure. 1282 /// 1283 /// ppcg is not freeing these options for us. To avoid leaks we do this 1284 /// ourselves. 1285 /// 1286 /// @param PPCGScop The scop referencing the options to free. 1287 void freeOptions(ppcg_scop *PPCGScop) { 1288 free(PPCGScop->options->debug); 1289 PPCGScop->options->debug = nullptr; 1290 free(PPCGScop->options); 1291 PPCGScop->options = nullptr; 1292 } 1293 1294 /// Generate code for a given GPU AST described by @p Root. 1295 /// 1296 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1297 /// @param Prog The GPU Program to generate code for. 1298 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1299 ScopAnnotator Annotator; 1300 Annotator.buildAliasScopes(*S); 1301 1302 Region *R = &S->getRegion(); 1303 1304 simplifyRegion(R, DT, LI, RI); 1305 1306 BasicBlock *EnteringBB = R->getEnteringBlock(); 1307 1308 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1309 1310 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1311 Prog); 1312 1313 // Only build the run-time condition and parameters _after_ having 1314 // introduced the conditional branch. This is important as the conditional 1315 // branch will guard the original scop from new induction variables that 1316 // the SCEVExpander may introduce while code generating the parameters and 1317 // which may introduce scalar dependences that prevent us from correctly 1318 // code generating this scop. 1319 BasicBlock *StartBlock = 1320 executeScopConditionally(*S, this, Builder.getTrue()); 1321 1322 // TODO: Handle LICM 1323 // TODO: Verify run-time checks 1324 auto SplitBlock = StartBlock->getSinglePredecessor(); 1325 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1326 NodeBuilder.addParameters(S->getContext()); 1327 Builder.SetInsertPoint(&*StartBlock->begin()); 1328 NodeBuilder.create(Root); 1329 NodeBuilder.finalizeSCoP(*S); 1330 } 1331 1332 bool runOnScop(Scop &CurrentScop) override { 1333 S = &CurrentScop; 1334 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1335 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1336 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1337 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 1338 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 1339 1340 // We currently do not support scops with invariant loads. 1341 if (S->hasInvariantAccesses()) 1342 return false; 1343 1344 auto PPCGScop = createPPCGScop(); 1345 auto PPCGProg = createPPCGProg(PPCGScop); 1346 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 1347 1348 if (PPCGGen->tree) 1349 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 1350 1351 freeOptions(PPCGScop); 1352 freePPCGGen(PPCGGen); 1353 gpu_prog_free(PPCGProg); 1354 ppcg_scop_free(PPCGScop); 1355 1356 return true; 1357 } 1358 1359 void printScop(raw_ostream &, Scop &) const override {} 1360 1361 void getAnalysisUsage(AnalysisUsage &AU) const override { 1362 AU.addRequired<DominatorTreeWrapperPass>(); 1363 AU.addRequired<RegionInfoPass>(); 1364 AU.addRequired<ScalarEvolutionWrapperPass>(); 1365 AU.addRequired<ScopDetection>(); 1366 AU.addRequired<ScopInfoRegionPass>(); 1367 AU.addRequired<LoopInfoWrapperPass>(); 1368 1369 AU.addPreserved<AAResultsWrapperPass>(); 1370 AU.addPreserved<BasicAAWrapperPass>(); 1371 AU.addPreserved<LoopInfoWrapperPass>(); 1372 AU.addPreserved<DominatorTreeWrapperPass>(); 1373 AU.addPreserved<GlobalsAAWrapperPass>(); 1374 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1375 AU.addPreserved<ScopDetection>(); 1376 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1377 AU.addPreserved<SCEVAAWrapperPass>(); 1378 1379 // FIXME: We do not yet add regions for the newly generated code to the 1380 // region tree. 1381 AU.addPreserved<RegionInfoPass>(); 1382 AU.addPreserved<ScopInfoRegionPass>(); 1383 } 1384 }; 1385 } 1386 1387 char PPCGCodeGeneration::ID = 1; 1388 1389 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 1390 1391 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 1392 "Polly - Apply PPCG translation to SCOP", false, false) 1393 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1394 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1395 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 1396 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1397 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 1398 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1399 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 1400 "Polly - Apply PPCG translation to SCOP", false, false) 1401