1 //===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Take a scop created by ScopInfo and map it to GPU code using the ppcg 11 // GPU mapping strategy. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "polly/CodeGen/IslNodeBuilder.h" 16 #include "polly/CodeGen/Utils.h" 17 #include "polly/DependenceInfo.h" 18 #include "polly/LinkAllPasses.h" 19 #include "polly/Options.h" 20 #include "polly/ScopInfo.h" 21 #include "polly/Support/SCEVValidator.h" 22 #include "llvm/ADT/PostOrderIterator.h" 23 #include "llvm/Analysis/AliasAnalysis.h" 24 #include "llvm/Analysis/BasicAliasAnalysis.h" 25 #include "llvm/Analysis/GlobalsModRef.h" 26 #include "llvm/Analysis/PostDominators.h" 27 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/LegacyPassManager.h" 31 #include "llvm/Support/TargetRegistry.h" 32 #include "llvm/Support/TargetSelect.h" 33 #include "llvm/Target/TargetMachine.h" 34 35 #include "isl/union_map.h" 36 37 extern "C" { 38 #include "ppcg/cuda.h" 39 #include "ppcg/gpu.h" 40 #include "ppcg/gpu_print.h" 41 #include "ppcg/ppcg.h" 42 #include "ppcg/schedule.h" 43 } 44 45 #include "llvm/Support/Debug.h" 46 47 using namespace polly; 48 using namespace llvm; 49 50 #define DEBUG_TYPE "polly-codegen-ppcg" 51 52 static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule", 53 cl::desc("Dump the computed GPU Schedule"), 54 cl::Hidden, cl::init(false), cl::ZeroOrMore, 55 cl::cat(PollyCategory)); 56 57 static cl::opt<bool> 58 DumpCode("polly-acc-dump-code", 59 cl::desc("Dump C code describing the GPU mapping"), cl::Hidden, 60 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 61 62 static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir", 63 cl::desc("Dump the kernel LLVM-IR"), 64 cl::Hidden, cl::init(false), cl::ZeroOrMore, 65 cl::cat(PollyCategory)); 66 67 static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm", 68 cl::desc("Dump the kernel assembly code"), 69 cl::Hidden, cl::init(false), cl::ZeroOrMore, 70 cl::cat(PollyCategory)); 71 72 static cl::opt<bool> FastMath("polly-acc-fastmath", 73 cl::desc("Allow unsafe math optimizations"), 74 cl::Hidden, cl::init(false), cl::ZeroOrMore, 75 cl::cat(PollyCategory)); 76 77 static cl::opt<std::string> 78 CudaVersion("polly-acc-cuda-version", 79 cl::desc("The CUDA version to compile for"), cl::Hidden, 80 cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory)); 81 82 /// Create the ast expressions for a ScopStmt. 83 /// 84 /// This function is a callback for to generate the ast expressions for each 85 /// of the scheduled ScopStmts. 86 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt( 87 void *StmtT, isl_ast_build *Build, 88 isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA, 89 isl_id *Id, void *User), 90 void *UserIndex, 91 isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User), 92 void *UserExpr) { 93 94 ScopStmt *Stmt = (ScopStmt *)StmtT; 95 96 isl_ctx *Ctx; 97 98 if (!Stmt || !Build) 99 return NULL; 100 101 Ctx = isl_ast_build_get_ctx(Build); 102 isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0); 103 104 for (MemoryAccess *Acc : *Stmt) { 105 isl_map *AddrFunc = Acc->getAddressFunction(); 106 AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain()); 107 isl_id *RefId = Acc->getId(); 108 isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc); 109 isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA); 110 MPA = isl_multi_pw_aff_coalesce(MPA); 111 MPA = FunctionIndex(MPA, RefId, UserIndex); 112 isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA); 113 Access = FunctionExpr(Access, RefId, UserExpr); 114 RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access); 115 } 116 117 return RefToExpr; 118 } 119 120 /// Generate code for a GPU specific isl AST. 121 /// 122 /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which 123 /// generates code for general-prupose AST nodes, with special functionality 124 /// for generating GPU specific user nodes. 125 /// 126 /// @see GPUNodeBuilder::createUser 127 class GPUNodeBuilder : public IslNodeBuilder { 128 public: 129 GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P, 130 const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE, 131 DominatorTree &DT, Scop &S, gpu_prog *Prog) 132 : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) { 133 getExprBuilder().setIDToSAI(&IDToSAI); 134 } 135 136 private: 137 /// A vector of array base pointers for which a new ScopArrayInfo was created. 138 /// 139 /// This vector is used to delete the ScopArrayInfo when it is not needed any 140 /// more. 141 std::vector<Value *> LocalArrays; 142 143 /// A module containing GPU code. 144 /// 145 /// This pointer is only set in case we are currently generating GPU code. 146 std::unique_ptr<Module> GPUModule; 147 148 /// The GPU program we generate code for. 149 gpu_prog *Prog; 150 151 /// Class to free isl_ids. 152 class IslIdDeleter { 153 public: 154 void operator()(__isl_take isl_id *Id) { isl_id_free(Id); }; 155 }; 156 157 /// A set containing all isl_ids allocated in a GPU kernel. 158 /// 159 /// By releasing this set all isl_ids will be freed. 160 std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs; 161 162 IslExprBuilder::IDToScopArrayInfoTy IDToSAI; 163 164 /// Create code for user-defined AST nodes. 165 /// 166 /// These AST nodes can be of type: 167 /// 168 /// - ScopStmt: A computational statement (TODO) 169 /// - Kernel: A GPU kernel call (TODO) 170 /// - Data-Transfer: A GPU <-> CPU data-transfer (TODO) 171 /// - In-kernel synchronization 172 /// - In-kernel memory copy statement 173 /// 174 /// @param UserStmt The ast node to generate code for. 175 virtual void createUser(__isl_take isl_ast_node *UserStmt); 176 177 /// Find llvm::Values referenced in GPU kernel. 178 /// 179 /// @param Kernel The kernel to scan for llvm::Values 180 /// 181 /// @returns A set of values referenced by the kernel. 182 SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel); 183 184 /// Create GPU kernel. 185 /// 186 /// Code generate the kernel described by @p KernelStmt. 187 /// 188 /// @param KernelStmt The ast node to generate kernel code for. 189 void createKernel(__isl_take isl_ast_node *KernelStmt); 190 191 /// Create kernel function. 192 /// 193 /// Create a kernel function located in a newly created module that can serve 194 /// as target for device code generation. Set the Builder to point to the 195 /// start block of this newly created function. 196 /// 197 /// @param Kernel The kernel to generate code for. 198 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 199 void createKernelFunction(ppcg_kernel *Kernel, 200 SetVector<Value *> &SubtreeValues); 201 202 /// Create the declaration of a kernel function. 203 /// 204 /// The kernel function takes as arguments: 205 /// 206 /// - One i8 pointer for each external array reference used in the kernel. 207 /// - Host iterators 208 /// - Parameters 209 /// - Other LLVM Value references (TODO) 210 /// 211 /// @param Kernel The kernel to generate the function declaration for. 212 /// @param SubtreeValues The set of llvm::Values referenced by this kernel. 213 /// 214 /// @returns The newly declared function. 215 Function *createKernelFunctionDecl(ppcg_kernel *Kernel, 216 SetVector<Value *> &SubtreeValues); 217 218 /// Insert intrinsic functions to obtain thread and block ids. 219 /// 220 /// @param The kernel to generate the intrinsic functions for. 221 void insertKernelIntrinsics(ppcg_kernel *Kernel); 222 223 /// Create code for a ScopStmt called in @p Expr. 224 /// 225 /// @param Expr The expression containing the call. 226 /// @param KernelStmt The kernel statement referenced in the call. 227 void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt); 228 229 /// Create an in-kernel synchronization call. 230 void createKernelSync(); 231 232 /// Create a PTX assembly string for the current GPU kernel. 233 /// 234 /// @returns A string containing the corresponding PTX assembly code. 235 std::string createKernelASM(); 236 237 /// Remove references from the dominator tree to the kernel function @p F. 238 /// 239 /// @param F The function to remove references to. 240 void clearDominators(Function *F); 241 242 /// Remove references from scalar evolution to the kernel function @p F. 243 /// 244 /// @param F The function to remove references to. 245 void clearScalarEvolution(Function *F); 246 247 /// Remove references from loop info to the kernel function @p F. 248 /// 249 /// @param F The function to remove references to. 250 void clearLoops(Function *F); 251 252 /// Finalize the generation of the kernel function. 253 /// 254 /// Free the LLVM-IR module corresponding to the kernel and -- if requested -- 255 /// dump its IR to stderr. 256 void finalizeKernelFunction(); 257 }; 258 259 /// Check if one string is a prefix of another. 260 /// 261 /// @param String The string in which to look for the prefix. 262 /// @param Prefix The prefix to look for. 263 static bool isPrefix(std::string String, std::string Prefix) { 264 return String.find(Prefix) == 0; 265 } 266 267 void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) { 268 isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt); 269 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 270 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 271 isl_id_free(Id); 272 isl_ast_expr_free(StmtExpr); 273 274 const char *Str = isl_id_get_name(Id); 275 if (!strcmp(Str, "kernel")) { 276 createKernel(UserStmt); 277 isl_ast_expr_free(Expr); 278 return; 279 } 280 281 if (isPrefix(Str, "to_device") || isPrefix(Str, "from_device")) { 282 // TODO: Insert memory copies 283 isl_ast_expr_free(Expr); 284 isl_ast_node_free(UserStmt); 285 return; 286 } 287 288 isl_id *Anno = isl_ast_node_get_annotation(UserStmt); 289 struct ppcg_kernel_stmt *KernelStmt = 290 (struct ppcg_kernel_stmt *)isl_id_get_user(Anno); 291 isl_id_free(Anno); 292 293 switch (KernelStmt->type) { 294 case ppcg_kernel_domain: 295 createScopStmt(Expr, KernelStmt); 296 isl_ast_node_free(UserStmt); 297 return; 298 case ppcg_kernel_copy: 299 // TODO: Create kernel copy stmt 300 isl_ast_expr_free(Expr); 301 isl_ast_node_free(UserStmt); 302 return; 303 case ppcg_kernel_sync: 304 createKernelSync(); 305 isl_ast_expr_free(Expr); 306 isl_ast_node_free(UserStmt); 307 return; 308 } 309 310 isl_ast_expr_free(Expr); 311 isl_ast_node_free(UserStmt); 312 return; 313 } 314 315 void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr, 316 ppcg_kernel_stmt *KernelStmt) { 317 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 318 isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr; 319 320 LoopToScevMapT LTS; 321 LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); 322 323 createSubstitutions(Expr, Stmt, LTS); 324 325 if (Stmt->isBlockStmt()) 326 BlockGen.copyStmt(*Stmt, LTS, Indexes); 327 else 328 assert(0 && "Region statement not supported\n"); 329 } 330 331 void GPUNodeBuilder::createKernelSync() { 332 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 333 auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0); 334 Builder.CreateCall(Sync, {}); 335 } 336 337 /// Collect llvm::Values referenced from @p Node 338 /// 339 /// This function only applies to isl_ast_nodes that are user_nodes referring 340 /// to a ScopStmt. All other node types are ignore. 341 /// 342 /// @param Node The node to collect references for. 343 /// @param User A user pointer used as storage for the data that is collected. 344 /// 345 /// @returns isl_bool_true if data could be collected successfully. 346 isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) { 347 if (isl_ast_node_get_type(Node) != isl_ast_node_user) 348 return isl_bool_true; 349 350 isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node); 351 isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0); 352 isl_id *Id = isl_ast_expr_get_id(StmtExpr); 353 const char *Str = isl_id_get_name(Id); 354 isl_id_free(Id); 355 isl_ast_expr_free(StmtExpr); 356 isl_ast_expr_free(Expr); 357 358 if (!isPrefix(Str, "Stmt")) 359 return isl_bool_true; 360 361 Id = isl_ast_node_get_annotation(Node); 362 auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id); 363 auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt; 364 isl_id_free(Id); 365 366 addReferencesFromStmt(Stmt, User); 367 368 return isl_bool_true; 369 } 370 371 SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) { 372 SetVector<Value *> SubtreeValues; 373 SetVector<const SCEV *> SCEVs; 374 SetVector<const Loop *> Loops; 375 SubtreeReferences References = { 376 LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()}; 377 378 for (const auto &I : IDToValue) 379 SubtreeValues.insert(I.second); 380 381 isl_ast_node_foreach_descendant_top_down( 382 Kernel->tree, collectReferencesInGPUStmt, &References); 383 384 for (const SCEV *Expr : SCEVs) 385 findValues(Expr, SE, SubtreeValues); 386 387 for (auto &SAI : S.arrays()) 388 SubtreeValues.remove(SAI.second->getBasePtr()); 389 390 isl_space *Space = S.getParamSpace(); 391 for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) { 392 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i); 393 assert(IDToValue.count(Id)); 394 Value *Val = IDToValue[Id]; 395 SubtreeValues.remove(Val); 396 isl_id_free(Id); 397 } 398 isl_space_free(Space); 399 400 for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) { 401 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 402 assert(IDToValue.count(Id)); 403 Value *Val = IDToValue[Id]; 404 SubtreeValues.remove(Val); 405 isl_id_free(Id); 406 } 407 408 return SubtreeValues; 409 } 410 411 void GPUNodeBuilder::clearDominators(Function *F) { 412 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 413 std::vector<BasicBlock *> Nodes; 414 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 415 Nodes.push_back(I->getBlock()); 416 417 for (BasicBlock *BB : Nodes) 418 DT.eraseNode(BB); 419 } 420 421 void GPUNodeBuilder::clearScalarEvolution(Function *F) { 422 for (BasicBlock &BB : *F) { 423 Loop *L = LI.getLoopFor(&BB); 424 if (L) 425 SE.forgetLoop(L); 426 } 427 } 428 429 void GPUNodeBuilder::clearLoops(Function *F) { 430 for (BasicBlock &BB : *F) { 431 Loop *L = LI.getLoopFor(&BB); 432 if (L) 433 SE.forgetLoop(L); 434 LI.removeBlock(&BB); 435 } 436 } 437 438 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) { 439 isl_id *Id = isl_ast_node_get_annotation(KernelStmt); 440 ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id); 441 isl_id_free(Id); 442 isl_ast_node_free(KernelStmt); 443 444 SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel); 445 446 assert(Kernel->tree && "Device AST of kernel node is empty"); 447 448 Instruction &HostInsertPoint = *Builder.GetInsertPoint(); 449 IslExprBuilder::IDToValueTy HostIDs = IDToValue; 450 ValueMapT HostValueMap = ValueMap; 451 452 SetVector<const Loop *> Loops; 453 454 // Create for all loops we depend on values that contain the current loop 455 // iteration. These values are necessary to generate code for SCEVs that 456 // depend on such loops. As a result we need to pass them to the subfunction. 457 for (const Loop *L : Loops) { 458 const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)), 459 SE.getUnknown(Builder.getInt64(1)), 460 L, SCEV::FlagAnyWrap); 461 Value *V = generateSCEV(OuterLIV); 462 OutsideLoopIterations[L] = SE.getUnknown(V); 463 SubtreeValues.insert(V); 464 } 465 466 createKernelFunction(Kernel, SubtreeValues); 467 468 create(isl_ast_node_copy(Kernel->tree)); 469 470 Function *F = Builder.GetInsertBlock()->getParent(); 471 clearDominators(F); 472 clearScalarEvolution(F); 473 clearLoops(F); 474 475 Builder.SetInsertPoint(&HostInsertPoint); 476 IDToValue = HostIDs; 477 478 ValueMap = HostValueMap; 479 ScalarMap.clear(); 480 PHIOpMap.clear(); 481 EscapeMap.clear(); 482 IDToSAI.clear(); 483 Annotator.resetAlternativeAliasBases(); 484 for (auto &BasePtr : LocalArrays) 485 S.invalidateScopArrayInfo(BasePtr, ScopArrayInfo::MK_Array); 486 LocalArrays.clear(); 487 488 finalizeKernelFunction(); 489 } 490 491 /// Compute the DataLayout string for the NVPTX backend. 492 /// 493 /// @param is64Bit Are we looking for a 64 bit architecture? 494 static std::string computeNVPTXDataLayout(bool is64Bit) { 495 std::string Ret = "e"; 496 497 if (!is64Bit) 498 Ret += "-p:32:32"; 499 500 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 501 502 return Ret; 503 } 504 505 Function * 506 GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel, 507 SetVector<Value *> &SubtreeValues) { 508 std::vector<Type *> Args; 509 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 510 511 for (long i = 0; i < Prog->n_array; i++) { 512 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 513 continue; 514 515 Args.push_back(Builder.getInt8PtrTy()); 516 } 517 518 int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set); 519 520 for (long i = 0; i < NumHostIters; i++) 521 Args.push_back(Builder.getInt64Ty()); 522 523 int NumVars = isl_space_dim(Kernel->space, isl_dim_param); 524 525 for (long i = 0; i < NumVars; i++) 526 Args.push_back(Builder.getInt64Ty()); 527 528 for (auto *V : SubtreeValues) 529 Args.push_back(V->getType()); 530 531 auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false); 532 auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier, 533 GPUModule.get()); 534 FN->setCallingConv(CallingConv::PTX_Kernel); 535 536 auto Arg = FN->arg_begin(); 537 for (long i = 0; i < Kernel->n_array; i++) { 538 if (!ppcg_kernel_requires_array_argument(Kernel, i)) 539 continue; 540 541 Arg->setName(Kernel->array[i].array->name); 542 543 isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); 544 const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id)); 545 Type *EleTy = SAI->getElementType(); 546 Value *Val = &*Arg; 547 SmallVector<const SCEV *, 4> Sizes; 548 isl_ast_build *Build = 549 isl_ast_build_from_context(isl_set_copy(Prog->context)); 550 for (long j = 1; j < Kernel->array[i].array->n_index; j++) { 551 isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff( 552 Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j])); 553 auto V = ExprBuilder.create(DimSize); 554 Sizes.push_back(SE.getSCEV(V)); 555 } 556 const ScopArrayInfo *SAIRep = 557 S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array); 558 LocalArrays.push_back(Val); 559 560 isl_ast_build_free(Build); 561 isl_id_free(Id); 562 IDToSAI[Id] = SAIRep; 563 Arg++; 564 } 565 566 for (long i = 0; i < NumHostIters; i++) { 567 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); 568 Arg->setName(isl_id_get_name(Id)); 569 IDToValue[Id] = &*Arg; 570 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 571 Arg++; 572 } 573 574 for (long i = 0; i < NumVars; i++) { 575 isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); 576 Arg->setName(isl_id_get_name(Id)); 577 IDToValue[Id] = &*Arg; 578 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 579 Arg++; 580 } 581 582 for (auto *V : SubtreeValues) { 583 Arg->setName(V->getName()); 584 ValueMap[V] = &*Arg; 585 Arg++; 586 } 587 588 return FN; 589 } 590 591 void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) { 592 Intrinsic::ID IntrinsicsBID[] = {Intrinsic::nvvm_read_ptx_sreg_ctaid_x, 593 Intrinsic::nvvm_read_ptx_sreg_ctaid_y}; 594 595 Intrinsic::ID IntrinsicsTID[] = {Intrinsic::nvvm_read_ptx_sreg_tid_x, 596 Intrinsic::nvvm_read_ptx_sreg_tid_y, 597 Intrinsic::nvvm_read_ptx_sreg_tid_z}; 598 599 auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable { 600 std::string Name = isl_id_get_name(Id); 601 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 602 Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr); 603 Value *Val = Builder.CreateCall(IntrinsicFn, {}); 604 Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name); 605 IDToValue[Id] = Val; 606 KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id)); 607 }; 608 609 for (int i = 0; i < Kernel->n_grid; ++i) { 610 isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i); 611 addId(Id, IntrinsicsBID[i]); 612 } 613 614 for (int i = 0; i < Kernel->n_block; ++i) { 615 isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i); 616 addId(Id, IntrinsicsTID[i]); 617 } 618 } 619 620 void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel, 621 SetVector<Value *> &SubtreeValues) { 622 623 std::string Identifier = "kernel_" + std::to_string(Kernel->id); 624 GPUModule.reset(new Module(Identifier, Builder.getContext())); 625 GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda")); 626 GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */)); 627 628 Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues); 629 630 BasicBlock *PrevBlock = Builder.GetInsertBlock(); 631 auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN); 632 633 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 634 DT.addNewBlock(EntryBlock, PrevBlock); 635 636 Builder.SetInsertPoint(EntryBlock); 637 Builder.CreateRetVoid(); 638 Builder.SetInsertPoint(EntryBlock, EntryBlock->begin()); 639 640 insertKernelIntrinsics(Kernel); 641 } 642 643 std::string GPUNodeBuilder::createKernelASM() { 644 llvm::Triple GPUTriple(Triple::normalize("nvptx64-nvidia-cuda")); 645 std::string ErrMsg; 646 auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg); 647 648 if (!GPUTarget) { 649 errs() << ErrMsg << "\n"; 650 return ""; 651 } 652 653 TargetOptions Options; 654 Options.UnsafeFPMath = FastMath; 655 std::unique_ptr<TargetMachine> TargetM( 656 GPUTarget->createTargetMachine(GPUTriple.getTriple(), CudaVersion, "", 657 Options, Optional<Reloc::Model>())); 658 659 SmallString<0> ASMString; 660 raw_svector_ostream ASMStream(ASMString); 661 llvm::legacy::PassManager PM; 662 663 PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis())); 664 665 if (TargetM->addPassesToEmitFile( 666 PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) { 667 errs() << "The target does not support generation of this file type!\n"; 668 return ""; 669 } 670 671 PM.run(*GPUModule); 672 673 return ASMStream.str(); 674 } 675 676 void GPUNodeBuilder::finalizeKernelFunction() { 677 678 if (DumpKernelIR) 679 outs() << *GPUModule << "\n"; 680 681 std::string Assembly = createKernelASM(); 682 683 if (DumpKernelASM) 684 outs() << Assembly << "\n"; 685 686 GPUModule.release(); 687 KernelIDs.clear(); 688 } 689 690 namespace { 691 class PPCGCodeGeneration : public ScopPass { 692 public: 693 static char ID; 694 695 /// The scop that is currently processed. 696 Scop *S; 697 698 LoopInfo *LI; 699 DominatorTree *DT; 700 ScalarEvolution *SE; 701 const DataLayout *DL; 702 RegionInfo *RI; 703 704 PPCGCodeGeneration() : ScopPass(ID) {} 705 706 /// Construct compilation options for PPCG. 707 /// 708 /// @returns The compilation options. 709 ppcg_options *createPPCGOptions() { 710 auto DebugOptions = 711 (ppcg_debug_options *)malloc(sizeof(ppcg_debug_options)); 712 auto Options = (ppcg_options *)malloc(sizeof(ppcg_options)); 713 714 DebugOptions->dump_schedule_constraints = false; 715 DebugOptions->dump_schedule = false; 716 DebugOptions->dump_final_schedule = false; 717 DebugOptions->dump_sizes = false; 718 719 Options->debug = DebugOptions; 720 721 Options->reschedule = true; 722 Options->scale_tile_loops = false; 723 Options->wrap = false; 724 725 Options->non_negative_parameters = false; 726 Options->ctx = nullptr; 727 Options->sizes = nullptr; 728 729 Options->tile_size = 32; 730 731 Options->use_private_memory = false; 732 Options->use_shared_memory = false; 733 Options->max_shared_memory = 0; 734 735 Options->target = PPCG_TARGET_CUDA; 736 Options->openmp = false; 737 Options->linearize_device_arrays = true; 738 Options->live_range_reordering = false; 739 740 Options->opencl_compiler_options = nullptr; 741 Options->opencl_use_gpu = false; 742 Options->opencl_n_include_file = 0; 743 Options->opencl_include_files = nullptr; 744 Options->opencl_print_kernel_types = false; 745 Options->opencl_embed_kernel_code = false; 746 747 Options->save_schedule_file = nullptr; 748 Options->load_schedule_file = nullptr; 749 750 return Options; 751 } 752 753 /// Get a tagged access relation containing all accesses of type @p AccessTy. 754 /// 755 /// Instead of a normal access of the form: 756 /// 757 /// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)] 758 /// 759 /// a tagged access has the form 760 /// 761 /// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)] 762 /// 763 /// where 'id' is an additional space that references the memory access that 764 /// triggered the access. 765 /// 766 /// @param AccessTy The type of the memory accesses to collect. 767 /// 768 /// @return The relation describing all tagged memory accesses. 769 isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) { 770 isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace()); 771 772 for (auto &Stmt : *S) 773 for (auto &Acc : Stmt) 774 if (Acc->getType() == AccessTy) { 775 isl_map *Relation = Acc->getAccessRelation(); 776 Relation = isl_map_intersect_domain(Relation, Stmt.getDomain()); 777 778 isl_space *Space = isl_map_get_space(Relation); 779 Space = isl_space_range(Space); 780 Space = isl_space_from_range(Space); 781 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 782 isl_map *Universe = isl_map_universe(Space); 783 Relation = isl_map_domain_product(Relation, Universe); 784 Accesses = isl_union_map_add_map(Accesses, Relation); 785 } 786 787 return Accesses; 788 } 789 790 /// Get the set of all read accesses, tagged with the access id. 791 /// 792 /// @see getTaggedAccesses 793 isl_union_map *getTaggedReads() { 794 return getTaggedAccesses(MemoryAccess::READ); 795 } 796 797 /// Get the set of all may (and must) accesses, tagged with the access id. 798 /// 799 /// @see getTaggedAccesses 800 isl_union_map *getTaggedMayWrites() { 801 return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE), 802 getTaggedAccesses(MemoryAccess::MUST_WRITE)); 803 } 804 805 /// Get the set of all must accesses, tagged with the access id. 806 /// 807 /// @see getTaggedAccesses 808 isl_union_map *getTaggedMustWrites() { 809 return getTaggedAccesses(MemoryAccess::MUST_WRITE); 810 } 811 812 /// Collect parameter and array names as isl_ids. 813 /// 814 /// To reason about the different parameters and arrays used, ppcg requires 815 /// a list of all isl_ids in use. As PPCG traditionally performs 816 /// source-to-source compilation each of these isl_ids is mapped to the 817 /// expression that represents it. As we do not have a corresponding 818 /// expression in Polly, we just map each id to a 'zero' expression to match 819 /// the data format that ppcg expects. 820 /// 821 /// @returns Retun a map from collected ids to 'zero' ast expressions. 822 __isl_give isl_id_to_ast_expr *getNames() { 823 auto *Names = isl_id_to_ast_expr_alloc( 824 S->getIslCtx(), 825 S->getNumParams() + std::distance(S->array_begin(), S->array_end())); 826 auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx())); 827 auto *Space = S->getParamSpace(); 828 829 for (int I = 0, E = S->getNumParams(); I < E; ++I) { 830 isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, I); 831 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 832 } 833 834 for (auto &Array : S->arrays()) { 835 auto Id = Array.second->getBasePtrId(); 836 Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero)); 837 } 838 839 isl_space_free(Space); 840 isl_ast_expr_free(Zero); 841 842 return Names; 843 } 844 845 /// Create a new PPCG scop from the current scop. 846 /// 847 /// The PPCG scop is initialized with data from the current polly::Scop. From 848 /// this initial data, the data-dependences in the PPCG scop are initialized. 849 /// We do not use Polly's dependence analysis for now, to ensure we match 850 /// the PPCG default behaviour more closely. 851 /// 852 /// @returns A new ppcg scop. 853 ppcg_scop *createPPCGScop() { 854 auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop)); 855 856 PPCGScop->options = createPPCGOptions(); 857 858 PPCGScop->start = 0; 859 PPCGScop->end = 0; 860 861 PPCGScop->context = S->getContext(); 862 PPCGScop->domain = S->getDomains(); 863 PPCGScop->call = nullptr; 864 PPCGScop->tagged_reads = getTaggedReads(); 865 PPCGScop->reads = S->getReads(); 866 PPCGScop->live_in = nullptr; 867 PPCGScop->tagged_may_writes = getTaggedMayWrites(); 868 PPCGScop->may_writes = S->getWrites(); 869 PPCGScop->tagged_must_writes = getTaggedMustWrites(); 870 PPCGScop->must_writes = S->getMustWrites(); 871 PPCGScop->live_out = nullptr; 872 PPCGScop->tagged_must_kills = isl_union_map_empty(S->getParamSpace()); 873 PPCGScop->tagger = nullptr; 874 875 PPCGScop->independence = nullptr; 876 PPCGScop->dep_flow = nullptr; 877 PPCGScop->tagged_dep_flow = nullptr; 878 PPCGScop->dep_false = nullptr; 879 PPCGScop->dep_forced = nullptr; 880 PPCGScop->dep_order = nullptr; 881 PPCGScop->tagged_dep_order = nullptr; 882 883 PPCGScop->schedule = S->getScheduleTree(); 884 PPCGScop->names = getNames(); 885 886 PPCGScop->pet = nullptr; 887 888 compute_tagger(PPCGScop); 889 compute_dependences(PPCGScop); 890 891 return PPCGScop; 892 } 893 894 /// Collect the array acesses in a statement. 895 /// 896 /// @param Stmt The statement for which to collect the accesses. 897 /// 898 /// @returns A list of array accesses. 899 gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) { 900 gpu_stmt_access *Accesses = nullptr; 901 902 for (MemoryAccess *Acc : Stmt) { 903 auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access); 904 Access->read = Acc->isRead(); 905 Access->write = Acc->isWrite(); 906 Access->access = Acc->getAccessRelation(); 907 isl_space *Space = isl_map_get_space(Access->access); 908 Space = isl_space_range(Space); 909 Space = isl_space_from_range(Space); 910 Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId()); 911 isl_map *Universe = isl_map_universe(Space); 912 Access->tagged_access = 913 isl_map_domain_product(Acc->getAccessRelation(), Universe); 914 Access->exact_write = Acc->isWrite(); 915 Access->ref_id = Acc->getId(); 916 Access->next = Accesses; 917 Accesses = Access; 918 } 919 920 return Accesses; 921 } 922 923 /// Collect the list of GPU statements. 924 /// 925 /// Each statement has an id, a pointer to the underlying data structure, 926 /// as well as a list with all memory accesses. 927 /// 928 /// TODO: Initialize the list of memory accesses. 929 /// 930 /// @returns A linked-list of statements. 931 gpu_stmt *getStatements() { 932 gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt, 933 std::distance(S->begin(), S->end())); 934 935 int i = 0; 936 for (auto &Stmt : *S) { 937 gpu_stmt *GPUStmt = &Stmts[i]; 938 939 GPUStmt->id = Stmt.getDomainId(); 940 941 // We use the pet stmt pointer to keep track of the Polly statements. 942 GPUStmt->stmt = (pet_stmt *)&Stmt; 943 GPUStmt->accesses = getStmtAccesses(Stmt); 944 i++; 945 } 946 947 return Stmts; 948 } 949 950 /// Derive the extent of an array. 951 /// 952 /// The extent of an array is defined by the set of memory locations for 953 /// which a memory access in the iteration domain exists. 954 /// 955 /// @param Array The array to derive the extent for. 956 /// 957 /// @returns An isl_set describing the extent of the array. 958 __isl_give isl_set *getExtent(ScopArrayInfo *Array) { 959 isl_union_map *Accesses = S->getAccesses(); 960 Accesses = isl_union_map_intersect_domain(Accesses, S->getDomains()); 961 isl_union_set *AccessUSet = isl_union_map_range(Accesses); 962 isl_set *AccessSet = 963 isl_union_set_extract_set(AccessUSet, Array->getSpace()); 964 isl_union_set_free(AccessUSet); 965 966 return AccessSet; 967 } 968 969 /// Derive the bounds of an array. 970 /// 971 /// For the first dimension we derive the bound of the array from the extent 972 /// of this dimension. For inner dimensions we obtain their size directly from 973 /// ScopArrayInfo. 974 /// 975 /// @param PPCGArray The array to compute bounds for. 976 /// @param Array The polly array from which to take the information. 977 void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) { 978 if (PPCGArray.n_index > 0) { 979 isl_set *Dom = isl_set_copy(PPCGArray.extent); 980 Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1); 981 isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0); 982 isl_set_free(Dom); 983 Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound)); 984 isl_local_space *LS = isl_local_space_from_space(isl_set_get_space(Dom)); 985 isl_aff *One = isl_aff_zero_on_domain(LS); 986 One = isl_aff_add_constant_si(One, 1); 987 Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One)); 988 Bound = isl_pw_aff_gist(Bound, S->getContext()); 989 PPCGArray.bound[0] = Bound; 990 } 991 992 for (unsigned i = 1; i < PPCGArray.n_index; ++i) { 993 isl_pw_aff *Bound = Array->getDimensionSizePw(i); 994 auto LS = isl_pw_aff_get_domain_space(Bound); 995 auto Aff = isl_multi_aff_zero(LS); 996 Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff); 997 PPCGArray.bound[i] = Bound; 998 } 999 } 1000 1001 /// Create the arrays for @p PPCGProg. 1002 /// 1003 /// @param PPCGProg The program to compute the arrays for. 1004 void createArrays(gpu_prog *PPCGProg) { 1005 int i = 0; 1006 for (auto &Element : S->arrays()) { 1007 ScopArrayInfo *Array = Element.second.get(); 1008 1009 std::string TypeName; 1010 raw_string_ostream OS(TypeName); 1011 1012 OS << *Array->getElementType(); 1013 TypeName = OS.str(); 1014 1015 gpu_array_info &PPCGArray = PPCGProg->array[i]; 1016 1017 PPCGArray.space = Array->getSpace(); 1018 PPCGArray.type = strdup(TypeName.c_str()); 1019 PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8; 1020 PPCGArray.name = strdup(Array->getName().c_str()); 1021 PPCGArray.extent = nullptr; 1022 PPCGArray.n_index = Array->getNumberOfDimensions(); 1023 PPCGArray.bound = 1024 isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index); 1025 PPCGArray.extent = getExtent(Array); 1026 PPCGArray.n_ref = 0; 1027 PPCGArray.refs = nullptr; 1028 PPCGArray.accessed = true; 1029 PPCGArray.read_only_scalar = false; 1030 PPCGArray.has_compound_element = false; 1031 PPCGArray.local = false; 1032 PPCGArray.declare_local = false; 1033 PPCGArray.global = false; 1034 PPCGArray.linearize = false; 1035 PPCGArray.dep_order = nullptr; 1036 1037 setArrayBounds(PPCGArray, Array); 1038 i++; 1039 1040 collect_references(PPCGProg, &PPCGArray); 1041 } 1042 } 1043 1044 /// Create an identity map between the arrays in the scop. 1045 /// 1046 /// @returns An identity map between the arrays in the scop. 1047 isl_union_map *getArrayIdentity() { 1048 isl_union_map *Maps = isl_union_map_empty(S->getParamSpace()); 1049 1050 for (auto &Item : S->arrays()) { 1051 ScopArrayInfo *Array = Item.second.get(); 1052 isl_space *Space = Array->getSpace(); 1053 Space = isl_space_map_from_set(Space); 1054 isl_map *Identity = isl_map_identity(Space); 1055 Maps = isl_union_map_add_map(Maps, Identity); 1056 } 1057 1058 return Maps; 1059 } 1060 1061 /// Create a default-initialized PPCG GPU program. 1062 /// 1063 /// @returns A new gpu grogram description. 1064 gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) { 1065 1066 if (!PPCGScop) 1067 return nullptr; 1068 1069 auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog); 1070 1071 PPCGProg->ctx = S->getIslCtx(); 1072 PPCGProg->scop = PPCGScop; 1073 PPCGProg->context = isl_set_copy(PPCGScop->context); 1074 PPCGProg->read = isl_union_map_copy(PPCGScop->reads); 1075 PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes); 1076 PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes); 1077 PPCGProg->tagged_must_kill = 1078 isl_union_map_copy(PPCGScop->tagged_must_kills); 1079 PPCGProg->to_inner = getArrayIdentity(); 1080 PPCGProg->to_outer = getArrayIdentity(); 1081 PPCGProg->may_persist = compute_may_persist(PPCGProg); 1082 PPCGProg->any_to_outer = nullptr; 1083 PPCGProg->array_order = nullptr; 1084 PPCGProg->n_stmts = std::distance(S->begin(), S->end()); 1085 PPCGProg->stmts = getStatements(); 1086 PPCGProg->n_array = std::distance(S->array_begin(), S->array_end()); 1087 PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info, 1088 PPCGProg->n_array); 1089 1090 createArrays(PPCGProg); 1091 1092 return PPCGProg; 1093 } 1094 1095 struct PrintGPUUserData { 1096 struct cuda_info *CudaInfo; 1097 struct gpu_prog *PPCGProg; 1098 std::vector<ppcg_kernel *> Kernels; 1099 }; 1100 1101 /// Print a user statement node in the host code. 1102 /// 1103 /// We use ppcg's printing facilities to print the actual statement and 1104 /// additionally build up a list of all kernels that are encountered in the 1105 /// host ast. 1106 /// 1107 /// @param P The printer to print to 1108 /// @param Options The printing options to use 1109 /// @param Node The node to print 1110 /// @param User A user pointer to carry additional data. This pointer is 1111 /// expected to be of type PrintGPUUserData. 1112 /// 1113 /// @returns A printer to which the output has been printed. 1114 static __isl_give isl_printer * 1115 printHostUser(__isl_take isl_printer *P, 1116 __isl_take isl_ast_print_options *Options, 1117 __isl_take isl_ast_node *Node, void *User) { 1118 auto Data = (struct PrintGPUUserData *)User; 1119 auto Id = isl_ast_node_get_annotation(Node); 1120 1121 if (Id) { 1122 bool IsUser = !strcmp(isl_id_get_name(Id), "user"); 1123 1124 // If this is a user statement, format it ourselves as ppcg would 1125 // otherwise try to call pet functionality that is not available in 1126 // Polly. 1127 if (IsUser) { 1128 P = isl_printer_start_line(P); 1129 P = isl_printer_print_ast_node(P, Node); 1130 P = isl_printer_end_line(P); 1131 isl_id_free(Id); 1132 isl_ast_print_options_free(Options); 1133 return P; 1134 } 1135 1136 auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id); 1137 isl_id_free(Id); 1138 Data->Kernels.push_back(Kernel); 1139 } 1140 1141 return print_host_user(P, Options, Node, User); 1142 } 1143 1144 /// Print C code corresponding to the control flow in @p Kernel. 1145 /// 1146 /// @param Kernel The kernel to print 1147 void printKernel(ppcg_kernel *Kernel) { 1148 auto *P = isl_printer_to_str(S->getIslCtx()); 1149 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1150 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1151 P = isl_ast_node_print(Kernel->tree, P, Options); 1152 char *String = isl_printer_get_str(P); 1153 printf("%s\n", String); 1154 free(String); 1155 isl_printer_free(P); 1156 } 1157 1158 /// Print C code corresponding to the GPU code described by @p Tree. 1159 /// 1160 /// @param Tree An AST describing GPU code 1161 /// @param PPCGProg The PPCG program from which @Tree has been constructed. 1162 void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) { 1163 auto *P = isl_printer_to_str(S->getIslCtx()); 1164 P = isl_printer_set_output_format(P, ISL_FORMAT_C); 1165 1166 PrintGPUUserData Data; 1167 Data.PPCGProg = PPCGProg; 1168 1169 auto *Options = isl_ast_print_options_alloc(S->getIslCtx()); 1170 Options = 1171 isl_ast_print_options_set_print_user(Options, printHostUser, &Data); 1172 P = isl_ast_node_print(Tree, P, Options); 1173 char *String = isl_printer_get_str(P); 1174 printf("# host\n"); 1175 printf("%s\n", String); 1176 free(String); 1177 isl_printer_free(P); 1178 1179 for (auto Kernel : Data.Kernels) { 1180 printf("# kernel%d\n", Kernel->id); 1181 printKernel(Kernel); 1182 } 1183 } 1184 1185 // Generate a GPU program using PPCG. 1186 // 1187 // GPU mapping consists of multiple steps: 1188 // 1189 // 1) Compute new schedule for the program. 1190 // 2) Map schedule to GPU (TODO) 1191 // 3) Generate code for new schedule (TODO) 1192 // 1193 // We do not use here the Polly ScheduleOptimizer, as the schedule optimizer 1194 // is mostly CPU specific. Instead, we use PPCG's GPU code generation 1195 // strategy directly from this pass. 1196 gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) { 1197 1198 auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen); 1199 1200 PPCGGen->ctx = S->getIslCtx(); 1201 PPCGGen->options = PPCGScop->options; 1202 PPCGGen->print = nullptr; 1203 PPCGGen->print_user = nullptr; 1204 PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt; 1205 PPCGGen->prog = PPCGProg; 1206 PPCGGen->tree = nullptr; 1207 PPCGGen->types.n = 0; 1208 PPCGGen->types.name = nullptr; 1209 PPCGGen->sizes = nullptr; 1210 PPCGGen->used_sizes = nullptr; 1211 PPCGGen->kernel_id = 0; 1212 1213 // Set scheduling strategy to same strategy PPCG is using. 1214 isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true); 1215 isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true); 1216 isl_options_set_schedule_whole_component(PPCGGen->ctx, false); 1217 1218 isl_schedule *Schedule = get_schedule(PPCGGen); 1219 1220 int has_permutable = has_any_permutable_node(Schedule); 1221 1222 if (!has_permutable || has_permutable < 0) { 1223 Schedule = isl_schedule_free(Schedule); 1224 } else { 1225 Schedule = map_to_device(PPCGGen, Schedule); 1226 PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule)); 1227 } 1228 1229 if (DumpSchedule) { 1230 isl_printer *P = isl_printer_to_str(S->getIslCtx()); 1231 P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK); 1232 P = isl_printer_print_str(P, "Schedule\n"); 1233 P = isl_printer_print_str(P, "========\n"); 1234 if (Schedule) 1235 P = isl_printer_print_schedule(P, Schedule); 1236 else 1237 P = isl_printer_print_str(P, "No schedule found\n"); 1238 1239 printf("%s\n", isl_printer_get_str(P)); 1240 isl_printer_free(P); 1241 } 1242 1243 if (DumpCode) { 1244 printf("Code\n"); 1245 printf("====\n"); 1246 if (PPCGGen->tree) 1247 printGPUTree(PPCGGen->tree, PPCGProg); 1248 else 1249 printf("No code generated\n"); 1250 } 1251 1252 isl_schedule_free(Schedule); 1253 1254 return PPCGGen; 1255 } 1256 1257 /// Free gpu_gen structure. 1258 /// 1259 /// @param PPCGGen The ppcg_gen object to free. 1260 void freePPCGGen(gpu_gen *PPCGGen) { 1261 isl_ast_node_free(PPCGGen->tree); 1262 isl_union_map_free(PPCGGen->sizes); 1263 isl_union_map_free(PPCGGen->used_sizes); 1264 free(PPCGGen); 1265 } 1266 1267 /// Free the options in the ppcg scop structure. 1268 /// 1269 /// ppcg is not freeing these options for us. To avoid leaks we do this 1270 /// ourselves. 1271 /// 1272 /// @param PPCGScop The scop referencing the options to free. 1273 void freeOptions(ppcg_scop *PPCGScop) { 1274 free(PPCGScop->options->debug); 1275 PPCGScop->options->debug = nullptr; 1276 free(PPCGScop->options); 1277 PPCGScop->options = nullptr; 1278 } 1279 1280 /// Generate code for a given GPU AST described by @p Root. 1281 /// 1282 /// @param Root An isl_ast_node pointing to the root of the GPU AST. 1283 /// @param Prog The GPU Program to generate code for. 1284 void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) { 1285 ScopAnnotator Annotator; 1286 Annotator.buildAliasScopes(*S); 1287 1288 Region *R = &S->getRegion(); 1289 1290 simplifyRegion(R, DT, LI, RI); 1291 1292 BasicBlock *EnteringBB = R->getEnteringBlock(); 1293 1294 PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator); 1295 1296 GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S, 1297 Prog); 1298 1299 // Only build the run-time condition and parameters _after_ having 1300 // introduced the conditional branch. This is important as the conditional 1301 // branch will guard the original scop from new induction variables that 1302 // the SCEVExpander may introduce while code generating the parameters and 1303 // which may introduce scalar dependences that prevent us from correctly 1304 // code generating this scop. 1305 BasicBlock *StartBlock = 1306 executeScopConditionally(*S, this, Builder.getTrue()); 1307 1308 // TODO: Handle LICM 1309 // TODO: Verify run-time checks 1310 auto SplitBlock = StartBlock->getSinglePredecessor(); 1311 Builder.SetInsertPoint(SplitBlock->getTerminator()); 1312 NodeBuilder.addParameters(S->getContext()); 1313 Builder.SetInsertPoint(&*StartBlock->begin()); 1314 NodeBuilder.create(Root); 1315 NodeBuilder.finalizeSCoP(*S); 1316 } 1317 1318 bool runOnScop(Scop &CurrentScop) override { 1319 S = &CurrentScop; 1320 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1321 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1322 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1323 DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout(); 1324 RI = &getAnalysis<RegionInfoPass>().getRegionInfo(); 1325 1326 // We currently do not support scops with invariant loads. 1327 if (S->hasInvariantAccesses()) 1328 return false; 1329 1330 auto PPCGScop = createPPCGScop(); 1331 auto PPCGProg = createPPCGProg(PPCGScop); 1332 auto PPCGGen = generateGPU(PPCGScop, PPCGProg); 1333 1334 if (PPCGGen->tree) 1335 generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg); 1336 1337 freeOptions(PPCGScop); 1338 freePPCGGen(PPCGGen); 1339 gpu_prog_free(PPCGProg); 1340 ppcg_scop_free(PPCGScop); 1341 1342 return true; 1343 } 1344 1345 void printScop(raw_ostream &, Scop &) const override {} 1346 1347 void getAnalysisUsage(AnalysisUsage &AU) const override { 1348 AU.addRequired<DominatorTreeWrapperPass>(); 1349 AU.addRequired<RegionInfoPass>(); 1350 AU.addRequired<ScalarEvolutionWrapperPass>(); 1351 AU.addRequired<ScopDetection>(); 1352 AU.addRequired<ScopInfoRegionPass>(); 1353 AU.addRequired<LoopInfoWrapperPass>(); 1354 1355 AU.addPreserved<AAResultsWrapperPass>(); 1356 AU.addPreserved<BasicAAWrapperPass>(); 1357 AU.addPreserved<LoopInfoWrapperPass>(); 1358 AU.addPreserved<DominatorTreeWrapperPass>(); 1359 AU.addPreserved<GlobalsAAWrapperPass>(); 1360 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1361 AU.addPreserved<ScopDetection>(); 1362 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1363 AU.addPreserved<SCEVAAWrapperPass>(); 1364 1365 // FIXME: We do not yet add regions for the newly generated code to the 1366 // region tree. 1367 AU.addPreserved<RegionInfoPass>(); 1368 AU.addPreserved<ScopInfoRegionPass>(); 1369 } 1370 }; 1371 } 1372 1373 char PPCGCodeGeneration::ID = 1; 1374 1375 Pass *polly::createPPCGCodeGenerationPass() { return new PPCGCodeGeneration(); } 1376 1377 INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg", 1378 "Polly - Apply PPCG translation to SCOP", false, false) 1379 INITIALIZE_PASS_DEPENDENCY(DependenceInfo); 1380 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1381 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); 1382 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1383 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); 1384 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1385 INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg", 1386 "Polly - Apply PPCG translation to SCOP", false, false) 1387