1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #include "polly/Dependences.h" 27 #include "polly/LinkAllPasses.h" 28 #include "polly/Options.h" 29 #include "polly/ScopInfo.h" 30 #include "polly/TempScopInfo.h" 31 #include "polly/CodeGen/CodeGeneration.h" 32 #include "polly/CodeGen/BlockGenerators.h" 33 #include "polly/CodeGen/LoopGenerators.h" 34 #include "polly/CodeGen/PTXGenerator.h" 35 #include "polly/CodeGen/Utils.h" 36 #include "polly/Support/GICHelper.h" 37 #include "polly/Support/ScopHelper.h" 38 39 #include "llvm/IR/Module.h" 40 #include "llvm/ADT/SetVector.h" 41 #include "llvm/ADT/PostOrderIterator.h" 42 #include "llvm/Analysis/LoopInfo.h" 43 #include "llvm/Analysis/ScalarEvolutionExpander.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/IR/DataLayout.h" 46 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 47 48 #define CLOOG_INT_GMP 1 49 #include "cloog/cloog.h" 50 #include "cloog/isl/cloog.h" 51 52 #include "isl/aff.h" 53 54 #include <vector> 55 #include <utility> 56 57 using namespace polly; 58 using namespace llvm; 59 60 #define DEBUG_TYPE "polly-codegen" 61 62 struct isl_set; 63 64 namespace polly { 65 static cl::opt<bool> 66 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"), 67 cl::value_desc("OpenMP code generation enabled if true"), 68 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 69 70 #ifdef GPU_CODEGEN 71 static cl::opt<bool> 72 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), 73 cl::Hidden, cl::value_desc("GPGPU code generation enabled if true"), 74 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 75 76 static cl::opt<std::string> 77 GPUTriple("polly-gpgpu-triple", 78 cl::desc("Target triple for GPU code generation"), cl::Hidden, 79 cl::init(""), cl::cat(PollyCategory)); 80 #endif /* GPU_CODEGEN */ 81 82 typedef DenseMap<const char *, Value *> CharMapT; 83 84 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 85 class ClastExpCodeGen { 86 PollyIRBuilder &Builder; 87 const CharMapT &IVS; 88 89 Value *codegen(const clast_name *e, Type *Ty); 90 Value *codegen(const clast_term *e, Type *Ty); 91 Value *codegen(const clast_binary *e, Type *Ty); 92 Value *codegen(const clast_reduction *r, Type *Ty); 93 94 public: 95 // A generator for clast expressions. 96 // 97 // @param B The IRBuilder that defines where the code to calculate the 98 // clast expressions should be inserted. 99 // @param IVMAP A Map that translates strings describing the induction 100 // variables to the Values* that represent these variables 101 // on the LLVM side. 102 ClastExpCodeGen(PollyIRBuilder &B, CharMapT &IVMap); 103 104 // Generates code to calculate a given clast expression. 105 // 106 // @param e The expression to calculate. 107 // @return The Value that holds the result. 108 Value *codegen(const clast_expr *e, Type *Ty); 109 }; 110 111 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 112 CharMapT::const_iterator I = IVS.find(e->name); 113 114 assert(I != IVS.end() && "Clast name not found"); 115 116 return Builder.CreateSExtOrBitCast(I->second, Ty); 117 } 118 119 static APInt APInt_from_MPZ(const mpz_t mpz) { 120 uint64_t *p = nullptr; 121 size_t sz; 122 123 p = (uint64_t *)mpz_export(p, &sz, -1, sizeof(uint64_t), 0, 0, mpz); 124 125 if (p) { 126 APInt A((unsigned)mpz_sizeinbase(mpz, 2), (unsigned)sz, p); 127 A = A.zext(A.getBitWidth() + 1); 128 free(p); 129 130 if (mpz_sgn(mpz) == -1) 131 return -A; 132 else 133 return A; 134 } else { 135 uint64_t val = 0; 136 return APInt(1, 1, &val); 137 } 138 } 139 140 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 141 APInt a = APInt_from_MPZ(e->val); 142 143 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 144 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 145 146 if (!e->var) 147 return ConstOne; 148 149 Value *var = codegen(e->var, Ty); 150 return Builder.CreateMul(ConstOne, var); 151 } 152 153 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 154 Value *LHS = codegen(e->LHS, Ty); 155 156 APInt RHS_AP = APInt_from_MPZ(e->RHS); 157 158 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 159 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 160 161 switch (e->type) { 162 case clast_bin_mod: 163 return Builder.CreateSRem(LHS, RHS); 164 case clast_bin_fdiv: { 165 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 166 Value *One = ConstantInt::get(Ty, 1); 167 Value *Zero = ConstantInt::get(Ty, 0); 168 Value *Sum1 = Builder.CreateSub(LHS, RHS); 169 Value *Sum2 = Builder.CreateAdd(Sum1, One); 170 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 171 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 172 return Builder.CreateSDiv(Dividend, RHS); 173 } 174 case clast_bin_cdiv: { 175 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 176 Value *One = ConstantInt::get(Ty, 1); 177 Value *Zero = ConstantInt::get(Ty, 0); 178 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 179 Value *Sum2 = Builder.CreateSub(Sum1, One); 180 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 181 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 182 return Builder.CreateSDiv(Dividend, RHS); 183 } 184 case clast_bin_div: 185 return Builder.CreateSDiv(LHS, RHS); 186 } 187 188 llvm_unreachable("Unknown clast binary expression type"); 189 } 190 191 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 192 assert((r->type == clast_red_min || r->type == clast_red_max || 193 r->type == clast_red_sum) && 194 "Clast reduction type not supported"); 195 Value *old = codegen(r->elts[0], Ty); 196 197 for (int i = 1; i < r->n; ++i) { 198 Value *exprValue = codegen(r->elts[i], Ty); 199 200 switch (r->type) { 201 case clast_red_min: { 202 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 203 old = Builder.CreateSelect(cmp, old, exprValue); 204 break; 205 } 206 case clast_red_max: { 207 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 208 old = Builder.CreateSelect(cmp, old, exprValue); 209 break; 210 } 211 case clast_red_sum: 212 old = Builder.CreateAdd(old, exprValue); 213 break; 214 } 215 } 216 217 return old; 218 } 219 220 ClastExpCodeGen::ClastExpCodeGen(PollyIRBuilder &B, CharMapT &IVMap) 221 : Builder(B), IVS(IVMap) {} 222 223 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 224 switch (e->type) { 225 case clast_expr_name: 226 return codegen((const clast_name *)e, Ty); 227 case clast_expr_term: 228 return codegen((const clast_term *)e, Ty); 229 case clast_expr_bin: 230 return codegen((const clast_binary *)e, Ty); 231 case clast_expr_red: 232 return codegen((const clast_reduction *)e, Ty); 233 } 234 235 llvm_unreachable("Unknown clast expression!"); 236 } 237 238 class ClastStmtCodeGen { 239 public: 240 const std::vector<std::string> &getParallelLoops(); 241 242 private: 243 // The Scop we code generate. 244 Scop *S; 245 Pass *P; 246 LoopInfo &LI; 247 ScalarEvolution &SE; 248 DominatorTree &DT; 249 250 // The Builder specifies the current location to code generate at. 251 PollyIRBuilder &Builder; 252 253 // Map the Values from the old code to their counterparts in the new code. 254 ValueMapT ValueMap; 255 256 // Map the loops from the old code to expressions function of the induction 257 // variables in the new code. For example, when the code generator produces 258 // this AST: 259 // 260 // for (int c1 = 0; c1 <= 1023; c1 += 1) 261 // for (int c2 = 0; c2 <= 1023; c2 += 1) 262 // Stmt(c2 + 3, c1); 263 // 264 // LoopToScev is a map associating: 265 // "outer loop in the old loop nest" -> SCEV("c2 + 3"), 266 // "inner loop in the old loop nest" -> SCEV("c1"). 267 LoopToScevMapT LoopToScev; 268 269 // clastVars maps from the textual representation of a clast variable to its 270 // current *Value. clast variables are scheduling variables, original 271 // induction variables or parameters. They are used either in loop bounds or 272 // to define the statement instance that is executed. 273 // 274 // for (s = 0; s < n + 3; ++i) 275 // for (t = s; t < m; ++j) 276 // Stmt(i = s + 3 * m, j = t); 277 // 278 // {s,t,i,j,n,m} is the set of clast variables in this clast. 279 CharMapT ClastVars; 280 281 // Codegenerator for clast expressions. 282 ClastExpCodeGen ExpGen; 283 284 // Do we currently generate parallel code? 285 bool parallelCodeGeneration; 286 287 std::vector<std::string> parallelLoops; 288 289 void codegen(const clast_assignment *a); 290 291 void codegen(const clast_assignment *a, ScopStmt *Statement, 292 unsigned Dimension, int vectorDim, 293 std::vector<ValueMapT> *VectorVMap = 0, 294 std::vector<LoopToScevMapT> *VLTS = 0); 295 296 void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement, 297 int vectorDim = 0, 298 std::vector<ValueMapT> *VectorVMap = 0, 299 std::vector<LoopToScevMapT> *VLTS = 0); 300 301 void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = nullptr, 302 const char *iterator = nullptr, 303 __isl_take isl_set *scatteringDomain = 0); 304 305 void codegen(const clast_block *b); 306 307 /// @brief Create a classical sequential loop. 308 void codegenForSequential(const clast_for *f); 309 310 /// @brief Create OpenMP structure values. 311 /// 312 /// Create a list of values that has to be stored into the OpenMP subfuncition 313 /// structure. 314 SetVector<Value *> getOMPValues(const clast_stmt *Body); 315 316 /// @brief Update ClastVars and ValueMap according to a value map. 317 /// 318 /// @param VMap A map from old to new values. 319 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 320 321 /// @brief Create an OpenMP parallel for loop. 322 /// 323 /// This loop reflects a loop as if it would have been created by an OpenMP 324 /// statement. 325 void codegenForOpenMP(const clast_for *f); 326 327 #ifdef GPU_CODEGEN 328 /// @brief Create GPGPU device memory access values. 329 /// 330 /// Create a list of values that will be set to be parameters of the GPGPU 331 /// subfunction. These parameters represent device memory base addresses 332 /// and the size in bytes. 333 SetVector<Value *> getGPUValues(unsigned &OutputBytes); 334 335 /// @brief Create a GPU parallel for loop. 336 /// 337 /// This loop reflects a loop as if it would have been created by a GPU 338 /// statement. 339 void codegenForGPGPU(const clast_for *F); 340 341 /// @brief Get innermost for loop. 342 const clast_stmt *getScheduleInfo(const clast_for *F, 343 std::vector<int> &NumIters, 344 unsigned &LoopDepth, 345 unsigned &NonPLoopDepth); 346 #endif /* GPU_CODEGEN */ 347 348 /// @brief Check if a loop is parallel 349 /// 350 /// Detect if a clast_for loop can be executed in parallel. 351 /// 352 /// @param For The clast for loop to check. 353 /// 354 /// @return bool Returns true if the incoming clast_for statement can 355 /// execute in parallel. 356 bool isParallelFor(const clast_for *For); 357 358 bool isInnermostLoop(const clast_for *f); 359 360 /// @brief Get the number of loop iterations for this loop. 361 /// @param f The clast for loop to check. 362 int getNumberOfIterations(const clast_for *f); 363 364 /// @brief Create vector instructions for this loop. 365 void codegenForVector(const clast_for *f); 366 367 void codegen(const clast_for *f); 368 369 Value *codegen(const clast_equation *eq); 370 371 void codegen(const clast_guard *g); 372 373 void codegen(const clast_stmt *stmt); 374 375 void addParameters(const CloogNames *names); 376 377 IntegerType *getIntPtrTy(); 378 379 public: 380 void codegen(const clast_root *r); 381 382 ClastStmtCodeGen(Scop *scop, PollyIRBuilder &B, Pass *P); 383 }; 384 } 385 386 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 387 return P->getAnalysis<DataLayoutPass>().getDataLayout().getIntPtrType( 388 Builder.getContext()); 389 } 390 391 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 392 return parallelLoops; 393 } 394 395 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 396 Value *V = ExpGen.codegen(a->RHS, getIntPtrTy()); 397 ClastVars[a->LHS] = V; 398 } 399 400 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt, 401 unsigned Dim, int VectorDim, 402 std::vector<ValueMapT> *VectorVMap, 403 std::vector<LoopToScevMapT> *VLTS) { 404 Value *RHS; 405 406 assert(!A->LHS && "Statement assignments do not have left hand side"); 407 408 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 409 410 const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS); 411 if (VLTS) 412 (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS; 413 LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS; 414 415 const PHINode *PN = Stmt->getInductionVariableForDimension(Dim); 416 if (PN) { 417 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 418 419 if (VectorVMap) 420 (*VectorVMap)[VectorDim][PN] = RHS; 421 422 ValueMap[PN] = RHS; 423 } 424 } 425 426 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment, 427 ScopStmt *Statement, int vectorDim, 428 std::vector<ValueMapT> *VectorVMap, 429 std::vector<LoopToScevMapT> *VLTS) { 430 int Dimension = 0; 431 432 while (Assignment) { 433 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) && 434 "Substitions are expected to be assignments"); 435 codegen((const clast_assignment *)Assignment, Statement, Dimension, 436 vectorDim, VectorVMap, VLTS); 437 Assignment = Assignment->next; 438 Dimension++; 439 } 440 } 441 442 // Takes the cloog specific domain and translates it into a map Statement -> 443 // PartialSchedule, where the PartialSchedule contains all the dimensions that 444 // have been code generated up to this point. 445 static __isl_give isl_map *extractPartialSchedule(ScopStmt *Statement, 446 __isl_take isl_set *Domain) { 447 isl_map *Schedule = Statement->getScattering(); 448 int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set); 449 int UnscheduledDimensions = 450 isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions; 451 452 isl_set_free(Domain); 453 454 return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions, 455 UnscheduledDimensions); 456 } 457 458 void ClastStmtCodeGen::codegen(const clast_user_stmt *u, 459 std::vector<Value *> *IVS, const char *iterator, 460 __isl_take isl_set *Domain) { 461 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 462 463 if (u->substitutions) 464 codegenSubstitutions(u->substitutions, Statement); 465 466 int VectorDimensions = IVS ? IVS->size() : 1; 467 468 if (VectorDimensions == 1) { 469 BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P, LI, 470 SE); 471 isl_set_free(Domain); 472 return; 473 } 474 475 VectorValueMapT VectorMap(VectorDimensions); 476 std::vector<LoopToScevMapT> VLTS(VectorDimensions); 477 478 if (IVS) { 479 assert(u->substitutions && "Substitutions expected!"); 480 int i = 0; 481 for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end(); 482 II != IE; ++II) { 483 ClastVars[iterator] = *II; 484 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS); 485 i++; 486 } 487 } 488 489 // Copy the current value map into all vector maps if the key wasn't 490 // available yet. This is needed in case vector codegen is performed in 491 // OpenMP subfunctions. 492 for (auto KV : ValueMap) 493 for (int i = 0; i < VectorDimensions; ++i) 494 VectorMap[i].insert(KV); 495 496 isl_map *Schedule = extractPartialSchedule(Statement, Domain); 497 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule, 498 P, LI, SE); 499 isl_map_free(Schedule); 500 } 501 502 void ClastStmtCodeGen::codegen(const clast_block *b) { 503 if (b->body) 504 codegen(b->body); 505 } 506 507 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 508 Value *LowerBound, *UpperBound, *IV, *Stride; 509 BasicBlock *ExitBlock; 510 Type *IntPtrTy = getIntPtrTy(); 511 512 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 513 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 514 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 515 516 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, LI, DT, ExitBlock, 517 CmpInst::ICMP_SLE); 518 519 // Add loop iv to symbols. 520 ClastVars[f->iterator] = IV; 521 522 if (f->body) 523 codegen(f->body); 524 525 // Loop is finished, so remove its iv from the live symbols. 526 ClastVars.erase(f->iterator); 527 Builder.SetInsertPoint(ExitBlock->begin()); 528 } 529 530 // Helper class to determine all scalar parameters used in the basic blocks of a 531 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 532 class ParameterVisitor : public ClastVisitor { 533 std::set<Value *> Values; 534 535 public: 536 ParameterVisitor() : ClastVisitor(), Values() {} 537 538 void visitUser(const clast_user_stmt *Stmt) { 539 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 540 const BasicBlock *BB = S->getBasicBlock(); 541 542 // Check all the operands of instructions in the basic block. 543 for (const Instruction &Inst : *BB) { 544 for (Value *SrcVal : Inst.operands()) { 545 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 546 if (S->getParent()->getRegion().contains(OpInst)) 547 continue; 548 549 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 550 Values.insert(SrcVal); 551 } 552 } 553 } 554 555 // Iterator to iterate over the values found. 556 typedef std::set<Value *>::const_iterator const_iterator; 557 inline const_iterator begin() const { return Values.begin(); } 558 inline const_iterator end() const { return Values.end(); } 559 }; 560 561 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 562 SetVector<Value *> Values; 563 564 // The clast variables 565 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 566 I++) 567 Values.insert(I->second); 568 569 // Find the temporaries that are referenced in the clast statements' 570 // basic blocks but are not defined by these blocks (e.g., references 571 // to function arguments or temporaries defined before the start of 572 // the SCoP). 573 ParameterVisitor Params; 574 Params.visit(Body); 575 576 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 577 PI != PE; ++PI) { 578 Value *V = *PI; 579 Values.insert(V); 580 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 581 } 582 583 return Values; 584 } 585 586 void 587 ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) { 588 std::set<Value *> Inserted; 589 590 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 591 I++) { 592 ClastVars[I->first] = VMap[I->second]; 593 Inserted.insert(I->second); 594 } 595 596 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 597 E = VMap.end(); 598 I != E; ++I) { 599 if (Inserted.count(I->first)) 600 continue; 601 602 ValueMap[I->first] = I->second; 603 } 604 } 605 606 static void clearDomtree(Function *F, DominatorTree &DT) { 607 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 608 std::vector<BasicBlock *> Nodes; 609 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 610 Nodes.push_back(I->getBlock()); 611 612 for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end(); 613 I != E; ++I) 614 DT.eraseNode(*I); 615 } 616 617 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 618 Value *Stride, *LB, *UB, *IV; 619 BasicBlock::iterator LoopBody; 620 IntegerType *IntPtrTy = getIntPtrTy(); 621 SetVector<Value *> Values; 622 OMPGenerator::ValueToValueMapTy VMap; 623 OMPGenerator OMPGen(Builder, P); 624 625 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 626 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 627 LB = ExpGen.codegen(For->LB, IntPtrTy); 628 UB = ExpGen.codegen(For->UB, IntPtrTy); 629 630 Values = getOMPValues(For->body); 631 632 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 633 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 634 Builder.SetInsertPoint(LoopBody); 635 636 // Save the current values. 637 const ValueMapT ValueMapCopy = ValueMap; 638 const CharMapT ClastVarsCopy = ClastVars; 639 640 updateWithValueMap(VMap); 641 ClastVars[For->iterator] = IV; 642 643 if (For->body) 644 codegen(For->body); 645 646 // Restore the original values. 647 ValueMap = ValueMapCopy; 648 ClastVars = ClastVarsCopy; 649 650 clearDomtree((*LoopBody).getParent()->getParent(), 651 P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()); 652 653 Builder.SetInsertPoint(AfterLoop); 654 } 655 656 #ifdef GPU_CODEGEN 657 static unsigned getArraySizeInBytes(const ArrayType *AT) { 658 unsigned Bytes = AT->getNumElements(); 659 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 660 Bytes *= getArraySizeInBytes(T); 661 else 662 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 663 664 return Bytes; 665 } 666 667 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 668 SetVector<Value *> Values; 669 OutputBytes = 0; 670 671 // Record the memory reference base addresses. 672 for (ScopStmt *Stmt : *S) { 673 for (MemoryAccess *MA : *Stmt) { 674 Value *BaseAddr = MA->getBaseAddr(); 675 Values.insert((BaseAddr)); 676 677 // FIXME: we assume that there is one and only one array to be written 678 // in a SCoP. 679 int NumWrites = 0; 680 if (MA->isWrite()) { 681 ++NumWrites; 682 assert(NumWrites <= 1 && 683 "We support at most one array to be written in a SCoP."); 684 if (const PointerType *PT = 685 dyn_cast<PointerType>(BaseAddr->getType())) { 686 Type *T = PT->getArrayElementType(); 687 const ArrayType *ATy = dyn_cast<ArrayType>(T); 688 OutputBytes = getArraySizeInBytes(ATy); 689 } 690 } 691 } 692 } 693 694 return Values; 695 } 696 697 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F, 698 std::vector<int> &NumIters, 699 unsigned &LoopDepth, 700 unsigned &NonPLoopDepth) { 701 clast_stmt *Stmt = (clast_stmt *)F; 702 const clast_for *Result; 703 bool NonParaFlag = false; 704 LoopDepth = 0; 705 NonPLoopDepth = 0; 706 707 while (Stmt) { 708 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 709 const clast_for *T = (clast_for *)Stmt; 710 if (isParallelFor(T)) { 711 if (!NonParaFlag) { 712 NumIters.push_back(getNumberOfIterations(T)); 713 Result = T; 714 } 715 } else 716 NonParaFlag = true; 717 718 Stmt = T->body; 719 LoopDepth++; 720 continue; 721 } 722 Stmt = Stmt->next; 723 } 724 725 assert(NumIters.size() == 4 && 726 "The loops should be tiled into 4-depth parallel loops and an " 727 "innermost non-parallel one (if exist)."); 728 NonPLoopDepth = LoopDepth - NumIters.size(); 729 assert(NonPLoopDepth <= 1 && 730 "We support only one innermost non-parallel loop currently."); 731 return (const clast_stmt *)Result->body; 732 } 733 734 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 735 BasicBlock::iterator LoopBody; 736 SetVector<Value *> Values; 737 SetVector<Value *> IVS; 738 std::vector<int> NumIterations; 739 PTXGenerator::ValueToValueMapTy VMap; 740 741 assert(!GPUTriple.empty() && 742 "Target triple should be set properly for GPGPU code generation."); 743 PTXGenerator PTXGen(Builder, P, GPUTriple); 744 745 // Get original IVS and ScopStmt 746 unsigned TiledLoopDepth, NonPLoopDepth; 747 const clast_stmt *InnerStmt = 748 getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth); 749 const clast_stmt *TmpStmt; 750 const clast_user_stmt *U; 751 const clast_for *InnerFor; 752 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 753 InnerFor = (const clast_for *)InnerStmt; 754 TmpStmt = InnerFor->body; 755 } else 756 TmpStmt = InnerStmt; 757 U = (const clast_user_stmt *)TmpStmt; 758 ScopStmt *Statement = (ScopStmt *)U->statement->usr; 759 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 760 const Value *IV = Statement->getInductionVariableForDimension(i); 761 IVS.insert(const_cast<Value *>(IV)); 762 } 763 764 unsigned OutBytes; 765 Values = getGPUValues(OutBytes); 766 PTXGen.setOutputBytes(OutBytes); 767 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 768 769 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 770 Builder.SetInsertPoint(LoopBody); 771 772 BasicBlock *AfterBB = 0; 773 if (NonPLoopDepth) { 774 Value *LowerBound, *UpperBound, *IV, *Stride; 775 Type *IntPtrTy = getIntPtrTy(); 776 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 777 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 778 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 779 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, LI, DT, AfterBB, 780 CmpInst::ICMP_SLE); 781 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 782 Value *OldIV = const_cast<Value *>(OldIV_); 783 VMap.insert(std::make_pair(OldIV, IV)); 784 } 785 786 updateWithValueMap(VMap); 787 788 BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P, LI, 789 SE); 790 791 if (AfterBB) 792 Builder.SetInsertPoint(AfterBB->begin()); 793 794 // FIXME: The replacement of the host base address with the parameter of ptx 795 // subfunction should have been done by updateWithValueMap. We use the 796 // following codes to avoid affecting other parts of Polly. This should be 797 // fixed later. 798 Function *FN = Builder.GetInsertBlock()->getParent(); 799 for (unsigned j = 0; j < Values.size(); j++) { 800 Value *baseAddr = Values[j]; 801 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 802 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 803 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 804 } 805 } 806 Builder.SetInsertPoint(AfterLoop); 807 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 808 NumIterations[2], NumIterations[3]); 809 PTXGen.finishGeneration(FN); 810 } 811 #endif 812 813 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 814 const clast_stmt *stmt = f->body; 815 816 while (stmt) { 817 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 818 return false; 819 820 stmt = stmt->next; 821 } 822 823 return true; 824 } 825 826 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) { 827 isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain)); 828 int NumberOfIterations = polly::getNumberOfIterations(LoopDomain); 829 if (NumberOfIterations == -1) 830 return -1; 831 return NumberOfIterations / mpz_get_si(For->stride) + 1; 832 } 833 834 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 835 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 836 int VectorWidth = getNumberOfIterations(F); 837 838 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 839 840 APInt Stride = APInt_from_MPZ(F->stride); 841 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 842 Stride = Stride.zext(LoopIVType->getBitWidth()); 843 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 844 845 std::vector<Value *> IVS(VectorWidth); 846 IVS[0] = LB; 847 848 for (int i = 1; i < VectorWidth; i++) 849 IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv"); 850 851 isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(F->domain)); 852 853 // Add loop iv to symbols. 854 ClastVars[F->iterator] = LB; 855 856 const clast_stmt *Stmt = F->body; 857 858 while (Stmt) { 859 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 860 isl_set_copy(Domain)); 861 Stmt = Stmt->next; 862 } 863 864 // Loop is finished, so remove its iv from the live symbols. 865 isl_set_free(Domain); 866 ClastVars.erase(F->iterator); 867 } 868 869 static isl_union_map *getCombinedScheduleForSpace(Scop *S, unsigned dimLevel) { 870 isl_space *Space = S->getParamSpace(); 871 isl_union_map *schedule = isl_union_map_empty(Space); 872 873 for (ScopStmt *Stmt : *S) { 874 unsigned remainingDimensions = Stmt->getNumScattering() - dimLevel; 875 isl_map *Scattering = isl_map_project_out( 876 Stmt->getScattering(), isl_dim_out, dimLevel, remainingDimensions); 877 schedule = isl_union_map_add_map(schedule, Scattering); 878 } 879 880 return schedule; 881 } 882 883 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 884 isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(f->domain)); 885 assert(Domain && "Cannot access domain of loop"); 886 887 Dependences &D = P->getAnalysis<Dependences>(); 888 isl_union_map *Deps = 889 D.getDependences(Dependences::TYPE_RAW | Dependences::TYPE_WAW | 890 Dependences::TYPE_WAR | Dependences::TYPE_RAW); 891 isl_union_map *Schedule = 892 getCombinedScheduleForSpace(S, isl_set_n_dim(Domain)); 893 Schedule = 894 isl_union_map_intersect_range(Schedule, isl_union_set_from_set(Domain)); 895 bool IsParallel = D.isParallel(Schedule, Deps); 896 isl_union_map_free(Schedule); 897 return IsParallel; 898 } 899 900 void ClastStmtCodeGen::codegen(const clast_for *f) { 901 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 902 if ((Vector || OpenMP) && isParallelFor(f)) { 903 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) && 904 (getNumberOfIterations(f) <= 16)) { 905 codegenForVector(f); 906 return; 907 } 908 909 if (OpenMP && !parallelCodeGeneration) { 910 parallelCodeGeneration = true; 911 parallelLoops.push_back(f->iterator); 912 codegenForOpenMP(f); 913 parallelCodeGeneration = false; 914 return; 915 } 916 } 917 918 #ifdef GPU_CODEGEN 919 if (GPGPU && isParallelFor(f)) { 920 if (!parallelCodeGeneration) { 921 parallelCodeGeneration = true; 922 parallelLoops.push_back(f->iterator); 923 codegenForGPGPU(f); 924 parallelCodeGeneration = false; 925 return; 926 } 927 } 928 #endif 929 930 codegenForSequential(f); 931 } 932 933 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 934 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 935 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 936 CmpInst::Predicate P; 937 938 if (eq->sign == 0) 939 P = ICmpInst::ICMP_EQ; 940 else if (eq->sign > 0) 941 P = ICmpInst::ICMP_SGE; 942 else 943 P = ICmpInst::ICMP_SLE; 944 945 return Builder.CreateICmp(P, LHS, RHS); 946 } 947 948 void ClastStmtCodeGen::codegen(const clast_guard *g) { 949 Function *F = Builder.GetInsertBlock()->getParent(); 950 LLVMContext &Context = F->getContext(); 951 952 BasicBlock *CondBB = 953 SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P); 954 CondBB->setName("polly.cond"); 955 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 956 MergeBB->setName("polly.merge"); 957 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 958 959 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 960 DT.addNewBlock(ThenBB, CondBB); 961 DT.changeImmediateDominator(MergeBB, CondBB); 962 963 CondBB->getTerminator()->eraseFromParent(); 964 965 Builder.SetInsertPoint(CondBB); 966 967 Value *Predicate = codegen(&(g->eq[0])); 968 969 for (int i = 1; i < g->n; ++i) { 970 Value *TmpPredicate = codegen(&(g->eq[i])); 971 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 972 } 973 974 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 975 Builder.SetInsertPoint(ThenBB); 976 Builder.CreateBr(MergeBB); 977 Builder.SetInsertPoint(ThenBB->begin()); 978 979 LoopInfo &LI = P->getAnalysis<LoopInfo>(); 980 Loop *L = LI.getLoopFor(CondBB); 981 if (L) 982 L->addBasicBlockToLoop(ThenBB, LI.getBase()); 983 984 codegen(g->then); 985 986 Builder.SetInsertPoint(MergeBB->begin()); 987 } 988 989 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 990 if (CLAST_STMT_IS_A(stmt, stmt_root)) 991 assert(false && "No second root statement expected"); 992 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 993 codegen((const clast_assignment *)stmt); 994 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 995 codegen((const clast_user_stmt *)stmt); 996 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 997 codegen((const clast_block *)stmt); 998 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 999 codegen((const clast_for *)stmt); 1000 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 1001 codegen((const clast_guard *)stmt); 1002 1003 if (stmt->next) 1004 codegen(stmt->next); 1005 } 1006 1007 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 1008 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 1009 1010 int i = 0; 1011 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 1012 PI != PE; ++PI) { 1013 assert(i < names->nb_parameters && "Not enough parameter names"); 1014 1015 const SCEV *Param = *PI; 1016 Type *Ty = Param->getType(); 1017 1018 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 1019 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 1020 ClastVars[names->parameters[i]] = V; 1021 1022 ++i; 1023 } 1024 } 1025 1026 void ClastStmtCodeGen::codegen(const clast_root *r) { 1027 addParameters(r->names); 1028 1029 parallelCodeGeneration = false; 1030 1031 const clast_stmt *stmt = (const clast_stmt *)r; 1032 if (stmt->next) 1033 codegen(stmt->next); 1034 } 1035 1036 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, PollyIRBuilder &B, Pass *P) 1037 : S(scop), P(P), LI(P->getAnalysis<LoopInfo>()), 1038 SE(P->getAnalysis<ScalarEvolution>()), 1039 DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), Builder(B), 1040 ExpGen(Builder, ClastVars) {} 1041 1042 namespace { 1043 class CodeGeneration : public ScopPass { 1044 std::vector<std::string> ParallelLoops; 1045 1046 public: 1047 static char ID; 1048 1049 CodeGeneration() : ScopPass(ID) {} 1050 1051 bool runOnScop(Scop &S) { 1052 ParallelLoops.clear(); 1053 1054 assert(!S.getRegion().isTopLevelRegion() && 1055 "Top level regions are not supported"); 1056 1057 simplifyRegion(&S, this); 1058 1059 BasicBlock *StartBlock = executeScopConditionally(S, this); 1060 1061 PollyIRBuilder Builder(StartBlock->begin()); 1062 1063 ClastStmtCodeGen CodeGen(&S, Builder, this); 1064 CloogInfo &C = getAnalysis<CloogInfo>(); 1065 CodeGen.codegen(C.getClast()); 1066 1067 ParallelLoops.insert(ParallelLoops.begin(), 1068 CodeGen.getParallelLoops().begin(), 1069 CodeGen.getParallelLoops().end()); 1070 return true; 1071 } 1072 1073 virtual void printScop(raw_ostream &OS) const { 1074 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1075 PE = ParallelLoops.end(); 1076 PI != PE; ++PI) 1077 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1078 } 1079 1080 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1081 AU.addRequired<CloogInfo>(); 1082 AU.addRequired<Dependences>(); 1083 AU.addRequired<DominatorTreeWrapperPass>(); 1084 AU.addRequired<RegionInfoPass>(); 1085 AU.addRequired<ScalarEvolution>(); 1086 AU.addRequired<ScopDetection>(); 1087 AU.addRequired<ScopInfo>(); 1088 AU.addRequired<DataLayoutPass>(); 1089 AU.addRequired<LoopInfo>(); 1090 1091 AU.addPreserved<CloogInfo>(); 1092 AU.addPreserved<Dependences>(); 1093 AU.addPreserved<LoopInfo>(); 1094 AU.addPreserved<DominatorTreeWrapperPass>(); 1095 AU.addPreserved<ScopDetection>(); 1096 AU.addPreserved<ScalarEvolution>(); 1097 1098 // FIXME: We do not yet add regions for the newly generated code to the 1099 // region tree. 1100 AU.addPreserved<RegionInfoPass>(); 1101 AU.addPreserved<TempScopInfo>(); 1102 AU.addPreserved<ScopInfo>(); 1103 AU.addPreservedID(IndependentBlocksID); 1104 } 1105 }; 1106 } 1107 1108 char CodeGeneration::ID = 1; 1109 1110 Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); } 1111 1112 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1113 "Polly - Create LLVM-IR from SCoPs", false, false); 1114 INITIALIZE_PASS_DEPENDENCY(CloogInfo); 1115 INITIALIZE_PASS_DEPENDENCY(Dependences); 1116 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1117 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); 1118 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution); 1119 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1120 INITIALIZE_PASS_DEPENDENCY(DataLayoutPass); 1121 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1122 "Polly - Create LLVM-IR from SCoPs", false, false) 1123 1124 #endif // CLOOG_FOUND 1125