1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #define DEBUG_TYPE "polly-codegen" 27 #include "polly/Dependences.h" 28 #include "polly/LinkAllPasses.h" 29 #include "polly/Options.h" 30 #include "polly/ScopInfo.h" 31 #include "polly/TempScopInfo.h" 32 #include "polly/CodeGen/CodeGeneration.h" 33 #include "polly/CodeGen/BlockGenerators.h" 34 #include "polly/CodeGen/LoopGenerators.h" 35 #include "polly/CodeGen/PTXGenerator.h" 36 #include "polly/CodeGen/Utils.h" 37 #include "polly/Support/GICHelper.h" 38 #include "polly/Support/ScopHelper.h" 39 40 #include "llvm/IR/Module.h" 41 #include "llvm/ADT/SetVector.h" 42 #include "llvm/ADT/PostOrderIterator.h" 43 #include "llvm/Analysis/LoopInfo.h" 44 #include "llvm/Analysis/ScalarEvolutionExpander.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/IR/DataLayout.h" 47 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 48 49 #define CLOOG_INT_GMP 1 50 #include "cloog/cloog.h" 51 #include "cloog/isl/cloog.h" 52 53 #include "isl/aff.h" 54 55 #include <vector> 56 #include <utility> 57 58 using namespace polly; 59 using namespace llvm; 60 61 struct isl_set; 62 63 namespace polly { 64 static cl::opt<bool> 65 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"), 66 cl::value_desc("OpenMP code generation enabled if true"), 67 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 68 69 #ifdef GPU_CODEGEN 70 static cl::opt<bool> 71 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), cl::Hidden, 72 cl::value_desc("GPGPU code generation enabled if true"), cl::init(false), 73 cl::ZeroOrMore, cl::cat(PollyCategory)); 74 75 static cl::opt<std::string> 76 GPUTriple("polly-gpgpu-triple", 77 cl::desc("Target triple for GPU code generation"), cl::Hidden, 78 cl::init(""), cl::cat(PollyCategory)); 79 #endif /* GPU_CODEGEN */ 80 81 typedef DenseMap<const char *, Value *> CharMapT; 82 83 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 84 class ClastExpCodeGen { 85 PollyIRBuilder &Builder; 86 const CharMapT &IVS; 87 88 Value *codegen(const clast_name *e, Type *Ty); 89 Value *codegen(const clast_term *e, Type *Ty); 90 Value *codegen(const clast_binary *e, Type *Ty); 91 Value *codegen(const clast_reduction *r, Type *Ty); 92 93 public: 94 // A generator for clast expressions. 95 // 96 // @param B The IRBuilder that defines where the code to calculate the 97 // clast expressions should be inserted. 98 // @param IVMAP A Map that translates strings describing the induction 99 // variables to the Values* that represent these variables 100 // on the LLVM side. 101 ClastExpCodeGen(PollyIRBuilder &B, CharMapT &IVMap); 102 103 // Generates code to calculate a given clast expression. 104 // 105 // @param e The expression to calculate. 106 // @return The Value that holds the result. 107 Value *codegen(const clast_expr *e, Type *Ty); 108 }; 109 110 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 111 CharMapT::const_iterator I = IVS.find(e->name); 112 113 assert(I != IVS.end() && "Clast name not found"); 114 115 return Builder.CreateSExtOrBitCast(I->second, Ty); 116 } 117 118 static APInt APInt_from_MPZ(const mpz_t mpz) { 119 uint64_t *p = NULL; 120 size_t sz; 121 122 p = (uint64_t *)mpz_export(p, &sz, -1, sizeof(uint64_t), 0, 0, mpz); 123 124 if (p) { 125 APInt A((unsigned)mpz_sizeinbase(mpz, 2), (unsigned)sz, p); 126 A = A.zext(A.getBitWidth() + 1); 127 free(p); 128 129 if (mpz_sgn(mpz) == -1) 130 return -A; 131 else 132 return A; 133 } else { 134 uint64_t val = 0; 135 return APInt(1, 1, &val); 136 } 137 } 138 139 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 140 APInt a = APInt_from_MPZ(e->val); 141 142 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 143 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 144 145 if (!e->var) 146 return ConstOne; 147 148 Value *var = codegen(e->var, Ty); 149 return Builder.CreateMul(ConstOne, var); 150 } 151 152 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 153 Value *LHS = codegen(e->LHS, Ty); 154 155 APInt RHS_AP = APInt_from_MPZ(e->RHS); 156 157 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 158 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 159 160 switch (e->type) { 161 case clast_bin_mod: 162 return Builder.CreateSRem(LHS, RHS); 163 case clast_bin_fdiv: { 164 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 165 Value *One = ConstantInt::get(Ty, 1); 166 Value *Zero = ConstantInt::get(Ty, 0); 167 Value *Sum1 = Builder.CreateSub(LHS, RHS); 168 Value *Sum2 = Builder.CreateAdd(Sum1, One); 169 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 170 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 171 return Builder.CreateSDiv(Dividend, RHS); 172 } 173 case clast_bin_cdiv: { 174 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 175 Value *One = ConstantInt::get(Ty, 1); 176 Value *Zero = ConstantInt::get(Ty, 0); 177 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 178 Value *Sum2 = Builder.CreateSub(Sum1, One); 179 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 180 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 181 return Builder.CreateSDiv(Dividend, RHS); 182 } 183 case clast_bin_div: 184 return Builder.CreateSDiv(LHS, RHS); 185 } 186 187 llvm_unreachable("Unknown clast binary expression type"); 188 } 189 190 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 191 assert((r->type == clast_red_min || r->type == clast_red_max || 192 r->type == clast_red_sum) && 193 "Clast reduction type not supported"); 194 Value *old = codegen(r->elts[0], Ty); 195 196 for (int i = 1; i < r->n; ++i) { 197 Value *exprValue = codegen(r->elts[i], Ty); 198 199 switch (r->type) { 200 case clast_red_min: { 201 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 202 old = Builder.CreateSelect(cmp, old, exprValue); 203 break; 204 } 205 case clast_red_max: { 206 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 207 old = Builder.CreateSelect(cmp, old, exprValue); 208 break; 209 } 210 case clast_red_sum: 211 old = Builder.CreateAdd(old, exprValue); 212 break; 213 } 214 } 215 216 return old; 217 } 218 219 ClastExpCodeGen::ClastExpCodeGen(PollyIRBuilder &B, CharMapT &IVMap) 220 : Builder(B), IVS(IVMap) {} 221 222 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 223 switch (e->type) { 224 case clast_expr_name: 225 return codegen((const clast_name *)e, Ty); 226 case clast_expr_term: 227 return codegen((const clast_term *)e, Ty); 228 case clast_expr_bin: 229 return codegen((const clast_binary *)e, Ty); 230 case clast_expr_red: 231 return codegen((const clast_reduction *)e, Ty); 232 } 233 234 llvm_unreachable("Unknown clast expression!"); 235 } 236 237 class ClastStmtCodeGen { 238 public: 239 const std::vector<std::string> &getParallelLoops(); 240 241 private: 242 // The Scop we code generate. 243 Scop *S; 244 Pass *P; 245 246 // The Builder specifies the current location to code generate at. 247 PollyIRBuilder &Builder; 248 249 // Map the Values from the old code to their counterparts in the new code. 250 ValueMapT ValueMap; 251 252 // Map the loops from the old code to expressions function of the induction 253 // variables in the new code. For example, when the code generator produces 254 // this AST: 255 // 256 // for (int c1 = 0; c1 <= 1023; c1 += 1) 257 // for (int c2 = 0; c2 <= 1023; c2 += 1) 258 // Stmt(c2 + 3, c1); 259 // 260 // LoopToScev is a map associating: 261 // "outer loop in the old loop nest" -> SCEV("c2 + 3"), 262 // "inner loop in the old loop nest" -> SCEV("c1"). 263 LoopToScevMapT LoopToScev; 264 265 // clastVars maps from the textual representation of a clast variable to its 266 // current *Value. clast variables are scheduling variables, original 267 // induction variables or parameters. They are used either in loop bounds or 268 // to define the statement instance that is executed. 269 // 270 // for (s = 0; s < n + 3; ++i) 271 // for (t = s; t < m; ++j) 272 // Stmt(i = s + 3 * m, j = t); 273 // 274 // {s,t,i,j,n,m} is the set of clast variables in this clast. 275 CharMapT ClastVars; 276 277 // Codegenerator for clast expressions. 278 ClastExpCodeGen ExpGen; 279 280 // Do we currently generate parallel code? 281 bool parallelCodeGeneration; 282 283 std::vector<std::string> parallelLoops; 284 285 void codegen(const clast_assignment *a); 286 287 void codegen(const clast_assignment *a, ScopStmt *Statement, 288 unsigned Dimension, int vectorDim, 289 std::vector<ValueMapT> *VectorVMap = 0, 290 std::vector<LoopToScevMapT> *VLTS = 0); 291 292 void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement, 293 int vectorDim = 0, 294 std::vector<ValueMapT> *VectorVMap = 0, 295 std::vector<LoopToScevMapT> *VLTS = 0); 296 297 void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = NULL, 298 const char *iterator = NULL, 299 __isl_take isl_set *scatteringDomain = 0); 300 301 void codegen(const clast_block *b); 302 303 /// @brief Create a classical sequential loop. 304 void codegenForSequential(const clast_for *f); 305 306 /// @brief Create OpenMP structure values. 307 /// 308 /// Create a list of values that has to be stored into the OpenMP subfuncition 309 /// structure. 310 SetVector<Value *> getOMPValues(const clast_stmt *Body); 311 312 /// @brief Update ClastVars and ValueMap according to a value map. 313 /// 314 /// @param VMap A map from old to new values. 315 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 316 317 /// @brief Create an OpenMP parallel for loop. 318 /// 319 /// This loop reflects a loop as if it would have been created by an OpenMP 320 /// statement. 321 void codegenForOpenMP(const clast_for *f); 322 323 #ifdef GPU_CODEGEN 324 /// @brief Create GPGPU device memory access values. 325 /// 326 /// Create a list of values that will be set to be parameters of the GPGPU 327 /// subfunction. These parameters represent device memory base addresses 328 /// and the size in bytes. 329 SetVector<Value *> getGPUValues(unsigned &OutputBytes); 330 331 /// @brief Create a GPU parallel for loop. 332 /// 333 /// This loop reflects a loop as if it would have been created by a GPU 334 /// statement. 335 void codegenForGPGPU(const clast_for *F); 336 337 /// @brief Get innermost for loop. 338 const clast_stmt *getScheduleInfo(const clast_for *F, 339 std::vector<int> &NumIters, 340 unsigned &LoopDepth, 341 unsigned &NonPLoopDepth); 342 #endif /* GPU_CODEGEN */ 343 344 /// @brief Check if a loop is parallel 345 /// 346 /// Detect if a clast_for loop can be executed in parallel. 347 /// 348 /// @param For The clast for loop to check. 349 /// 350 /// @return bool Returns true if the incoming clast_for statement can 351 /// execute in parallel. 352 bool isParallelFor(const clast_for *For); 353 354 bool isInnermostLoop(const clast_for *f); 355 356 /// @brief Get the number of loop iterations for this loop. 357 /// @param f The clast for loop to check. 358 int getNumberOfIterations(const clast_for *f); 359 360 /// @brief Create vector instructions for this loop. 361 void codegenForVector(const clast_for *f); 362 363 void codegen(const clast_for *f); 364 365 Value *codegen(const clast_equation *eq); 366 367 void codegen(const clast_guard *g); 368 369 void codegen(const clast_stmt *stmt); 370 371 void addParameters(const CloogNames *names); 372 373 IntegerType *getIntPtrTy(); 374 375 public: 376 void codegen(const clast_root *r); 377 378 ClastStmtCodeGen(Scop *scop, PollyIRBuilder &B, Pass *P); 379 }; 380 } 381 382 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 383 return P->getAnalysis<DataLayoutPass>().getDataLayout().getIntPtrType( 384 Builder.getContext()); 385 } 386 387 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 388 return parallelLoops; 389 } 390 391 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 392 Value *V = ExpGen.codegen(a->RHS, getIntPtrTy()); 393 ClastVars[a->LHS] = V; 394 } 395 396 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt, 397 unsigned Dim, int VectorDim, 398 std::vector<ValueMapT> *VectorVMap, 399 std::vector<LoopToScevMapT> *VLTS) { 400 Value *RHS; 401 402 assert(!A->LHS && "Statement assignments do not have left hand side"); 403 404 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 405 406 const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS); 407 if (VLTS) 408 (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS; 409 LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS; 410 411 const PHINode *PN = Stmt->getInductionVariableForDimension(Dim); 412 if (PN) { 413 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 414 415 if (VectorVMap) 416 (*VectorVMap)[VectorDim][PN] = RHS; 417 418 ValueMap[PN] = RHS; 419 } 420 } 421 422 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment, 423 ScopStmt *Statement, int vectorDim, 424 std::vector<ValueMapT> *VectorVMap, 425 std::vector<LoopToScevMapT> *VLTS) { 426 int Dimension = 0; 427 428 while (Assignment) { 429 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) && 430 "Substitions are expected to be assignments"); 431 codegen((const clast_assignment *)Assignment, Statement, Dimension, 432 vectorDim, VectorVMap, VLTS); 433 Assignment = Assignment->next; 434 Dimension++; 435 } 436 } 437 438 // Takes the cloog specific domain and translates it into a map Statement -> 439 // PartialSchedule, where the PartialSchedule contains all the dimensions that 440 // have been code generated up to this point. 441 static __isl_give isl_map *extractPartialSchedule(ScopStmt *Statement, 442 __isl_take isl_set *Domain) { 443 isl_map *Schedule = Statement->getScattering(); 444 int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set); 445 int UnscheduledDimensions = 446 isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions; 447 448 isl_set_free(Domain); 449 450 return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions, 451 UnscheduledDimensions); 452 } 453 454 void ClastStmtCodeGen::codegen(const clast_user_stmt *u, 455 std::vector<Value *> *IVS, const char *iterator, 456 __isl_take isl_set *Domain) { 457 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 458 459 if (u->substitutions) 460 codegenSubstitutions(u->substitutions, Statement); 461 462 int VectorDimensions = IVS ? IVS->size() : 1; 463 464 if (VectorDimensions == 1) { 465 BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P); 466 isl_set_free(Domain); 467 return; 468 } 469 470 VectorValueMapT VectorMap(VectorDimensions); 471 std::vector<LoopToScevMapT> VLTS(VectorDimensions); 472 473 if (IVS) { 474 assert(u->substitutions && "Substitutions expected!"); 475 int i = 0; 476 for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end(); 477 II != IE; ++II) { 478 ClastVars[iterator] = *II; 479 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS); 480 i++; 481 } 482 } 483 484 // Copy the current value map into all vector maps if the key wasn't 485 // available yet. This is needed in case vector codegen is performed in 486 // OpenMP subfunctions. 487 for (auto KV : ValueMap) 488 for (int i = 0; i < VectorDimensions; ++i) 489 VectorMap[i].insert(KV); 490 491 isl_map *Schedule = extractPartialSchedule(Statement, Domain); 492 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule, 493 P); 494 isl_map_free(Schedule); 495 } 496 497 void ClastStmtCodeGen::codegen(const clast_block *b) { 498 if (b->body) 499 codegen(b->body); 500 } 501 502 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 503 Value *LowerBound, *UpperBound, *IV, *Stride; 504 BasicBlock *ExitBlock; 505 Type *IntPtrTy = getIntPtrTy(); 506 507 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 508 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 509 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 510 511 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, ExitBlock, 512 CmpInst::ICMP_SLE); 513 514 // Add loop iv to symbols. 515 ClastVars[f->iterator] = IV; 516 517 if (f->body) 518 codegen(f->body); 519 520 // Loop is finished, so remove its iv from the live symbols. 521 ClastVars.erase(f->iterator); 522 Builder.SetInsertPoint(ExitBlock->begin()); 523 } 524 525 // Helper class to determine all scalar parameters used in the basic blocks of a 526 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 527 class ParameterVisitor : public ClastVisitor { 528 std::set<Value *> Values; 529 530 public: 531 ParameterVisitor() : ClastVisitor(), Values() {} 532 533 void visitUser(const clast_user_stmt *Stmt) { 534 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 535 const BasicBlock *BB = S->getBasicBlock(); 536 537 // Check all the operands of instructions in the basic block. 538 for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE; 539 ++BI) { 540 const Instruction &Inst = *BI; 541 for (Instruction::const_op_iterator II = Inst.op_begin(), 542 IE = Inst.op_end(); 543 II != IE; ++II) { 544 Value *SrcVal = *II; 545 546 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 547 if (S->getParent()->getRegion().contains(OpInst)) 548 continue; 549 550 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 551 Values.insert(SrcVal); 552 } 553 } 554 } 555 556 // Iterator to iterate over the values found. 557 typedef std::set<Value *>::const_iterator const_iterator; 558 inline const_iterator begin() const { return Values.begin(); } 559 inline const_iterator end() const { return Values.end(); } 560 }; 561 562 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 563 SetVector<Value *> Values; 564 565 // The clast variables 566 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 567 I++) 568 Values.insert(I->second); 569 570 // Find the temporaries that are referenced in the clast statements' 571 // basic blocks but are not defined by these blocks (e.g., references 572 // to function arguments or temporaries defined before the start of 573 // the SCoP). 574 ParameterVisitor Params; 575 Params.visit(Body); 576 577 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 578 PI != PE; ++PI) { 579 Value *V = *PI; 580 Values.insert(V); 581 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 582 } 583 584 return Values; 585 } 586 587 void 588 ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) { 589 std::set<Value *> Inserted; 590 591 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 592 I++) { 593 ClastVars[I->first] = VMap[I->second]; 594 Inserted.insert(I->second); 595 } 596 597 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 598 E = VMap.end(); 599 I != E; ++I) { 600 if (Inserted.count(I->first)) 601 continue; 602 603 ValueMap[I->first] = I->second; 604 } 605 } 606 607 static void clearDomtree(Function *F, DominatorTree &DT) { 608 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 609 std::vector<BasicBlock *> Nodes; 610 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 611 Nodes.push_back(I->getBlock()); 612 613 for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end(); 614 I != E; ++I) 615 DT.eraseNode(*I); 616 } 617 618 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 619 Value *Stride, *LB, *UB, *IV; 620 BasicBlock::iterator LoopBody; 621 IntegerType *IntPtrTy = getIntPtrTy(); 622 SetVector<Value *> Values; 623 OMPGenerator::ValueToValueMapTy VMap; 624 OMPGenerator OMPGen(Builder, P); 625 626 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 627 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 628 LB = ExpGen.codegen(For->LB, IntPtrTy); 629 UB = ExpGen.codegen(For->UB, IntPtrTy); 630 631 Values = getOMPValues(For->body); 632 633 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 634 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 635 Builder.SetInsertPoint(LoopBody); 636 637 // Save the current values. 638 const ValueMapT ValueMapCopy = ValueMap; 639 const CharMapT ClastVarsCopy = ClastVars; 640 641 updateWithValueMap(VMap); 642 ClastVars[For->iterator] = IV; 643 644 if (For->body) 645 codegen(For->body); 646 647 // Restore the original values. 648 ValueMap = ValueMapCopy; 649 ClastVars = ClastVarsCopy; 650 651 clearDomtree((*LoopBody).getParent()->getParent(), 652 P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()); 653 654 Builder.SetInsertPoint(AfterLoop); 655 } 656 657 #ifdef GPU_CODEGEN 658 static unsigned getArraySizeInBytes(const ArrayType *AT) { 659 unsigned Bytes = AT->getNumElements(); 660 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 661 Bytes *= getArraySizeInBytes(T); 662 else 663 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 664 665 return Bytes; 666 } 667 668 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 669 SetVector<Value *> Values; 670 OutputBytes = 0; 671 672 // Record the memory reference base addresses. 673 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 674 ScopStmt *Stmt = *SI; 675 for (SmallVector<MemoryAccess *, 8>::iterator I = Stmt->memacc_begin(), 676 E = Stmt->memacc_end(); 677 I != E; ++I) { 678 Value *BaseAddr = const_cast<Value *>((*I)->getBaseAddr()); 679 Values.insert((BaseAddr)); 680 681 // FIXME: we assume that there is one and only one array to be written 682 // in a SCoP. 683 int NumWrites = 0; 684 if ((*I)->isWrite()) { 685 ++NumWrites; 686 assert(NumWrites <= 1 && 687 "We support at most one array to be written in a SCoP."); 688 if (const PointerType *PT = 689 dyn_cast<PointerType>(BaseAddr->getType())) { 690 Type *T = PT->getArrayElementType(); 691 const ArrayType *ATy = dyn_cast<ArrayType>(T); 692 OutputBytes = getArraySizeInBytes(ATy); 693 } 694 } 695 } 696 } 697 698 return Values; 699 } 700 701 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F, 702 std::vector<int> &NumIters, 703 unsigned &LoopDepth, 704 unsigned &NonPLoopDepth) { 705 clast_stmt *Stmt = (clast_stmt *)F; 706 const clast_for *Result; 707 bool NonParaFlag = false; 708 LoopDepth = 0; 709 NonPLoopDepth = 0; 710 711 while (Stmt) { 712 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 713 const clast_for *T = (clast_for *)Stmt; 714 if (isParallelFor(T)) { 715 if (!NonParaFlag) { 716 NumIters.push_back(getNumberOfIterations(T)); 717 Result = T; 718 } 719 } else 720 NonParaFlag = true; 721 722 Stmt = T->body; 723 LoopDepth++; 724 continue; 725 } 726 Stmt = Stmt->next; 727 } 728 729 assert(NumIters.size() == 4 && 730 "The loops should be tiled into 4-depth parallel loops and an " 731 "innermost non-parallel one (if exist)."); 732 NonPLoopDepth = LoopDepth - NumIters.size(); 733 assert(NonPLoopDepth <= 1 && 734 "We support only one innermost non-parallel loop currently."); 735 return (const clast_stmt *)Result->body; 736 } 737 738 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 739 BasicBlock::iterator LoopBody; 740 SetVector<Value *> Values; 741 SetVector<Value *> IVS; 742 std::vector<int> NumIterations; 743 PTXGenerator::ValueToValueMapTy VMap; 744 745 assert(!GPUTriple.empty() && 746 "Target triple should be set properly for GPGPU code generation."); 747 PTXGenerator PTXGen(Builder, P, GPUTriple); 748 749 // Get original IVS and ScopStmt 750 unsigned TiledLoopDepth, NonPLoopDepth; 751 const clast_stmt *InnerStmt = 752 getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth); 753 const clast_stmt *TmpStmt; 754 const clast_user_stmt *U; 755 const clast_for *InnerFor; 756 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 757 InnerFor = (const clast_for *)InnerStmt; 758 TmpStmt = InnerFor->body; 759 } else 760 TmpStmt = InnerStmt; 761 U = (const clast_user_stmt *)TmpStmt; 762 ScopStmt *Statement = (ScopStmt *)U->statement->usr; 763 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 764 const Value *IV = Statement->getInductionVariableForDimension(i); 765 IVS.insert(const_cast<Value *>(IV)); 766 } 767 768 unsigned OutBytes; 769 Values = getGPUValues(OutBytes); 770 PTXGen.setOutputBytes(OutBytes); 771 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 772 773 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 774 Builder.SetInsertPoint(LoopBody); 775 776 BasicBlock *AfterBB = 0; 777 if (NonPLoopDepth) { 778 Value *LowerBound, *UpperBound, *IV, *Stride; 779 Type *IntPtrTy = getIntPtrTy(); 780 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 781 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 782 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 783 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 784 CmpInst::ICMP_SLE); 785 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 786 Value *OldIV = const_cast<Value *>(OldIV_); 787 VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV)); 788 } 789 790 updateWithValueMap(VMap); 791 792 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 793 794 if (AfterBB) 795 Builder.SetInsertPoint(AfterBB->begin()); 796 797 // FIXME: The replacement of the host base address with the parameter of ptx 798 // subfunction should have been done by updateWithValueMap. We use the 799 // following codes to avoid affecting other parts of Polly. This should be 800 // fixed later. 801 Function *FN = Builder.GetInsertBlock()->getParent(); 802 for (unsigned j = 0; j < Values.size(); j++) { 803 Value *baseAddr = Values[j]; 804 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 805 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 806 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 807 } 808 } 809 Builder.SetInsertPoint(AfterLoop); 810 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 811 NumIterations[2], NumIterations[3]); 812 PTXGen.finishGeneration(FN); 813 } 814 #endif 815 816 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 817 const clast_stmt *stmt = f->body; 818 819 while (stmt) { 820 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 821 return false; 822 823 stmt = stmt->next; 824 } 825 826 return true; 827 } 828 829 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) { 830 isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain)); 831 int NumberOfIterations = polly::getNumberOfIterations(LoopDomain); 832 if (NumberOfIterations == -1) 833 return -1; 834 return NumberOfIterations / mpz_get_si(For->stride) + 1; 835 } 836 837 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 838 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 839 int VectorWidth = getNumberOfIterations(F); 840 841 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 842 843 APInt Stride = APInt_from_MPZ(F->stride); 844 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 845 Stride = Stride.zext(LoopIVType->getBitWidth()); 846 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 847 848 std::vector<Value *> IVS(VectorWidth); 849 IVS[0] = LB; 850 851 for (int i = 1; i < VectorWidth; i++) 852 IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv"); 853 854 isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(F->domain)); 855 856 // Add loop iv to symbols. 857 ClastVars[F->iterator] = LB; 858 859 const clast_stmt *Stmt = F->body; 860 861 while (Stmt) { 862 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 863 isl_set_copy(Domain)); 864 Stmt = Stmt->next; 865 } 866 867 // Loop is finished, so remove its iv from the live symbols. 868 isl_set_free(Domain); 869 ClastVars.erase(F->iterator); 870 } 871 872 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 873 isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(f->domain)); 874 assert(Domain && "Cannot access domain of loop"); 875 876 Dependences &D = P->getAnalysis<Dependences>(); 877 878 return D.isParallelDimension(Domain, isl_set_n_dim(Domain)); 879 } 880 881 void ClastStmtCodeGen::codegen(const clast_for *f) { 882 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 883 if ((Vector || OpenMP) && isParallelFor(f)) { 884 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) && 885 (getNumberOfIterations(f) <= 16)) { 886 codegenForVector(f); 887 return; 888 } 889 890 if (OpenMP && !parallelCodeGeneration) { 891 parallelCodeGeneration = true; 892 parallelLoops.push_back(f->iterator); 893 codegenForOpenMP(f); 894 parallelCodeGeneration = false; 895 return; 896 } 897 } 898 899 #ifdef GPU_CODEGEN 900 if (GPGPU && isParallelFor(f)) { 901 if (!parallelCodeGeneration) { 902 parallelCodeGeneration = true; 903 parallelLoops.push_back(f->iterator); 904 codegenForGPGPU(f); 905 parallelCodeGeneration = false; 906 return; 907 } 908 } 909 #endif 910 911 codegenForSequential(f); 912 } 913 914 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 915 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 916 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 917 CmpInst::Predicate P; 918 919 if (eq->sign == 0) 920 P = ICmpInst::ICMP_EQ; 921 else if (eq->sign > 0) 922 P = ICmpInst::ICMP_SGE; 923 else 924 P = ICmpInst::ICMP_SLE; 925 926 return Builder.CreateICmp(P, LHS, RHS); 927 } 928 929 void ClastStmtCodeGen::codegen(const clast_guard *g) { 930 Function *F = Builder.GetInsertBlock()->getParent(); 931 LLVMContext &Context = F->getContext(); 932 933 BasicBlock *CondBB = 934 SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P); 935 CondBB->setName("polly.cond"); 936 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 937 MergeBB->setName("polly.merge"); 938 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 939 940 DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 941 DT.addNewBlock(ThenBB, CondBB); 942 DT.changeImmediateDominator(MergeBB, CondBB); 943 944 CondBB->getTerminator()->eraseFromParent(); 945 946 Builder.SetInsertPoint(CondBB); 947 948 Value *Predicate = codegen(&(g->eq[0])); 949 950 for (int i = 1; i < g->n; ++i) { 951 Value *TmpPredicate = codegen(&(g->eq[i])); 952 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 953 } 954 955 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 956 Builder.SetInsertPoint(ThenBB); 957 Builder.CreateBr(MergeBB); 958 Builder.SetInsertPoint(ThenBB->begin()); 959 960 LoopInfo &LI = P->getAnalysis<LoopInfo>(); 961 Loop *L = LI.getLoopFor(CondBB); 962 if (L) 963 L->addBasicBlockToLoop(ThenBB, LI.getBase()); 964 965 codegen(g->then); 966 967 Builder.SetInsertPoint(MergeBB->begin()); 968 } 969 970 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 971 if (CLAST_STMT_IS_A(stmt, stmt_root)) 972 assert(false && "No second root statement expected"); 973 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 974 codegen((const clast_assignment *)stmt); 975 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 976 codegen((const clast_user_stmt *)stmt); 977 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 978 codegen((const clast_block *)stmt); 979 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 980 codegen((const clast_for *)stmt); 981 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 982 codegen((const clast_guard *)stmt); 983 984 if (stmt->next) 985 codegen(stmt->next); 986 } 987 988 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 989 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 990 991 int i = 0; 992 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 993 PI != PE; ++PI) { 994 assert(i < names->nb_parameters && "Not enough parameter names"); 995 996 const SCEV *Param = *PI; 997 Type *Ty = Param->getType(); 998 999 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 1000 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 1001 ClastVars[names->parameters[i]] = V; 1002 1003 ++i; 1004 } 1005 } 1006 1007 void ClastStmtCodeGen::codegen(const clast_root *r) { 1008 addParameters(r->names); 1009 1010 parallelCodeGeneration = false; 1011 1012 const clast_stmt *stmt = (const clast_stmt *)r; 1013 if (stmt->next) 1014 codegen(stmt->next); 1015 } 1016 1017 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, PollyIRBuilder &B, Pass *P) 1018 : S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {} 1019 1020 namespace { 1021 class CodeGeneration : public ScopPass { 1022 std::vector<std::string> ParallelLoops; 1023 1024 public: 1025 static char ID; 1026 1027 CodeGeneration() : ScopPass(ID) {} 1028 1029 bool runOnScop(Scop &S) { 1030 ParallelLoops.clear(); 1031 1032 assert(!S.getRegion().isTopLevelRegion() && 1033 "Top level regions are not supported"); 1034 1035 simplifyRegion(&S, this); 1036 1037 BasicBlock *StartBlock = executeScopConditionally(S, this); 1038 1039 PollyIRBuilder Builder(StartBlock->begin()); 1040 1041 ClastStmtCodeGen CodeGen(&S, Builder, this); 1042 CloogInfo &C = getAnalysis<CloogInfo>(); 1043 CodeGen.codegen(C.getClast()); 1044 1045 ParallelLoops.insert(ParallelLoops.begin(), 1046 CodeGen.getParallelLoops().begin(), 1047 CodeGen.getParallelLoops().end()); 1048 return true; 1049 } 1050 1051 virtual void printScop(raw_ostream &OS) const { 1052 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1053 PE = ParallelLoops.end(); 1054 PI != PE; ++PI) 1055 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1056 } 1057 1058 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1059 AU.addRequired<CloogInfo>(); 1060 AU.addRequired<Dependences>(); 1061 AU.addRequired<DominatorTreeWrapperPass>(); 1062 AU.addRequired<RegionInfo>(); 1063 AU.addRequired<ScalarEvolution>(); 1064 AU.addRequired<ScopDetection>(); 1065 AU.addRequired<ScopInfo>(); 1066 AU.addRequired<DataLayoutPass>(); 1067 AU.addRequired<LoopInfo>(); 1068 1069 AU.addPreserved<CloogInfo>(); 1070 AU.addPreserved<Dependences>(); 1071 AU.addPreserved<LoopInfo>(); 1072 AU.addPreserved<DominatorTreeWrapperPass>(); 1073 AU.addPreserved<ScopDetection>(); 1074 AU.addPreserved<ScalarEvolution>(); 1075 1076 // FIXME: We do not yet add regions for the newly generated code to the 1077 // region tree. 1078 AU.addPreserved<RegionInfo>(); 1079 AU.addPreserved<TempScopInfo>(); 1080 AU.addPreserved<ScopInfo>(); 1081 AU.addPreservedID(IndependentBlocksID); 1082 } 1083 }; 1084 } 1085 1086 char CodeGeneration::ID = 1; 1087 1088 Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); } 1089 1090 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1091 "Polly - Create LLVM-IR from SCoPs", false, false); 1092 INITIALIZE_PASS_DEPENDENCY(CloogInfo); 1093 INITIALIZE_PASS_DEPENDENCY(Dependences); 1094 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); 1095 INITIALIZE_PASS_DEPENDENCY(RegionInfo); 1096 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution); 1097 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1098 INITIALIZE_PASS_DEPENDENCY(DataLayoutPass); 1099 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1100 "Polly - Create LLVM-IR from SCoPs", false, false) 1101 1102 #endif // CLOOG_FOUND 1103