1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #define DEBUG_TYPE "polly-codegen" 27 #include "polly/Dependences.h" 28 #include "polly/LinkAllPasses.h" 29 #include "polly/ScopInfo.h" 30 #include "polly/TempScopInfo.h" 31 #include "polly/CodeGen/CodeGeneration.h" 32 #include "polly/CodeGen/BlockGenerators.h" 33 #include "polly/CodeGen/LoopGenerators.h" 34 #include "polly/CodeGen/PTXGenerator.h" 35 #include "polly/CodeGen/Utils.h" 36 #include "polly/Support/GICHelper.h" 37 38 #include "llvm/Module.h" 39 #include "llvm/ADT/SetVector.h" 40 #include "llvm/ADT/PostOrderIterator.h" 41 #include "llvm/Analysis/LoopInfo.h" 42 #include "llvm/Analysis/ScalarEvolutionExpander.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/DataLayout.h" 46 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 47 48 #define CLOOG_INT_GMP 1 49 #include "cloog/cloog.h" 50 #include "cloog/isl/cloog.h" 51 52 #include "isl/aff.h" 53 54 #include <vector> 55 #include <utility> 56 57 using namespace polly; 58 using namespace llvm; 59 60 struct isl_set; 61 62 namespace polly { 63 static cl::opt<bool> 64 OpenMP("enable-polly-openmp", 65 cl::desc("Generate OpenMP parallel code"), cl::Hidden, 66 cl::value_desc("OpenMP code generation enabled if true"), 67 cl::init(false), cl::ZeroOrMore); 68 69 #ifdef GPU_CODEGEN 70 static cl::opt<bool> 71 GPGPU("enable-polly-gpgpu", 72 cl::desc("Generate GPU parallel code"), cl::Hidden, 73 cl::value_desc("GPGPU code generation enabled if true"), 74 cl::init(false), cl::ZeroOrMore); 75 76 static cl::opt<std::string> 77 GPUTriple("polly-gpgpu-triple", 78 cl::desc("Target triple for GPU code generation"), 79 cl::Hidden, cl::init("")); 80 #endif /* GPU_CODEGEN */ 81 82 typedef DenseMap<const char*, Value*> CharMapT; 83 84 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 85 class ClastExpCodeGen { 86 IRBuilder<> &Builder; 87 const CharMapT &IVS; 88 89 Value *codegen(const clast_name *e, Type *Ty); 90 Value *codegen(const clast_term *e, Type *Ty); 91 Value *codegen(const clast_binary *e, Type *Ty); 92 Value *codegen(const clast_reduction *r, Type *Ty); 93 public: 94 95 // A generator for clast expressions. 96 // 97 // @param B The IRBuilder that defines where the code to calculate the 98 // clast expressions should be inserted. 99 // @param IVMAP A Map that translates strings describing the induction 100 // variables to the Values* that represent these variables 101 // on the LLVM side. 102 ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap); 103 104 // Generates code to calculate a given clast expression. 105 // 106 // @param e The expression to calculate. 107 // @return The Value that holds the result. 108 Value *codegen(const clast_expr *e, Type *Ty); 109 }; 110 111 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 112 CharMapT::const_iterator I = IVS.find(e->name); 113 114 assert(I != IVS.end() && "Clast name not found"); 115 116 return Builder.CreateSExtOrBitCast(I->second, Ty); 117 } 118 119 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 120 APInt a = APInt_from_MPZ(e->val); 121 122 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 123 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 124 125 if (!e->var) 126 return ConstOne; 127 128 Value *var = codegen(e->var, Ty); 129 return Builder.CreateMul(ConstOne, var); 130 } 131 132 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 133 Value *LHS = codegen(e->LHS, Ty); 134 135 APInt RHS_AP = APInt_from_MPZ(e->RHS); 136 137 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 138 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 139 140 switch (e->type) { 141 case clast_bin_mod: 142 return Builder.CreateSRem(LHS, RHS); 143 case clast_bin_fdiv: 144 { 145 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 146 Value *One = ConstantInt::get(Ty, 1); 147 Value *Zero = ConstantInt::get(Ty, 0); 148 Value *Sum1 = Builder.CreateSub(LHS, RHS); 149 Value *Sum2 = Builder.CreateAdd(Sum1, One); 150 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 151 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 152 return Builder.CreateSDiv(Dividend, RHS); 153 } 154 case clast_bin_cdiv: 155 { 156 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 157 Value *One = ConstantInt::get(Ty, 1); 158 Value *Zero = ConstantInt::get(Ty, 0); 159 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 160 Value *Sum2 = Builder.CreateSub(Sum1, One); 161 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 162 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 163 return Builder.CreateSDiv(Dividend, RHS); 164 } 165 case clast_bin_div: 166 return Builder.CreateSDiv(LHS, RHS); 167 }; 168 169 llvm_unreachable("Unknown clast binary expression type"); 170 } 171 172 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 173 assert(( r->type == clast_red_min 174 || r->type == clast_red_max 175 || r->type == clast_red_sum) 176 && "Clast reduction type not supported"); 177 Value *old = codegen(r->elts[0], Ty); 178 179 for (int i=1; i < r->n; ++i) { 180 Value *exprValue = codegen(r->elts[i], Ty); 181 182 switch (r->type) { 183 case clast_red_min: 184 { 185 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 186 old = Builder.CreateSelect(cmp, old, exprValue); 187 break; 188 } 189 case clast_red_max: 190 { 191 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 192 old = Builder.CreateSelect(cmp, old, exprValue); 193 break; 194 } 195 case clast_red_sum: 196 old = Builder.CreateAdd(old, exprValue); 197 break; 198 } 199 } 200 201 return old; 202 } 203 204 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap) 205 : Builder(B), IVS(IVMap) {} 206 207 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 208 switch(e->type) { 209 case clast_expr_name: 210 return codegen((const clast_name *)e, Ty); 211 case clast_expr_term: 212 return codegen((const clast_term *)e, Ty); 213 case clast_expr_bin: 214 return codegen((const clast_binary *)e, Ty); 215 case clast_expr_red: 216 return codegen((const clast_reduction *)e, Ty); 217 } 218 219 llvm_unreachable("Unknown clast expression!"); 220 } 221 222 class ClastStmtCodeGen { 223 public: 224 const std::vector<std::string> &getParallelLoops(); 225 226 private: 227 // The Scop we code generate. 228 Scop *S; 229 Pass *P; 230 231 // The Builder specifies the current location to code generate at. 232 IRBuilder<> &Builder; 233 234 // Map the Values from the old code to their counterparts in the new code. 235 ValueMapT ValueMap; 236 237 // clastVars maps from the textual representation of a clast variable to its 238 // current *Value. clast variables are scheduling variables, original 239 // induction variables or parameters. They are used either in loop bounds or 240 // to define the statement instance that is executed. 241 // 242 // for (s = 0; s < n + 3; ++i) 243 // for (t = s; t < m; ++j) 244 // Stmt(i = s + 3 * m, j = t); 245 // 246 // {s,t,i,j,n,m} is the set of clast variables in this clast. 247 CharMapT ClastVars; 248 249 // Codegenerator for clast expressions. 250 ClastExpCodeGen ExpGen; 251 252 // Do we currently generate parallel code? 253 bool parallelCodeGeneration; 254 255 std::vector<std::string> parallelLoops; 256 257 void codegen(const clast_assignment *a); 258 259 void codegen(const clast_assignment *a, ScopStmt *Statement, 260 unsigned Dimension, int vectorDim, 261 std::vector<ValueMapT> *VectorVMap = 0); 262 263 void codegenSubstitutions(const clast_stmt *Assignment, 264 ScopStmt *Statement, int vectorDim = 0, 265 std::vector<ValueMapT> *VectorVMap = 0); 266 267 void codegen(const clast_user_stmt *u, std::vector<Value*> *IVS = NULL, 268 const char *iterator = NULL, isl_set *scatteringDomain = 0); 269 270 void codegen(const clast_block *b); 271 272 /// @brief Create a classical sequential loop. 273 void codegenForSequential(const clast_for *f); 274 275 /// @brief Create OpenMP structure values. 276 /// 277 /// Create a list of values that has to be stored into the OpenMP subfuncition 278 /// structure. 279 SetVector<Value*> getOMPValues(const clast_stmt *Body); 280 281 /// @brief Update ClastVars and ValueMap according to a value map. 282 /// 283 /// @param VMap A map from old to new values. 284 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 285 286 /// @brief Create an OpenMP parallel for loop. 287 /// 288 /// This loop reflects a loop as if it would have been created by an OpenMP 289 /// statement. 290 void codegenForOpenMP(const clast_for *f); 291 292 #ifdef GPU_CODEGEN 293 /// @brief Create GPGPU device memory access values. 294 /// 295 /// Create a list of values that will be set to be parameters of the GPGPU 296 /// subfunction. These parameters represent device memory base addresses 297 /// and the size in bytes. 298 SetVector<Value*> getGPUValues(unsigned &OutputBytes); 299 300 /// @brief Create a GPU parallel for loop. 301 /// 302 /// This loop reflects a loop as if it would have been created by a GPU 303 /// statement. 304 void codegenForGPGPU(const clast_for *F); 305 306 /// @brief Get innermost for loop. 307 const clast_stmt *getScheduleInfo(const clast_for *F, 308 std::vector<int> &NumIters, 309 unsigned &LoopDepth, 310 unsigned &NonPLoopDepth); 311 #endif /* GPU_CODEGEN */ 312 313 /// @brief Check if a loop is parallel 314 /// 315 /// Detect if a clast_for loop can be executed in parallel. 316 /// 317 /// @param For The clast for loop to check. 318 /// 319 /// @return bool Returns true if the incoming clast_for statement can 320 /// execute in parallel. 321 bool isParallelFor(const clast_for *For); 322 323 bool isInnermostLoop(const clast_for *f); 324 325 /// @brief Get the number of loop iterations for this loop. 326 /// @param f The clast for loop to check. 327 int getNumberOfIterations(const clast_for *f); 328 329 /// @brief Create vector instructions for this loop. 330 void codegenForVector(const clast_for *f); 331 332 void codegen(const clast_for *f); 333 334 Value *codegen(const clast_equation *eq); 335 336 void codegen(const clast_guard *g); 337 338 void codegen(const clast_stmt *stmt); 339 340 void addParameters(const CloogNames *names); 341 342 IntegerType *getIntPtrTy(); 343 344 public: 345 void codegen(const clast_root *r); 346 347 ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P); 348 }; 349 } 350 351 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 352 return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext()); 353 } 354 355 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 356 return parallelLoops; 357 } 358 359 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 360 Value *V= ExpGen.codegen(a->RHS, getIntPtrTy()); 361 ClastVars[a->LHS] = V; 362 } 363 364 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt, 365 unsigned Dim, int VectorDim, 366 std::vector<ValueMapT> *VectorVMap) { 367 const PHINode *PN; 368 Value *RHS; 369 370 assert(!A->LHS && "Statement assignments do not have left hand side"); 371 372 PN = Stmt->getInductionVariableForDimension(Dim); 373 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 374 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 375 376 if (VectorVMap) 377 (*VectorVMap)[VectorDim][PN] = RHS; 378 379 ValueMap[PN] = RHS; 380 } 381 382 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment, 383 ScopStmt *Statement, int vectorDim, 384 std::vector<ValueMapT> *VectorVMap) { 385 int Dimension = 0; 386 387 while (Assignment) { 388 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) 389 && "Substitions are expected to be assignments"); 390 codegen((const clast_assignment *)Assignment, Statement, Dimension, 391 vectorDim, VectorVMap); 392 Assignment = Assignment->next; 393 Dimension++; 394 } 395 } 396 397 void ClastStmtCodeGen::codegen(const clast_user_stmt *u, 398 std::vector<Value*> *IVS , const char *iterator, 399 isl_set *Domain) { 400 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 401 402 if (u->substitutions) 403 codegenSubstitutions(u->substitutions, Statement); 404 405 int VectorDimensions = IVS ? IVS->size() : 1; 406 407 if (VectorDimensions == 1) { 408 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 409 return; 410 } 411 412 VectorValueMapT VectorMap(VectorDimensions); 413 414 if (IVS) { 415 assert (u->substitutions && "Substitutions expected!"); 416 int i = 0; 417 for (std::vector<Value*>::iterator II = IVS->begin(), IE = IVS->end(); 418 II != IE; ++II) { 419 ClastVars[iterator] = *II; 420 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap); 421 i++; 422 } 423 } 424 425 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, Domain, P); 426 } 427 428 void ClastStmtCodeGen::codegen(const clast_block *b) { 429 if (b->body) 430 codegen(b->body); 431 } 432 433 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 434 Value *LowerBound, *UpperBound, *IV, *Stride; 435 BasicBlock *AfterBB; 436 Type *IntPtrTy = getIntPtrTy(); 437 438 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 439 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 440 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 441 442 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 443 CmpInst::ICMP_SLE); 444 445 // Add loop iv to symbols. 446 ClastVars[f->iterator] = IV; 447 448 if (f->body) 449 codegen(f->body); 450 451 // Loop is finished, so remove its iv from the live symbols. 452 ClastVars.erase(f->iterator); 453 Builder.SetInsertPoint(AfterBB->begin()); 454 } 455 456 // Helper class to determine all scalar parameters used in the basic blocks of a 457 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 458 class ParameterVisitor : public ClastVisitor { 459 std::set<Value *> Values; 460 public: 461 ParameterVisitor() : ClastVisitor(), Values() { } 462 463 void visitUser(const clast_user_stmt *Stmt) { 464 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 465 const BasicBlock *BB = S->getBasicBlock(); 466 467 // Check all the operands of instructions in the basic block. 468 for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE; 469 ++BI) { 470 const Instruction &Inst = *BI; 471 for (Instruction::const_op_iterator II = Inst.op_begin(), 472 IE = Inst.op_end(); II != IE; ++II) { 473 Value *SrcVal = *II; 474 475 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 476 if (S->getParent()->getRegion().contains(OpInst)) 477 continue; 478 479 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 480 Values.insert(SrcVal); 481 } 482 } 483 } 484 485 // Iterator to iterate over the values found. 486 typedef std::set<Value *>::const_iterator const_iterator; 487 inline const_iterator begin() const { return Values.begin(); } 488 inline const_iterator end() const { return Values.end(); } 489 }; 490 491 SetVector<Value*> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 492 SetVector<Value*> Values; 493 494 // The clast variables 495 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); 496 I != E; I++) 497 Values.insert(I->second); 498 499 // Find the temporaries that are referenced in the clast statements' 500 // basic blocks but are not defined by these blocks (e.g., references 501 // to function arguments or temporaries defined before the start of 502 // the SCoP). 503 ParameterVisitor Params; 504 Params.visit(Body); 505 506 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 507 PI != PE; ++PI) { 508 Value *V = *PI; 509 Values.insert(V); 510 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 511 } 512 513 return Values; 514 } 515 516 void ClastStmtCodeGen::updateWithValueMap( 517 OMPGenerator::ValueToValueMapTy &VMap) { 518 std::set<Value*> Inserted; 519 520 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); 521 I != E; I++) { 522 ClastVars[I->first] = VMap[I->second]; 523 Inserted.insert(I->second); 524 } 525 526 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 527 E = VMap.end(); I != E; ++I) { 528 if (Inserted.count(I->first)) 529 continue; 530 531 ValueMap[I->first] = I->second; 532 } 533 } 534 535 static void clearDomtree(Function *F, DominatorTree &DT) { 536 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 537 std::vector<BasicBlock*> Nodes; 538 for (po_iterator<DomTreeNode*> I = po_begin(N), E = po_end(N); I != E; ++I) 539 Nodes.push_back(I->getBlock()); 540 541 for (std::vector<BasicBlock*>::iterator I = Nodes.begin(), E = Nodes.end(); 542 I != E; ++I) 543 DT.eraseNode(*I); 544 } 545 546 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 547 Value *Stride, *LB, *UB, *IV; 548 BasicBlock::iterator LoopBody; 549 IntegerType *IntPtrTy = getIntPtrTy(); 550 SetVector<Value*> Values; 551 OMPGenerator::ValueToValueMapTy VMap; 552 OMPGenerator OMPGen(Builder, P); 553 554 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 555 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 556 LB = ExpGen.codegen(For->LB, IntPtrTy); 557 UB = ExpGen.codegen(For->UB, IntPtrTy); 558 559 Values = getOMPValues(For->body); 560 561 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 562 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 563 Builder.SetInsertPoint(LoopBody); 564 565 // Save the current values. 566 const ValueMapT ValueMapCopy = ValueMap; 567 const CharMapT ClastVarsCopy = ClastVars; 568 569 updateWithValueMap(VMap); 570 ClastVars[For->iterator] = IV; 571 572 if (For->body) 573 codegen(For->body); 574 575 // Restore the original values. 576 ValueMap = ValueMapCopy; 577 ClastVars = ClastVarsCopy; 578 579 clearDomtree((*LoopBody).getParent()->getParent(), 580 P->getAnalysis<DominatorTree>()); 581 582 Builder.SetInsertPoint(AfterLoop); 583 } 584 585 #ifdef GPU_CODEGEN 586 static unsigned getArraySizeInBytes(const ArrayType *AT) { 587 unsigned Bytes = AT->getNumElements(); 588 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 589 Bytes *= getArraySizeInBytes(T); 590 else 591 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 592 593 return Bytes; 594 } 595 596 SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 597 SetVector<Value*> Values; 598 OutputBytes = 0; 599 600 // Record the memory reference base addresses. 601 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 602 ScopStmt *Stmt = *SI; 603 for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(), 604 E = Stmt->memacc_end(); I != E; ++I) { 605 Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr()); 606 Values.insert((BaseAddr)); 607 608 // FIXME: we assume that there is one and only one array to be written 609 // in a SCoP. 610 int NumWrites = 0; 611 if ((*I)->isWrite()) { 612 ++NumWrites; 613 assert(NumWrites <= 1 && 614 "We support at most one array to be written in a SCoP."); 615 if (const PointerType * PT = 616 dyn_cast<PointerType>(BaseAddr->getType())) { 617 Type *T = PT->getArrayElementType(); 618 const ArrayType *ATy = dyn_cast<ArrayType>(T); 619 OutputBytes = getArraySizeInBytes(ATy); 620 } 621 } 622 } 623 } 624 625 return Values; 626 } 627 628 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F, 629 std::vector<int> &NumIters, 630 unsigned &LoopDepth, 631 unsigned &NonPLoopDepth) { 632 clast_stmt *Stmt = (clast_stmt *)F; 633 const clast_for *Result; 634 bool NonParaFlag = false; 635 LoopDepth = 0; 636 NonPLoopDepth = 0; 637 638 while (Stmt) { 639 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 640 const clast_for *T = (clast_for *) Stmt; 641 if (isParallelFor(T)) { 642 if (!NonParaFlag) { 643 NumIters.push_back(getNumberOfIterations(T)); 644 Result = T; 645 } 646 } else 647 NonParaFlag = true; 648 649 Stmt = T->body; 650 LoopDepth++; 651 continue; 652 } 653 Stmt = Stmt->next; 654 } 655 656 assert(NumIters.size() == 4 && 657 "The loops should be tiled into 4-depth parallel loops and an " 658 "innermost non-parallel one (if exist)."); 659 NonPLoopDepth = LoopDepth - NumIters.size(); 660 assert(NonPLoopDepth <= 1 661 && "We support only one innermost non-parallel loop currently."); 662 return (const clast_stmt *)Result->body; 663 } 664 665 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 666 BasicBlock::iterator LoopBody; 667 SetVector<Value *> Values; 668 SetVector<Value *> IVS; 669 std::vector<int> NumIterations; 670 PTXGenerator::ValueToValueMapTy VMap; 671 672 assert(!GPUTriple.empty() 673 && "Target triple should be set properly for GPGPU code generation."); 674 PTXGenerator PTXGen(Builder, P, GPUTriple); 675 676 // Get original IVS and ScopStmt 677 unsigned TiledLoopDepth, NonPLoopDepth; 678 const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations, 679 TiledLoopDepth, NonPLoopDepth); 680 const clast_stmt *TmpStmt; 681 const clast_user_stmt *U; 682 const clast_for *InnerFor; 683 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 684 InnerFor = (const clast_for *)InnerStmt; 685 TmpStmt = InnerFor->body; 686 } else 687 TmpStmt = InnerStmt; 688 U = (const clast_user_stmt *) TmpStmt; 689 ScopStmt *Statement = (ScopStmt *) U->statement->usr; 690 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 691 const Value* IV = Statement->getInductionVariableForDimension(i); 692 IVS.insert(const_cast<Value *>(IV)); 693 } 694 695 unsigned OutBytes; 696 Values = getGPUValues(OutBytes); 697 PTXGen.setOutputBytes(OutBytes); 698 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 699 700 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 701 Builder.SetInsertPoint(LoopBody); 702 703 BasicBlock *AfterBB = 0; 704 if (NonPLoopDepth) { 705 Value *LowerBound, *UpperBound, *IV, *Stride; 706 Type *IntPtrTy = getIntPtrTy(); 707 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 708 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 709 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 710 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 711 CmpInst::ICMP_SLE); 712 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 713 Value *OldIV = const_cast<Value *>(OldIV_); 714 VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV)); 715 } 716 717 updateWithValueMap(VMap); 718 719 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 720 721 if (AfterBB) 722 Builder.SetInsertPoint(AfterBB->begin()); 723 724 // FIXME: The replacement of the host base address with the parameter of ptx 725 // subfunction should have been done by updateWithValueMap. We use the 726 // following codes to avoid affecting other parts of Polly. This should be 727 // fixed later. 728 Function *FN = Builder.GetInsertBlock()->getParent(); 729 for (unsigned j = 0; j < Values.size(); j++) { 730 Value *baseAddr = Values[j]; 731 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 732 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 733 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 734 } 735 } 736 Builder.SetInsertPoint(AfterLoop); 737 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 738 NumIterations[2], NumIterations[3]); 739 PTXGen.finishGeneration(FN); 740 } 741 #endif 742 743 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 744 const clast_stmt *stmt = f->body; 745 746 while (stmt) { 747 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 748 return false; 749 750 stmt = stmt->next; 751 } 752 753 return true; 754 } 755 756 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *f) { 757 isl_set *loopDomain = isl_set_copy(isl_set_from_cloog_domain(f->domain)); 758 isl_set *tmp = isl_set_copy(loopDomain); 759 760 // Calculate a map similar to the identity map, but with the last input 761 // and output dimension not related. 762 // [i0, i1, i2, i3] -> [i0, i1, i2, o0] 763 isl_space *Space = isl_set_get_space(loopDomain); 764 Space = isl_space_drop_outputs(Space, 765 isl_set_dim(loopDomain, isl_dim_set) - 2, 1); 766 Space = isl_space_map_from_set(Space); 767 isl_map *identity = isl_map_identity(Space); 768 identity = isl_map_add_dims(identity, isl_dim_in, 1); 769 identity = isl_map_add_dims(identity, isl_dim_out, 1); 770 771 isl_map *map = isl_map_from_domain_and_range(tmp, loopDomain); 772 map = isl_map_intersect(map, identity); 773 774 isl_map *lexmax = isl_map_lexmax(isl_map_copy(map)); 775 isl_map *lexmin = isl_map_lexmin(map); 776 isl_map *sub = isl_map_sum(lexmax, isl_map_neg(lexmin)); 777 778 isl_set *elements = isl_map_range(sub); 779 780 if (!isl_set_is_singleton(elements)) { 781 isl_set_free(elements); 782 return -1; 783 } 784 785 isl_point *p = isl_set_sample_point(elements); 786 787 isl_int v; 788 isl_int_init(v); 789 isl_point_get_coordinate(p, isl_dim_set, isl_set_n_dim(loopDomain) - 1, &v); 790 int numberIterations = isl_int_get_si(v); 791 isl_int_clear(v); 792 isl_point_free(p); 793 794 return (numberIterations) / isl_int_get_si(f->stride) + 1; 795 } 796 797 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 798 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 799 int VectorWidth = getNumberOfIterations(F); 800 801 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 802 803 APInt Stride = APInt_from_MPZ(F->stride); 804 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 805 Stride = Stride.zext(LoopIVType->getBitWidth()); 806 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 807 808 std::vector<Value*> IVS(VectorWidth); 809 IVS[0] = LB; 810 811 for (int i = 1; i < VectorWidth; i++) 812 IVS[i] = Builder.CreateAdd(IVS[i-1], StrideValue, "p_vector_iv"); 813 814 isl_set *Domain = isl_set_from_cloog_domain(F->domain); 815 816 // Add loop iv to symbols. 817 ClastVars[F->iterator] = LB; 818 819 const clast_stmt *Stmt = F->body; 820 821 while (Stmt) { 822 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 823 isl_set_copy(Domain)); 824 Stmt = Stmt->next; 825 } 826 827 // Loop is finished, so remove its iv from the live symbols. 828 isl_set_free(Domain); 829 ClastVars.erase(F->iterator); 830 } 831 832 833 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 834 isl_set *Domain = isl_set_from_cloog_domain(f->domain); 835 assert(Domain && "Cannot access domain of loop"); 836 837 Dependences &D = P->getAnalysis<Dependences>(); 838 839 return D.isParallelDimension(isl_set_copy(Domain), isl_set_n_dim(Domain)); 840 } 841 842 void ClastStmtCodeGen::codegen(const clast_for *f) { 843 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 844 if ((Vector || OpenMP) && isParallelFor(f)) { 845 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) 846 && (getNumberOfIterations(f) <= 16)) { 847 codegenForVector(f); 848 return; 849 } 850 851 if (OpenMP && !parallelCodeGeneration) { 852 parallelCodeGeneration = true; 853 parallelLoops.push_back(f->iterator); 854 codegenForOpenMP(f); 855 parallelCodeGeneration = false; 856 return; 857 } 858 } 859 860 #ifdef GPU_CODEGEN 861 if (GPGPU && isParallelFor(f)) { 862 if (!parallelCodeGeneration) { 863 parallelCodeGeneration = true; 864 parallelLoops.push_back(f->iterator); 865 codegenForGPGPU(f); 866 parallelCodeGeneration = false; 867 return; 868 } 869 } 870 #endif 871 872 codegenForSequential(f); 873 } 874 875 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 876 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 877 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 878 CmpInst::Predicate P; 879 880 if (eq->sign == 0) 881 P = ICmpInst::ICMP_EQ; 882 else if (eq->sign > 0) 883 P = ICmpInst::ICMP_SGE; 884 else 885 P = ICmpInst::ICMP_SLE; 886 887 return Builder.CreateICmp(P, LHS, RHS); 888 } 889 890 void ClastStmtCodeGen::codegen(const clast_guard *g) { 891 Function *F = Builder.GetInsertBlock()->getParent(); 892 LLVMContext &Context = F->getContext(); 893 894 BasicBlock *CondBB = SplitBlock(Builder.GetInsertBlock(), 895 Builder.GetInsertPoint(), P); 896 CondBB->setName("polly.cond"); 897 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 898 MergeBB->setName("polly.merge"); 899 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 900 901 DominatorTree &DT = P->getAnalysis<DominatorTree>(); 902 DT.addNewBlock(ThenBB, CondBB); 903 DT.changeImmediateDominator(MergeBB, CondBB); 904 905 CondBB->getTerminator()->eraseFromParent(); 906 907 Builder.SetInsertPoint(CondBB); 908 909 Value *Predicate = codegen(&(g->eq[0])); 910 911 for (int i = 1; i < g->n; ++i) { 912 Value *TmpPredicate = codegen(&(g->eq[i])); 913 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 914 } 915 916 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 917 Builder.SetInsertPoint(ThenBB); 918 Builder.CreateBr(MergeBB); 919 Builder.SetInsertPoint(ThenBB->begin()); 920 921 codegen(g->then); 922 923 Builder.SetInsertPoint(MergeBB->begin()); 924 } 925 926 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 927 if (CLAST_STMT_IS_A(stmt, stmt_root)) 928 assert(false && "No second root statement expected"); 929 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 930 codegen((const clast_assignment *)stmt); 931 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 932 codegen((const clast_user_stmt *)stmt); 933 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 934 codegen((const clast_block *)stmt); 935 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 936 codegen((const clast_for *)stmt); 937 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 938 codegen((const clast_guard *)stmt); 939 940 if (stmt->next) 941 codegen(stmt->next); 942 } 943 944 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 945 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 946 947 int i = 0; 948 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 949 PI != PE; ++PI) { 950 assert(i < names->nb_parameters && "Not enough parameter names"); 951 952 const SCEV *Param = *PI; 953 Type *Ty = Param->getType(); 954 955 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 956 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 957 ClastVars[names->parameters[i]] = V; 958 959 ++i; 960 } 961 } 962 963 void ClastStmtCodeGen::codegen(const clast_root *r) { 964 addParameters(r->names); 965 966 parallelCodeGeneration = false; 967 968 const clast_stmt *stmt = (const clast_stmt*) r; 969 if (stmt->next) 970 codegen(stmt->next); 971 } 972 973 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) : 974 S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {} 975 976 namespace { 977 class CodeGeneration : public ScopPass { 978 std::vector<std::string> ParallelLoops; 979 980 public: 981 static char ID; 982 983 CodeGeneration() : ScopPass(ID) {} 984 985 986 bool runOnScop(Scop &S) { 987 ParallelLoops.clear(); 988 989 assert(S.getRegion().isSimple() && "Only simple regions are supported"); 990 991 BasicBlock *StartBlock = executeScopConditionally(S, this); 992 993 IRBuilder<> Builder(StartBlock->begin()); 994 995 ClastStmtCodeGen CodeGen(&S, Builder, this); 996 CloogInfo &C = getAnalysis<CloogInfo>(); 997 CodeGen.codegen(C.getClast()); 998 999 ParallelLoops.insert(ParallelLoops.begin(), 1000 CodeGen.getParallelLoops().begin(), 1001 CodeGen.getParallelLoops().end()); 1002 return true; 1003 } 1004 1005 virtual void printScop(raw_ostream &OS) const { 1006 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1007 PE = ParallelLoops.end(); PI != PE; ++PI) 1008 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1009 } 1010 1011 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1012 AU.addRequired<CloogInfo>(); 1013 AU.addRequired<Dependences>(); 1014 AU.addRequired<DominatorTree>(); 1015 AU.addRequired<RegionInfo>(); 1016 AU.addRequired<ScalarEvolution>(); 1017 AU.addRequired<ScopDetection>(); 1018 AU.addRequired<ScopInfo>(); 1019 AU.addRequired<DataLayout>(); 1020 1021 AU.addPreserved<CloogInfo>(); 1022 AU.addPreserved<Dependences>(); 1023 1024 // FIXME: We do not create LoopInfo for the newly generated loops. 1025 AU.addPreserved<LoopInfo>(); 1026 AU.addPreserved<DominatorTree>(); 1027 AU.addPreserved<ScopDetection>(); 1028 AU.addPreserved<ScalarEvolution>(); 1029 1030 // FIXME: We do not yet add regions for the newly generated code to the 1031 // region tree. 1032 AU.addPreserved<RegionInfo>(); 1033 AU.addPreserved<TempScopInfo>(); 1034 AU.addPreserved<ScopInfo>(); 1035 AU.addPreservedID(IndependentBlocksID); 1036 } 1037 }; 1038 } 1039 1040 char CodeGeneration::ID = 1; 1041 1042 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1043 "Polly - Create LLVM-IR from SCoPs", false, false) 1044 INITIALIZE_PASS_DEPENDENCY(CloogInfo) 1045 INITIALIZE_PASS_DEPENDENCY(Dependences) 1046 INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1047 INITIALIZE_PASS_DEPENDENCY(RegionInfo) 1048 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1049 INITIALIZE_PASS_DEPENDENCY(ScopDetection) 1050 INITIALIZE_PASS_DEPENDENCY(DataLayout) 1051 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1052 "Polly - Create LLVM-IR from SCoPs", false, false) 1053 1054 Pass *polly::createCodeGenerationPass() { 1055 return new CodeGeneration(); 1056 } 1057 1058 #endif // CLOOG_FOUND 1059