1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #define DEBUG_TYPE "polly-codegen" 27 #include "polly/Dependences.h" 28 #include "polly/LinkAllPasses.h" 29 #include "polly/ScopInfo.h" 30 #include "polly/TempScopInfo.h" 31 #include "polly/CodeGen/CodeGeneration.h" 32 #include "polly/CodeGen/BlockGenerators.h" 33 #include "polly/CodeGen/LoopGenerators.h" 34 #include "polly/CodeGen/PTXGenerator.h" 35 #include "polly/CodeGen/Utils.h" 36 #include "polly/Support/GICHelper.h" 37 38 #include "llvm/Module.h" 39 #include "llvm/ADT/SetVector.h" 40 #include "llvm/ADT/PostOrderIterator.h" 41 #include "llvm/Analysis/LoopInfo.h" 42 #include "llvm/Analysis/ScalarEvolutionExpander.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/DataLayout.h" 46 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 47 48 #define CLOOG_INT_GMP 1 49 #include "cloog/cloog.h" 50 #include "cloog/isl/cloog.h" 51 52 #include "isl/aff.h" 53 54 #include <vector> 55 #include <utility> 56 57 using namespace polly; 58 using namespace llvm; 59 60 struct isl_set; 61 62 namespace polly { 63 static cl::opt<bool> 64 OpenMP("enable-polly-openmp", 65 cl::desc("Generate OpenMP parallel code"), cl::Hidden, 66 cl::value_desc("OpenMP code generation enabled if true"), 67 cl::init(false), cl::ZeroOrMore); 68 69 #ifdef GPU_CODEGEN 70 static cl::opt<bool> 71 GPGPU("enable-polly-gpgpu", 72 cl::desc("Generate GPU parallel code"), cl::Hidden, 73 cl::value_desc("GPGPU code generation enabled if true"), 74 cl::init(false), cl::ZeroOrMore); 75 76 static cl::opt<std::string> 77 GPUTriple("polly-gpgpu-triple", 78 cl::desc("Target triple for GPU code generation"), 79 cl::Hidden, cl::init("")); 80 #endif /* GPU_CODEGEN */ 81 82 static cl::opt<bool> 83 AtLeastOnce("enable-polly-atLeastOnce", 84 cl::desc("Give polly the hint, that every loop is executed at least" 85 "once"), cl::Hidden, 86 cl::value_desc("OpenMP code generation enabled if true"), 87 cl::init(false), cl::ZeroOrMore); 88 89 typedef DenseMap<const char*, Value*> CharMapT; 90 91 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 92 class ClastExpCodeGen { 93 IRBuilder<> &Builder; 94 const CharMapT &IVS; 95 96 Value *codegen(const clast_name *e, Type *Ty); 97 Value *codegen(const clast_term *e, Type *Ty); 98 Value *codegen(const clast_binary *e, Type *Ty); 99 Value *codegen(const clast_reduction *r, Type *Ty); 100 public: 101 102 // A generator for clast expressions. 103 // 104 // @param B The IRBuilder that defines where the code to calculate the 105 // clast expressions should be inserted. 106 // @param IVMAP A Map that translates strings describing the induction 107 // variables to the Values* that represent these variables 108 // on the LLVM side. 109 ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap); 110 111 // Generates code to calculate a given clast expression. 112 // 113 // @param e The expression to calculate. 114 // @return The Value that holds the result. 115 Value *codegen(const clast_expr *e, Type *Ty); 116 }; 117 118 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 119 CharMapT::const_iterator I = IVS.find(e->name); 120 121 assert(I != IVS.end() && "Clast name not found"); 122 123 return Builder.CreateSExtOrBitCast(I->second, Ty); 124 } 125 126 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 127 APInt a = APInt_from_MPZ(e->val); 128 129 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 130 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 131 132 if (!e->var) 133 return ConstOne; 134 135 Value *var = codegen(e->var, Ty); 136 return Builder.CreateMul(ConstOne, var); 137 } 138 139 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 140 Value *LHS = codegen(e->LHS, Ty); 141 142 APInt RHS_AP = APInt_from_MPZ(e->RHS); 143 144 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 145 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 146 147 switch (e->type) { 148 case clast_bin_mod: 149 return Builder.CreateSRem(LHS, RHS); 150 case clast_bin_fdiv: 151 { 152 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 153 Value *One = ConstantInt::get(Ty, 1); 154 Value *Zero = ConstantInt::get(Ty, 0); 155 Value *Sum1 = Builder.CreateSub(LHS, RHS); 156 Value *Sum2 = Builder.CreateAdd(Sum1, One); 157 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 158 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 159 return Builder.CreateSDiv(Dividend, RHS); 160 } 161 case clast_bin_cdiv: 162 { 163 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 164 Value *One = ConstantInt::get(Ty, 1); 165 Value *Zero = ConstantInt::get(Ty, 0); 166 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 167 Value *Sum2 = Builder.CreateSub(Sum1, One); 168 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 169 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 170 return Builder.CreateSDiv(Dividend, RHS); 171 } 172 case clast_bin_div: 173 return Builder.CreateSDiv(LHS, RHS); 174 }; 175 176 llvm_unreachable("Unknown clast binary expression type"); 177 } 178 179 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 180 assert(( r->type == clast_red_min 181 || r->type == clast_red_max 182 || r->type == clast_red_sum) 183 && "Clast reduction type not supported"); 184 Value *old = codegen(r->elts[0], Ty); 185 186 for (int i=1; i < r->n; ++i) { 187 Value *exprValue = codegen(r->elts[i], Ty); 188 189 switch (r->type) { 190 case clast_red_min: 191 { 192 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 193 old = Builder.CreateSelect(cmp, old, exprValue); 194 break; 195 } 196 case clast_red_max: 197 { 198 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 199 old = Builder.CreateSelect(cmp, old, exprValue); 200 break; 201 } 202 case clast_red_sum: 203 old = Builder.CreateAdd(old, exprValue); 204 break; 205 } 206 } 207 208 return old; 209 } 210 211 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap) 212 : Builder(B), IVS(IVMap) {} 213 214 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 215 switch(e->type) { 216 case clast_expr_name: 217 return codegen((const clast_name *)e, Ty); 218 case clast_expr_term: 219 return codegen((const clast_term *)e, Ty); 220 case clast_expr_bin: 221 return codegen((const clast_binary *)e, Ty); 222 case clast_expr_red: 223 return codegen((const clast_reduction *)e, Ty); 224 } 225 226 llvm_unreachable("Unknown clast expression!"); 227 } 228 229 class ClastStmtCodeGen { 230 public: 231 const std::vector<std::string> &getParallelLoops(); 232 233 private: 234 // The Scop we code generate. 235 Scop *S; 236 Pass *P; 237 238 // The Builder specifies the current location to code generate at. 239 IRBuilder<> &Builder; 240 241 // Map the Values from the old code to their counterparts in the new code. 242 ValueMapT ValueMap; 243 244 // clastVars maps from the textual representation of a clast variable to its 245 // current *Value. clast variables are scheduling variables, original 246 // induction variables or parameters. They are used either in loop bounds or 247 // to define the statement instance that is executed. 248 // 249 // for (s = 0; s < n + 3; ++i) 250 // for (t = s; t < m; ++j) 251 // Stmt(i = s + 3 * m, j = t); 252 // 253 // {s,t,i,j,n,m} is the set of clast variables in this clast. 254 CharMapT ClastVars; 255 256 // Codegenerator for clast expressions. 257 ClastExpCodeGen ExpGen; 258 259 // Do we currently generate parallel code? 260 bool parallelCodeGeneration; 261 262 std::vector<std::string> parallelLoops; 263 264 void codegen(const clast_assignment *a); 265 266 void codegen(const clast_assignment *a, ScopStmt *Statement, 267 unsigned Dimension, int vectorDim, 268 std::vector<ValueMapT> *VectorVMap = 0); 269 270 void codegenSubstitutions(const clast_stmt *Assignment, 271 ScopStmt *Statement, int vectorDim = 0, 272 std::vector<ValueMapT> *VectorVMap = 0); 273 274 void codegen(const clast_user_stmt *u, std::vector<Value*> *IVS = NULL, 275 const char *iterator = NULL, isl_set *scatteringDomain = 0); 276 277 void codegen(const clast_block *b); 278 279 /// @brief Create a classical sequential loop. 280 void codegenForSequential(const clast_for *f); 281 282 /// @brief Create OpenMP structure values. 283 /// 284 /// Create a list of values that has to be stored into the OpenMP subfuncition 285 /// structure. 286 SetVector<Value*> getOMPValues(const clast_stmt *Body); 287 288 /// @brief Update ClastVars and ValueMap according to a value map. 289 /// 290 /// @param VMap A map from old to new values. 291 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 292 293 /// @brief Create an OpenMP parallel for loop. 294 /// 295 /// This loop reflects a loop as if it would have been created by an OpenMP 296 /// statement. 297 void codegenForOpenMP(const clast_for *f); 298 299 #ifdef GPU_CODEGEN 300 /// @brief Create GPGPU device memory access values. 301 /// 302 /// Create a list of values that will be set to be parameters of the GPGPU 303 /// subfunction. These parameters represent device memory base addresses 304 /// and the size in bytes. 305 SetVector<Value*> getGPUValues(unsigned &OutputBytes); 306 307 /// @brief Create a GPU parallel for loop. 308 /// 309 /// This loop reflects a loop as if it would have been created by a GPU 310 /// statement. 311 void codegenForGPGPU(const clast_for *F); 312 313 /// @brief Get innermost for loop. 314 const clast_stmt *getScheduleInfo(const clast_for *F, 315 std::vector<int> &NumIters, 316 unsigned &LoopDepth, 317 unsigned &NonPLoopDepth); 318 #endif /* GPU_CODEGEN */ 319 320 /// @brief Check if a loop is parallel 321 /// 322 /// Detect if a clast_for loop can be executed in parallel. 323 /// 324 /// @param f The clast for loop to check. 325 /// 326 /// @return bool Returns true if the incoming clast_for statement can 327 /// execute in parallel. 328 bool isParallelFor(const clast_for *For); 329 330 bool isInnermostLoop(const clast_for *f); 331 332 /// @brief Get the number of loop iterations for this loop. 333 /// @param f The clast for loop to check. 334 int getNumberOfIterations(const clast_for *f); 335 336 /// @brief Create vector instructions for this loop. 337 void codegenForVector(const clast_for *f); 338 339 void codegen(const clast_for *f); 340 341 Value *codegen(const clast_equation *eq); 342 343 void codegen(const clast_guard *g); 344 345 void codegen(const clast_stmt *stmt); 346 347 void addParameters(const CloogNames *names); 348 349 IntegerType *getIntPtrTy(); 350 351 public: 352 void codegen(const clast_root *r); 353 354 ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P); 355 }; 356 } 357 358 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 359 // FIXME: This might need to get a proper address space. Hard code 0 for now. 360 return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext(), 0u); 361 } 362 363 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 364 return parallelLoops; 365 } 366 367 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 368 Value *V= ExpGen.codegen(a->RHS, getIntPtrTy()); 369 ClastVars[a->LHS] = V; 370 } 371 372 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt, 373 unsigned Dim, int VectorDim, 374 std::vector<ValueMapT> *VectorVMap) { 375 const PHINode *PN; 376 Value *RHS; 377 378 assert(!A->LHS && "Statement assignments do not have left hand side"); 379 380 PN = Stmt->getInductionVariableForDimension(Dim); 381 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 382 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 383 384 if (VectorVMap) 385 (*VectorVMap)[VectorDim][PN] = RHS; 386 387 ValueMap[PN] = RHS; 388 } 389 390 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment, 391 ScopStmt *Statement, int vectorDim, 392 std::vector<ValueMapT> *VectorVMap) { 393 int Dimension = 0; 394 395 while (Assignment) { 396 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) 397 && "Substitions are expected to be assignments"); 398 codegen((const clast_assignment *)Assignment, Statement, Dimension, 399 vectorDim, VectorVMap); 400 Assignment = Assignment->next; 401 Dimension++; 402 } 403 } 404 405 void ClastStmtCodeGen::codegen(const clast_user_stmt *u, 406 std::vector<Value*> *IVS , const char *iterator, 407 isl_set *Domain) { 408 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 409 410 if (u->substitutions) 411 codegenSubstitutions(u->substitutions, Statement); 412 413 int VectorDimensions = IVS ? IVS->size() : 1; 414 415 if (VectorDimensions == 1) { 416 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 417 return; 418 } 419 420 VectorValueMapT VectorMap(VectorDimensions); 421 422 if (IVS) { 423 assert (u->substitutions && "Substitutions expected!"); 424 int i = 0; 425 for (std::vector<Value*>::iterator II = IVS->begin(), IE = IVS->end(); 426 II != IE; ++II) { 427 ClastVars[iterator] = *II; 428 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap); 429 i++; 430 } 431 } 432 433 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, Domain, P); 434 } 435 436 void ClastStmtCodeGen::codegen(const clast_block *b) { 437 if (b->body) 438 codegen(b->body); 439 } 440 441 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 442 Value *LowerBound, *UpperBound, *IV, *Stride; 443 BasicBlock *AfterBB; 444 Type *IntPtrTy = getIntPtrTy(); 445 446 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 447 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 448 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 449 450 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 451 CmpInst::ICMP_SLE); 452 453 // Add loop iv to symbols. 454 ClastVars[f->iterator] = IV; 455 456 if (f->body) 457 codegen(f->body); 458 459 // Loop is finished, so remove its iv from the live symbols. 460 ClastVars.erase(f->iterator); 461 Builder.SetInsertPoint(AfterBB->begin()); 462 } 463 464 // Helper class to determine all scalar parameters used in the basic blocks of a 465 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 466 class ParameterVisitor : public ClastVisitor { 467 std::set<Value *> Values; 468 public: 469 ParameterVisitor() : ClastVisitor(), Values() { } 470 471 void visitUser(const clast_user_stmt *Stmt) { 472 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 473 const BasicBlock *BB = S->getBasicBlock(); 474 475 // Check all the operands of instructions in the basic block. 476 for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE; 477 ++BI) { 478 const Instruction &Inst = *BI; 479 for (Instruction::const_op_iterator II = Inst.op_begin(), 480 IE = Inst.op_end(); II != IE; ++II) { 481 Value *SrcVal = *II; 482 483 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 484 if (S->getParent()->getRegion().contains(OpInst)) 485 continue; 486 487 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 488 Values.insert(SrcVal); 489 } 490 } 491 } 492 493 // Iterator to iterate over the values found. 494 typedef std::set<Value *>::const_iterator const_iterator; 495 inline const_iterator begin() const { return Values.begin(); } 496 inline const_iterator end() const { return Values.end(); } 497 }; 498 499 SetVector<Value*> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 500 SetVector<Value*> Values; 501 502 // The clast variables 503 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); 504 I != E; I++) 505 Values.insert(I->second); 506 507 // The memory reference base addresses 508 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 509 ScopStmt *Stmt = *SI; 510 for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(), 511 E = Stmt->memacc_end(); I != E; ++I) { 512 Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr()); 513 Values.insert((BaseAddr)); 514 } 515 } 516 517 // Find the temporaries that are referenced in the clast statements' 518 // basic blocks but are not defined by these blocks (e.g., references 519 // to function arguments or temporaries defined before the start of 520 // the SCoP). 521 ParameterVisitor Params; 522 Params.visit(Body); 523 524 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 525 PI != PE; ++PI) { 526 Value *V = *PI; 527 Values.insert(V); 528 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 529 } 530 531 return Values; 532 } 533 534 void ClastStmtCodeGen::updateWithValueMap( 535 OMPGenerator::ValueToValueMapTy &VMap) { 536 std::set<Value*> Inserted; 537 538 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); 539 I != E; I++) { 540 ClastVars[I->first] = VMap[I->second]; 541 Inserted.insert(I->second); 542 } 543 544 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 545 E = VMap.end(); I != E; ++I) { 546 if (Inserted.count(I->first)) 547 continue; 548 549 ValueMap[I->first] = I->second; 550 } 551 } 552 553 static void clearDomtree(Function *F, DominatorTree &DT) { 554 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 555 std::vector<BasicBlock*> Nodes; 556 for (po_iterator<DomTreeNode*> I = po_begin(N), E = po_end(N); I != E; ++I) 557 Nodes.push_back(I->getBlock()); 558 559 for (std::vector<BasicBlock*>::iterator I = Nodes.begin(), E = Nodes.end(); 560 I != E; ++I) 561 DT.eraseNode(*I); 562 } 563 564 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 565 Value *Stride, *LB, *UB, *IV; 566 BasicBlock::iterator LoopBody; 567 IntegerType *IntPtrTy = getIntPtrTy(); 568 SetVector<Value*> Values; 569 OMPGenerator::ValueToValueMapTy VMap; 570 OMPGenerator OMPGen(Builder, P); 571 572 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 573 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 574 LB = ExpGen.codegen(For->LB, IntPtrTy); 575 UB = ExpGen.codegen(For->UB, IntPtrTy); 576 577 Values = getOMPValues(For->body); 578 579 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 580 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 581 Builder.SetInsertPoint(LoopBody); 582 583 // Save the current values. 584 const ValueMapT ValueMapCopy = ValueMap; 585 const CharMapT ClastVarsCopy = ClastVars; 586 587 updateWithValueMap(VMap); 588 ClastVars[For->iterator] = IV; 589 590 if (For->body) 591 codegen(For->body); 592 593 // Restore the original values. 594 ValueMap = ValueMapCopy; 595 ClastVars = ClastVarsCopy; 596 597 clearDomtree((*LoopBody).getParent()->getParent(), 598 P->getAnalysis<DominatorTree>()); 599 600 Builder.SetInsertPoint(AfterLoop); 601 } 602 603 #ifdef GPU_CODEGEN 604 static unsigned getArraySizeInBytes(const ArrayType *AT) { 605 unsigned Bytes = AT->getNumElements(); 606 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 607 Bytes *= getArraySizeInBytes(T); 608 else 609 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 610 611 return Bytes; 612 } 613 614 SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 615 SetVector<Value*> Values; 616 OutputBytes = 0; 617 618 // Record the memory reference base addresses. 619 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 620 ScopStmt *Stmt = *SI; 621 for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(), 622 E = Stmt->memacc_end(); I != E; ++I) { 623 Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr()); 624 Values.insert((BaseAddr)); 625 626 // FIXME: we assume that there is one and only one array to be written 627 // in a SCoP. 628 int NumWrites = 0; 629 if ((*I)->isWrite()) { 630 ++NumWrites; 631 assert(NumWrites <= 1 && 632 "We support at most one array to be written in a SCoP."); 633 if (const PointerType * PT = 634 dyn_cast<PointerType>(BaseAddr->getType())) { 635 Type *T = PT->getArrayElementType(); 636 const ArrayType *ATy = dyn_cast<ArrayType>(T); 637 OutputBytes = getArraySizeInBytes(ATy); 638 } 639 } 640 } 641 } 642 643 return Values; 644 } 645 646 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F, 647 std::vector<int> &NumIters, 648 unsigned &LoopDepth, 649 unsigned &NonPLoopDepth) { 650 clast_stmt *Stmt = (clast_stmt *)F; 651 const clast_for *Result; 652 bool NonParaFlag = false; 653 LoopDepth = 0; 654 NonPLoopDepth = 0; 655 656 while (Stmt) { 657 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 658 const clast_for *T = (clast_for *) Stmt; 659 if (isParallelFor(T)) { 660 if (!NonParaFlag) { 661 NumIters.push_back(getNumberOfIterations(T)); 662 Result = T; 663 } 664 } else 665 NonParaFlag = true; 666 667 Stmt = T->body; 668 LoopDepth++; 669 continue; 670 } 671 Stmt = Stmt->next; 672 } 673 674 assert(NumIters.size() == 4 && 675 "The loops should be tiled into 4-depth parallel loops and an " 676 "innermost non-parallel one (if exist)."); 677 NonPLoopDepth = LoopDepth - NumIters.size(); 678 assert(NonPLoopDepth <= 1 679 && "We support only one innermost non-parallel loop currently."); 680 return (const clast_stmt *)Result->body; 681 } 682 683 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 684 BasicBlock::iterator LoopBody; 685 SetVector<Value *> Values; 686 SetVector<Value *> IVS; 687 std::vector<int> NumIterations; 688 PTXGenerator::ValueToValueMapTy VMap; 689 690 assert(!GPUTriple.empty() 691 && "Target triple should be set properly for GPGPU code generation."); 692 PTXGenerator PTXGen(Builder, P, GPUTriple); 693 694 // Get original IVS and ScopStmt 695 unsigned TiledLoopDepth, NonPLoopDepth; 696 const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations, 697 TiledLoopDepth, NonPLoopDepth); 698 const clast_stmt *TmpStmt; 699 const clast_user_stmt *U; 700 const clast_for *InnerFor; 701 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 702 InnerFor = (const clast_for *)InnerStmt; 703 TmpStmt = InnerFor->body; 704 } else 705 TmpStmt = InnerStmt; 706 U = (const clast_user_stmt *) TmpStmt; 707 ScopStmt *Statement = (ScopStmt *) U->statement->usr; 708 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 709 const Value* IV = Statement->getInductionVariableForDimension(i); 710 IVS.insert(const_cast<Value *>(IV)); 711 } 712 713 unsigned OutBytes; 714 Values = getGPUValues(OutBytes); 715 PTXGen.setOutputBytes(OutBytes); 716 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 717 718 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 719 Builder.SetInsertPoint(LoopBody); 720 721 BasicBlock *AfterBB = 0; 722 if (NonPLoopDepth) { 723 Value *LowerBound, *UpperBound, *IV, *Stride; 724 Type *IntPtrTy = getIntPtrTy(); 725 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 726 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 727 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 728 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB); 729 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 730 Value *OldIV = const_cast<Value *>(OldIV_); 731 VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV)); 732 } 733 734 // Preserve the current values. 735 const ValueMapT ValueMapCopy = ValueMap; 736 const CharMapT ClastVarsCopy = ClastVars; 737 updateWithVMap(VMap); 738 739 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 740 741 // Restore the original values. 742 ValueMap = ValueMapCopy; 743 ClastVars = ClastVarsCopy; 744 745 if (AfterBB) 746 Builder.SetInsertPoint(AfterBB->begin()); 747 748 // FIXME: The replacement of the host base address with the parameter of ptx 749 // subfunction should have been done by updateWithValueMap. We use the 750 // following codes to avoid affecting other parts of Polly. This should be 751 // fixed later. 752 Function *FN = Builder.GetInsertBlock()->getParent(); 753 for (unsigned j = 0; j < Values.size(); j++) { 754 Value *baseAddr = Values[j]; 755 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 756 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 757 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 758 } 759 } 760 Builder.SetInsertPoint(AfterLoop); 761 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 762 NumIterations[2], NumIterations[3]); 763 PTXGen.finishGeneration(FN); 764 } 765 #endif 766 767 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 768 const clast_stmt *stmt = f->body; 769 770 while (stmt) { 771 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 772 return false; 773 774 stmt = stmt->next; 775 } 776 777 return true; 778 } 779 780 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *f) { 781 isl_set *loopDomain = isl_set_copy(isl_set_from_cloog_domain(f->domain)); 782 isl_set *tmp = isl_set_copy(loopDomain); 783 784 // Calculate a map similar to the identity map, but with the last input 785 // and output dimension not related. 786 // [i0, i1, i2, i3] -> [i0, i1, i2, o0] 787 isl_space *Space = isl_set_get_space(loopDomain); 788 Space = isl_space_drop_outputs(Space, 789 isl_set_dim(loopDomain, isl_dim_set) - 2, 1); 790 Space = isl_space_map_from_set(Space); 791 isl_map *identity = isl_map_identity(Space); 792 identity = isl_map_add_dims(identity, isl_dim_in, 1); 793 identity = isl_map_add_dims(identity, isl_dim_out, 1); 794 795 isl_map *map = isl_map_from_domain_and_range(tmp, loopDomain); 796 map = isl_map_intersect(map, identity); 797 798 isl_map *lexmax = isl_map_lexmax(isl_map_copy(map)); 799 isl_map *lexmin = isl_map_lexmin(map); 800 isl_map *sub = isl_map_sum(lexmax, isl_map_neg(lexmin)); 801 802 isl_set *elements = isl_map_range(sub); 803 804 if (!isl_set_is_singleton(elements)) { 805 isl_set_free(elements); 806 return -1; 807 } 808 809 isl_point *p = isl_set_sample_point(elements); 810 811 isl_int v; 812 isl_int_init(v); 813 isl_point_get_coordinate(p, isl_dim_set, isl_set_n_dim(loopDomain) - 1, &v); 814 int numberIterations = isl_int_get_si(v); 815 isl_int_clear(v); 816 isl_point_free(p); 817 818 return (numberIterations) / isl_int_get_si(f->stride) + 1; 819 } 820 821 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 822 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 823 int VectorWidth = getNumberOfIterations(F); 824 825 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 826 827 APInt Stride = APInt_from_MPZ(F->stride); 828 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 829 Stride = Stride.zext(LoopIVType->getBitWidth()); 830 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 831 832 std::vector<Value*> IVS(VectorWidth); 833 IVS[0] = LB; 834 835 for (int i = 1; i < VectorWidth; i++) 836 IVS[i] = Builder.CreateAdd(IVS[i-1], StrideValue, "p_vector_iv"); 837 838 isl_set *Domain = isl_set_from_cloog_domain(F->domain); 839 840 // Add loop iv to symbols. 841 ClastVars[F->iterator] = LB; 842 843 const clast_stmt *Stmt = F->body; 844 845 while (Stmt) { 846 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 847 isl_set_copy(Domain)); 848 Stmt = Stmt->next; 849 } 850 851 // Loop is finished, so remove its iv from the live symbols. 852 isl_set_free(Domain); 853 ClastVars.erase(F->iterator); 854 } 855 856 857 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 858 isl_set *Domain = isl_set_from_cloog_domain(f->domain); 859 assert(Domain && "Cannot access domain of loop"); 860 861 Dependences &D = P->getAnalysis<Dependences>(); 862 863 return D.isParallelDimension(isl_set_copy(Domain), isl_set_n_dim(Domain)); 864 } 865 866 void ClastStmtCodeGen::codegen(const clast_for *f) { 867 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 868 if ((Vector || OpenMP) && isParallelFor(f)) { 869 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) 870 && (getNumberOfIterations(f) <= 16)) { 871 codegenForVector(f); 872 return; 873 } 874 875 if (OpenMP && !parallelCodeGeneration) { 876 parallelCodeGeneration = true; 877 parallelLoops.push_back(f->iterator); 878 codegenForOpenMP(f); 879 parallelCodeGeneration = false; 880 return; 881 } 882 } 883 884 #ifdef GPU_CODEGEN 885 if (GPGPU && isParallelFor(f)) { 886 if (!parallelCodeGeneration) { 887 parallelCodeGeneration = true; 888 parallelLoops.push_back(f->iterator); 889 codegenForGPGPU(f); 890 parallelCodeGeneration = false; 891 return; 892 } 893 } 894 #endif 895 896 codegenForSequential(f); 897 } 898 899 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 900 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 901 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 902 CmpInst::Predicate P; 903 904 if (eq->sign == 0) 905 P = ICmpInst::ICMP_EQ; 906 else if (eq->sign > 0) 907 P = ICmpInst::ICMP_SGE; 908 else 909 P = ICmpInst::ICMP_SLE; 910 911 return Builder.CreateICmp(P, LHS, RHS); 912 } 913 914 void ClastStmtCodeGen::codegen(const clast_guard *g) { 915 Function *F = Builder.GetInsertBlock()->getParent(); 916 LLVMContext &Context = F->getContext(); 917 918 BasicBlock *CondBB = SplitBlock(Builder.GetInsertBlock(), 919 Builder.GetInsertPoint(), P); 920 CondBB->setName("polly.cond"); 921 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 922 MergeBB->setName("polly.merge"); 923 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 924 925 DominatorTree &DT = P->getAnalysis<DominatorTree>(); 926 DT.addNewBlock(ThenBB, CondBB); 927 DT.changeImmediateDominator(MergeBB, CondBB); 928 929 CondBB->getTerminator()->eraseFromParent(); 930 931 Builder.SetInsertPoint(CondBB); 932 933 Value *Predicate = codegen(&(g->eq[0])); 934 935 for (int i = 1; i < g->n; ++i) { 936 Value *TmpPredicate = codegen(&(g->eq[i])); 937 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 938 } 939 940 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 941 Builder.SetInsertPoint(ThenBB); 942 Builder.CreateBr(MergeBB); 943 Builder.SetInsertPoint(ThenBB->begin()); 944 945 codegen(g->then); 946 947 Builder.SetInsertPoint(MergeBB->begin()); 948 } 949 950 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 951 if (CLAST_STMT_IS_A(stmt, stmt_root)) 952 assert(false && "No second root statement expected"); 953 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 954 codegen((const clast_assignment *)stmt); 955 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 956 codegen((const clast_user_stmt *)stmt); 957 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 958 codegen((const clast_block *)stmt); 959 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 960 codegen((const clast_for *)stmt); 961 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 962 codegen((const clast_guard *)stmt); 963 964 if (stmt->next) 965 codegen(stmt->next); 966 } 967 968 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 969 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 970 971 int i = 0; 972 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 973 PI != PE; ++PI) { 974 assert(i < names->nb_parameters && "Not enough parameter names"); 975 976 const SCEV *Param = *PI; 977 Type *Ty = Param->getType(); 978 979 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 980 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 981 ClastVars[names->parameters[i]] = V; 982 983 ++i; 984 } 985 } 986 987 void ClastStmtCodeGen::codegen(const clast_root *r) { 988 addParameters(r->names); 989 990 parallelCodeGeneration = false; 991 992 const clast_stmt *stmt = (const clast_stmt*) r; 993 if (stmt->next) 994 codegen(stmt->next); 995 } 996 997 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) : 998 S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {} 999 1000 namespace { 1001 class CodeGeneration : public ScopPass { 1002 std::vector<std::string> ParallelLoops; 1003 1004 public: 1005 static char ID; 1006 1007 CodeGeneration() : ScopPass(ID) {} 1008 1009 1010 bool runOnScop(Scop &S) { 1011 ParallelLoops.clear(); 1012 1013 assert(S.getRegion().isSimple() && "Only simple regions are supported"); 1014 1015 BasicBlock *StartBlock = executeScopConditionally(S, this); 1016 1017 IRBuilder<> Builder(StartBlock->begin()); 1018 1019 ClastStmtCodeGen CodeGen(&S, Builder, this); 1020 CloogInfo &C = getAnalysis<CloogInfo>(); 1021 CodeGen.codegen(C.getClast()); 1022 1023 ParallelLoops.insert(ParallelLoops.begin(), 1024 CodeGen.getParallelLoops().begin(), 1025 CodeGen.getParallelLoops().end()); 1026 return true; 1027 } 1028 1029 virtual void printScop(raw_ostream &OS) const { 1030 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1031 PE = ParallelLoops.end(); PI != PE; ++PI) 1032 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1033 } 1034 1035 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1036 AU.addRequired<CloogInfo>(); 1037 AU.addRequired<Dependences>(); 1038 AU.addRequired<DominatorTree>(); 1039 AU.addRequired<RegionInfo>(); 1040 AU.addRequired<ScalarEvolution>(); 1041 AU.addRequired<ScopDetection>(); 1042 AU.addRequired<ScopInfo>(); 1043 AU.addRequired<DataLayout>(); 1044 1045 AU.addPreserved<CloogInfo>(); 1046 AU.addPreserved<Dependences>(); 1047 1048 // FIXME: We do not create LoopInfo for the newly generated loops. 1049 AU.addPreserved<LoopInfo>(); 1050 AU.addPreserved<DominatorTree>(); 1051 AU.addPreserved<ScopDetection>(); 1052 AU.addPreserved<ScalarEvolution>(); 1053 1054 // FIXME: We do not yet add regions for the newly generated code to the 1055 // region tree. 1056 AU.addPreserved<RegionInfo>(); 1057 AU.addPreserved<TempScopInfo>(); 1058 AU.addPreserved<ScopInfo>(); 1059 AU.addPreservedID(IndependentBlocksID); 1060 } 1061 }; 1062 } 1063 1064 char CodeGeneration::ID = 1; 1065 1066 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1067 "Polly - Create LLVM-IR from SCoPs", false, false) 1068 INITIALIZE_PASS_DEPENDENCY(CloogInfo) 1069 INITIALIZE_PASS_DEPENDENCY(Dependences) 1070 INITIALIZE_PASS_DEPENDENCY(DominatorTree) 1071 INITIALIZE_PASS_DEPENDENCY(RegionInfo) 1072 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1073 INITIALIZE_PASS_DEPENDENCY(ScopDetection) 1074 INITIALIZE_PASS_DEPENDENCY(DataLayout) 1075 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1076 "Polly - Create LLVM-IR from SCoPs", false, false) 1077 1078 Pass *polly::createCodeGenerationPass() { 1079 return new CodeGeneration(); 1080 } 1081 1082 #endif // CLOOG_FOUND 1083