1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #define DEBUG_TYPE "polly-codegen" 27 #include "polly/Dependences.h" 28 #include "polly/LinkAllPasses.h" 29 #include "polly/ScopInfo.h" 30 #include "polly/TempScopInfo.h" 31 #include "polly/CodeGen/CodeGeneration.h" 32 #include "polly/CodeGen/BlockGenerators.h" 33 #include "polly/CodeGen/LoopGenerators.h" 34 #include "polly/CodeGen/PTXGenerator.h" 35 #include "polly/CodeGen/Utils.h" 36 #include "polly/Support/GICHelper.h" 37 #include "polly/Support/ScopHelper.h" 38 39 #include "llvm/IR/Module.h" 40 #include "llvm/ADT/SetVector.h" 41 #include "llvm/ADT/PostOrderIterator.h" 42 #include "llvm/Analysis/LoopInfo.h" 43 #include "llvm/Analysis/ScalarEvolutionExpander.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/IR/DataLayout.h" 47 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 48 49 #define CLOOG_INT_GMP 1 50 #include "cloog/cloog.h" 51 #include "cloog/isl/cloog.h" 52 53 #include "isl/aff.h" 54 55 #include <vector> 56 #include <utility> 57 58 using namespace polly; 59 using namespace llvm; 60 61 struct isl_set; 62 63 namespace polly { 64 static cl::opt<bool> 65 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"), 66 cl::Hidden, cl::value_desc("OpenMP code generation enabled if true"), 67 cl::init(false), cl::ZeroOrMore); 68 69 #ifdef GPU_CODEGEN 70 static cl::opt<bool> 71 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), cl::Hidden, 72 cl::value_desc("GPGPU code generation enabled if true"), cl::init(false), 73 cl::ZeroOrMore); 74 75 static cl::opt<std::string> GPUTriple( 76 "polly-gpgpu-triple", cl::desc("Target triple for GPU code generation"), 77 cl::Hidden, cl::init("")); 78 #endif /* GPU_CODEGEN */ 79 80 typedef DenseMap<const char *, Value *> CharMapT; 81 82 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 83 class ClastExpCodeGen { 84 IRBuilder<> &Builder; 85 const CharMapT &IVS; 86 87 Value *codegen(const clast_name *e, Type *Ty); 88 Value *codegen(const clast_term *e, Type *Ty); 89 Value *codegen(const clast_binary *e, Type *Ty); 90 Value *codegen(const clast_reduction *r, Type *Ty); 91 92 public: 93 94 // A generator for clast expressions. 95 // 96 // @param B The IRBuilder that defines where the code to calculate the 97 // clast expressions should be inserted. 98 // @param IVMAP A Map that translates strings describing the induction 99 // variables to the Values* that represent these variables 100 // on the LLVM side. 101 ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap); 102 103 // Generates code to calculate a given clast expression. 104 // 105 // @param e The expression to calculate. 106 // @return The Value that holds the result. 107 Value *codegen(const clast_expr *e, Type *Ty); 108 }; 109 110 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 111 CharMapT::const_iterator I = IVS.find(e->name); 112 113 assert(I != IVS.end() && "Clast name not found"); 114 115 return Builder.CreateSExtOrBitCast(I->second, Ty); 116 } 117 118 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 119 APInt a = APInt_from_MPZ(e->val); 120 121 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 122 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 123 124 if (!e->var) 125 return ConstOne; 126 127 Value *var = codegen(e->var, Ty); 128 return Builder.CreateMul(ConstOne, var); 129 } 130 131 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 132 Value *LHS = codegen(e->LHS, Ty); 133 134 APInt RHS_AP = APInt_from_MPZ(e->RHS); 135 136 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 137 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 138 139 switch (e->type) { 140 case clast_bin_mod: 141 return Builder.CreateSRem(LHS, RHS); 142 case clast_bin_fdiv: { 143 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 144 Value *One = ConstantInt::get(Ty, 1); 145 Value *Zero = ConstantInt::get(Ty, 0); 146 Value *Sum1 = Builder.CreateSub(LHS, RHS); 147 Value *Sum2 = Builder.CreateAdd(Sum1, One); 148 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 149 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 150 return Builder.CreateSDiv(Dividend, RHS); 151 } 152 case clast_bin_cdiv: { 153 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 154 Value *One = ConstantInt::get(Ty, 1); 155 Value *Zero = ConstantInt::get(Ty, 0); 156 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 157 Value *Sum2 = Builder.CreateSub(Sum1, One); 158 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 159 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 160 return Builder.CreateSDiv(Dividend, RHS); 161 } 162 case clast_bin_div: 163 return Builder.CreateSDiv(LHS, RHS); 164 } 165 166 llvm_unreachable("Unknown clast binary expression type"); 167 } 168 169 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 170 assert((r->type == clast_red_min || r->type == clast_red_max || 171 r->type == clast_red_sum) && "Clast reduction type not supported"); 172 Value *old = codegen(r->elts[0], Ty); 173 174 for (int i = 1; i < r->n; ++i) { 175 Value *exprValue = codegen(r->elts[i], Ty); 176 177 switch (r->type) { 178 case clast_red_min: { 179 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 180 old = Builder.CreateSelect(cmp, old, exprValue); 181 break; 182 } 183 case clast_red_max: { 184 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 185 old = Builder.CreateSelect(cmp, old, exprValue); 186 break; 187 } 188 case clast_red_sum: 189 old = Builder.CreateAdd(old, exprValue); 190 break; 191 } 192 } 193 194 return old; 195 } 196 197 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap) 198 : Builder(B), IVS(IVMap) {} 199 200 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 201 switch (e->type) { 202 case clast_expr_name: 203 return codegen((const clast_name *)e, Ty); 204 case clast_expr_term: 205 return codegen((const clast_term *)e, Ty); 206 case clast_expr_bin: 207 return codegen((const clast_binary *)e, Ty); 208 case clast_expr_red: 209 return codegen((const clast_reduction *)e, Ty); 210 } 211 212 llvm_unreachable("Unknown clast expression!"); 213 } 214 215 class ClastStmtCodeGen { 216 public: 217 const std::vector<std::string> &getParallelLoops(); 218 219 private: 220 // The Scop we code generate. 221 Scop *S; 222 Pass *P; 223 224 // The Builder specifies the current location to code generate at. 225 IRBuilder<> &Builder; 226 227 // Map the Values from the old code to their counterparts in the new code. 228 ValueMapT ValueMap; 229 230 // Map the loops from the old code to expressions function of the induction 231 // variables in the new code. For example, when the code generator produces 232 // this AST: 233 // 234 // for (int c1 = 0; c1 <= 1023; c1 += 1) 235 // for (int c2 = 0; c2 <= 1023; c2 += 1) 236 // Stmt(c2 + 3, c1); 237 // 238 // LoopToScev is a map associating: 239 // "outer loop in the old loop nest" -> SCEV("c2 + 3"), 240 // "inner loop in the old loop nest" -> SCEV("c1"). 241 LoopToScevMapT LoopToScev; 242 243 // clastVars maps from the textual representation of a clast variable to its 244 // current *Value. clast variables are scheduling variables, original 245 // induction variables or parameters. They are used either in loop bounds or 246 // to define the statement instance that is executed. 247 // 248 // for (s = 0; s < n + 3; ++i) 249 // for (t = s; t < m; ++j) 250 // Stmt(i = s + 3 * m, j = t); 251 // 252 // {s,t,i,j,n,m} is the set of clast variables in this clast. 253 CharMapT ClastVars; 254 255 // Codegenerator for clast expressions. 256 ClastExpCodeGen ExpGen; 257 258 // Do we currently generate parallel code? 259 bool parallelCodeGeneration; 260 261 std::vector<std::string> parallelLoops; 262 263 void codegen(const clast_assignment *a); 264 265 void 266 codegen(const clast_assignment *a, ScopStmt *Statement, unsigned Dimension, 267 int vectorDim, std::vector<ValueMapT> *VectorVMap = 0, 268 std::vector<LoopToScevMapT> *VLTS = 0); 269 270 void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement, 271 int vectorDim = 0, 272 std::vector<ValueMapT> *VectorVMap = 0, 273 std::vector<LoopToScevMapT> *VLTS = 0); 274 275 void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = NULL, 276 const char *iterator = NULL, isl_set *scatteringDomain = 0); 277 278 void codegen(const clast_block *b); 279 280 /// @brief Create a classical sequential loop. 281 void codegenForSequential(const clast_for *f); 282 283 /// @brief Create OpenMP structure values. 284 /// 285 /// Create a list of values that has to be stored into the OpenMP subfuncition 286 /// structure. 287 SetVector<Value *> getOMPValues(const clast_stmt *Body); 288 289 /// @brief Update ClastVars and ValueMap according to a value map. 290 /// 291 /// @param VMap A map from old to new values. 292 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 293 294 /// @brief Create an OpenMP parallel for loop. 295 /// 296 /// This loop reflects a loop as if it would have been created by an OpenMP 297 /// statement. 298 void codegenForOpenMP(const clast_for *f); 299 300 #ifdef GPU_CODEGEN 301 /// @brief Create GPGPU device memory access values. 302 /// 303 /// Create a list of values that will be set to be parameters of the GPGPU 304 /// subfunction. These parameters represent device memory base addresses 305 /// and the size in bytes. 306 SetVector<Value *> getGPUValues(unsigned &OutputBytes); 307 308 /// @brief Create a GPU parallel for loop. 309 /// 310 /// This loop reflects a loop as if it would have been created by a GPU 311 /// statement. 312 void codegenForGPGPU(const clast_for *F); 313 314 /// @brief Get innermost for loop. 315 const clast_stmt * 316 getScheduleInfo(const clast_for *F, std::vector<int> &NumIters, 317 unsigned &LoopDepth, unsigned &NonPLoopDepth); 318 #endif /* GPU_CODEGEN */ 319 320 /// @brief Check if a loop is parallel 321 /// 322 /// Detect if a clast_for loop can be executed in parallel. 323 /// 324 /// @param For The clast for loop to check. 325 /// 326 /// @return bool Returns true if the incoming clast_for statement can 327 /// execute in parallel. 328 bool isParallelFor(const clast_for *For); 329 330 bool isInnermostLoop(const clast_for *f); 331 332 /// @brief Get the number of loop iterations for this loop. 333 /// @param f The clast for loop to check. 334 int getNumberOfIterations(const clast_for *f); 335 336 /// @brief Create vector instructions for this loop. 337 void codegenForVector(const clast_for *f); 338 339 void codegen(const clast_for *f); 340 341 Value *codegen(const clast_equation *eq); 342 343 void codegen(const clast_guard *g); 344 345 void codegen(const clast_stmt *stmt); 346 347 void addParameters(const CloogNames *names); 348 349 IntegerType *getIntPtrTy(); 350 351 public: 352 void codegen(const clast_root *r); 353 354 ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P); 355 }; 356 } 357 358 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 359 return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext()); 360 } 361 362 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 363 return parallelLoops; 364 } 365 366 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 367 Value *V = ExpGen.codegen(a->RHS, getIntPtrTy()); 368 ClastVars[a->LHS] = V; 369 } 370 371 void ClastStmtCodeGen::codegen( 372 const clast_assignment *A, ScopStmt *Stmt, unsigned Dim, int VectorDim, 373 std::vector<ValueMapT> *VectorVMap, std::vector<LoopToScevMapT> *VLTS) { 374 Value *RHS; 375 376 assert(!A->LHS && "Statement assignments do not have left hand side"); 377 378 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 379 380 const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS); 381 if (VLTS) 382 (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS; 383 LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS; 384 385 const PHINode *PN = Stmt->getInductionVariableForDimension(Dim); 386 if (PN) { 387 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 388 389 if (VectorVMap) 390 (*VectorVMap)[VectorDim][PN] = RHS; 391 392 ValueMap[PN] = RHS; 393 } 394 } 395 396 void ClastStmtCodeGen::codegenSubstitutions( 397 const clast_stmt *Assignment, ScopStmt *Statement, int vectorDim, 398 std::vector<ValueMapT> *VectorVMap, std::vector<LoopToScevMapT> *VLTS) { 399 int Dimension = 0; 400 401 while (Assignment) { 402 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) && 403 "Substitions are expected to be assignments"); 404 codegen((const clast_assignment *)Assignment, Statement, Dimension, 405 vectorDim, VectorVMap, VLTS); 406 Assignment = Assignment->next; 407 Dimension++; 408 } 409 } 410 411 // Takes the cloog specific domain and translates it into a map Statement -> 412 // PartialSchedule, where the PartialSchedule contains all the dimensions that 413 // have been code generated up to this point. 414 static __isl_give isl_map * 415 extractPartialSchedule(ScopStmt *Statement, isl_set *Domain) { 416 isl_map *Schedule = Statement->getScattering(); 417 int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set); 418 int UnscheduledDimensions = 419 isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions; 420 421 return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions, 422 UnscheduledDimensions); 423 } 424 425 void 426 ClastStmtCodeGen::codegen(const clast_user_stmt *u, std::vector<Value *> *IVS, 427 const char *iterator, isl_set *Domain) { 428 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 429 430 if (u->substitutions) 431 codegenSubstitutions(u->substitutions, Statement); 432 433 int VectorDimensions = IVS ? IVS->size() : 1; 434 435 if (VectorDimensions == 1) { 436 BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P); 437 return; 438 } 439 440 VectorValueMapT VectorMap(VectorDimensions); 441 std::vector<LoopToScevMapT> VLTS(VectorDimensions); 442 443 if (IVS) { 444 assert(u->substitutions && "Substitutions expected!"); 445 int i = 0; 446 for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end(); 447 II != IE; ++II) { 448 ClastVars[iterator] = *II; 449 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS); 450 i++; 451 } 452 } 453 454 isl_map *Schedule = extractPartialSchedule(Statement, Domain); 455 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule, 456 P); 457 isl_map_free(Schedule); 458 } 459 460 void ClastStmtCodeGen::codegen(const clast_block *b) { 461 if (b->body) 462 codegen(b->body); 463 } 464 465 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 466 Value *LowerBound, *UpperBound, *IV, *Stride; 467 BasicBlock *AfterBB; 468 Type *IntPtrTy = getIntPtrTy(); 469 470 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 471 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 472 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 473 474 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 475 CmpInst::ICMP_SLE); 476 477 // Add loop iv to symbols. 478 ClastVars[f->iterator] = IV; 479 480 if (f->body) 481 codegen(f->body); 482 483 // Loop is finished, so remove its iv from the live symbols. 484 ClastVars.erase(f->iterator); 485 Builder.SetInsertPoint(AfterBB->begin()); 486 } 487 488 // Helper class to determine all scalar parameters used in the basic blocks of a 489 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 490 class ParameterVisitor : public ClastVisitor { 491 std::set<Value *> Values; 492 493 public: 494 ParameterVisitor() : ClastVisitor(), Values() {} 495 496 void visitUser(const clast_user_stmt *Stmt) { 497 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 498 const BasicBlock *BB = S->getBasicBlock(); 499 500 // Check all the operands of instructions in the basic block. 501 for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE; 502 ++BI) { 503 const Instruction &Inst = *BI; 504 for (Instruction::const_op_iterator II = Inst.op_begin(), 505 IE = Inst.op_end(); 506 II != IE; ++II) { 507 Value *SrcVal = *II; 508 509 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 510 if (S->getParent()->getRegion().contains(OpInst)) 511 continue; 512 513 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 514 Values.insert(SrcVal); 515 } 516 } 517 } 518 519 // Iterator to iterate over the values found. 520 typedef std::set<Value *>::const_iterator const_iterator; 521 inline const_iterator begin() const { return Values.begin(); } 522 inline const_iterator end() const { return Values.end(); } 523 }; 524 525 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 526 SetVector<Value *> Values; 527 528 // The clast variables 529 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 530 I++) 531 Values.insert(I->second); 532 533 // Find the temporaries that are referenced in the clast statements' 534 // basic blocks but are not defined by these blocks (e.g., references 535 // to function arguments or temporaries defined before the start of 536 // the SCoP). 537 ParameterVisitor Params; 538 Params.visit(Body); 539 540 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 541 PI != PE; ++PI) { 542 Value *V = *PI; 543 Values.insert(V); 544 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 545 } 546 547 return Values; 548 } 549 550 void 551 ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) { 552 std::set<Value *> Inserted; 553 554 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 555 I++) { 556 ClastVars[I->first] = VMap[I->second]; 557 Inserted.insert(I->second); 558 } 559 560 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 561 E = VMap.end(); 562 I != E; ++I) { 563 if (Inserted.count(I->first)) 564 continue; 565 566 ValueMap[I->first] = I->second; 567 } 568 } 569 570 static void clearDomtree(Function *F, DominatorTree &DT) { 571 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 572 std::vector<BasicBlock *> Nodes; 573 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 574 Nodes.push_back(I->getBlock()); 575 576 for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end(); 577 I != E; ++I) 578 DT.eraseNode(*I); 579 } 580 581 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 582 Value *Stride, *LB, *UB, *IV; 583 BasicBlock::iterator LoopBody; 584 IntegerType *IntPtrTy = getIntPtrTy(); 585 SetVector<Value *> Values; 586 OMPGenerator::ValueToValueMapTy VMap; 587 OMPGenerator OMPGen(Builder, P); 588 589 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 590 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 591 LB = ExpGen.codegen(For->LB, IntPtrTy); 592 UB = ExpGen.codegen(For->UB, IntPtrTy); 593 594 Values = getOMPValues(For->body); 595 596 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 597 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 598 Builder.SetInsertPoint(LoopBody); 599 600 // Save the current values. 601 const ValueMapT ValueMapCopy = ValueMap; 602 const CharMapT ClastVarsCopy = ClastVars; 603 604 updateWithValueMap(VMap); 605 ClastVars[For->iterator] = IV; 606 607 if (For->body) 608 codegen(For->body); 609 610 // Restore the original values. 611 ValueMap = ValueMapCopy; 612 ClastVars = ClastVarsCopy; 613 614 clearDomtree((*LoopBody).getParent()->getParent(), 615 P->getAnalysis<DominatorTree>()); 616 617 Builder.SetInsertPoint(AfterLoop); 618 } 619 620 #ifdef GPU_CODEGEN 621 static unsigned getArraySizeInBytes(const ArrayType *AT) { 622 unsigned Bytes = AT->getNumElements(); 623 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 624 Bytes *= getArraySizeInBytes(T); 625 else 626 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 627 628 return Bytes; 629 } 630 631 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 632 SetVector<Value *> Values; 633 OutputBytes = 0; 634 635 // Record the memory reference base addresses. 636 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 637 ScopStmt *Stmt = *SI; 638 for (SmallVector<MemoryAccess *, 8>::iterator I = Stmt->memacc_begin(), 639 E = Stmt->memacc_end(); 640 I != E; ++I) { 641 Value *BaseAddr = const_cast<Value *>((*I)->getBaseAddr()); 642 Values.insert((BaseAddr)); 643 644 // FIXME: we assume that there is one and only one array to be written 645 // in a SCoP. 646 int NumWrites = 0; 647 if ((*I)->isWrite()) { 648 ++NumWrites; 649 assert(NumWrites <= 1 && 650 "We support at most one array to be written in a SCoP."); 651 if (const PointerType *PT = 652 dyn_cast<PointerType>(BaseAddr->getType())) { 653 Type *T = PT->getArrayElementType(); 654 const ArrayType *ATy = dyn_cast<ArrayType>(T); 655 OutputBytes = getArraySizeInBytes(ATy); 656 } 657 } 658 } 659 } 660 661 return Values; 662 } 663 664 const clast_stmt *ClastStmtCodeGen::getScheduleInfo( 665 const clast_for *F, std::vector<int> &NumIters, unsigned &LoopDepth, 666 unsigned &NonPLoopDepth) { 667 clast_stmt *Stmt = (clast_stmt *)F; 668 const clast_for *Result; 669 bool NonParaFlag = false; 670 LoopDepth = 0; 671 NonPLoopDepth = 0; 672 673 while (Stmt) { 674 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 675 const clast_for *T = (clast_for *)Stmt; 676 if (isParallelFor(T)) { 677 if (!NonParaFlag) { 678 NumIters.push_back(getNumberOfIterations(T)); 679 Result = T; 680 } 681 } else 682 NonParaFlag = true; 683 684 Stmt = T->body; 685 LoopDepth++; 686 continue; 687 } 688 Stmt = Stmt->next; 689 } 690 691 assert(NumIters.size() == 4 && 692 "The loops should be tiled into 4-depth parallel loops and an " 693 "innermost non-parallel one (if exist)."); 694 NonPLoopDepth = LoopDepth - NumIters.size(); 695 assert(NonPLoopDepth <= 1 && 696 "We support only one innermost non-parallel loop currently."); 697 return (const clast_stmt *)Result->body; 698 } 699 700 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 701 BasicBlock::iterator LoopBody; 702 SetVector<Value *> Values; 703 SetVector<Value *> IVS; 704 std::vector<int> NumIterations; 705 PTXGenerator::ValueToValueMapTy VMap; 706 707 assert(!GPUTriple.empty() && 708 "Target triple should be set properly for GPGPU code generation."); 709 PTXGenerator PTXGen(Builder, P, GPUTriple); 710 711 // Get original IVS and ScopStmt 712 unsigned TiledLoopDepth, NonPLoopDepth; 713 const clast_stmt *InnerStmt = 714 getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth); 715 const clast_stmt *TmpStmt; 716 const clast_user_stmt *U; 717 const clast_for *InnerFor; 718 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 719 InnerFor = (const clast_for *)InnerStmt; 720 TmpStmt = InnerFor->body; 721 } else 722 TmpStmt = InnerStmt; 723 U = (const clast_user_stmt *)TmpStmt; 724 ScopStmt *Statement = (ScopStmt *)U->statement->usr; 725 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 726 const Value *IV = Statement->getInductionVariableForDimension(i); 727 IVS.insert(const_cast<Value *>(IV)); 728 } 729 730 unsigned OutBytes; 731 Values = getGPUValues(OutBytes); 732 PTXGen.setOutputBytes(OutBytes); 733 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 734 735 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 736 Builder.SetInsertPoint(LoopBody); 737 738 BasicBlock *AfterBB = 0; 739 if (NonPLoopDepth) { 740 Value *LowerBound, *UpperBound, *IV, *Stride; 741 Type *IntPtrTy = getIntPtrTy(); 742 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 743 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 744 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 745 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 746 CmpInst::ICMP_SLE); 747 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 748 Value *OldIV = const_cast<Value *>(OldIV_); 749 VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV)); 750 } 751 752 updateWithValueMap(VMap); 753 754 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 755 756 if (AfterBB) 757 Builder.SetInsertPoint(AfterBB->begin()); 758 759 // FIXME: The replacement of the host base address with the parameter of ptx 760 // subfunction should have been done by updateWithValueMap. We use the 761 // following codes to avoid affecting other parts of Polly. This should be 762 // fixed later. 763 Function *FN = Builder.GetInsertBlock()->getParent(); 764 for (unsigned j = 0; j < Values.size(); j++) { 765 Value *baseAddr = Values[j]; 766 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 767 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 768 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 769 } 770 } 771 Builder.SetInsertPoint(AfterLoop); 772 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 773 NumIterations[2], NumIterations[3]); 774 PTXGen.finishGeneration(FN); 775 } 776 #endif 777 778 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 779 const clast_stmt *stmt = f->body; 780 781 while (stmt) { 782 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 783 return false; 784 785 stmt = stmt->next; 786 } 787 788 return true; 789 } 790 791 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) { 792 isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain)); 793 int NumberOfIterations = polly::getNumberOfIterations(LoopDomain); 794 if (NumberOfIterations == -1) 795 return -1; 796 return NumberOfIterations / isl_int_get_si(For->stride) + 1; 797 } 798 799 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 800 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 801 int VectorWidth = getNumberOfIterations(F); 802 803 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 804 805 APInt Stride = APInt_from_MPZ(F->stride); 806 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 807 Stride = Stride.zext(LoopIVType->getBitWidth()); 808 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 809 810 std::vector<Value *> IVS(VectorWidth); 811 IVS[0] = LB; 812 813 for (int i = 1; i < VectorWidth; i++) 814 IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv"); 815 816 isl_set *Domain = isl_set_from_cloog_domain(F->domain); 817 818 // Add loop iv to symbols. 819 ClastVars[F->iterator] = LB; 820 821 const clast_stmt *Stmt = F->body; 822 823 while (Stmt) { 824 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 825 isl_set_copy(Domain)); 826 Stmt = Stmt->next; 827 } 828 829 // Loop is finished, so remove its iv from the live symbols. 830 isl_set_free(Domain); 831 ClastVars.erase(F->iterator); 832 } 833 834 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 835 isl_set *Domain = isl_set_from_cloog_domain(f->domain); 836 assert(Domain && "Cannot access domain of loop"); 837 838 Dependences &D = P->getAnalysis<Dependences>(); 839 840 return D.isParallelDimension(isl_set_copy(Domain), isl_set_n_dim(Domain)); 841 } 842 843 void ClastStmtCodeGen::codegen(const clast_for *f) { 844 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 845 if ((Vector || OpenMP) && isParallelFor(f)) { 846 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) && 847 (getNumberOfIterations(f) <= 16)) { 848 codegenForVector(f); 849 return; 850 } 851 852 if (OpenMP && !parallelCodeGeneration) { 853 parallelCodeGeneration = true; 854 parallelLoops.push_back(f->iterator); 855 codegenForOpenMP(f); 856 parallelCodeGeneration = false; 857 return; 858 } 859 } 860 861 #ifdef GPU_CODEGEN 862 if (GPGPU && isParallelFor(f)) { 863 if (!parallelCodeGeneration) { 864 parallelCodeGeneration = true; 865 parallelLoops.push_back(f->iterator); 866 codegenForGPGPU(f); 867 parallelCodeGeneration = false; 868 return; 869 } 870 } 871 #endif 872 873 codegenForSequential(f); 874 } 875 876 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 877 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 878 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 879 CmpInst::Predicate P; 880 881 if (eq->sign == 0) 882 P = ICmpInst::ICMP_EQ; 883 else if (eq->sign > 0) 884 P = ICmpInst::ICMP_SGE; 885 else 886 P = ICmpInst::ICMP_SLE; 887 888 return Builder.CreateICmp(P, LHS, RHS); 889 } 890 891 void ClastStmtCodeGen::codegen(const clast_guard *g) { 892 Function *F = Builder.GetInsertBlock()->getParent(); 893 LLVMContext &Context = F->getContext(); 894 895 BasicBlock *CondBB = 896 SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P); 897 CondBB->setName("polly.cond"); 898 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 899 MergeBB->setName("polly.merge"); 900 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 901 902 DominatorTree &DT = P->getAnalysis<DominatorTree>(); 903 DT.addNewBlock(ThenBB, CondBB); 904 DT.changeImmediateDominator(MergeBB, CondBB); 905 906 CondBB->getTerminator()->eraseFromParent(); 907 908 Builder.SetInsertPoint(CondBB); 909 910 Value *Predicate = codegen(&(g->eq[0])); 911 912 for (int i = 1; i < g->n; ++i) { 913 Value *TmpPredicate = codegen(&(g->eq[i])); 914 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 915 } 916 917 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 918 Builder.SetInsertPoint(ThenBB); 919 Builder.CreateBr(MergeBB); 920 Builder.SetInsertPoint(ThenBB->begin()); 921 922 codegen(g->then); 923 924 Builder.SetInsertPoint(MergeBB->begin()); 925 } 926 927 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 928 if (CLAST_STMT_IS_A(stmt, stmt_root)) 929 assert(false && "No second root statement expected"); 930 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 931 codegen((const clast_assignment *)stmt); 932 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 933 codegen((const clast_user_stmt *)stmt); 934 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 935 codegen((const clast_block *)stmt); 936 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 937 codegen((const clast_for *)stmt); 938 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 939 codegen((const clast_guard *)stmt); 940 941 if (stmt->next) 942 codegen(stmt->next); 943 } 944 945 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 946 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 947 948 int i = 0; 949 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 950 PI != PE; ++PI) { 951 assert(i < names->nb_parameters && "Not enough parameter names"); 952 953 const SCEV *Param = *PI; 954 Type *Ty = Param->getType(); 955 956 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 957 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 958 ClastVars[names->parameters[i]] = V; 959 960 ++i; 961 } 962 } 963 964 void ClastStmtCodeGen::codegen(const clast_root *r) { 965 addParameters(r->names); 966 967 parallelCodeGeneration = false; 968 969 const clast_stmt *stmt = (const clast_stmt *)r; 970 if (stmt->next) 971 codegen(stmt->next); 972 } 973 974 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) 975 : S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {} 976 977 namespace { 978 class CodeGeneration : public ScopPass { 979 std::vector<std::string> ParallelLoops; 980 981 public: 982 static char ID; 983 984 CodeGeneration() : ScopPass(ID) {} 985 986 bool runOnScop(Scop &S) { 987 ParallelLoops.clear(); 988 989 assert(!S.getRegion().isTopLevelRegion() 990 && "Top level regions are not supported"); 991 992 simplifyRegion(&S, this); 993 994 BasicBlock *StartBlock = executeScopConditionally(S, this); 995 996 IRBuilder<> Builder(StartBlock->begin()); 997 998 ClastStmtCodeGen CodeGen(&S, Builder, this); 999 CloogInfo &C = getAnalysis<CloogInfo>(); 1000 CodeGen.codegen(C.getClast()); 1001 1002 ParallelLoops.insert(ParallelLoops.begin(), 1003 CodeGen.getParallelLoops().begin(), 1004 CodeGen.getParallelLoops().end()); 1005 return true; 1006 } 1007 1008 virtual void printScop(raw_ostream &OS) const { 1009 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1010 PE = ParallelLoops.end(); 1011 PI != PE; ++PI) 1012 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1013 } 1014 1015 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1016 AU.addRequired<CloogInfo>(); 1017 AU.addRequired<Dependences>(); 1018 AU.addRequired<DominatorTree>(); 1019 AU.addRequired<RegionInfo>(); 1020 AU.addRequired<ScalarEvolution>(); 1021 AU.addRequired<ScopDetection>(); 1022 AU.addRequired<ScopInfo>(); 1023 AU.addRequired<DataLayout>(); 1024 AU.addRequired<LoopInfo>(); 1025 1026 AU.addPreserved<CloogInfo>(); 1027 AU.addPreserved<Dependences>(); 1028 1029 // FIXME: We do not create LoopInfo for the newly generated loops. 1030 AU.addPreserved<LoopInfo>(); 1031 AU.addPreserved<DominatorTree>(); 1032 AU.addPreserved<ScopDetection>(); 1033 AU.addPreserved<ScalarEvolution>(); 1034 1035 // FIXME: We do not yet add regions for the newly generated code to the 1036 // region tree. 1037 AU.addPreserved<RegionInfo>(); 1038 AU.addPreserved<TempScopInfo>(); 1039 AU.addPreserved<ScopInfo>(); 1040 AU.addPreservedID(IndependentBlocksID); 1041 } 1042 }; 1043 } 1044 1045 char CodeGeneration::ID = 1; 1046 1047 Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); } 1048 1049 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1050 "Polly - Create LLVM-IR from SCoPs", false, false); 1051 INITIALIZE_PASS_DEPENDENCY(CloogInfo); 1052 INITIALIZE_PASS_DEPENDENCY(Dependences); 1053 INITIALIZE_PASS_DEPENDENCY(DominatorTree); 1054 INITIALIZE_PASS_DEPENDENCY(RegionInfo); 1055 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution); 1056 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1057 INITIALIZE_PASS_DEPENDENCY(DataLayout); 1058 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1059 "Polly - Create LLVM-IR from SCoPs", false, false) 1060 1061 #endif // CLOOG_FOUND 1062