1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #define DEBUG_TYPE "polly-codegen" 27 #include "polly/Dependences.h" 28 #include "polly/LinkAllPasses.h" 29 #include "polly/Options.h" 30 #include "polly/ScopInfo.h" 31 #include "polly/TempScopInfo.h" 32 #include "polly/CodeGen/CodeGeneration.h" 33 #include "polly/CodeGen/BlockGenerators.h" 34 #include "polly/CodeGen/LoopGenerators.h" 35 #include "polly/CodeGen/PTXGenerator.h" 36 #include "polly/CodeGen/Utils.h" 37 #include "polly/Support/GICHelper.h" 38 #include "polly/Support/ScopHelper.h" 39 40 #include "llvm/IR/Module.h" 41 #include "llvm/ADT/SetVector.h" 42 #include "llvm/ADT/PostOrderIterator.h" 43 #include "llvm/Analysis/LoopInfo.h" 44 #include "llvm/Analysis/ScalarEvolutionExpander.h" 45 #include "llvm/Support/Debug.h" 46 #include "llvm/IR/DataLayout.h" 47 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 48 49 #define CLOOG_INT_GMP 1 50 #include "cloog/cloog.h" 51 #include "cloog/isl/cloog.h" 52 53 #include "isl/aff.h" 54 55 #include <vector> 56 #include <utility> 57 58 using namespace polly; 59 using namespace llvm; 60 61 struct isl_set; 62 63 namespace polly { 64 static cl::opt<bool> 65 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"), 66 cl::value_desc("OpenMP code generation enabled if true"), 67 cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); 68 69 #ifdef GPU_CODEGEN 70 static cl::opt<bool> 71 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), cl::Hidden, 72 cl::value_desc("GPGPU code generation enabled if true"), cl::init(false), 73 cl::ZeroOrMore, cl::cat(PollyCategory)); 74 75 static cl::opt<std::string> 76 GPUTriple("polly-gpgpu-triple", 77 cl::desc("Target triple for GPU code generation"), cl::Hidden, 78 cl::init(""), cl::cat(PollyCategory)); 79 #endif /* GPU_CODEGEN */ 80 81 typedef DenseMap<const char *, Value *> CharMapT; 82 83 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 84 class ClastExpCodeGen { 85 IRBuilder<> &Builder; 86 const CharMapT &IVS; 87 88 Value *codegen(const clast_name *e, Type *Ty); 89 Value *codegen(const clast_term *e, Type *Ty); 90 Value *codegen(const clast_binary *e, Type *Ty); 91 Value *codegen(const clast_reduction *r, Type *Ty); 92 93 public: 94 95 // A generator for clast expressions. 96 // 97 // @param B The IRBuilder that defines where the code to calculate the 98 // clast expressions should be inserted. 99 // @param IVMAP A Map that translates strings describing the induction 100 // variables to the Values* that represent these variables 101 // on the LLVM side. 102 ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap); 103 104 // Generates code to calculate a given clast expression. 105 // 106 // @param e The expression to calculate. 107 // @return The Value that holds the result. 108 Value *codegen(const clast_expr *e, Type *Ty); 109 }; 110 111 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 112 CharMapT::const_iterator I = IVS.find(e->name); 113 114 assert(I != IVS.end() && "Clast name not found"); 115 116 return Builder.CreateSExtOrBitCast(I->second, Ty); 117 } 118 119 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 120 APInt a = APInt_from_MPZ(e->val); 121 122 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 123 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 124 125 if (!e->var) 126 return ConstOne; 127 128 Value *var = codegen(e->var, Ty); 129 return Builder.CreateMul(ConstOne, var); 130 } 131 132 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 133 Value *LHS = codegen(e->LHS, Ty); 134 135 APInt RHS_AP = APInt_from_MPZ(e->RHS); 136 137 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 138 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 139 140 switch (e->type) { 141 case clast_bin_mod: 142 return Builder.CreateSRem(LHS, RHS); 143 case clast_bin_fdiv: { 144 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 145 Value *One = ConstantInt::get(Ty, 1); 146 Value *Zero = ConstantInt::get(Ty, 0); 147 Value *Sum1 = Builder.CreateSub(LHS, RHS); 148 Value *Sum2 = Builder.CreateAdd(Sum1, One); 149 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 150 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 151 return Builder.CreateSDiv(Dividend, RHS); 152 } 153 case clast_bin_cdiv: { 154 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 155 Value *One = ConstantInt::get(Ty, 1); 156 Value *Zero = ConstantInt::get(Ty, 0); 157 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 158 Value *Sum2 = Builder.CreateSub(Sum1, One); 159 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 160 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 161 return Builder.CreateSDiv(Dividend, RHS); 162 } 163 case clast_bin_div: 164 return Builder.CreateSDiv(LHS, RHS); 165 } 166 167 llvm_unreachable("Unknown clast binary expression type"); 168 } 169 170 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 171 assert((r->type == clast_red_min || r->type == clast_red_max || 172 r->type == clast_red_sum) && 173 "Clast reduction type not supported"); 174 Value *old = codegen(r->elts[0], Ty); 175 176 for (int i = 1; i < r->n; ++i) { 177 Value *exprValue = codegen(r->elts[i], Ty); 178 179 switch (r->type) { 180 case clast_red_min: { 181 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 182 old = Builder.CreateSelect(cmp, old, exprValue); 183 break; 184 } 185 case clast_red_max: { 186 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 187 old = Builder.CreateSelect(cmp, old, exprValue); 188 break; 189 } 190 case clast_red_sum: 191 old = Builder.CreateAdd(old, exprValue); 192 break; 193 } 194 } 195 196 return old; 197 } 198 199 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap) 200 : Builder(B), IVS(IVMap) {} 201 202 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 203 switch (e->type) { 204 case clast_expr_name: 205 return codegen((const clast_name *)e, Ty); 206 case clast_expr_term: 207 return codegen((const clast_term *)e, Ty); 208 case clast_expr_bin: 209 return codegen((const clast_binary *)e, Ty); 210 case clast_expr_red: 211 return codegen((const clast_reduction *)e, Ty); 212 } 213 214 llvm_unreachable("Unknown clast expression!"); 215 } 216 217 class ClastStmtCodeGen { 218 public: 219 const std::vector<std::string> &getParallelLoops(); 220 221 private: 222 // The Scop we code generate. 223 Scop *S; 224 Pass *P; 225 226 // The Builder specifies the current location to code generate at. 227 IRBuilder<> &Builder; 228 229 // Map the Values from the old code to their counterparts in the new code. 230 ValueMapT ValueMap; 231 232 // Map the loops from the old code to expressions function of the induction 233 // variables in the new code. For example, when the code generator produces 234 // this AST: 235 // 236 // for (int c1 = 0; c1 <= 1023; c1 += 1) 237 // for (int c2 = 0; c2 <= 1023; c2 += 1) 238 // Stmt(c2 + 3, c1); 239 // 240 // LoopToScev is a map associating: 241 // "outer loop in the old loop nest" -> SCEV("c2 + 3"), 242 // "inner loop in the old loop nest" -> SCEV("c1"). 243 LoopToScevMapT LoopToScev; 244 245 // clastVars maps from the textual representation of a clast variable to its 246 // current *Value. clast variables are scheduling variables, original 247 // induction variables or parameters. They are used either in loop bounds or 248 // to define the statement instance that is executed. 249 // 250 // for (s = 0; s < n + 3; ++i) 251 // for (t = s; t < m; ++j) 252 // Stmt(i = s + 3 * m, j = t); 253 // 254 // {s,t,i,j,n,m} is the set of clast variables in this clast. 255 CharMapT ClastVars; 256 257 // Codegenerator for clast expressions. 258 ClastExpCodeGen ExpGen; 259 260 // Do we currently generate parallel code? 261 bool parallelCodeGeneration; 262 263 std::vector<std::string> parallelLoops; 264 265 void codegen(const clast_assignment *a); 266 267 void codegen(const clast_assignment *a, ScopStmt *Statement, 268 unsigned Dimension, int vectorDim, 269 std::vector<ValueMapT> *VectorVMap = 0, 270 std::vector<LoopToScevMapT> *VLTS = 0); 271 272 void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement, 273 int vectorDim = 0, 274 std::vector<ValueMapT> *VectorVMap = 0, 275 std::vector<LoopToScevMapT> *VLTS = 0); 276 277 void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = NULL, 278 const char *iterator = NULL, 279 __isl_take isl_set *scatteringDomain = 0); 280 281 void codegen(const clast_block *b); 282 283 /// @brief Create a classical sequential loop. 284 void codegenForSequential(const clast_for *f); 285 286 /// @brief Create OpenMP structure values. 287 /// 288 /// Create a list of values that has to be stored into the OpenMP subfuncition 289 /// structure. 290 SetVector<Value *> getOMPValues(const clast_stmt *Body); 291 292 /// @brief Update ClastVars and ValueMap according to a value map. 293 /// 294 /// @param VMap A map from old to new values. 295 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 296 297 /// @brief Create an OpenMP parallel for loop. 298 /// 299 /// This loop reflects a loop as if it would have been created by an OpenMP 300 /// statement. 301 void codegenForOpenMP(const clast_for *f); 302 303 #ifdef GPU_CODEGEN 304 /// @brief Create GPGPU device memory access values. 305 /// 306 /// Create a list of values that will be set to be parameters of the GPGPU 307 /// subfunction. These parameters represent device memory base addresses 308 /// and the size in bytes. 309 SetVector<Value *> getGPUValues(unsigned &OutputBytes); 310 311 /// @brief Create a GPU parallel for loop. 312 /// 313 /// This loop reflects a loop as if it would have been created by a GPU 314 /// statement. 315 void codegenForGPGPU(const clast_for *F); 316 317 /// @brief Get innermost for loop. 318 const clast_stmt *getScheduleInfo(const clast_for *F, 319 std::vector<int> &NumIters, 320 unsigned &LoopDepth, 321 unsigned &NonPLoopDepth); 322 #endif /* GPU_CODEGEN */ 323 324 /// @brief Check if a loop is parallel 325 /// 326 /// Detect if a clast_for loop can be executed in parallel. 327 /// 328 /// @param For The clast for loop to check. 329 /// 330 /// @return bool Returns true if the incoming clast_for statement can 331 /// execute in parallel. 332 bool isParallelFor(const clast_for *For); 333 334 bool isInnermostLoop(const clast_for *f); 335 336 /// @brief Get the number of loop iterations for this loop. 337 /// @param f The clast for loop to check. 338 int getNumberOfIterations(const clast_for *f); 339 340 /// @brief Create vector instructions for this loop. 341 void codegenForVector(const clast_for *f); 342 343 void codegen(const clast_for *f); 344 345 Value *codegen(const clast_equation *eq); 346 347 void codegen(const clast_guard *g); 348 349 void codegen(const clast_stmt *stmt); 350 351 void addParameters(const CloogNames *names); 352 353 IntegerType *getIntPtrTy(); 354 355 public: 356 void codegen(const clast_root *r); 357 358 ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P); 359 }; 360 } 361 362 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 363 return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext()); 364 } 365 366 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 367 return parallelLoops; 368 } 369 370 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 371 Value *V = ExpGen.codegen(a->RHS, getIntPtrTy()); 372 ClastVars[a->LHS] = V; 373 } 374 375 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt, 376 unsigned Dim, int VectorDim, 377 std::vector<ValueMapT> *VectorVMap, 378 std::vector<LoopToScevMapT> *VLTS) { 379 Value *RHS; 380 381 assert(!A->LHS && "Statement assignments do not have left hand side"); 382 383 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 384 385 const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS); 386 if (VLTS) 387 (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS; 388 LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS; 389 390 const PHINode *PN = Stmt->getInductionVariableForDimension(Dim); 391 if (PN) { 392 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 393 394 if (VectorVMap) 395 (*VectorVMap)[VectorDim][PN] = RHS; 396 397 ValueMap[PN] = RHS; 398 } 399 } 400 401 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment, 402 ScopStmt *Statement, int vectorDim, 403 std::vector<ValueMapT> *VectorVMap, 404 std::vector<LoopToScevMapT> *VLTS) { 405 int Dimension = 0; 406 407 while (Assignment) { 408 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) && 409 "Substitions are expected to be assignments"); 410 codegen((const clast_assignment *)Assignment, Statement, Dimension, 411 vectorDim, VectorVMap, VLTS); 412 Assignment = Assignment->next; 413 Dimension++; 414 } 415 } 416 417 // Takes the cloog specific domain and translates it into a map Statement -> 418 // PartialSchedule, where the PartialSchedule contains all the dimensions that 419 // have been code generated up to this point. 420 static __isl_give isl_map *extractPartialSchedule(ScopStmt *Statement, 421 __isl_take isl_set *Domain) { 422 isl_map *Schedule = Statement->getScattering(); 423 int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set); 424 int UnscheduledDimensions = 425 isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions; 426 427 isl_set_free(Domain); 428 429 return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions, 430 UnscheduledDimensions); 431 } 432 433 void ClastStmtCodeGen::codegen(const clast_user_stmt *u, 434 std::vector<Value *> *IVS, const char *iterator, 435 __isl_take isl_set *Domain) { 436 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 437 438 if (u->substitutions) 439 codegenSubstitutions(u->substitutions, Statement); 440 441 int VectorDimensions = IVS ? IVS->size() : 1; 442 443 if (VectorDimensions == 1) { 444 BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P); 445 return; 446 } 447 448 VectorValueMapT VectorMap(VectorDimensions); 449 std::vector<LoopToScevMapT> VLTS(VectorDimensions); 450 451 if (IVS) { 452 assert(u->substitutions && "Substitutions expected!"); 453 int i = 0; 454 for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end(); 455 II != IE; ++II) { 456 ClastVars[iterator] = *II; 457 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS); 458 i++; 459 } 460 } 461 462 isl_map *Schedule = extractPartialSchedule(Statement, Domain); 463 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule, 464 P); 465 isl_map_free(Schedule); 466 } 467 468 void ClastStmtCodeGen::codegen(const clast_block *b) { 469 if (b->body) 470 codegen(b->body); 471 } 472 473 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 474 Value *LowerBound, *UpperBound, *IV, *Stride; 475 BasicBlock *ExitBlock; 476 Type *IntPtrTy = getIntPtrTy(); 477 478 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 479 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 480 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 481 482 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, ExitBlock, 483 CmpInst::ICMP_SLE); 484 485 // Add loop iv to symbols. 486 ClastVars[f->iterator] = IV; 487 488 if (f->body) 489 codegen(f->body); 490 491 // Loop is finished, so remove its iv from the live symbols. 492 ClastVars.erase(f->iterator); 493 Builder.SetInsertPoint(ExitBlock->begin()); 494 } 495 496 // Helper class to determine all scalar parameters used in the basic blocks of a 497 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 498 class ParameterVisitor : public ClastVisitor { 499 std::set<Value *> Values; 500 501 public: 502 ParameterVisitor() : ClastVisitor(), Values() {} 503 504 void visitUser(const clast_user_stmt *Stmt) { 505 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 506 const BasicBlock *BB = S->getBasicBlock(); 507 508 // Check all the operands of instructions in the basic block. 509 for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE; 510 ++BI) { 511 const Instruction &Inst = *BI; 512 for (Instruction::const_op_iterator II = Inst.op_begin(), 513 IE = Inst.op_end(); 514 II != IE; ++II) { 515 Value *SrcVal = *II; 516 517 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 518 if (S->getParent()->getRegion().contains(OpInst)) 519 continue; 520 521 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 522 Values.insert(SrcVal); 523 } 524 } 525 } 526 527 // Iterator to iterate over the values found. 528 typedef std::set<Value *>::const_iterator const_iterator; 529 inline const_iterator begin() const { return Values.begin(); } 530 inline const_iterator end() const { return Values.end(); } 531 }; 532 533 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 534 SetVector<Value *> Values; 535 536 // The clast variables 537 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 538 I++) 539 Values.insert(I->second); 540 541 // Find the temporaries that are referenced in the clast statements' 542 // basic blocks but are not defined by these blocks (e.g., references 543 // to function arguments or temporaries defined before the start of 544 // the SCoP). 545 ParameterVisitor Params; 546 Params.visit(Body); 547 548 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 549 PI != PE; ++PI) { 550 Value *V = *PI; 551 Values.insert(V); 552 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 553 } 554 555 return Values; 556 } 557 558 void 559 ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) { 560 std::set<Value *> Inserted; 561 562 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 563 I++) { 564 ClastVars[I->first] = VMap[I->second]; 565 Inserted.insert(I->second); 566 } 567 568 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 569 E = VMap.end(); 570 I != E; ++I) { 571 if (Inserted.count(I->first)) 572 continue; 573 574 ValueMap[I->first] = I->second; 575 } 576 } 577 578 static void clearDomtree(Function *F, DominatorTree &DT) { 579 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 580 std::vector<BasicBlock *> Nodes; 581 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 582 Nodes.push_back(I->getBlock()); 583 584 for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end(); 585 I != E; ++I) 586 DT.eraseNode(*I); 587 } 588 589 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 590 Value *Stride, *LB, *UB, *IV; 591 BasicBlock::iterator LoopBody; 592 IntegerType *IntPtrTy = getIntPtrTy(); 593 SetVector<Value *> Values; 594 OMPGenerator::ValueToValueMapTy VMap; 595 OMPGenerator OMPGen(Builder, P); 596 597 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 598 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 599 LB = ExpGen.codegen(For->LB, IntPtrTy); 600 UB = ExpGen.codegen(For->UB, IntPtrTy); 601 602 Values = getOMPValues(For->body); 603 604 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 605 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 606 Builder.SetInsertPoint(LoopBody); 607 608 // Save the current values. 609 const ValueMapT ValueMapCopy = ValueMap; 610 const CharMapT ClastVarsCopy = ClastVars; 611 612 updateWithValueMap(VMap); 613 ClastVars[For->iterator] = IV; 614 615 if (For->body) 616 codegen(For->body); 617 618 // Restore the original values. 619 ValueMap = ValueMapCopy; 620 ClastVars = ClastVarsCopy; 621 622 clearDomtree((*LoopBody).getParent()->getParent(), 623 P->getAnalysis<DominatorTree>()); 624 625 Builder.SetInsertPoint(AfterLoop); 626 } 627 628 #ifdef GPU_CODEGEN 629 static unsigned getArraySizeInBytes(const ArrayType *AT) { 630 unsigned Bytes = AT->getNumElements(); 631 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 632 Bytes *= getArraySizeInBytes(T); 633 else 634 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 635 636 return Bytes; 637 } 638 639 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 640 SetVector<Value *> Values; 641 OutputBytes = 0; 642 643 // Record the memory reference base addresses. 644 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 645 ScopStmt *Stmt = *SI; 646 for (SmallVector<MemoryAccess *, 8>::iterator I = Stmt->memacc_begin(), 647 E = Stmt->memacc_end(); 648 I != E; ++I) { 649 Value *BaseAddr = const_cast<Value *>((*I)->getBaseAddr()); 650 Values.insert((BaseAddr)); 651 652 // FIXME: we assume that there is one and only one array to be written 653 // in a SCoP. 654 int NumWrites = 0; 655 if ((*I)->isWrite()) { 656 ++NumWrites; 657 assert(NumWrites <= 1 && 658 "We support at most one array to be written in a SCoP."); 659 if (const PointerType *PT = 660 dyn_cast<PointerType>(BaseAddr->getType())) { 661 Type *T = PT->getArrayElementType(); 662 const ArrayType *ATy = dyn_cast<ArrayType>(T); 663 OutputBytes = getArraySizeInBytes(ATy); 664 } 665 } 666 } 667 } 668 669 return Values; 670 } 671 672 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F, 673 std::vector<int> &NumIters, 674 unsigned &LoopDepth, 675 unsigned &NonPLoopDepth) { 676 clast_stmt *Stmt = (clast_stmt *)F; 677 const clast_for *Result; 678 bool NonParaFlag = false; 679 LoopDepth = 0; 680 NonPLoopDepth = 0; 681 682 while (Stmt) { 683 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 684 const clast_for *T = (clast_for *)Stmt; 685 if (isParallelFor(T)) { 686 if (!NonParaFlag) { 687 NumIters.push_back(getNumberOfIterations(T)); 688 Result = T; 689 } 690 } else 691 NonParaFlag = true; 692 693 Stmt = T->body; 694 LoopDepth++; 695 continue; 696 } 697 Stmt = Stmt->next; 698 } 699 700 assert(NumIters.size() == 4 && 701 "The loops should be tiled into 4-depth parallel loops and an " 702 "innermost non-parallel one (if exist)."); 703 NonPLoopDepth = LoopDepth - NumIters.size(); 704 assert(NonPLoopDepth <= 1 && 705 "We support only one innermost non-parallel loop currently."); 706 return (const clast_stmt *)Result->body; 707 } 708 709 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 710 BasicBlock::iterator LoopBody; 711 SetVector<Value *> Values; 712 SetVector<Value *> IVS; 713 std::vector<int> NumIterations; 714 PTXGenerator::ValueToValueMapTy VMap; 715 716 assert(!GPUTriple.empty() && 717 "Target triple should be set properly for GPGPU code generation."); 718 PTXGenerator PTXGen(Builder, P, GPUTriple); 719 720 // Get original IVS and ScopStmt 721 unsigned TiledLoopDepth, NonPLoopDepth; 722 const clast_stmt *InnerStmt = 723 getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth); 724 const clast_stmt *TmpStmt; 725 const clast_user_stmt *U; 726 const clast_for *InnerFor; 727 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 728 InnerFor = (const clast_for *)InnerStmt; 729 TmpStmt = InnerFor->body; 730 } else 731 TmpStmt = InnerStmt; 732 U = (const clast_user_stmt *)TmpStmt; 733 ScopStmt *Statement = (ScopStmt *)U->statement->usr; 734 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 735 const Value *IV = Statement->getInductionVariableForDimension(i); 736 IVS.insert(const_cast<Value *>(IV)); 737 } 738 739 unsigned OutBytes; 740 Values = getGPUValues(OutBytes); 741 PTXGen.setOutputBytes(OutBytes); 742 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 743 744 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 745 Builder.SetInsertPoint(LoopBody); 746 747 BasicBlock *AfterBB = 0; 748 if (NonPLoopDepth) { 749 Value *LowerBound, *UpperBound, *IV, *Stride; 750 Type *IntPtrTy = getIntPtrTy(); 751 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 752 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 753 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 754 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 755 CmpInst::ICMP_SLE); 756 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 757 Value *OldIV = const_cast<Value *>(OldIV_); 758 VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV)); 759 } 760 761 updateWithValueMap(VMap); 762 763 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 764 765 if (AfterBB) 766 Builder.SetInsertPoint(AfterBB->begin()); 767 768 // FIXME: The replacement of the host base address with the parameter of ptx 769 // subfunction should have been done by updateWithValueMap. We use the 770 // following codes to avoid affecting other parts of Polly. This should be 771 // fixed later. 772 Function *FN = Builder.GetInsertBlock()->getParent(); 773 for (unsigned j = 0; j < Values.size(); j++) { 774 Value *baseAddr = Values[j]; 775 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 776 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 777 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 778 } 779 } 780 Builder.SetInsertPoint(AfterLoop); 781 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 782 NumIterations[2], NumIterations[3]); 783 PTXGen.finishGeneration(FN); 784 } 785 #endif 786 787 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 788 const clast_stmt *stmt = f->body; 789 790 while (stmt) { 791 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 792 return false; 793 794 stmt = stmt->next; 795 } 796 797 return true; 798 } 799 800 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) { 801 isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain)); 802 int NumberOfIterations = polly::getNumberOfIterations(LoopDomain); 803 if (NumberOfIterations == -1) 804 return -1; 805 return NumberOfIterations / isl_int_get_si(For->stride) + 1; 806 } 807 808 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 809 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 810 int VectorWidth = getNumberOfIterations(F); 811 812 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 813 814 APInt Stride = APInt_from_MPZ(F->stride); 815 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 816 Stride = Stride.zext(LoopIVType->getBitWidth()); 817 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 818 819 std::vector<Value *> IVS(VectorWidth); 820 IVS[0] = LB; 821 822 for (int i = 1; i < VectorWidth; i++) 823 IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv"); 824 825 isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(F->domain)); 826 827 // Add loop iv to symbols. 828 ClastVars[F->iterator] = LB; 829 830 const clast_stmt *Stmt = F->body; 831 832 while (Stmt) { 833 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 834 isl_set_copy(Domain)); 835 Stmt = Stmt->next; 836 } 837 838 // Loop is finished, so remove its iv from the live symbols. 839 isl_set_free(Domain); 840 ClastVars.erase(F->iterator); 841 } 842 843 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 844 isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(f->domain)); 845 assert(Domain && "Cannot access domain of loop"); 846 847 Dependences &D = P->getAnalysis<Dependences>(); 848 849 return D.isParallelDimension(Domain, isl_set_n_dim(Domain)); 850 } 851 852 void ClastStmtCodeGen::codegen(const clast_for *f) { 853 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 854 if ((Vector || OpenMP) && isParallelFor(f)) { 855 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) && 856 (getNumberOfIterations(f) <= 16)) { 857 codegenForVector(f); 858 return; 859 } 860 861 if (OpenMP && !parallelCodeGeneration) { 862 parallelCodeGeneration = true; 863 parallelLoops.push_back(f->iterator); 864 codegenForOpenMP(f); 865 parallelCodeGeneration = false; 866 return; 867 } 868 } 869 870 #ifdef GPU_CODEGEN 871 if (GPGPU && isParallelFor(f)) { 872 if (!parallelCodeGeneration) { 873 parallelCodeGeneration = true; 874 parallelLoops.push_back(f->iterator); 875 codegenForGPGPU(f); 876 parallelCodeGeneration = false; 877 return; 878 } 879 } 880 #endif 881 882 codegenForSequential(f); 883 } 884 885 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 886 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 887 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 888 CmpInst::Predicate P; 889 890 if (eq->sign == 0) 891 P = ICmpInst::ICMP_EQ; 892 else if (eq->sign > 0) 893 P = ICmpInst::ICMP_SGE; 894 else 895 P = ICmpInst::ICMP_SLE; 896 897 return Builder.CreateICmp(P, LHS, RHS); 898 } 899 900 void ClastStmtCodeGen::codegen(const clast_guard *g) { 901 Function *F = Builder.GetInsertBlock()->getParent(); 902 LLVMContext &Context = F->getContext(); 903 904 BasicBlock *CondBB = 905 SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P); 906 CondBB->setName("polly.cond"); 907 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 908 MergeBB->setName("polly.merge"); 909 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 910 911 DominatorTree &DT = P->getAnalysis<DominatorTree>(); 912 DT.addNewBlock(ThenBB, CondBB); 913 DT.changeImmediateDominator(MergeBB, CondBB); 914 915 CondBB->getTerminator()->eraseFromParent(); 916 917 Builder.SetInsertPoint(CondBB); 918 919 Value *Predicate = codegen(&(g->eq[0])); 920 921 for (int i = 1; i < g->n; ++i) { 922 Value *TmpPredicate = codegen(&(g->eq[i])); 923 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 924 } 925 926 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 927 Builder.SetInsertPoint(ThenBB); 928 Builder.CreateBr(MergeBB); 929 Builder.SetInsertPoint(ThenBB->begin()); 930 931 LoopInfo &LI = P->getAnalysis<LoopInfo>(); 932 Loop *L = LI.getLoopFor(CondBB); 933 if (L) { 934 L->addBasicBlockToLoop(ThenBB, LI.getBase()); 935 L->addBasicBlockToLoop(MergeBB, LI.getBase()); 936 } 937 938 codegen(g->then); 939 940 Builder.SetInsertPoint(MergeBB->begin()); 941 } 942 943 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 944 if (CLAST_STMT_IS_A(stmt, stmt_root)) 945 assert(false && "No second root statement expected"); 946 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 947 codegen((const clast_assignment *)stmt); 948 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 949 codegen((const clast_user_stmt *)stmt); 950 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 951 codegen((const clast_block *)stmt); 952 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 953 codegen((const clast_for *)stmt); 954 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 955 codegen((const clast_guard *)stmt); 956 957 if (stmt->next) 958 codegen(stmt->next); 959 } 960 961 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 962 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 963 964 int i = 0; 965 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 966 PI != PE; ++PI) { 967 assert(i < names->nb_parameters && "Not enough parameter names"); 968 969 const SCEV *Param = *PI; 970 Type *Ty = Param->getType(); 971 972 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 973 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 974 ClastVars[names->parameters[i]] = V; 975 976 ++i; 977 } 978 } 979 980 void ClastStmtCodeGen::codegen(const clast_root *r) { 981 addParameters(r->names); 982 983 parallelCodeGeneration = false; 984 985 const clast_stmt *stmt = (const clast_stmt *)r; 986 if (stmt->next) 987 codegen(stmt->next); 988 } 989 990 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) 991 : S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {} 992 993 namespace { 994 class CodeGeneration : public ScopPass { 995 std::vector<std::string> ParallelLoops; 996 997 public: 998 static char ID; 999 1000 CodeGeneration() : ScopPass(ID) {} 1001 1002 bool runOnScop(Scop &S) { 1003 ParallelLoops.clear(); 1004 1005 assert(!S.getRegion().isTopLevelRegion() && 1006 "Top level regions are not supported"); 1007 1008 simplifyRegion(&S, this); 1009 1010 BasicBlock *StartBlock = executeScopConditionally(S, this); 1011 1012 IRBuilder<> Builder(StartBlock->begin()); 1013 1014 ClastStmtCodeGen CodeGen(&S, Builder, this); 1015 CloogInfo &C = getAnalysis<CloogInfo>(); 1016 CodeGen.codegen(C.getClast()); 1017 1018 ParallelLoops.insert(ParallelLoops.begin(), 1019 CodeGen.getParallelLoops().begin(), 1020 CodeGen.getParallelLoops().end()); 1021 return true; 1022 } 1023 1024 virtual void printScop(raw_ostream &OS) const { 1025 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1026 PE = ParallelLoops.end(); 1027 PI != PE; ++PI) 1028 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1029 } 1030 1031 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1032 AU.addRequired<CloogInfo>(); 1033 AU.addRequired<Dependences>(); 1034 AU.addRequired<DominatorTree>(); 1035 AU.addRequired<RegionInfo>(); 1036 AU.addRequired<ScalarEvolution>(); 1037 AU.addRequired<ScopDetection>(); 1038 AU.addRequired<ScopInfo>(); 1039 AU.addRequired<DataLayout>(); 1040 AU.addRequired<LoopInfo>(); 1041 1042 AU.addPreserved<CloogInfo>(); 1043 AU.addPreserved<Dependences>(); 1044 AU.addPreserved<LoopInfo>(); 1045 AU.addPreserved<DominatorTree>(); 1046 AU.addPreserved<ScopDetection>(); 1047 AU.addPreserved<ScalarEvolution>(); 1048 1049 // FIXME: We do not yet add regions for the newly generated code to the 1050 // region tree. 1051 AU.addPreserved<RegionInfo>(); 1052 AU.addPreserved<TempScopInfo>(); 1053 AU.addPreserved<ScopInfo>(); 1054 AU.addPreservedID(IndependentBlocksID); 1055 } 1056 }; 1057 } 1058 1059 char CodeGeneration::ID = 1; 1060 1061 Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); } 1062 1063 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1064 "Polly - Create LLVM-IR from SCoPs", false, false); 1065 INITIALIZE_PASS_DEPENDENCY(CloogInfo); 1066 INITIALIZE_PASS_DEPENDENCY(Dependences); 1067 INITIALIZE_PASS_DEPENDENCY(DominatorTree); 1068 INITIALIZE_PASS_DEPENDENCY(RegionInfo); 1069 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution); 1070 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1071 INITIALIZE_PASS_DEPENDENCY(DataLayout); 1072 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1073 "Polly - Create LLVM-IR from SCoPs", false, false) 1074 1075 #endif // CLOOG_FOUND 1076