1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it 11 // back to LLVM-IR using Cloog. 12 // 13 // The Scop describes the high level memory behaviour of a control flow region. 14 // Transformation passes can update the schedule (execution order) of statements 15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that 16 // reflects the updated execution order. This clast is used to create new 17 // LLVM-IR that is computational equivalent to the original control flow region, 18 // but executes its code in the new execution order defined by the changed 19 // scattering. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "polly/CodeGen/Cloog.h" 24 #ifdef CLOOG_FOUND 25 26 #define DEBUG_TYPE "polly-codegen" 27 #include "polly/Dependences.h" 28 #include "polly/LinkAllPasses.h" 29 #include "polly/ScopInfo.h" 30 #include "polly/TempScopInfo.h" 31 #include "polly/CodeGen/CodeGeneration.h" 32 #include "polly/CodeGen/BlockGenerators.h" 33 #include "polly/CodeGen/LoopGenerators.h" 34 #include "polly/CodeGen/PTXGenerator.h" 35 #include "polly/CodeGen/Utils.h" 36 #include "polly/Support/GICHelper.h" 37 38 #include "llvm/IR/Module.h" 39 #include "llvm/ADT/SetVector.h" 40 #include "llvm/ADT/PostOrderIterator.h" 41 #include "llvm/Analysis/LoopInfo.h" 42 #include "llvm/Analysis/ScalarEvolutionExpander.h" 43 #include "llvm/Support/CommandLine.h" 44 #include "llvm/Support/Debug.h" 45 #include "llvm/IR/DataLayout.h" 46 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 47 48 #define CLOOG_INT_GMP 1 49 #include "cloog/cloog.h" 50 #include "cloog/isl/cloog.h" 51 52 #include "isl/aff.h" 53 54 #include <vector> 55 #include <utility> 56 57 using namespace polly; 58 using namespace llvm; 59 60 struct isl_set; 61 62 namespace polly { 63 static cl::opt<bool> 64 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"), 65 cl::Hidden, cl::value_desc("OpenMP code generation enabled if true"), 66 cl::init(false), cl::ZeroOrMore); 67 68 #ifdef GPU_CODEGEN 69 static cl::opt<bool> 70 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), cl::Hidden, 71 cl::value_desc("GPGPU code generation enabled if true"), cl::init(false), 72 cl::ZeroOrMore); 73 74 static cl::opt<std::string> GPUTriple( 75 "polly-gpgpu-triple", cl::desc("Target triple for GPU code generation"), 76 cl::Hidden, cl::init("")); 77 #endif /* GPU_CODEGEN */ 78 79 typedef DenseMap<const char *, Value *> CharMapT; 80 81 /// Class to generate LLVM-IR that calculates the value of a clast_expr. 82 class ClastExpCodeGen { 83 IRBuilder<> &Builder; 84 const CharMapT &IVS; 85 86 Value *codegen(const clast_name *e, Type *Ty); 87 Value *codegen(const clast_term *e, Type *Ty); 88 Value *codegen(const clast_binary *e, Type *Ty); 89 Value *codegen(const clast_reduction *r, Type *Ty); 90 public: 91 92 // A generator for clast expressions. 93 // 94 // @param B The IRBuilder that defines where the code to calculate the 95 // clast expressions should be inserted. 96 // @param IVMAP A Map that translates strings describing the induction 97 // variables to the Values* that represent these variables 98 // on the LLVM side. 99 ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap); 100 101 // Generates code to calculate a given clast expression. 102 // 103 // @param e The expression to calculate. 104 // @return The Value that holds the result. 105 Value *codegen(const clast_expr *e, Type *Ty); 106 }; 107 108 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) { 109 CharMapT::const_iterator I = IVS.find(e->name); 110 111 assert(I != IVS.end() && "Clast name not found"); 112 113 return Builder.CreateSExtOrBitCast(I->second, Ty); 114 } 115 116 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) { 117 APInt a = APInt_from_MPZ(e->val); 118 119 Value *ConstOne = ConstantInt::get(Builder.getContext(), a); 120 ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty); 121 122 if (!e->var) 123 return ConstOne; 124 125 Value *var = codegen(e->var, Ty); 126 return Builder.CreateMul(ConstOne, var); 127 } 128 129 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) { 130 Value *LHS = codegen(e->LHS, Ty); 131 132 APInt RHS_AP = APInt_from_MPZ(e->RHS); 133 134 Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP); 135 RHS = Builder.CreateSExtOrBitCast(RHS, Ty); 136 137 switch (e->type) { 138 case clast_bin_mod: 139 return Builder.CreateSRem(LHS, RHS); 140 case clast_bin_fdiv: { 141 // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d 142 Value *One = ConstantInt::get(Ty, 1); 143 Value *Zero = ConstantInt::get(Ty, 0); 144 Value *Sum1 = Builder.CreateSub(LHS, RHS); 145 Value *Sum2 = Builder.CreateAdd(Sum1, One); 146 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 147 Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS); 148 return Builder.CreateSDiv(Dividend, RHS); 149 } 150 case clast_bin_cdiv: { 151 // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d 152 Value *One = ConstantInt::get(Ty, 1); 153 Value *Zero = ConstantInt::get(Ty, 0); 154 Value *Sum1 = Builder.CreateAdd(LHS, RHS); 155 Value *Sum2 = Builder.CreateSub(Sum1, One); 156 Value *isNegative = Builder.CreateICmpSLT(LHS, Zero); 157 Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2); 158 return Builder.CreateSDiv(Dividend, RHS); 159 } 160 case clast_bin_div: 161 return Builder.CreateSDiv(LHS, RHS); 162 } 163 164 llvm_unreachable("Unknown clast binary expression type"); 165 } 166 167 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) { 168 assert((r->type == clast_red_min || r->type == clast_red_max || 169 r->type == clast_red_sum) && "Clast reduction type not supported"); 170 Value *old = codegen(r->elts[0], Ty); 171 172 for (int i = 1; i < r->n; ++i) { 173 Value *exprValue = codegen(r->elts[i], Ty); 174 175 switch (r->type) { 176 case clast_red_min: { 177 Value *cmp = Builder.CreateICmpSLT(old, exprValue); 178 old = Builder.CreateSelect(cmp, old, exprValue); 179 break; 180 } 181 case clast_red_max: { 182 Value *cmp = Builder.CreateICmpSGT(old, exprValue); 183 old = Builder.CreateSelect(cmp, old, exprValue); 184 break; 185 } 186 case clast_red_sum: 187 old = Builder.CreateAdd(old, exprValue); 188 break; 189 } 190 } 191 192 return old; 193 } 194 195 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap) 196 : Builder(B), IVS(IVMap) {} 197 198 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) { 199 switch (e->type) { 200 case clast_expr_name: 201 return codegen((const clast_name *)e, Ty); 202 case clast_expr_term: 203 return codegen((const clast_term *)e, Ty); 204 case clast_expr_bin: 205 return codegen((const clast_binary *)e, Ty); 206 case clast_expr_red: 207 return codegen((const clast_reduction *)e, Ty); 208 } 209 210 llvm_unreachable("Unknown clast expression!"); 211 } 212 213 class ClastStmtCodeGen { 214 public: 215 const std::vector<std::string> &getParallelLoops(); 216 217 private: 218 // The Scop we code generate. 219 Scop *S; 220 Pass *P; 221 222 // The Builder specifies the current location to code generate at. 223 IRBuilder<> &Builder; 224 225 // Map the Values from the old code to their counterparts in the new code. 226 ValueMapT ValueMap; 227 228 // Map the loops from the old code to expressions function of the induction 229 // variables in the new code. For example, when the code generator produces 230 // this AST: 231 // 232 // for (int c1 = 0; c1 <= 1023; c1 += 1) 233 // for (int c2 = 0; c2 <= 1023; c2 += 1) 234 // Stmt(c2 + 3, c1); 235 // 236 // LoopToScev is a map associating: 237 // "outer loop in the old loop nest" -> SCEV("c2 + 3"), 238 // "inner loop in the old loop nest" -> SCEV("c1"). 239 LoopToScevMapT LoopToScev; 240 241 // clastVars maps from the textual representation of a clast variable to its 242 // current *Value. clast variables are scheduling variables, original 243 // induction variables or parameters. They are used either in loop bounds or 244 // to define the statement instance that is executed. 245 // 246 // for (s = 0; s < n + 3; ++i) 247 // for (t = s; t < m; ++j) 248 // Stmt(i = s + 3 * m, j = t); 249 // 250 // {s,t,i,j,n,m} is the set of clast variables in this clast. 251 CharMapT ClastVars; 252 253 // Codegenerator for clast expressions. 254 ClastExpCodeGen ExpGen; 255 256 // Do we currently generate parallel code? 257 bool parallelCodeGeneration; 258 259 std::vector<std::string> parallelLoops; 260 261 void codegen(const clast_assignment *a); 262 263 void codegen(const clast_assignment *a, ScopStmt *Statement, 264 unsigned Dimension, int vectorDim, 265 std::vector<ValueMapT> *VectorVMap = 0, 266 std::vector<LoopToScevMapT> *VLTS = 0); 267 268 void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement, 269 int vectorDim = 0, 270 std::vector<ValueMapT> *VectorVMap = 0, 271 std::vector<LoopToScevMapT> *VLTS = 0); 272 273 void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = NULL, 274 const char *iterator = NULL, isl_set *scatteringDomain = 0); 275 276 void codegen(const clast_block *b); 277 278 /// @brief Create a classical sequential loop. 279 void codegenForSequential(const clast_for *f); 280 281 /// @brief Create OpenMP structure values. 282 /// 283 /// Create a list of values that has to be stored into the OpenMP subfuncition 284 /// structure. 285 SetVector<Value *> getOMPValues(const clast_stmt *Body); 286 287 /// @brief Update ClastVars and ValueMap according to a value map. 288 /// 289 /// @param VMap A map from old to new values. 290 void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap); 291 292 /// @brief Create an OpenMP parallel for loop. 293 /// 294 /// This loop reflects a loop as if it would have been created by an OpenMP 295 /// statement. 296 void codegenForOpenMP(const clast_for *f); 297 298 #ifdef GPU_CODEGEN 299 /// @brief Create GPGPU device memory access values. 300 /// 301 /// Create a list of values that will be set to be parameters of the GPGPU 302 /// subfunction. These parameters represent device memory base addresses 303 /// and the size in bytes. 304 SetVector<Value *> getGPUValues(unsigned &OutputBytes); 305 306 /// @brief Create a GPU parallel for loop. 307 /// 308 /// This loop reflects a loop as if it would have been created by a GPU 309 /// statement. 310 void codegenForGPGPU(const clast_for *F); 311 312 /// @brief Get innermost for loop. 313 const clast_stmt * 314 getScheduleInfo(const clast_for *F, std::vector<int> &NumIters, 315 unsigned &LoopDepth, unsigned &NonPLoopDepth); 316 #endif /* GPU_CODEGEN */ 317 318 /// @brief Check if a loop is parallel 319 /// 320 /// Detect if a clast_for loop can be executed in parallel. 321 /// 322 /// @param For The clast for loop to check. 323 /// 324 /// @return bool Returns true if the incoming clast_for statement can 325 /// execute in parallel. 326 bool isParallelFor(const clast_for *For); 327 328 bool isInnermostLoop(const clast_for *f); 329 330 /// @brief Get the number of loop iterations for this loop. 331 /// @param f The clast for loop to check. 332 int getNumberOfIterations(const clast_for *f); 333 334 /// @brief Create vector instructions for this loop. 335 void codegenForVector(const clast_for *f); 336 337 void codegen(const clast_for *f); 338 339 Value *codegen(const clast_equation *eq); 340 341 void codegen(const clast_guard *g); 342 343 void codegen(const clast_stmt *stmt); 344 345 void addParameters(const CloogNames *names); 346 347 IntegerType *getIntPtrTy(); 348 349 public: 350 void codegen(const clast_root *r); 351 352 ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P); 353 }; 354 } 355 356 IntegerType *ClastStmtCodeGen::getIntPtrTy() { 357 return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext()); 358 } 359 360 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() { 361 return parallelLoops; 362 } 363 364 void ClastStmtCodeGen::codegen(const clast_assignment *a) { 365 Value *V = ExpGen.codegen(a->RHS, getIntPtrTy()); 366 ClastVars[a->LHS] = V; 367 } 368 369 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt, 370 unsigned Dim, int VectorDim, 371 std::vector<ValueMapT> *VectorVMap, 372 std::vector<LoopToScevMapT> *VLTS) { 373 Value *RHS; 374 375 assert(!A->LHS && "Statement assignments do not have left hand side"); 376 377 RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty()); 378 379 const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS); 380 if (VLTS) 381 (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS; 382 LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS; 383 384 const PHINode *PN = Stmt->getInductionVariableForDimension(Dim); 385 if (PN) { 386 RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType()); 387 388 if (VectorVMap) 389 (*VectorVMap)[VectorDim][PN] = RHS; 390 391 ValueMap[PN] = RHS; 392 } 393 } 394 395 void ClastStmtCodeGen::codegenSubstitutions( 396 const clast_stmt *Assignment, ScopStmt *Statement, int vectorDim, 397 std::vector<ValueMapT> *VectorVMap, std::vector<LoopToScevMapT> *VLTS) { 398 int Dimension = 0; 399 400 while (Assignment) { 401 assert(CLAST_STMT_IS_A(Assignment, stmt_ass) && 402 "Substitions are expected to be assignments"); 403 codegen((const clast_assignment *)Assignment, Statement, Dimension, 404 vectorDim, VectorVMap, VLTS); 405 Assignment = Assignment->next; 406 Dimension++; 407 } 408 } 409 410 // Takes the cloog specific domain and translates it into a map Statement -> 411 // PartialSchedule, where the PartialSchedule contains all the dimensions that 412 // have been code generated up to this point. 413 static __isl_give isl_map * 414 extractPartialSchedule(ScopStmt *Statement, isl_set *Domain) { 415 isl_map *Schedule = Statement->getScattering(); 416 int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set); 417 int UnscheduledDimensions = 418 isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions; 419 420 return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions, 421 UnscheduledDimensions); 422 } 423 424 void ClastStmtCodeGen::codegen(const clast_user_stmt *u, 425 std::vector<Value *> *IVS, const char *iterator, 426 isl_set *Domain) { 427 ScopStmt *Statement = (ScopStmt *)u->statement->usr; 428 429 if (u->substitutions) 430 codegenSubstitutions(u->substitutions, Statement); 431 432 int VectorDimensions = IVS ? IVS->size() : 1; 433 434 if (VectorDimensions == 1) { 435 BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P); 436 return; 437 } 438 439 VectorValueMapT VectorMap(VectorDimensions); 440 std::vector<LoopToScevMapT> VLTS(VectorDimensions); 441 442 if (IVS) { 443 assert(u->substitutions && "Substitutions expected!"); 444 int i = 0; 445 for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end(); 446 II != IE; ++II) { 447 ClastVars[iterator] = *II; 448 codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS); 449 i++; 450 } 451 } 452 453 isl_map *Schedule = extractPartialSchedule(Statement, Domain); 454 VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule, 455 P); 456 isl_map_free(Schedule); 457 } 458 459 void ClastStmtCodeGen::codegen(const clast_block *b) { 460 if (b->body) 461 codegen(b->body); 462 } 463 464 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) { 465 Value *LowerBound, *UpperBound, *IV, *Stride; 466 BasicBlock *AfterBB; 467 Type *IntPtrTy = getIntPtrTy(); 468 469 LowerBound = ExpGen.codegen(f->LB, IntPtrTy); 470 UpperBound = ExpGen.codegen(f->UB, IntPtrTy); 471 Stride = Builder.getInt(APInt_from_MPZ(f->stride)); 472 473 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 474 CmpInst::ICMP_SLE); 475 476 // Add loop iv to symbols. 477 ClastVars[f->iterator] = IV; 478 479 if (f->body) 480 codegen(f->body); 481 482 // Loop is finished, so remove its iv from the live symbols. 483 ClastVars.erase(f->iterator); 484 Builder.SetInsertPoint(AfterBB->begin()); 485 } 486 487 // Helper class to determine all scalar parameters used in the basic blocks of a 488 // clast. Scalar parameters are scalar variables defined outside of the SCoP. 489 class ParameterVisitor : public ClastVisitor { 490 std::set<Value *> Values; 491 public: 492 ParameterVisitor() : ClastVisitor(), Values() {} 493 494 void visitUser(const clast_user_stmt *Stmt) { 495 const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr); 496 const BasicBlock *BB = S->getBasicBlock(); 497 498 // Check all the operands of instructions in the basic block. 499 for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE; 500 ++BI) { 501 const Instruction &Inst = *BI; 502 for (Instruction::const_op_iterator II = Inst.op_begin(), 503 IE = Inst.op_end(); 504 II != IE; ++II) { 505 Value *SrcVal = *II; 506 507 if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal)) 508 if (S->getParent()->getRegion().contains(OpInst)) 509 continue; 510 511 if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal)) 512 Values.insert(SrcVal); 513 } 514 } 515 } 516 517 // Iterator to iterate over the values found. 518 typedef std::set<Value *>::const_iterator const_iterator; 519 inline const_iterator begin() const { return Values.begin(); } 520 inline const_iterator end() const { return Values.end(); } 521 }; 522 523 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) { 524 SetVector<Value *> Values; 525 526 // The clast variables 527 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 528 I++) 529 Values.insert(I->second); 530 531 // Find the temporaries that are referenced in the clast statements' 532 // basic blocks but are not defined by these blocks (e.g., references 533 // to function arguments or temporaries defined before the start of 534 // the SCoP). 535 ParameterVisitor Params; 536 Params.visit(Body); 537 538 for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end(); 539 PI != PE; ++PI) { 540 Value *V = *PI; 541 Values.insert(V); 542 DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n"); 543 } 544 545 return Values; 546 } 547 548 void ClastStmtCodeGen::updateWithValueMap( 549 OMPGenerator::ValueToValueMapTy &VMap) { 550 std::set<Value *> Inserted; 551 552 for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E; 553 I++) { 554 ClastVars[I->first] = VMap[I->second]; 555 Inserted.insert(I->second); 556 } 557 558 for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(), 559 E = VMap.end(); 560 I != E; ++I) { 561 if (Inserted.count(I->first)) 562 continue; 563 564 ValueMap[I->first] = I->second; 565 } 566 } 567 568 static void clearDomtree(Function *F, DominatorTree &DT) { 569 DomTreeNode *N = DT.getNode(&F->getEntryBlock()); 570 std::vector<BasicBlock *> Nodes; 571 for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I) 572 Nodes.push_back(I->getBlock()); 573 574 for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end(); 575 I != E; ++I) 576 DT.eraseNode(*I); 577 } 578 579 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) { 580 Value *Stride, *LB, *UB, *IV; 581 BasicBlock::iterator LoopBody; 582 IntegerType *IntPtrTy = getIntPtrTy(); 583 SetVector<Value *> Values; 584 OMPGenerator::ValueToValueMapTy VMap; 585 OMPGenerator OMPGen(Builder, P); 586 587 Stride = Builder.getInt(APInt_from_MPZ(For->stride)); 588 Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy); 589 LB = ExpGen.codegen(For->LB, IntPtrTy); 590 UB = ExpGen.codegen(For->UB, IntPtrTy); 591 592 Values = getOMPValues(For->body); 593 594 IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody); 595 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 596 Builder.SetInsertPoint(LoopBody); 597 598 // Save the current values. 599 const ValueMapT ValueMapCopy = ValueMap; 600 const CharMapT ClastVarsCopy = ClastVars; 601 602 updateWithValueMap(VMap); 603 ClastVars[For->iterator] = IV; 604 605 if (For->body) 606 codegen(For->body); 607 608 // Restore the original values. 609 ValueMap = ValueMapCopy; 610 ClastVars = ClastVarsCopy; 611 612 clearDomtree((*LoopBody).getParent()->getParent(), 613 P->getAnalysis<DominatorTree>()); 614 615 Builder.SetInsertPoint(AfterLoop); 616 } 617 618 #ifdef GPU_CODEGEN 619 static unsigned getArraySizeInBytes(const ArrayType *AT) { 620 unsigned Bytes = AT->getNumElements(); 621 if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType())) 622 Bytes *= getArraySizeInBytes(T); 623 else 624 Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8; 625 626 return Bytes; 627 } 628 629 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) { 630 SetVector<Value *> Values; 631 OutputBytes = 0; 632 633 // Record the memory reference base addresses. 634 for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) { 635 ScopStmt *Stmt = *SI; 636 for (SmallVector<MemoryAccess *, 8>::iterator I = Stmt->memacc_begin(), 637 E = Stmt->memacc_end(); 638 I != E; ++I) { 639 Value *BaseAddr = const_cast<Value *>((*I)->getBaseAddr()); 640 Values.insert((BaseAddr)); 641 642 // FIXME: we assume that there is one and only one array to be written 643 // in a SCoP. 644 int NumWrites = 0; 645 if ((*I)->isWrite()) { 646 ++NumWrites; 647 assert(NumWrites <= 1 && 648 "We support at most one array to be written in a SCoP."); 649 if (const PointerType *PT = 650 dyn_cast<PointerType>(BaseAddr->getType())) { 651 Type *T = PT->getArrayElementType(); 652 const ArrayType *ATy = dyn_cast<ArrayType>(T); 653 OutputBytes = getArraySizeInBytes(ATy); 654 } 655 } 656 } 657 } 658 659 return Values; 660 } 661 662 const clast_stmt *ClastStmtCodeGen::getScheduleInfo( 663 const clast_for *F, std::vector<int> &NumIters, unsigned &LoopDepth, 664 unsigned &NonPLoopDepth) { 665 clast_stmt *Stmt = (clast_stmt *)F; 666 const clast_for *Result; 667 bool NonParaFlag = false; 668 LoopDepth = 0; 669 NonPLoopDepth = 0; 670 671 while (Stmt) { 672 if (CLAST_STMT_IS_A(Stmt, stmt_for)) { 673 const clast_for *T = (clast_for *)Stmt; 674 if (isParallelFor(T)) { 675 if (!NonParaFlag) { 676 NumIters.push_back(getNumberOfIterations(T)); 677 Result = T; 678 } 679 } else 680 NonParaFlag = true; 681 682 Stmt = T->body; 683 LoopDepth++; 684 continue; 685 } 686 Stmt = Stmt->next; 687 } 688 689 assert(NumIters.size() == 4 && 690 "The loops should be tiled into 4-depth parallel loops and an " 691 "innermost non-parallel one (if exist)."); 692 NonPLoopDepth = LoopDepth - NumIters.size(); 693 assert(NonPLoopDepth <= 1 && 694 "We support only one innermost non-parallel loop currently."); 695 return (const clast_stmt *)Result->body; 696 } 697 698 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) { 699 BasicBlock::iterator LoopBody; 700 SetVector<Value *> Values; 701 SetVector<Value *> IVS; 702 std::vector<int> NumIterations; 703 PTXGenerator::ValueToValueMapTy VMap; 704 705 assert(!GPUTriple.empty() && 706 "Target triple should be set properly for GPGPU code generation."); 707 PTXGenerator PTXGen(Builder, P, GPUTriple); 708 709 // Get original IVS and ScopStmt 710 unsigned TiledLoopDepth, NonPLoopDepth; 711 const clast_stmt *InnerStmt = 712 getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth); 713 const clast_stmt *TmpStmt; 714 const clast_user_stmt *U; 715 const clast_for *InnerFor; 716 if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) { 717 InnerFor = (const clast_for *)InnerStmt; 718 TmpStmt = InnerFor->body; 719 } else 720 TmpStmt = InnerStmt; 721 U = (const clast_user_stmt *)TmpStmt; 722 ScopStmt *Statement = (ScopStmt *)U->statement->usr; 723 for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) { 724 const Value *IV = Statement->getInductionVariableForDimension(i); 725 IVS.insert(const_cast<Value *>(IV)); 726 } 727 728 unsigned OutBytes; 729 Values = getGPUValues(OutBytes); 730 PTXGen.setOutputBytes(OutBytes); 731 PTXGen.startGeneration(Values, IVS, VMap, &LoopBody); 732 733 BasicBlock::iterator AfterLoop = Builder.GetInsertPoint(); 734 Builder.SetInsertPoint(LoopBody); 735 736 BasicBlock *AfterBB = 0; 737 if (NonPLoopDepth) { 738 Value *LowerBound, *UpperBound, *IV, *Stride; 739 Type *IntPtrTy = getIntPtrTy(); 740 LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy); 741 UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy); 742 Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride)); 743 IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB, 744 CmpInst::ICMP_SLE); 745 const Value *OldIV_ = Statement->getInductionVariableForDimension(2); 746 Value *OldIV = const_cast<Value *>(OldIV_); 747 VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV)); 748 } 749 750 updateWithValueMap(VMap); 751 752 BlockGenerator::generate(Builder, *Statement, ValueMap, P); 753 754 if (AfterBB) 755 Builder.SetInsertPoint(AfterBB->begin()); 756 757 // FIXME: The replacement of the host base address with the parameter of ptx 758 // subfunction should have been done by updateWithValueMap. We use the 759 // following codes to avoid affecting other parts of Polly. This should be 760 // fixed later. 761 Function *FN = Builder.GetInsertBlock()->getParent(); 762 for (unsigned j = 0; j < Values.size(); j++) { 763 Value *baseAddr = Values[j]; 764 for (Function::iterator B = FN->begin(); B != FN->end(); ++B) { 765 for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I) 766 I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]); 767 } 768 } 769 Builder.SetInsertPoint(AfterLoop); 770 PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1], 771 NumIterations[2], NumIterations[3]); 772 PTXGen.finishGeneration(FN); 773 } 774 #endif 775 776 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) { 777 const clast_stmt *stmt = f->body; 778 779 while (stmt) { 780 if (!CLAST_STMT_IS_A(stmt, stmt_user)) 781 return false; 782 783 stmt = stmt->next; 784 } 785 786 return true; 787 } 788 789 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) { 790 isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain)); 791 int NumberOfIterations = polly::getNumberOfIterations(LoopDomain); 792 if (NumberOfIterations == -1) 793 return -1; 794 return NumberOfIterations / isl_int_get_si(For->stride) + 1; 795 } 796 797 void ClastStmtCodeGen::codegenForVector(const clast_for *F) { 798 DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";); 799 int VectorWidth = getNumberOfIterations(F); 800 801 Value *LB = ExpGen.codegen(F->LB, getIntPtrTy()); 802 803 APInt Stride = APInt_from_MPZ(F->stride); 804 IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType()); 805 Stride = Stride.zext(LoopIVType->getBitWidth()); 806 Value *StrideValue = ConstantInt::get(LoopIVType, Stride); 807 808 std::vector<Value *> IVS(VectorWidth); 809 IVS[0] = LB; 810 811 for (int i = 1; i < VectorWidth; i++) 812 IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv"); 813 814 isl_set *Domain = isl_set_from_cloog_domain(F->domain); 815 816 // Add loop iv to symbols. 817 ClastVars[F->iterator] = LB; 818 819 const clast_stmt *Stmt = F->body; 820 821 while (Stmt) { 822 codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator, 823 isl_set_copy(Domain)); 824 Stmt = Stmt->next; 825 } 826 827 // Loop is finished, so remove its iv from the live symbols. 828 isl_set_free(Domain); 829 ClastVars.erase(F->iterator); 830 } 831 832 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) { 833 isl_set *Domain = isl_set_from_cloog_domain(f->domain); 834 assert(Domain && "Cannot access domain of loop"); 835 836 Dependences &D = P->getAnalysis<Dependences>(); 837 838 return D.isParallelDimension(isl_set_copy(Domain), isl_set_n_dim(Domain)); 839 } 840 841 void ClastStmtCodeGen::codegen(const clast_for *f) { 842 bool Vector = PollyVectorizerChoice != VECTORIZER_NONE; 843 if ((Vector || OpenMP) && isParallelFor(f)) { 844 if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) && 845 (getNumberOfIterations(f) <= 16)) { 846 codegenForVector(f); 847 return; 848 } 849 850 if (OpenMP && !parallelCodeGeneration) { 851 parallelCodeGeneration = true; 852 parallelLoops.push_back(f->iterator); 853 codegenForOpenMP(f); 854 parallelCodeGeneration = false; 855 return; 856 } 857 } 858 859 #ifdef GPU_CODEGEN 860 if (GPGPU && isParallelFor(f)) { 861 if (!parallelCodeGeneration) { 862 parallelCodeGeneration = true; 863 parallelLoops.push_back(f->iterator); 864 codegenForGPGPU(f); 865 parallelCodeGeneration = false; 866 return; 867 } 868 } 869 #endif 870 871 codegenForSequential(f); 872 } 873 874 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) { 875 Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy()); 876 Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy()); 877 CmpInst::Predicate P; 878 879 if (eq->sign == 0) 880 P = ICmpInst::ICMP_EQ; 881 else if (eq->sign > 0) 882 P = ICmpInst::ICMP_SGE; 883 else 884 P = ICmpInst::ICMP_SLE; 885 886 return Builder.CreateICmp(P, LHS, RHS); 887 } 888 889 void ClastStmtCodeGen::codegen(const clast_guard *g) { 890 Function *F = Builder.GetInsertBlock()->getParent(); 891 LLVMContext &Context = F->getContext(); 892 893 BasicBlock *CondBB = 894 SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P); 895 CondBB->setName("polly.cond"); 896 BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P); 897 MergeBB->setName("polly.merge"); 898 BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F); 899 900 DominatorTree &DT = P->getAnalysis<DominatorTree>(); 901 DT.addNewBlock(ThenBB, CondBB); 902 DT.changeImmediateDominator(MergeBB, CondBB); 903 904 CondBB->getTerminator()->eraseFromParent(); 905 906 Builder.SetInsertPoint(CondBB); 907 908 Value *Predicate = codegen(&(g->eq[0])); 909 910 for (int i = 1; i < g->n; ++i) { 911 Value *TmpPredicate = codegen(&(g->eq[i])); 912 Predicate = Builder.CreateAnd(Predicate, TmpPredicate); 913 } 914 915 Builder.CreateCondBr(Predicate, ThenBB, MergeBB); 916 Builder.SetInsertPoint(ThenBB); 917 Builder.CreateBr(MergeBB); 918 Builder.SetInsertPoint(ThenBB->begin()); 919 920 codegen(g->then); 921 922 Builder.SetInsertPoint(MergeBB->begin()); 923 } 924 925 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) { 926 if (CLAST_STMT_IS_A(stmt, stmt_root)) 927 assert(false && "No second root statement expected"); 928 else if (CLAST_STMT_IS_A(stmt, stmt_ass)) 929 codegen((const clast_assignment *)stmt); 930 else if (CLAST_STMT_IS_A(stmt, stmt_user)) 931 codegen((const clast_user_stmt *)stmt); 932 else if (CLAST_STMT_IS_A(stmt, stmt_block)) 933 codegen((const clast_block *)stmt); 934 else if (CLAST_STMT_IS_A(stmt, stmt_for)) 935 codegen((const clast_for *)stmt); 936 else if (CLAST_STMT_IS_A(stmt, stmt_guard)) 937 codegen((const clast_guard *)stmt); 938 939 if (stmt->next) 940 codegen(stmt->next); 941 } 942 943 void ClastStmtCodeGen::addParameters(const CloogNames *names) { 944 SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly"); 945 946 int i = 0; 947 for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end(); 948 PI != PE; ++PI) { 949 assert(i < names->nb_parameters && "Not enough parameter names"); 950 951 const SCEV *Param = *PI; 952 Type *Ty = Param->getType(); 953 954 Instruction *insertLocation = --(Builder.GetInsertBlock()->end()); 955 Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation); 956 ClastVars[names->parameters[i]] = V; 957 958 ++i; 959 } 960 } 961 962 void ClastStmtCodeGen::codegen(const clast_root *r) { 963 addParameters(r->names); 964 965 parallelCodeGeneration = false; 966 967 const clast_stmt *stmt = (const clast_stmt *)r; 968 if (stmt->next) 969 codegen(stmt->next); 970 } 971 972 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) 973 : S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) { 974 } 975 976 namespace { 977 class CodeGeneration : public ScopPass { 978 std::vector<std::string> ParallelLoops; 979 980 public: 981 static char ID; 982 983 CodeGeneration() : ScopPass(ID) {} 984 985 bool runOnScop(Scop &S) { 986 ParallelLoops.clear(); 987 988 assert(S.getRegion().isSimple() && "Only simple regions are supported"); 989 990 BasicBlock *StartBlock = executeScopConditionally(S, this); 991 992 IRBuilder<> Builder(StartBlock->begin()); 993 994 ClastStmtCodeGen CodeGen(&S, Builder, this); 995 CloogInfo &C = getAnalysis<CloogInfo>(); 996 CodeGen.codegen(C.getClast()); 997 998 ParallelLoops.insert(ParallelLoops.begin(), 999 CodeGen.getParallelLoops().begin(), 1000 CodeGen.getParallelLoops().end()); 1001 return true; 1002 } 1003 1004 virtual void printScop(raw_ostream &OS) const { 1005 for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(), 1006 PE = ParallelLoops.end(); 1007 PI != PE; ++PI) 1008 OS << "Parallel loop with iterator '" << *PI << "' generated\n"; 1009 } 1010 1011 virtual void getAnalysisUsage(AnalysisUsage &AU) const { 1012 AU.addRequired<CloogInfo>(); 1013 AU.addRequired<Dependences>(); 1014 AU.addRequired<DominatorTree>(); 1015 AU.addRequired<RegionInfo>(); 1016 AU.addRequired<ScalarEvolution>(); 1017 AU.addRequired<ScopDetection>(); 1018 AU.addRequired<ScopInfo>(); 1019 AU.addRequired<DataLayout>(); 1020 AU.addRequired<LoopInfo>(); 1021 1022 AU.addPreserved<CloogInfo>(); 1023 AU.addPreserved<Dependences>(); 1024 1025 // FIXME: We do not create LoopInfo for the newly generated loops. 1026 AU.addPreserved<LoopInfo>(); 1027 AU.addPreserved<DominatorTree>(); 1028 AU.addPreserved<ScopDetection>(); 1029 AU.addPreserved<ScalarEvolution>(); 1030 1031 // FIXME: We do not yet add regions for the newly generated code to the 1032 // region tree. 1033 AU.addPreserved<RegionInfo>(); 1034 AU.addPreserved<TempScopInfo>(); 1035 AU.addPreserved<ScopInfo>(); 1036 AU.addPreservedID(IndependentBlocksID); 1037 } 1038 }; 1039 } 1040 1041 char CodeGeneration::ID = 1; 1042 1043 Pass *polly::createCodeGenerationPass() { 1044 return new CodeGeneration(); 1045 } 1046 1047 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen", 1048 "Polly - Create LLVM-IR from SCoPs", false, false); 1049 INITIALIZE_PASS_DEPENDENCY(CloogInfo); 1050 INITIALIZE_PASS_DEPENDENCY(Dependences); 1051 INITIALIZE_PASS_DEPENDENCY(DominatorTree); 1052 INITIALIZE_PASS_DEPENDENCY(RegionInfo); 1053 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution); 1054 INITIALIZE_PASS_DEPENDENCY(ScopDetection); 1055 INITIALIZE_PASS_DEPENDENCY(DataLayout); 1056 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen", 1057 "Polly - Create LLVM-IR from SCoPs", false, false) 1058 1059 #endif // CLOOG_FOUND 1060