1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it
11 // back to LLVM-IR using Cloog.
12 //
13 // The Scop describes the high level memory behaviour of a control flow region.
14 // Transformation passes can update the schedule (execution order) of statements
15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that
16 // reflects the updated execution order. This clast is used to create new
17 // LLVM-IR that is computational equivalent to the original control flow region,
18 // but executes its code in the new execution order defined by the changed
19 // scattering.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "polly/CodeGen/Cloog.h"
24 #ifdef CLOOG_FOUND
25 
26 #define DEBUG_TYPE "polly-codegen"
27 #include "polly/Dependences.h"
28 #include "polly/LinkAllPasses.h"
29 #include "polly/Options.h"
30 #include "polly/ScopInfo.h"
31 #include "polly/TempScopInfo.h"
32 #include "polly/CodeGen/CodeGeneration.h"
33 #include "polly/CodeGen/BlockGenerators.h"
34 #include "polly/CodeGen/LoopGenerators.h"
35 #include "polly/CodeGen/PTXGenerator.h"
36 #include "polly/CodeGen/Utils.h"
37 #include "polly/Support/GICHelper.h"
38 #include "polly/Support/ScopHelper.h"
39 
40 #include "llvm/IR/Module.h"
41 #include "llvm/ADT/SetVector.h"
42 #include "llvm/ADT/PostOrderIterator.h"
43 #include "llvm/Analysis/LoopInfo.h"
44 #include "llvm/Analysis/ScalarEvolutionExpander.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/IR/DataLayout.h"
47 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
48 
49 #define CLOOG_INT_GMP 1
50 #include "cloog/cloog.h"
51 #include "cloog/isl/cloog.h"
52 
53 #include "isl/aff.h"
54 
55 #include <vector>
56 #include <utility>
57 
58 using namespace polly;
59 using namespace llvm;
60 
61 struct isl_set;
62 
63 namespace polly {
64 static cl::opt<bool>
65 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"),
66        cl::value_desc("OpenMP code generation enabled if true"),
67        cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
68 
69 #ifdef GPU_CODEGEN
70 static cl::opt<bool>
71 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), cl::Hidden,
72       cl::value_desc("GPGPU code generation enabled if true"), cl::init(false),
73       cl::ZeroOrMore, cl::cat(PollyCategory));
74 
75 static cl::opt<std::string>
76 GPUTriple("polly-gpgpu-triple",
77           cl::desc("Target triple for GPU code generation"), cl::Hidden,
78           cl::init(""), cl::cat(PollyCategory));
79 #endif /* GPU_CODEGEN */
80 
81 typedef DenseMap<const char *, Value *> CharMapT;
82 
83 /// Class to generate LLVM-IR that calculates the value of a clast_expr.
84 class ClastExpCodeGen {
85   IRBuilder<> &Builder;
86   const CharMapT &IVS;
87 
88   Value *codegen(const clast_name *e, Type *Ty);
89   Value *codegen(const clast_term *e, Type *Ty);
90   Value *codegen(const clast_binary *e, Type *Ty);
91   Value *codegen(const clast_reduction *r, Type *Ty);
92 
93 public:
94   // A generator for clast expressions.
95   //
96   // @param B The IRBuilder that defines where the code to calculate the
97   //          clast expressions should be inserted.
98   // @param IVMAP A Map that translates strings describing the induction
99   //              variables to the Values* that represent these variables
100   //              on the LLVM side.
101   ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap);
102 
103   // Generates code to calculate a given clast expression.
104   //
105   // @param e The expression to calculate.
106   // @return The Value that holds the result.
107   Value *codegen(const clast_expr *e, Type *Ty);
108 };
109 
110 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) {
111   CharMapT::const_iterator I = IVS.find(e->name);
112 
113   assert(I != IVS.end() && "Clast name not found");
114 
115   return Builder.CreateSExtOrBitCast(I->second, Ty);
116 }
117 
118 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) {
119   APInt a = APInt_from_MPZ(e->val);
120 
121   Value *ConstOne = ConstantInt::get(Builder.getContext(), a);
122   ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty);
123 
124   if (!e->var)
125     return ConstOne;
126 
127   Value *var = codegen(e->var, Ty);
128   return Builder.CreateMul(ConstOne, var);
129 }
130 
131 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) {
132   Value *LHS = codegen(e->LHS, Ty);
133 
134   APInt RHS_AP = APInt_from_MPZ(e->RHS);
135 
136   Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP);
137   RHS = Builder.CreateSExtOrBitCast(RHS, Ty);
138 
139   switch (e->type) {
140   case clast_bin_mod:
141     return Builder.CreateSRem(LHS, RHS);
142   case clast_bin_fdiv: {
143     // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d
144     Value *One = ConstantInt::get(Ty, 1);
145     Value *Zero = ConstantInt::get(Ty, 0);
146     Value *Sum1 = Builder.CreateSub(LHS, RHS);
147     Value *Sum2 = Builder.CreateAdd(Sum1, One);
148     Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
149     Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS);
150     return Builder.CreateSDiv(Dividend, RHS);
151   }
152   case clast_bin_cdiv: {
153     // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d
154     Value *One = ConstantInt::get(Ty, 1);
155     Value *Zero = ConstantInt::get(Ty, 0);
156     Value *Sum1 = Builder.CreateAdd(LHS, RHS);
157     Value *Sum2 = Builder.CreateSub(Sum1, One);
158     Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
159     Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2);
160     return Builder.CreateSDiv(Dividend, RHS);
161   }
162   case clast_bin_div:
163     return Builder.CreateSDiv(LHS, RHS);
164   }
165 
166   llvm_unreachable("Unknown clast binary expression type");
167 }
168 
169 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) {
170   assert((r->type == clast_red_min || r->type == clast_red_max ||
171           r->type == clast_red_sum) &&
172          "Clast reduction type not supported");
173   Value *old = codegen(r->elts[0], Ty);
174 
175   for (int i = 1; i < r->n; ++i) {
176     Value *exprValue = codegen(r->elts[i], Ty);
177 
178     switch (r->type) {
179     case clast_red_min: {
180       Value *cmp = Builder.CreateICmpSLT(old, exprValue);
181       old = Builder.CreateSelect(cmp, old, exprValue);
182       break;
183     }
184     case clast_red_max: {
185       Value *cmp = Builder.CreateICmpSGT(old, exprValue);
186       old = Builder.CreateSelect(cmp, old, exprValue);
187       break;
188     }
189     case clast_red_sum:
190       old = Builder.CreateAdd(old, exprValue);
191       break;
192     }
193   }
194 
195   return old;
196 }
197 
198 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap)
199     : Builder(B), IVS(IVMap) {}
200 
201 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) {
202   switch (e->type) {
203   case clast_expr_name:
204     return codegen((const clast_name *)e, Ty);
205   case clast_expr_term:
206     return codegen((const clast_term *)e, Ty);
207   case clast_expr_bin:
208     return codegen((const clast_binary *)e, Ty);
209   case clast_expr_red:
210     return codegen((const clast_reduction *)e, Ty);
211   }
212 
213   llvm_unreachable("Unknown clast expression!");
214 }
215 
216 class ClastStmtCodeGen {
217 public:
218   const std::vector<std::string> &getParallelLoops();
219 
220 private:
221   // The Scop we code generate.
222   Scop *S;
223   Pass *P;
224 
225   // The Builder specifies the current location to code generate at.
226   IRBuilder<> &Builder;
227 
228   // Map the Values from the old code to their counterparts in the new code.
229   ValueMapT ValueMap;
230 
231   // Map the loops from the old code to expressions function of the induction
232   // variables in the new code.  For example, when the code generator produces
233   // this AST:
234   //
235   //   for (int c1 = 0; c1 <= 1023; c1 += 1)
236   //     for (int c2 = 0; c2 <= 1023; c2 += 1)
237   //       Stmt(c2 + 3, c1);
238   //
239   // LoopToScev is a map associating:
240   //   "outer loop in the old loop nest" -> SCEV("c2 + 3"),
241   //   "inner loop in the old loop nest" -> SCEV("c1").
242   LoopToScevMapT LoopToScev;
243 
244   // clastVars maps from the textual representation of a clast variable to its
245   // current *Value. clast variables are scheduling variables, original
246   // induction variables or parameters. They are used either in loop bounds or
247   // to define the statement instance that is executed.
248   //
249   //   for (s = 0; s < n + 3; ++i)
250   //     for (t = s; t < m; ++j)
251   //       Stmt(i = s + 3 * m, j = t);
252   //
253   // {s,t,i,j,n,m} is the set of clast variables in this clast.
254   CharMapT ClastVars;
255 
256   // Codegenerator for clast expressions.
257   ClastExpCodeGen ExpGen;
258 
259   // Do we currently generate parallel code?
260   bool parallelCodeGeneration;
261 
262   std::vector<std::string> parallelLoops;
263 
264   void codegen(const clast_assignment *a);
265 
266   void codegen(const clast_assignment *a, ScopStmt *Statement,
267                unsigned Dimension, int vectorDim,
268                std::vector<ValueMapT> *VectorVMap = 0,
269                std::vector<LoopToScevMapT> *VLTS = 0);
270 
271   void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement,
272                             int vectorDim = 0,
273                             std::vector<ValueMapT> *VectorVMap = 0,
274                             std::vector<LoopToScevMapT> *VLTS = 0);
275 
276   void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = NULL,
277                const char *iterator = NULL,
278                __isl_take isl_set *scatteringDomain = 0);
279 
280   void codegen(const clast_block *b);
281 
282   /// @brief Create a classical sequential loop.
283   void codegenForSequential(const clast_for *f);
284 
285   /// @brief Create OpenMP structure values.
286   ///
287   /// Create a list of values that has to be stored into the OpenMP subfuncition
288   /// structure.
289   SetVector<Value *> getOMPValues(const clast_stmt *Body);
290 
291   /// @brief Update ClastVars and ValueMap according to a value map.
292   ///
293   /// @param VMap A map from old to new values.
294   void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap);
295 
296   /// @brief Create an OpenMP parallel for loop.
297   ///
298   /// This loop reflects a loop as if it would have been created by an OpenMP
299   /// statement.
300   void codegenForOpenMP(const clast_for *f);
301 
302 #ifdef GPU_CODEGEN
303   /// @brief Create GPGPU device memory access values.
304   ///
305   /// Create a list of values that will be set to be parameters of the GPGPU
306   /// subfunction. These parameters represent device memory base addresses
307   /// and the size in bytes.
308   SetVector<Value *> getGPUValues(unsigned &OutputBytes);
309 
310   /// @brief Create a GPU parallel for loop.
311   ///
312   /// This loop reflects a loop as if it would have been created by a GPU
313   /// statement.
314   void codegenForGPGPU(const clast_for *F);
315 
316   /// @brief Get innermost for loop.
317   const clast_stmt *getScheduleInfo(const clast_for *F,
318                                     std::vector<int> &NumIters,
319                                     unsigned &LoopDepth,
320                                     unsigned &NonPLoopDepth);
321 #endif /* GPU_CODEGEN */
322 
323   /// @brief Check if a loop is parallel
324   ///
325   /// Detect if a clast_for loop can be executed in parallel.
326   ///
327   /// @param For The clast for loop to check.
328   ///
329   /// @return bool Returns true if the incoming clast_for statement can
330   ///              execute in parallel.
331   bool isParallelFor(const clast_for *For);
332 
333   bool isInnermostLoop(const clast_for *f);
334 
335   /// @brief Get the number of loop iterations for this loop.
336   /// @param f The clast for loop to check.
337   int getNumberOfIterations(const clast_for *f);
338 
339   /// @brief Create vector instructions for this loop.
340   void codegenForVector(const clast_for *f);
341 
342   void codegen(const clast_for *f);
343 
344   Value *codegen(const clast_equation *eq);
345 
346   void codegen(const clast_guard *g);
347 
348   void codegen(const clast_stmt *stmt);
349 
350   void addParameters(const CloogNames *names);
351 
352   IntegerType *getIntPtrTy();
353 
354 public:
355   void codegen(const clast_root *r);
356 
357   ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P);
358 };
359 }
360 
361 IntegerType *ClastStmtCodeGen::getIntPtrTy() {
362   return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext());
363 }
364 
365 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() {
366   return parallelLoops;
367 }
368 
369 void ClastStmtCodeGen::codegen(const clast_assignment *a) {
370   Value *V = ExpGen.codegen(a->RHS, getIntPtrTy());
371   ClastVars[a->LHS] = V;
372 }
373 
374 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt,
375                                unsigned Dim, int VectorDim,
376                                std::vector<ValueMapT> *VectorVMap,
377                                std::vector<LoopToScevMapT> *VLTS) {
378   Value *RHS;
379 
380   assert(!A->LHS && "Statement assignments do not have left hand side");
381 
382   RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty());
383 
384   const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS);
385   if (VLTS)
386     (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS;
387   LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS;
388 
389   const PHINode *PN = Stmt->getInductionVariableForDimension(Dim);
390   if (PN) {
391     RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType());
392 
393     if (VectorVMap)
394       (*VectorVMap)[VectorDim][PN] = RHS;
395 
396     ValueMap[PN] = RHS;
397   }
398 }
399 
400 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment,
401                                             ScopStmt *Statement, int vectorDim,
402                                             std::vector<ValueMapT> *VectorVMap,
403                                             std::vector<LoopToScevMapT> *VLTS) {
404   int Dimension = 0;
405 
406   while (Assignment) {
407     assert(CLAST_STMT_IS_A(Assignment, stmt_ass) &&
408            "Substitions are expected to be assignments");
409     codegen((const clast_assignment *)Assignment, Statement, Dimension,
410             vectorDim, VectorVMap, VLTS);
411     Assignment = Assignment->next;
412     Dimension++;
413   }
414 }
415 
416 // Takes the cloog specific domain and translates it into a map Statement ->
417 // PartialSchedule, where the PartialSchedule contains all the dimensions that
418 // have been code generated up to this point.
419 static __isl_give isl_map *extractPartialSchedule(ScopStmt *Statement,
420                                                   __isl_take isl_set *Domain) {
421   isl_map *Schedule = Statement->getScattering();
422   int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set);
423   int UnscheduledDimensions =
424       isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions;
425 
426   isl_set_free(Domain);
427 
428   return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions,
429                              UnscheduledDimensions);
430 }
431 
432 void ClastStmtCodeGen::codegen(const clast_user_stmt *u,
433                                std::vector<Value *> *IVS, const char *iterator,
434                                __isl_take isl_set *Domain) {
435   ScopStmt *Statement = (ScopStmt *)u->statement->usr;
436 
437   if (u->substitutions)
438     codegenSubstitutions(u->substitutions, Statement);
439 
440   int VectorDimensions = IVS ? IVS->size() : 1;
441 
442   if (VectorDimensions == 1) {
443     BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P);
444     return;
445   }
446 
447   VectorValueMapT VectorMap(VectorDimensions);
448   std::vector<LoopToScevMapT> VLTS(VectorDimensions);
449 
450   if (IVS) {
451     assert(u->substitutions && "Substitutions expected!");
452     int i = 0;
453     for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end();
454          II != IE; ++II) {
455       ClastVars[iterator] = *II;
456       codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS);
457       i++;
458     }
459   }
460 
461   isl_map *Schedule = extractPartialSchedule(Statement, Domain);
462   VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule,
463                                  P);
464   isl_map_free(Schedule);
465 }
466 
467 void ClastStmtCodeGen::codegen(const clast_block *b) {
468   if (b->body)
469     codegen(b->body);
470 }
471 
472 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) {
473   Value *LowerBound, *UpperBound, *IV, *Stride;
474   BasicBlock *ExitBlock;
475   Type *IntPtrTy = getIntPtrTy();
476 
477   LowerBound = ExpGen.codegen(f->LB, IntPtrTy);
478   UpperBound = ExpGen.codegen(f->UB, IntPtrTy);
479   Stride = Builder.getInt(APInt_from_MPZ(f->stride));
480 
481   IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, ExitBlock,
482                   CmpInst::ICMP_SLE);
483 
484   // Add loop iv to symbols.
485   ClastVars[f->iterator] = IV;
486 
487   if (f->body)
488     codegen(f->body);
489 
490   // Loop is finished, so remove its iv from the live symbols.
491   ClastVars.erase(f->iterator);
492   Builder.SetInsertPoint(ExitBlock->begin());
493 }
494 
495 // Helper class to determine all scalar parameters used in the basic blocks of a
496 // clast. Scalar parameters are scalar variables defined outside of the SCoP.
497 class ParameterVisitor : public ClastVisitor {
498   std::set<Value *> Values;
499 
500 public:
501   ParameterVisitor() : ClastVisitor(), Values() {}
502 
503   void visitUser(const clast_user_stmt *Stmt) {
504     const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr);
505     const BasicBlock *BB = S->getBasicBlock();
506 
507     // Check all the operands of instructions in the basic block.
508     for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE;
509          ++BI) {
510       const Instruction &Inst = *BI;
511       for (Instruction::const_op_iterator II = Inst.op_begin(),
512                                           IE = Inst.op_end();
513            II != IE; ++II) {
514         Value *SrcVal = *II;
515 
516         if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal))
517           if (S->getParent()->getRegion().contains(OpInst))
518             continue;
519 
520         if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal))
521           Values.insert(SrcVal);
522       }
523     }
524   }
525 
526   // Iterator to iterate over the values found.
527   typedef std::set<Value *>::const_iterator const_iterator;
528   inline const_iterator begin() const { return Values.begin(); }
529   inline const_iterator end() const { return Values.end(); }
530 };
531 
532 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) {
533   SetVector<Value *> Values;
534 
535   // The clast variables
536   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E;
537        I++)
538     Values.insert(I->second);
539 
540   // Find the temporaries that are referenced in the clast statements'
541   // basic blocks but are not defined by these blocks (e.g., references
542   // to function arguments or temporaries defined before the start of
543   // the SCoP).
544   ParameterVisitor Params;
545   Params.visit(Body);
546 
547   for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end();
548        PI != PE; ++PI) {
549     Value *V = *PI;
550     Values.insert(V);
551     DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n");
552   }
553 
554   return Values;
555 }
556 
557 void
558 ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) {
559   std::set<Value *> Inserted;
560 
561   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E;
562        I++) {
563     ClastVars[I->first] = VMap[I->second];
564     Inserted.insert(I->second);
565   }
566 
567   for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(),
568                                                  E = VMap.end();
569        I != E; ++I) {
570     if (Inserted.count(I->first))
571       continue;
572 
573     ValueMap[I->first] = I->second;
574   }
575 }
576 
577 static void clearDomtree(Function *F, DominatorTree &DT) {
578   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
579   std::vector<BasicBlock *> Nodes;
580   for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
581     Nodes.push_back(I->getBlock());
582 
583   for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end();
584        I != E; ++I)
585     DT.eraseNode(*I);
586 }
587 
588 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) {
589   Value *Stride, *LB, *UB, *IV;
590   BasicBlock::iterator LoopBody;
591   IntegerType *IntPtrTy = getIntPtrTy();
592   SetVector<Value *> Values;
593   OMPGenerator::ValueToValueMapTy VMap;
594   OMPGenerator OMPGen(Builder, P);
595 
596   Stride = Builder.getInt(APInt_from_MPZ(For->stride));
597   Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy);
598   LB = ExpGen.codegen(For->LB, IntPtrTy);
599   UB = ExpGen.codegen(For->UB, IntPtrTy);
600 
601   Values = getOMPValues(For->body);
602 
603   IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody);
604   BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
605   Builder.SetInsertPoint(LoopBody);
606 
607   // Save the current values.
608   const ValueMapT ValueMapCopy = ValueMap;
609   const CharMapT ClastVarsCopy = ClastVars;
610 
611   updateWithValueMap(VMap);
612   ClastVars[For->iterator] = IV;
613 
614   if (For->body)
615     codegen(For->body);
616 
617   // Restore the original values.
618   ValueMap = ValueMapCopy;
619   ClastVars = ClastVarsCopy;
620 
621   clearDomtree((*LoopBody).getParent()->getParent(),
622                P->getAnalysis<DominatorTree>());
623 
624   Builder.SetInsertPoint(AfterLoop);
625 }
626 
627 #ifdef GPU_CODEGEN
628 static unsigned getArraySizeInBytes(const ArrayType *AT) {
629   unsigned Bytes = AT->getNumElements();
630   if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
631     Bytes *= getArraySizeInBytes(T);
632   else
633     Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
634 
635   return Bytes;
636 }
637 
638 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
639   SetVector<Value *> Values;
640   OutputBytes = 0;
641 
642   // Record the memory reference base addresses.
643   for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
644     ScopStmt *Stmt = *SI;
645     for (SmallVector<MemoryAccess *, 8>::iterator I = Stmt->memacc_begin(),
646                                                   E = Stmt->memacc_end();
647          I != E; ++I) {
648       Value *BaseAddr = const_cast<Value *>((*I)->getBaseAddr());
649       Values.insert((BaseAddr));
650 
651       // FIXME: we assume that there is one and only one array to be written
652       // in a SCoP.
653       int NumWrites = 0;
654       if ((*I)->isWrite()) {
655         ++NumWrites;
656         assert(NumWrites <= 1 &&
657                "We support at most one array to be written in a SCoP.");
658         if (const PointerType *PT =
659                 dyn_cast<PointerType>(BaseAddr->getType())) {
660           Type *T = PT->getArrayElementType();
661           const ArrayType *ATy = dyn_cast<ArrayType>(T);
662           OutputBytes = getArraySizeInBytes(ATy);
663         }
664       }
665     }
666   }
667 
668   return Values;
669 }
670 
671 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
672                                                     std::vector<int> &NumIters,
673                                                     unsigned &LoopDepth,
674                                                     unsigned &NonPLoopDepth) {
675   clast_stmt *Stmt = (clast_stmt *)F;
676   const clast_for *Result;
677   bool NonParaFlag = false;
678   LoopDepth = 0;
679   NonPLoopDepth = 0;
680 
681   while (Stmt) {
682     if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
683       const clast_for *T = (clast_for *)Stmt;
684       if (isParallelFor(T)) {
685         if (!NonParaFlag) {
686           NumIters.push_back(getNumberOfIterations(T));
687           Result = T;
688         }
689       } else
690         NonParaFlag = true;
691 
692       Stmt = T->body;
693       LoopDepth++;
694       continue;
695     }
696     Stmt = Stmt->next;
697   }
698 
699   assert(NumIters.size() == 4 &&
700          "The loops should be tiled into 4-depth parallel loops and an "
701          "innermost non-parallel one (if exist).");
702   NonPLoopDepth = LoopDepth - NumIters.size();
703   assert(NonPLoopDepth <= 1 &&
704          "We support only one innermost non-parallel loop currently.");
705   return (const clast_stmt *)Result->body;
706 }
707 
708 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
709   BasicBlock::iterator LoopBody;
710   SetVector<Value *> Values;
711   SetVector<Value *> IVS;
712   std::vector<int> NumIterations;
713   PTXGenerator::ValueToValueMapTy VMap;
714 
715   assert(!GPUTriple.empty() &&
716          "Target triple should be set properly for GPGPU code generation.");
717   PTXGenerator PTXGen(Builder, P, GPUTriple);
718 
719   // Get original IVS and ScopStmt
720   unsigned TiledLoopDepth, NonPLoopDepth;
721   const clast_stmt *InnerStmt =
722       getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth);
723   const clast_stmt *TmpStmt;
724   const clast_user_stmt *U;
725   const clast_for *InnerFor;
726   if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
727     InnerFor = (const clast_for *)InnerStmt;
728     TmpStmt = InnerFor->body;
729   } else
730     TmpStmt = InnerStmt;
731   U = (const clast_user_stmt *)TmpStmt;
732   ScopStmt *Statement = (ScopStmt *)U->statement->usr;
733   for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
734     const Value *IV = Statement->getInductionVariableForDimension(i);
735     IVS.insert(const_cast<Value *>(IV));
736   }
737 
738   unsigned OutBytes;
739   Values = getGPUValues(OutBytes);
740   PTXGen.setOutputBytes(OutBytes);
741   PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
742 
743   BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
744   Builder.SetInsertPoint(LoopBody);
745 
746   BasicBlock *AfterBB = 0;
747   if (NonPLoopDepth) {
748     Value *LowerBound, *UpperBound, *IV, *Stride;
749     Type *IntPtrTy = getIntPtrTy();
750     LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
751     UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
752     Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
753     IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB,
754                     CmpInst::ICMP_SLE);
755     const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
756     Value *OldIV = const_cast<Value *>(OldIV_);
757     VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV));
758   }
759 
760   updateWithValueMap(VMap);
761 
762   BlockGenerator::generate(Builder, *Statement, ValueMap, P);
763 
764   if (AfterBB)
765     Builder.SetInsertPoint(AfterBB->begin());
766 
767   // FIXME: The replacement of the host base address with the parameter of ptx
768   // subfunction should have been done by updateWithValueMap. We use the
769   // following codes to avoid affecting other parts of Polly. This should be
770   // fixed later.
771   Function *FN = Builder.GetInsertBlock()->getParent();
772   for (unsigned j = 0; j < Values.size(); j++) {
773     Value *baseAddr = Values[j];
774     for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
775       for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
776         I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
777     }
778   }
779   Builder.SetInsertPoint(AfterLoop);
780   PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
781                                 NumIterations[2], NumIterations[3]);
782   PTXGen.finishGeneration(FN);
783 }
784 #endif
785 
786 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
787   const clast_stmt *stmt = f->body;
788 
789   while (stmt) {
790     if (!CLAST_STMT_IS_A(stmt, stmt_user))
791       return false;
792 
793     stmt = stmt->next;
794   }
795 
796   return true;
797 }
798 
799 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) {
800   isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain));
801   int NumberOfIterations = polly::getNumberOfIterations(LoopDomain);
802   if (NumberOfIterations == -1)
803     return -1;
804   return NumberOfIterations / isl_int_get_si(For->stride) + 1;
805 }
806 
807 void ClastStmtCodeGen::codegenForVector(const clast_for *F) {
808   DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";);
809   int VectorWidth = getNumberOfIterations(F);
810 
811   Value *LB = ExpGen.codegen(F->LB, getIntPtrTy());
812 
813   APInt Stride = APInt_from_MPZ(F->stride);
814   IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType());
815   Stride = Stride.zext(LoopIVType->getBitWidth());
816   Value *StrideValue = ConstantInt::get(LoopIVType, Stride);
817 
818   std::vector<Value *> IVS(VectorWidth);
819   IVS[0] = LB;
820 
821   for (int i = 1; i < VectorWidth; i++)
822     IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv");
823 
824   isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(F->domain));
825 
826   // Add loop iv to symbols.
827   ClastVars[F->iterator] = LB;
828 
829   const clast_stmt *Stmt = F->body;
830 
831   while (Stmt) {
832     codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator,
833             isl_set_copy(Domain));
834     Stmt = Stmt->next;
835   }
836 
837   // Loop is finished, so remove its iv from the live symbols.
838   isl_set_free(Domain);
839   ClastVars.erase(F->iterator);
840 }
841 
842 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) {
843   isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(f->domain));
844   assert(Domain && "Cannot access domain of loop");
845 
846   Dependences &D = P->getAnalysis<Dependences>();
847 
848   return D.isParallelDimension(Domain, isl_set_n_dim(Domain));
849 }
850 
851 void ClastStmtCodeGen::codegen(const clast_for *f) {
852   bool Vector = PollyVectorizerChoice != VECTORIZER_NONE;
853   if ((Vector || OpenMP) && isParallelFor(f)) {
854     if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) &&
855         (getNumberOfIterations(f) <= 16)) {
856       codegenForVector(f);
857       return;
858     }
859 
860     if (OpenMP && !parallelCodeGeneration) {
861       parallelCodeGeneration = true;
862       parallelLoops.push_back(f->iterator);
863       codegenForOpenMP(f);
864       parallelCodeGeneration = false;
865       return;
866     }
867   }
868 
869 #ifdef GPU_CODEGEN
870   if (GPGPU && isParallelFor(f)) {
871     if (!parallelCodeGeneration) {
872       parallelCodeGeneration = true;
873       parallelLoops.push_back(f->iterator);
874       codegenForGPGPU(f);
875       parallelCodeGeneration = false;
876       return;
877     }
878   }
879 #endif
880 
881   codegenForSequential(f);
882 }
883 
884 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) {
885   Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy());
886   Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy());
887   CmpInst::Predicate P;
888 
889   if (eq->sign == 0)
890     P = ICmpInst::ICMP_EQ;
891   else if (eq->sign > 0)
892     P = ICmpInst::ICMP_SGE;
893   else
894     P = ICmpInst::ICMP_SLE;
895 
896   return Builder.CreateICmp(P, LHS, RHS);
897 }
898 
899 void ClastStmtCodeGen::codegen(const clast_guard *g) {
900   Function *F = Builder.GetInsertBlock()->getParent();
901   LLVMContext &Context = F->getContext();
902 
903   BasicBlock *CondBB =
904       SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P);
905   CondBB->setName("polly.cond");
906   BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P);
907   MergeBB->setName("polly.merge");
908   BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F);
909 
910   DominatorTree &DT = P->getAnalysis<DominatorTree>();
911   DT.addNewBlock(ThenBB, CondBB);
912   DT.changeImmediateDominator(MergeBB, CondBB);
913 
914   CondBB->getTerminator()->eraseFromParent();
915 
916   Builder.SetInsertPoint(CondBB);
917 
918   Value *Predicate = codegen(&(g->eq[0]));
919 
920   for (int i = 1; i < g->n; ++i) {
921     Value *TmpPredicate = codegen(&(g->eq[i]));
922     Predicate = Builder.CreateAnd(Predicate, TmpPredicate);
923   }
924 
925   Builder.CreateCondBr(Predicate, ThenBB, MergeBB);
926   Builder.SetInsertPoint(ThenBB);
927   Builder.CreateBr(MergeBB);
928   Builder.SetInsertPoint(ThenBB->begin());
929 
930   LoopInfo &LI = P->getAnalysis<LoopInfo>();
931   Loop *L = LI.getLoopFor(CondBB);
932   if (L)
933     L->addBasicBlockToLoop(ThenBB, LI.getBase());
934 
935   codegen(g->then);
936 
937   Builder.SetInsertPoint(MergeBB->begin());
938 }
939 
940 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) {
941   if (CLAST_STMT_IS_A(stmt, stmt_root))
942     assert(false && "No second root statement expected");
943   else if (CLAST_STMT_IS_A(stmt, stmt_ass))
944     codegen((const clast_assignment *)stmt);
945   else if (CLAST_STMT_IS_A(stmt, stmt_user))
946     codegen((const clast_user_stmt *)stmt);
947   else if (CLAST_STMT_IS_A(stmt, stmt_block))
948     codegen((const clast_block *)stmt);
949   else if (CLAST_STMT_IS_A(stmt, stmt_for))
950     codegen((const clast_for *)stmt);
951   else if (CLAST_STMT_IS_A(stmt, stmt_guard))
952     codegen((const clast_guard *)stmt);
953 
954   if (stmt->next)
955     codegen(stmt->next);
956 }
957 
958 void ClastStmtCodeGen::addParameters(const CloogNames *names) {
959   SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly");
960 
961   int i = 0;
962   for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end();
963        PI != PE; ++PI) {
964     assert(i < names->nb_parameters && "Not enough parameter names");
965 
966     const SCEV *Param = *PI;
967     Type *Ty = Param->getType();
968 
969     Instruction *insertLocation = --(Builder.GetInsertBlock()->end());
970     Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation);
971     ClastVars[names->parameters[i]] = V;
972 
973     ++i;
974   }
975 }
976 
977 void ClastStmtCodeGen::codegen(const clast_root *r) {
978   addParameters(r->names);
979 
980   parallelCodeGeneration = false;
981 
982   const clast_stmt *stmt = (const clast_stmt *)r;
983   if (stmt->next)
984     codegen(stmt->next);
985 }
986 
987 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P)
988     : S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {}
989 
990 namespace {
991 class CodeGeneration : public ScopPass {
992   std::vector<std::string> ParallelLoops;
993 
994 public:
995   static char ID;
996 
997   CodeGeneration() : ScopPass(ID) {}
998 
999   bool runOnScop(Scop &S) {
1000     ParallelLoops.clear();
1001 
1002     assert(!S.getRegion().isTopLevelRegion() &&
1003            "Top level regions are not supported");
1004 
1005     simplifyRegion(&S, this);
1006 
1007     BasicBlock *StartBlock = executeScopConditionally(S, this);
1008 
1009     IRBuilder<> Builder(StartBlock->begin());
1010 
1011     ClastStmtCodeGen CodeGen(&S, Builder, this);
1012     CloogInfo &C = getAnalysis<CloogInfo>();
1013     CodeGen.codegen(C.getClast());
1014 
1015     ParallelLoops.insert(ParallelLoops.begin(),
1016                          CodeGen.getParallelLoops().begin(),
1017                          CodeGen.getParallelLoops().end());
1018     return true;
1019   }
1020 
1021   virtual void printScop(raw_ostream &OS) const {
1022     for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(),
1023                                                   PE = ParallelLoops.end();
1024          PI != PE; ++PI)
1025       OS << "Parallel loop with iterator '" << *PI << "' generated\n";
1026   }
1027 
1028   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
1029     AU.addRequired<CloogInfo>();
1030     AU.addRequired<Dependences>();
1031     AU.addRequired<DominatorTree>();
1032     AU.addRequired<RegionInfo>();
1033     AU.addRequired<ScalarEvolution>();
1034     AU.addRequired<ScopDetection>();
1035     AU.addRequired<ScopInfo>();
1036     AU.addRequired<DataLayout>();
1037     AU.addRequired<LoopInfo>();
1038 
1039     AU.addPreserved<CloogInfo>();
1040     AU.addPreserved<Dependences>();
1041     AU.addPreserved<LoopInfo>();
1042     AU.addPreserved<DominatorTree>();
1043     AU.addPreserved<ScopDetection>();
1044     AU.addPreserved<ScalarEvolution>();
1045 
1046     // FIXME: We do not yet add regions for the newly generated code to the
1047     //        region tree.
1048     AU.addPreserved<RegionInfo>();
1049     AU.addPreserved<TempScopInfo>();
1050     AU.addPreserved<ScopInfo>();
1051     AU.addPreservedID(IndependentBlocksID);
1052   }
1053 };
1054 }
1055 
1056 char CodeGeneration::ID = 1;
1057 
1058 Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); }
1059 
1060 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
1061                       "Polly - Create LLVM-IR from SCoPs", false, false);
1062 INITIALIZE_PASS_DEPENDENCY(CloogInfo);
1063 INITIALIZE_PASS_DEPENDENCY(Dependences);
1064 INITIALIZE_PASS_DEPENDENCY(DominatorTree);
1065 INITIALIZE_PASS_DEPENDENCY(RegionInfo);
1066 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution);
1067 INITIALIZE_PASS_DEPENDENCY(ScopDetection);
1068 INITIALIZE_PASS_DEPENDENCY(DataLayout);
1069 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
1070                     "Polly - Create LLVM-IR from SCoPs", false, false)
1071 
1072 #endif // CLOOG_FOUND
1073