1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it
11 // back to LLVM-IR using Cloog.
12 //
13 // The Scop describes the high level memory behaviour of a control flow region.
14 // Transformation passes can update the schedule (execution order) of statements
15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that
16 // reflects the updated execution order. This clast is used to create new
17 // LLVM-IR that is computational equivalent to the original control flow region,
18 // but executes its code in the new execution order defined by the changed
19 // scattering.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "polly/CodeGen/Cloog.h"
24 #ifdef CLOOG_FOUND
25 
26 #define DEBUG_TYPE "polly-codegen"
27 #include "polly/Dependences.h"
28 #include "polly/LinkAllPasses.h"
29 #include "polly/Options.h"
30 #include "polly/ScopInfo.h"
31 #include "polly/TempScopInfo.h"
32 #include "polly/CodeGen/CodeGeneration.h"
33 #include "polly/CodeGen/BlockGenerators.h"
34 #include "polly/CodeGen/LoopGenerators.h"
35 #include "polly/CodeGen/PTXGenerator.h"
36 #include "polly/CodeGen/Utils.h"
37 #include "polly/Support/GICHelper.h"
38 #include "polly/Support/ScopHelper.h"
39 
40 #include "llvm/IR/Module.h"
41 #include "llvm/ADT/SetVector.h"
42 #include "llvm/ADT/PostOrderIterator.h"
43 #include "llvm/Analysis/LoopInfo.h"
44 #include "llvm/Analysis/ScalarEvolutionExpander.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/IR/DataLayout.h"
47 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
48 
49 #define CLOOG_INT_GMP 1
50 #include "cloog/cloog.h"
51 #include "cloog/isl/cloog.h"
52 
53 #include "isl/aff.h"
54 
55 #include <vector>
56 #include <utility>
57 
58 using namespace polly;
59 using namespace llvm;
60 
61 struct isl_set;
62 
63 namespace polly {
64 static cl::opt<bool>
65 OpenMP("enable-polly-openmp", cl::desc("Generate OpenMP parallel code"),
66        cl::value_desc("OpenMP code generation enabled if true"),
67        cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
68 
69 #ifdef GPU_CODEGEN
70 static cl::opt<bool>
71 GPGPU("enable-polly-gpgpu", cl::desc("Generate GPU parallel code"), cl::Hidden,
72       cl::value_desc("GPGPU code generation enabled if true"), cl::init(false),
73       cl::ZeroOrMore, cl::cat(PollyCategory));
74 
75 static cl::opt<std::string>
76 GPUTriple("polly-gpgpu-triple",
77           cl::desc("Target triple for GPU code generation"), cl::Hidden,
78           cl::init(""), cl::cat(PollyCategory));
79 #endif /* GPU_CODEGEN */
80 
81 typedef DenseMap<const char *, Value *> CharMapT;
82 
83 /// Class to generate LLVM-IR that calculates the value of a clast_expr.
84 class ClastExpCodeGen {
85   IRBuilder<> &Builder;
86   const CharMapT &IVS;
87 
88   Value *codegen(const clast_name *e, Type *Ty);
89   Value *codegen(const clast_term *e, Type *Ty);
90   Value *codegen(const clast_binary *e, Type *Ty);
91   Value *codegen(const clast_reduction *r, Type *Ty);
92 
93 public:
94 
95   // A generator for clast expressions.
96   //
97   // @param B The IRBuilder that defines where the code to calculate the
98   //          clast expressions should be inserted.
99   // @param IVMAP A Map that translates strings describing the induction
100   //              variables to the Values* that represent these variables
101   //              on the LLVM side.
102   ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap);
103 
104   // Generates code to calculate a given clast expression.
105   //
106   // @param e The expression to calculate.
107   // @return The Value that holds the result.
108   Value *codegen(const clast_expr *e, Type *Ty);
109 };
110 
111 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) {
112   CharMapT::const_iterator I = IVS.find(e->name);
113 
114   assert(I != IVS.end() && "Clast name not found");
115 
116   return Builder.CreateSExtOrBitCast(I->second, Ty);
117 }
118 
119 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) {
120   APInt a = APInt_from_MPZ(e->val);
121 
122   Value *ConstOne = ConstantInt::get(Builder.getContext(), a);
123   ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty);
124 
125   if (!e->var)
126     return ConstOne;
127 
128   Value *var = codegen(e->var, Ty);
129   return Builder.CreateMul(ConstOne, var);
130 }
131 
132 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) {
133   Value *LHS = codegen(e->LHS, Ty);
134 
135   APInt RHS_AP = APInt_from_MPZ(e->RHS);
136 
137   Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP);
138   RHS = Builder.CreateSExtOrBitCast(RHS, Ty);
139 
140   switch (e->type) {
141   case clast_bin_mod:
142     return Builder.CreateSRem(LHS, RHS);
143   case clast_bin_fdiv: {
144     // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d
145     Value *One = ConstantInt::get(Ty, 1);
146     Value *Zero = ConstantInt::get(Ty, 0);
147     Value *Sum1 = Builder.CreateSub(LHS, RHS);
148     Value *Sum2 = Builder.CreateAdd(Sum1, One);
149     Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
150     Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS);
151     return Builder.CreateSDiv(Dividend, RHS);
152   }
153   case clast_bin_cdiv: {
154     // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d
155     Value *One = ConstantInt::get(Ty, 1);
156     Value *Zero = ConstantInt::get(Ty, 0);
157     Value *Sum1 = Builder.CreateAdd(LHS, RHS);
158     Value *Sum2 = Builder.CreateSub(Sum1, One);
159     Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
160     Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2);
161     return Builder.CreateSDiv(Dividend, RHS);
162   }
163   case clast_bin_div:
164     return Builder.CreateSDiv(LHS, RHS);
165   }
166 
167   llvm_unreachable("Unknown clast binary expression type");
168 }
169 
170 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) {
171   assert((r->type == clast_red_min || r->type == clast_red_max ||
172           r->type == clast_red_sum) &&
173          "Clast reduction type not supported");
174   Value *old = codegen(r->elts[0], Ty);
175 
176   for (int i = 1; i < r->n; ++i) {
177     Value *exprValue = codegen(r->elts[i], Ty);
178 
179     switch (r->type) {
180     case clast_red_min: {
181       Value *cmp = Builder.CreateICmpSLT(old, exprValue);
182       old = Builder.CreateSelect(cmp, old, exprValue);
183       break;
184     }
185     case clast_red_max: {
186       Value *cmp = Builder.CreateICmpSGT(old, exprValue);
187       old = Builder.CreateSelect(cmp, old, exprValue);
188       break;
189     }
190     case clast_red_sum:
191       old = Builder.CreateAdd(old, exprValue);
192       break;
193     }
194   }
195 
196   return old;
197 }
198 
199 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap)
200     : Builder(B), IVS(IVMap) {}
201 
202 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) {
203   switch (e->type) {
204   case clast_expr_name:
205     return codegen((const clast_name *)e, Ty);
206   case clast_expr_term:
207     return codegen((const clast_term *)e, Ty);
208   case clast_expr_bin:
209     return codegen((const clast_binary *)e, Ty);
210   case clast_expr_red:
211     return codegen((const clast_reduction *)e, Ty);
212   }
213 
214   llvm_unreachable("Unknown clast expression!");
215 }
216 
217 class ClastStmtCodeGen {
218 public:
219   const std::vector<std::string> &getParallelLoops();
220 
221 private:
222   // The Scop we code generate.
223   Scop *S;
224   Pass *P;
225 
226   // The Builder specifies the current location to code generate at.
227   IRBuilder<> &Builder;
228 
229   // Map the Values from the old code to their counterparts in the new code.
230   ValueMapT ValueMap;
231 
232   // Map the loops from the old code to expressions function of the induction
233   // variables in the new code.  For example, when the code generator produces
234   // this AST:
235   //
236   //   for (int c1 = 0; c1 <= 1023; c1 += 1)
237   //     for (int c2 = 0; c2 <= 1023; c2 += 1)
238   //       Stmt(c2 + 3, c1);
239   //
240   // LoopToScev is a map associating:
241   //   "outer loop in the old loop nest" -> SCEV("c2 + 3"),
242   //   "inner loop in the old loop nest" -> SCEV("c1").
243   LoopToScevMapT LoopToScev;
244 
245   // clastVars maps from the textual representation of a clast variable to its
246   // current *Value. clast variables are scheduling variables, original
247   // induction variables or parameters. They are used either in loop bounds or
248   // to define the statement instance that is executed.
249   //
250   //   for (s = 0; s < n + 3; ++i)
251   //     for (t = s; t < m; ++j)
252   //       Stmt(i = s + 3 * m, j = t);
253   //
254   // {s,t,i,j,n,m} is the set of clast variables in this clast.
255   CharMapT ClastVars;
256 
257   // Codegenerator for clast expressions.
258   ClastExpCodeGen ExpGen;
259 
260   // Do we currently generate parallel code?
261   bool parallelCodeGeneration;
262 
263   std::vector<std::string> parallelLoops;
264 
265   void codegen(const clast_assignment *a);
266 
267   void codegen(const clast_assignment *a, ScopStmt *Statement,
268                unsigned Dimension, int vectorDim,
269                std::vector<ValueMapT> *VectorVMap = 0,
270                std::vector<LoopToScevMapT> *VLTS = 0);
271 
272   void codegenSubstitutions(const clast_stmt *Assignment, ScopStmt *Statement,
273                             int vectorDim = 0,
274                             std::vector<ValueMapT> *VectorVMap = 0,
275                             std::vector<LoopToScevMapT> *VLTS = 0);
276 
277   void codegen(const clast_user_stmt *u, std::vector<Value *> *IVS = NULL,
278                const char *iterator = NULL,
279                __isl_take isl_set *scatteringDomain = 0);
280 
281   void codegen(const clast_block *b);
282 
283   /// @brief Create a classical sequential loop.
284   void codegenForSequential(const clast_for *f);
285 
286   /// @brief Create OpenMP structure values.
287   ///
288   /// Create a list of values that has to be stored into the OpenMP subfuncition
289   /// structure.
290   SetVector<Value *> getOMPValues(const clast_stmt *Body);
291 
292   /// @brief Update ClastVars and ValueMap according to a value map.
293   ///
294   /// @param VMap A map from old to new values.
295   void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap);
296 
297   /// @brief Create an OpenMP parallel for loop.
298   ///
299   /// This loop reflects a loop as if it would have been created by an OpenMP
300   /// statement.
301   void codegenForOpenMP(const clast_for *f);
302 
303 #ifdef GPU_CODEGEN
304   /// @brief Create GPGPU device memory access values.
305   ///
306   /// Create a list of values that will be set to be parameters of the GPGPU
307   /// subfunction. These parameters represent device memory base addresses
308   /// and the size in bytes.
309   SetVector<Value *> getGPUValues(unsigned &OutputBytes);
310 
311   /// @brief Create a GPU parallel for loop.
312   ///
313   /// This loop reflects a loop as if it would have been created by a GPU
314   /// statement.
315   void codegenForGPGPU(const clast_for *F);
316 
317   /// @brief Get innermost for loop.
318   const clast_stmt *getScheduleInfo(const clast_for *F,
319                                     std::vector<int> &NumIters,
320                                     unsigned &LoopDepth,
321                                     unsigned &NonPLoopDepth);
322 #endif /* GPU_CODEGEN */
323 
324   /// @brief Check if a loop is parallel
325   ///
326   /// Detect if a clast_for loop can be executed in parallel.
327   ///
328   /// @param For The clast for loop to check.
329   ///
330   /// @return bool Returns true if the incoming clast_for statement can
331   ///              execute in parallel.
332   bool isParallelFor(const clast_for *For);
333 
334   bool isInnermostLoop(const clast_for *f);
335 
336   /// @brief Get the number of loop iterations for this loop.
337   /// @param f The clast for loop to check.
338   int getNumberOfIterations(const clast_for *f);
339 
340   /// @brief Create vector instructions for this loop.
341   void codegenForVector(const clast_for *f);
342 
343   void codegen(const clast_for *f);
344 
345   Value *codegen(const clast_equation *eq);
346 
347   void codegen(const clast_guard *g);
348 
349   void codegen(const clast_stmt *stmt);
350 
351   void addParameters(const CloogNames *names);
352 
353   IntegerType *getIntPtrTy();
354 
355 public:
356   void codegen(const clast_root *r);
357 
358   ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P);
359 };
360 }
361 
362 IntegerType *ClastStmtCodeGen::getIntPtrTy() {
363   return P->getAnalysis<DataLayout>().getIntPtrType(Builder.getContext());
364 }
365 
366 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() {
367   return parallelLoops;
368 }
369 
370 void ClastStmtCodeGen::codegen(const clast_assignment *a) {
371   Value *V = ExpGen.codegen(a->RHS, getIntPtrTy());
372   ClastVars[a->LHS] = V;
373 }
374 
375 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt,
376                                unsigned Dim, int VectorDim,
377                                std::vector<ValueMapT> *VectorVMap,
378                                std::vector<LoopToScevMapT> *VLTS) {
379   Value *RHS;
380 
381   assert(!A->LHS && "Statement assignments do not have left hand side");
382 
383   RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty());
384 
385   const llvm::SCEV *URHS = S->getSE()->getUnknown(RHS);
386   if (VLTS)
387     (*VLTS)[VectorDim][Stmt->getLoopForDimension(Dim)] = URHS;
388   LoopToScev[Stmt->getLoopForDimension(Dim)] = URHS;
389 
390   const PHINode *PN = Stmt->getInductionVariableForDimension(Dim);
391   if (PN) {
392     RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType());
393 
394     if (VectorVMap)
395       (*VectorVMap)[VectorDim][PN] = RHS;
396 
397     ValueMap[PN] = RHS;
398   }
399 }
400 
401 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment,
402                                             ScopStmt *Statement, int vectorDim,
403                                             std::vector<ValueMapT> *VectorVMap,
404                                             std::vector<LoopToScevMapT> *VLTS) {
405   int Dimension = 0;
406 
407   while (Assignment) {
408     assert(CLAST_STMT_IS_A(Assignment, stmt_ass) &&
409            "Substitions are expected to be assignments");
410     codegen((const clast_assignment *)Assignment, Statement, Dimension,
411             vectorDim, VectorVMap, VLTS);
412     Assignment = Assignment->next;
413     Dimension++;
414   }
415 }
416 
417 // Takes the cloog specific domain and translates it into a map Statement ->
418 // PartialSchedule, where the PartialSchedule contains all the dimensions that
419 // have been code generated up to this point.
420 static __isl_give isl_map *extractPartialSchedule(ScopStmt *Statement,
421                                                   __isl_take isl_set *Domain) {
422   isl_map *Schedule = Statement->getScattering();
423   int ScheduledDimensions = isl_set_dim(Domain, isl_dim_set);
424   int UnscheduledDimensions =
425       isl_map_dim(Schedule, isl_dim_out) - ScheduledDimensions;
426 
427   isl_set_free(Domain);
428 
429   return isl_map_project_out(Schedule, isl_dim_out, ScheduledDimensions,
430                              UnscheduledDimensions);
431 }
432 
433 void ClastStmtCodeGen::codegen(const clast_user_stmt *u,
434                                std::vector<Value *> *IVS, const char *iterator,
435                                __isl_take isl_set *Domain) {
436   ScopStmt *Statement = (ScopStmt *)u->statement->usr;
437 
438   if (u->substitutions)
439     codegenSubstitutions(u->substitutions, Statement);
440 
441   int VectorDimensions = IVS ? IVS->size() : 1;
442 
443   if (VectorDimensions == 1) {
444     BlockGenerator::generate(Builder, *Statement, ValueMap, LoopToScev, P);
445     return;
446   }
447 
448   VectorValueMapT VectorMap(VectorDimensions);
449   std::vector<LoopToScevMapT> VLTS(VectorDimensions);
450 
451   if (IVS) {
452     assert(u->substitutions && "Substitutions expected!");
453     int i = 0;
454     for (std::vector<Value *>::iterator II = IVS->begin(), IE = IVS->end();
455          II != IE; ++II) {
456       ClastVars[iterator] = *II;
457       codegenSubstitutions(u->substitutions, Statement, i, &VectorMap, &VLTS);
458       i++;
459     }
460   }
461 
462   isl_map *Schedule = extractPartialSchedule(Statement, Domain);
463   VectorBlockGenerator::generate(Builder, *Statement, VectorMap, VLTS, Schedule,
464                                  P);
465   isl_map_free(Schedule);
466 }
467 
468 void ClastStmtCodeGen::codegen(const clast_block *b) {
469   if (b->body)
470     codegen(b->body);
471 }
472 
473 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) {
474   Value *LowerBound, *UpperBound, *IV, *Stride;
475   BasicBlock *ExitBlock;
476   Type *IntPtrTy = getIntPtrTy();
477 
478   LowerBound = ExpGen.codegen(f->LB, IntPtrTy);
479   UpperBound = ExpGen.codegen(f->UB, IntPtrTy);
480   Stride = Builder.getInt(APInt_from_MPZ(f->stride));
481 
482   IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, ExitBlock,
483                   CmpInst::ICMP_SLE);
484 
485   // Add loop iv to symbols.
486   ClastVars[f->iterator] = IV;
487 
488   if (f->body)
489     codegen(f->body);
490 
491   // Loop is finished, so remove its iv from the live symbols.
492   ClastVars.erase(f->iterator);
493   Builder.SetInsertPoint(ExitBlock->begin());
494 }
495 
496 // Helper class to determine all scalar parameters used in the basic blocks of a
497 // clast. Scalar parameters are scalar variables defined outside of the SCoP.
498 class ParameterVisitor : public ClastVisitor {
499   std::set<Value *> Values;
500 
501 public:
502   ParameterVisitor() : ClastVisitor(), Values() {}
503 
504   void visitUser(const clast_user_stmt *Stmt) {
505     const ScopStmt *S = static_cast<const ScopStmt *>(Stmt->statement->usr);
506     const BasicBlock *BB = S->getBasicBlock();
507 
508     // Check all the operands of instructions in the basic block.
509     for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end(); BI != BE;
510          ++BI) {
511       const Instruction &Inst = *BI;
512       for (Instruction::const_op_iterator II = Inst.op_begin(),
513                                           IE = Inst.op_end();
514            II != IE; ++II) {
515         Value *SrcVal = *II;
516 
517         if (Instruction *OpInst = dyn_cast<Instruction>(SrcVal))
518           if (S->getParent()->getRegion().contains(OpInst))
519             continue;
520 
521         if (isa<Instruction>(SrcVal) || isa<Argument>(SrcVal))
522           Values.insert(SrcVal);
523       }
524     }
525   }
526 
527   // Iterator to iterate over the values found.
528   typedef std::set<Value *>::const_iterator const_iterator;
529   inline const_iterator begin() const { return Values.begin(); }
530   inline const_iterator end() const { return Values.end(); }
531 };
532 
533 SetVector<Value *> ClastStmtCodeGen::getOMPValues(const clast_stmt *Body) {
534   SetVector<Value *> Values;
535 
536   // The clast variables
537   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E;
538        I++)
539     Values.insert(I->second);
540 
541   // Find the temporaries that are referenced in the clast statements'
542   // basic blocks but are not defined by these blocks (e.g., references
543   // to function arguments or temporaries defined before the start of
544   // the SCoP).
545   ParameterVisitor Params;
546   Params.visit(Body);
547 
548   for (ParameterVisitor::const_iterator PI = Params.begin(), PE = Params.end();
549        PI != PE; ++PI) {
550     Value *V = *PI;
551     Values.insert(V);
552     DEBUG(dbgs() << "Adding temporary for OMP copy-in: " << *V << "\n");
553   }
554 
555   return Values;
556 }
557 
558 void
559 ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap) {
560   std::set<Value *> Inserted;
561 
562   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end(); I != E;
563        I++) {
564     ClastVars[I->first] = VMap[I->second];
565     Inserted.insert(I->second);
566   }
567 
568   for (OMPGenerator::ValueToValueMapTy::iterator I = VMap.begin(),
569                                                  E = VMap.end();
570        I != E; ++I) {
571     if (Inserted.count(I->first))
572       continue;
573 
574     ValueMap[I->first] = I->second;
575   }
576 }
577 
578 static void clearDomtree(Function *F, DominatorTree &DT) {
579   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
580   std::vector<BasicBlock *> Nodes;
581   for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
582     Nodes.push_back(I->getBlock());
583 
584   for (std::vector<BasicBlock *>::iterator I = Nodes.begin(), E = Nodes.end();
585        I != E; ++I)
586     DT.eraseNode(*I);
587 }
588 
589 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) {
590   Value *Stride, *LB, *UB, *IV;
591   BasicBlock::iterator LoopBody;
592   IntegerType *IntPtrTy = getIntPtrTy();
593   SetVector<Value *> Values;
594   OMPGenerator::ValueToValueMapTy VMap;
595   OMPGenerator OMPGen(Builder, P);
596 
597   Stride = Builder.getInt(APInt_from_MPZ(For->stride));
598   Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy);
599   LB = ExpGen.codegen(For->LB, IntPtrTy);
600   UB = ExpGen.codegen(For->UB, IntPtrTy);
601 
602   Values = getOMPValues(For->body);
603 
604   IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody);
605   BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
606   Builder.SetInsertPoint(LoopBody);
607 
608   // Save the current values.
609   const ValueMapT ValueMapCopy = ValueMap;
610   const CharMapT ClastVarsCopy = ClastVars;
611 
612   updateWithValueMap(VMap);
613   ClastVars[For->iterator] = IV;
614 
615   if (For->body)
616     codegen(For->body);
617 
618   // Restore the original values.
619   ValueMap = ValueMapCopy;
620   ClastVars = ClastVarsCopy;
621 
622   clearDomtree((*LoopBody).getParent()->getParent(),
623                P->getAnalysis<DominatorTree>());
624 
625   Builder.SetInsertPoint(AfterLoop);
626 }
627 
628 #ifdef GPU_CODEGEN
629 static unsigned getArraySizeInBytes(const ArrayType *AT) {
630   unsigned Bytes = AT->getNumElements();
631   if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
632     Bytes *= getArraySizeInBytes(T);
633   else
634     Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
635 
636   return Bytes;
637 }
638 
639 SetVector<Value *> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
640   SetVector<Value *> Values;
641   OutputBytes = 0;
642 
643   // Record the memory reference base addresses.
644   for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
645     ScopStmt *Stmt = *SI;
646     for (SmallVector<MemoryAccess *, 8>::iterator I = Stmt->memacc_begin(),
647                                                   E = Stmt->memacc_end();
648          I != E; ++I) {
649       Value *BaseAddr = const_cast<Value *>((*I)->getBaseAddr());
650       Values.insert((BaseAddr));
651 
652       // FIXME: we assume that there is one and only one array to be written
653       // in a SCoP.
654       int NumWrites = 0;
655       if ((*I)->isWrite()) {
656         ++NumWrites;
657         assert(NumWrites <= 1 &&
658                "We support at most one array to be written in a SCoP.");
659         if (const PointerType *PT =
660                 dyn_cast<PointerType>(BaseAddr->getType())) {
661           Type *T = PT->getArrayElementType();
662           const ArrayType *ATy = dyn_cast<ArrayType>(T);
663           OutputBytes = getArraySizeInBytes(ATy);
664         }
665       }
666     }
667   }
668 
669   return Values;
670 }
671 
672 const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
673                                                     std::vector<int> &NumIters,
674                                                     unsigned &LoopDepth,
675                                                     unsigned &NonPLoopDepth) {
676   clast_stmt *Stmt = (clast_stmt *)F;
677   const clast_for *Result;
678   bool NonParaFlag = false;
679   LoopDepth = 0;
680   NonPLoopDepth = 0;
681 
682   while (Stmt) {
683     if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
684       const clast_for *T = (clast_for *)Stmt;
685       if (isParallelFor(T)) {
686         if (!NonParaFlag) {
687           NumIters.push_back(getNumberOfIterations(T));
688           Result = T;
689         }
690       } else
691         NonParaFlag = true;
692 
693       Stmt = T->body;
694       LoopDepth++;
695       continue;
696     }
697     Stmt = Stmt->next;
698   }
699 
700   assert(NumIters.size() == 4 &&
701          "The loops should be tiled into 4-depth parallel loops and an "
702          "innermost non-parallel one (if exist).");
703   NonPLoopDepth = LoopDepth - NumIters.size();
704   assert(NonPLoopDepth <= 1 &&
705          "We support only one innermost non-parallel loop currently.");
706   return (const clast_stmt *)Result->body;
707 }
708 
709 void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
710   BasicBlock::iterator LoopBody;
711   SetVector<Value *> Values;
712   SetVector<Value *> IVS;
713   std::vector<int> NumIterations;
714   PTXGenerator::ValueToValueMapTy VMap;
715 
716   assert(!GPUTriple.empty() &&
717          "Target triple should be set properly for GPGPU code generation.");
718   PTXGenerator PTXGen(Builder, P, GPUTriple);
719 
720   // Get original IVS and ScopStmt
721   unsigned TiledLoopDepth, NonPLoopDepth;
722   const clast_stmt *InnerStmt =
723       getScheduleInfo(F, NumIterations, TiledLoopDepth, NonPLoopDepth);
724   const clast_stmt *TmpStmt;
725   const clast_user_stmt *U;
726   const clast_for *InnerFor;
727   if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
728     InnerFor = (const clast_for *)InnerStmt;
729     TmpStmt = InnerFor->body;
730   } else
731     TmpStmt = InnerStmt;
732   U = (const clast_user_stmt *)TmpStmt;
733   ScopStmt *Statement = (ScopStmt *)U->statement->usr;
734   for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
735     const Value *IV = Statement->getInductionVariableForDimension(i);
736     IVS.insert(const_cast<Value *>(IV));
737   }
738 
739   unsigned OutBytes;
740   Values = getGPUValues(OutBytes);
741   PTXGen.setOutputBytes(OutBytes);
742   PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
743 
744   BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
745   Builder.SetInsertPoint(LoopBody);
746 
747   BasicBlock *AfterBB = 0;
748   if (NonPLoopDepth) {
749     Value *LowerBound, *UpperBound, *IV, *Stride;
750     Type *IntPtrTy = getIntPtrTy();
751     LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
752     UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
753     Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
754     IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB,
755                     CmpInst::ICMP_SLE);
756     const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
757     Value *OldIV = const_cast<Value *>(OldIV_);
758     VMap.insert(std::make_pair<Value *, Value *>(OldIV, IV));
759   }
760 
761   updateWithValueMap(VMap);
762 
763   BlockGenerator::generate(Builder, *Statement, ValueMap, P);
764 
765   if (AfterBB)
766     Builder.SetInsertPoint(AfterBB->begin());
767 
768   // FIXME: The replacement of the host base address with the parameter of ptx
769   // subfunction should have been done by updateWithValueMap. We use the
770   // following codes to avoid affecting other parts of Polly. This should be
771   // fixed later.
772   Function *FN = Builder.GetInsertBlock()->getParent();
773   for (unsigned j = 0; j < Values.size(); j++) {
774     Value *baseAddr = Values[j];
775     for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
776       for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
777         I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
778     }
779   }
780   Builder.SetInsertPoint(AfterLoop);
781   PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
782                                 NumIterations[2], NumIterations[3]);
783   PTXGen.finishGeneration(FN);
784 }
785 #endif
786 
787 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
788   const clast_stmt *stmt = f->body;
789 
790   while (stmt) {
791     if (!CLAST_STMT_IS_A(stmt, stmt_user))
792       return false;
793 
794     stmt = stmt->next;
795   }
796 
797   return true;
798 }
799 
800 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *For) {
801   isl_set *LoopDomain = isl_set_copy(isl_set_from_cloog_domain(For->domain));
802   int NumberOfIterations = polly::getNumberOfIterations(LoopDomain);
803   if (NumberOfIterations == -1)
804     return -1;
805   return NumberOfIterations / isl_int_get_si(For->stride) + 1;
806 }
807 
808 void ClastStmtCodeGen::codegenForVector(const clast_for *F) {
809   DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";);
810   int VectorWidth = getNumberOfIterations(F);
811 
812   Value *LB = ExpGen.codegen(F->LB, getIntPtrTy());
813 
814   APInt Stride = APInt_from_MPZ(F->stride);
815   IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType());
816   Stride = Stride.zext(LoopIVType->getBitWidth());
817   Value *StrideValue = ConstantInt::get(LoopIVType, Stride);
818 
819   std::vector<Value *> IVS(VectorWidth);
820   IVS[0] = LB;
821 
822   for (int i = 1; i < VectorWidth; i++)
823     IVS[i] = Builder.CreateAdd(IVS[i - 1], StrideValue, "p_vector_iv");
824 
825   isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(F->domain));
826 
827   // Add loop iv to symbols.
828   ClastVars[F->iterator] = LB;
829 
830   const clast_stmt *Stmt = F->body;
831 
832   while (Stmt) {
833     codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator,
834             isl_set_copy(Domain));
835     Stmt = Stmt->next;
836   }
837 
838   // Loop is finished, so remove its iv from the live symbols.
839   isl_set_free(Domain);
840   ClastVars.erase(F->iterator);
841 }
842 
843 bool ClastStmtCodeGen::isParallelFor(const clast_for *f) {
844   isl_set *Domain = isl_set_copy(isl_set_from_cloog_domain(f->domain));
845   assert(Domain && "Cannot access domain of loop");
846 
847   Dependences &D = P->getAnalysis<Dependences>();
848 
849   return D.isParallelDimension(Domain, isl_set_n_dim(Domain));
850 }
851 
852 void ClastStmtCodeGen::codegen(const clast_for *f) {
853   bool Vector = PollyVectorizerChoice != VECTORIZER_NONE;
854   if ((Vector || OpenMP) && isParallelFor(f)) {
855     if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f)) &&
856         (getNumberOfIterations(f) <= 16)) {
857       codegenForVector(f);
858       return;
859     }
860 
861     if (OpenMP && !parallelCodeGeneration) {
862       parallelCodeGeneration = true;
863       parallelLoops.push_back(f->iterator);
864       codegenForOpenMP(f);
865       parallelCodeGeneration = false;
866       return;
867     }
868   }
869 
870 #ifdef GPU_CODEGEN
871   if (GPGPU && isParallelFor(f)) {
872     if (!parallelCodeGeneration) {
873       parallelCodeGeneration = true;
874       parallelLoops.push_back(f->iterator);
875       codegenForGPGPU(f);
876       parallelCodeGeneration = false;
877       return;
878     }
879   }
880 #endif
881 
882   codegenForSequential(f);
883 }
884 
885 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) {
886   Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy());
887   Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy());
888   CmpInst::Predicate P;
889 
890   if (eq->sign == 0)
891     P = ICmpInst::ICMP_EQ;
892   else if (eq->sign > 0)
893     P = ICmpInst::ICMP_SGE;
894   else
895     P = ICmpInst::ICMP_SLE;
896 
897   return Builder.CreateICmp(P, LHS, RHS);
898 }
899 
900 void ClastStmtCodeGen::codegen(const clast_guard *g) {
901   Function *F = Builder.GetInsertBlock()->getParent();
902   LLVMContext &Context = F->getContext();
903 
904   BasicBlock *CondBB =
905       SplitBlock(Builder.GetInsertBlock(), Builder.GetInsertPoint(), P);
906   CondBB->setName("polly.cond");
907   BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P);
908   MergeBB->setName("polly.merge");
909   BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F);
910 
911   DominatorTree &DT = P->getAnalysis<DominatorTree>();
912   DT.addNewBlock(ThenBB, CondBB);
913   DT.changeImmediateDominator(MergeBB, CondBB);
914 
915   CondBB->getTerminator()->eraseFromParent();
916 
917   Builder.SetInsertPoint(CondBB);
918 
919   Value *Predicate = codegen(&(g->eq[0]));
920 
921   for (int i = 1; i < g->n; ++i) {
922     Value *TmpPredicate = codegen(&(g->eq[i]));
923     Predicate = Builder.CreateAnd(Predicate, TmpPredicate);
924   }
925 
926   Builder.CreateCondBr(Predicate, ThenBB, MergeBB);
927   Builder.SetInsertPoint(ThenBB);
928   Builder.CreateBr(MergeBB);
929   Builder.SetInsertPoint(ThenBB->begin());
930 
931   LoopInfo &LI = P->getAnalysis<LoopInfo>();
932   Loop *L = LI.getLoopFor(CondBB);
933   if (L) {
934     L->addBasicBlockToLoop(ThenBB, LI.getBase());
935     L->addBasicBlockToLoop(MergeBB, LI.getBase());
936   }
937 
938   codegen(g->then);
939 
940   Builder.SetInsertPoint(MergeBB->begin());
941 }
942 
943 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) {
944   if (CLAST_STMT_IS_A(stmt, stmt_root))
945     assert(false && "No second root statement expected");
946   else if (CLAST_STMT_IS_A(stmt, stmt_ass))
947     codegen((const clast_assignment *)stmt);
948   else if (CLAST_STMT_IS_A(stmt, stmt_user))
949     codegen((const clast_user_stmt *)stmt);
950   else if (CLAST_STMT_IS_A(stmt, stmt_block))
951     codegen((const clast_block *)stmt);
952   else if (CLAST_STMT_IS_A(stmt, stmt_for))
953     codegen((const clast_for *)stmt);
954   else if (CLAST_STMT_IS_A(stmt, stmt_guard))
955     codegen((const clast_guard *)stmt);
956 
957   if (stmt->next)
958     codegen(stmt->next);
959 }
960 
961 void ClastStmtCodeGen::addParameters(const CloogNames *names) {
962   SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly");
963 
964   int i = 0;
965   for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end();
966        PI != PE; ++PI) {
967     assert(i < names->nb_parameters && "Not enough parameter names");
968 
969     const SCEV *Param = *PI;
970     Type *Ty = Param->getType();
971 
972     Instruction *insertLocation = --(Builder.GetInsertBlock()->end());
973     Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation);
974     ClastVars[names->parameters[i]] = V;
975 
976     ++i;
977   }
978 }
979 
980 void ClastStmtCodeGen::codegen(const clast_root *r) {
981   addParameters(r->names);
982 
983   parallelCodeGeneration = false;
984 
985   const clast_stmt *stmt = (const clast_stmt *)r;
986   if (stmt->next)
987     codegen(stmt->next);
988 }
989 
990 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P)
991     : S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {}
992 
993 namespace {
994 class CodeGeneration : public ScopPass {
995   std::vector<std::string> ParallelLoops;
996 
997 public:
998   static char ID;
999 
1000   CodeGeneration() : ScopPass(ID) {}
1001 
1002   bool runOnScop(Scop &S) {
1003     ParallelLoops.clear();
1004 
1005     assert(!S.getRegion().isTopLevelRegion() &&
1006            "Top level regions are not supported");
1007 
1008     simplifyRegion(&S, this);
1009 
1010     BasicBlock *StartBlock = executeScopConditionally(S, this);
1011 
1012     IRBuilder<> Builder(StartBlock->begin());
1013 
1014     ClastStmtCodeGen CodeGen(&S, Builder, this);
1015     CloogInfo &C = getAnalysis<CloogInfo>();
1016     CodeGen.codegen(C.getClast());
1017 
1018     ParallelLoops.insert(ParallelLoops.begin(),
1019                          CodeGen.getParallelLoops().begin(),
1020                          CodeGen.getParallelLoops().end());
1021     return true;
1022   }
1023 
1024   virtual void printScop(raw_ostream &OS) const {
1025     for (std::vector<std::string>::const_iterator PI = ParallelLoops.begin(),
1026                                                   PE = ParallelLoops.end();
1027          PI != PE; ++PI)
1028       OS << "Parallel loop with iterator '" << *PI << "' generated\n";
1029   }
1030 
1031   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
1032     AU.addRequired<CloogInfo>();
1033     AU.addRequired<Dependences>();
1034     AU.addRequired<DominatorTree>();
1035     AU.addRequired<RegionInfo>();
1036     AU.addRequired<ScalarEvolution>();
1037     AU.addRequired<ScopDetection>();
1038     AU.addRequired<ScopInfo>();
1039     AU.addRequired<DataLayout>();
1040     AU.addRequired<LoopInfo>();
1041 
1042     AU.addPreserved<CloogInfo>();
1043     AU.addPreserved<Dependences>();
1044     AU.addPreserved<LoopInfo>();
1045     AU.addPreserved<DominatorTree>();
1046     AU.addPreserved<ScopDetection>();
1047     AU.addPreserved<ScalarEvolution>();
1048 
1049     // FIXME: We do not yet add regions for the newly generated code to the
1050     //        region tree.
1051     AU.addPreserved<RegionInfo>();
1052     AU.addPreserved<TempScopInfo>();
1053     AU.addPreserved<ScopInfo>();
1054     AU.addPreservedID(IndependentBlocksID);
1055   }
1056 };
1057 }
1058 
1059 char CodeGeneration::ID = 1;
1060 
1061 Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); }
1062 
1063 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
1064                       "Polly - Create LLVM-IR from SCoPs", false, false);
1065 INITIALIZE_PASS_DEPENDENCY(CloogInfo);
1066 INITIALIZE_PASS_DEPENDENCY(Dependences);
1067 INITIALIZE_PASS_DEPENDENCY(DominatorTree);
1068 INITIALIZE_PASS_DEPENDENCY(RegionInfo);
1069 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution);
1070 INITIALIZE_PASS_DEPENDENCY(ScopDetection);
1071 INITIALIZE_PASS_DEPENDENCY(DataLayout);
1072 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
1073                     "Polly - Create LLVM-IR from SCoPs", false, false)
1074 
1075 #endif // CLOOG_FOUND
1076