1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it
11 // back to LLVM-IR using Cloog.
12 //
13 // The Scop describes the high level memory behaviour of a control flow region.
14 // Transformation passes can update the schedule (execution order) of statements
15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that
16 // reflects the updated execution order. This clast is used to create new
17 // LLVM-IR that is computational equivalent to the original control flow region,
18 // but executes its code in the new execution order defined by the changed
19 // scattering.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #define DEBUG_TYPE "polly-codegen"
24 
25 #include "polly/Cloog.h"
26 #include "polly/CodeGeneration.h"
27 #include "polly/Dependences.h"
28 #include "polly/LinkAllPasses.h"
29 #include "polly/ScopInfo.h"
30 #include "polly/TempScopInfo.h"
31 #include "polly/Support/GICHelper.h"
32 #include "polly/LoopGenerators.h"
33 
34 #include "llvm/Module.h"
35 #include "llvm/ADT/SetVector.h"
36 #include "llvm/ADT/PostOrderIterator.h"
37 #include "llvm/Analysis/LoopInfo.h"
38 #include "llvm/Analysis/ScalarEvolutionExpander.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/IRBuilder.h"
42 #include "llvm/Target/TargetData.h"
43 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
44 
45 #define CLOOG_INT_GMP 1
46 #include "cloog/cloog.h"
47 #include "cloog/isl/cloog.h"
48 
49 #include "isl/aff.h"
50 
51 #include <vector>
52 #include <utility>
53 
54 using namespace polly;
55 using namespace llvm;
56 
57 struct isl_set;
58 
59 namespace polly {
60 
61 bool EnablePollyVector;
62 
63 static cl::opt<bool, true>
64 Vector("enable-polly-vector",
65        cl::desc("Enable polly vector code generation"), cl::Hidden,
66        cl::location(EnablePollyVector), cl::init(false), cl::ZeroOrMore);
67 
68 static cl::opt<bool>
69 OpenMP("enable-polly-openmp",
70        cl::desc("Generate OpenMP parallel code"), cl::Hidden,
71        cl::value_desc("OpenMP code generation enabled if true"),
72        cl::init(false), cl::ZeroOrMore);
73 
74 static cl::opt<bool>
75 AtLeastOnce("enable-polly-atLeastOnce",
76        cl::desc("Give polly the hint, that every loop is executed at least"
77                 "once"), cl::Hidden,
78        cl::value_desc("OpenMP code generation enabled if true"),
79        cl::init(false), cl::ZeroOrMore);
80 
81 static cl::opt<bool>
82 Aligned("enable-polly-aligned",
83        cl::desc("Assumed aligned memory accesses."), cl::Hidden,
84        cl::value_desc("OpenMP code generation enabled if true"),
85        cl::init(false), cl::ZeroOrMore);
86 
87 typedef DenseMap<const Value*, Value*> ValueMapT;
88 typedef DenseMap<const char*, Value*> CharMapT;
89 typedef std::vector<ValueMapT> VectorValueMapT;
90 
91 class IslGenerator;
92 
93 class IslGenerator {
94 public:
95   IslGenerator(IRBuilder<> &Builder, std::vector<Value *> &IVS) :
96     Builder(Builder), IVS(IVS) {}
97   Value *generateIslInt(__isl_take isl_int Int);
98   Value *generateIslAff(__isl_take isl_aff *Aff);
99   Value *generateIslPwAff(__isl_take isl_pw_aff *PwAff);
100 
101 private:
102   typedef struct {
103     Value *Result;
104     class IslGenerator *Generator;
105   } IslGenInfo;
106 
107   IRBuilder<> &Builder;
108   std::vector<Value *> &IVS;
109   static int mergeIslAffValues(__isl_take isl_set *Set,
110                                __isl_take isl_aff *Aff, void *User);
111 };
112 
113 Value *IslGenerator::generateIslInt(isl_int Int) {
114   mpz_t IntMPZ;
115   mpz_init(IntMPZ);
116   isl_int_get_gmp(Int, IntMPZ);
117   Value *IntValue = Builder.getInt(APInt_from_MPZ(IntMPZ));
118   mpz_clear(IntMPZ);
119   return IntValue;
120 }
121 
122 Value *IslGenerator::generateIslAff(__isl_take isl_aff *Aff) {
123   Value *Result;
124   Value *ConstValue;
125   isl_int ConstIsl;
126 
127   isl_int_init(ConstIsl);
128   isl_aff_get_constant(Aff, &ConstIsl);
129   ConstValue = generateIslInt(ConstIsl);
130   Type *Ty = Builder.getInt64Ty();
131 
132   // FIXME: We should give the constant and coefficients the right type. Here
133   // we force it into i64.
134   Result = Builder.CreateSExtOrBitCast(ConstValue, Ty);
135 
136   unsigned int NbInputDims = isl_aff_dim(Aff, isl_dim_in);
137 
138   assert((IVS.size() == NbInputDims) && "The Dimension of Induction Variables"
139          "must match the dimension of the affine space.");
140 
141   isl_int CoefficientIsl;
142   isl_int_init(CoefficientIsl);
143 
144   for (unsigned int i = 0; i < NbInputDims; ++i) {
145     Value *CoefficientValue;
146     isl_aff_get_coefficient(Aff, isl_dim_in, i, &CoefficientIsl);
147 
148     if (isl_int_is_zero(CoefficientIsl))
149       continue;
150 
151     CoefficientValue = generateIslInt(CoefficientIsl);
152     CoefficientValue = Builder.CreateIntCast(CoefficientValue, Ty, true);
153     Value *IV = Builder.CreateIntCast(IVS[i], Ty, true);
154     Value *PAdd = Builder.CreateMul(CoefficientValue, IV, "p_mul_coeff");
155     Result = Builder.CreateAdd(Result, PAdd, "p_sum_coeff");
156   }
157 
158   isl_int_clear(CoefficientIsl);
159   isl_int_clear(ConstIsl);
160   isl_aff_free(Aff);
161 
162   return Result;
163 }
164 
165 int IslGenerator::mergeIslAffValues(__isl_take isl_set *Set,
166                                     __isl_take isl_aff *Aff, void *User) {
167   IslGenInfo *GenInfo = (IslGenInfo *)User;
168 
169   assert((GenInfo->Result == NULL) && "Result is already set."
170          "Currently only single isl_aff is supported");
171   assert(isl_set_plain_is_universe(Set)
172          && "Code generation failed because the set is not universe");
173 
174   GenInfo->Result = GenInfo->Generator->generateIslAff(Aff);
175 
176   isl_set_free(Set);
177   return 0;
178 }
179 
180 Value *IslGenerator::generateIslPwAff(__isl_take isl_pw_aff *PwAff) {
181   IslGenInfo User;
182   User.Result = NULL;
183   User.Generator = this;
184   isl_pw_aff_foreach_piece(PwAff, mergeIslAffValues, &User);
185   assert(User.Result && "Code generation for isl_pw_aff failed");
186 
187   isl_pw_aff_free(PwAff);
188   return User.Result;
189 }
190 
191 /// @brief Generate a new basic block for a polyhedral statement.
192 ///
193 /// The only public function exposed is generate().
194 class BlockGenerator {
195 public:
196   /// @brief Generate a new BasicBlock for a ScopStmt.
197   ///
198   /// @param Builder   The LLVM-IR Builder used to generate the statement. The
199   ///                  code is generated at the location, the Builder points to.
200   /// @param Stmt      The statement to code generate.
201   /// @param GlobalMap A map that defines for certain Values referenced from the
202   ///                  original code new Values they should be replaced with.
203   /// @param P         A reference to the pass this function is called from.
204   ///                  The pass is needed to update other analysis.
205   static void generate(IRBuilder<> &Builder, ScopStmt &Stmt,
206                        ValueMapT &GlobalMap, Pass *P) {
207     BlockGenerator Generator(Builder, Stmt, P);
208     Generator.copyBB(GlobalMap);
209   }
210 
211 protected:
212   IRBuilder<> &Builder;
213   ScopStmt &Statement;
214   Pass *P;
215 
216   BlockGenerator(IRBuilder<> &B, ScopStmt &Stmt, Pass *P);
217 
218   /// @brief Get the new version of a Value.
219   ///
220   /// @param Old       The old Value.
221   /// @param BBMap     A mapping from old values to their new values
222   ///                  (for values recalculated within this basic block).
223   /// @param GlobalMap A mapping from old values to their new values
224   ///                  (for values recalculated in the new ScoP, but not
225   ///                   within this basic block).
226   ///
227   /// @returns  o The old value, if it is still valid.
228   ///           o The new value, if available.
229   ///           o NULL, if no value is found.
230   Value *getNewValue(const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap);
231 
232   void copyInstScalar(const Instruction *Inst, ValueMapT &BBMap,
233                       ValueMapT &GlobalMap);
234 
235   /// @brief Get the memory access offset to be added to the base address
236   std::vector<Value*> getMemoryAccessIndex(__isl_keep isl_map *AccessRelation,
237                                            Value *BaseAddress, ValueMapT &BBMap,
238                                            ValueMapT &GlobalMap);
239 
240   /// @brief Get the new operand address according to the changed access in
241   ///        JSCOP file.
242   Value *getNewAccessOperand(__isl_keep isl_map *NewAccessRelation,
243                              Value *BaseAddress, const Value *OldOperand,
244                              ValueMapT &BBMap, ValueMapT &GlobalMap);
245 
246   /// @brief Generate the operand address
247   Value *generateLocationAccessed(const Instruction *Inst,
248                                   const Value *Pointer, ValueMapT &BBMap,
249                                   ValueMapT &GlobalMap);
250 
251   Value *generateScalarLoad(const LoadInst *load, ValueMapT &BBMap,
252                             ValueMapT &GlobalMap);
253 
254   Value *generateScalarStore(const StoreInst *store, ValueMapT &BBMap,
255                              ValueMapT &GlobalMap);
256 
257   /// @brief Copy a single Instruction.
258   ///
259   /// This copies a single Instruction and updates references to old values
260   /// with references to new values, as defined by GlobalMap and BBMap.
261   ///
262   /// @param BBMap     A mapping from old values to their new values
263   ///                  (for values recalculated within this basic block).
264   /// @param GlobalMap A mapping from old values to their new values
265   ///                  (for values recalculated in the new ScoP, but not
266   ///                  within this basic block).
267   void copyInstruction(const Instruction *Inst, ValueMapT &BBMap,
268                        ValueMapT &GlobalMap);
269 
270   /// @brief Copy the basic block.
271   ///
272   /// This copies the entire basic block and updates references to old values
273   /// with references to new values, as defined by GlobalMap.
274   ///
275   /// @param GlobalMap A mapping from old values to their new values
276   ///                  (for values recalculated in the new ScoP, but not
277   ///                  within this basic block).
278   void copyBB(ValueMapT &GlobalMap);
279 };
280 
281 BlockGenerator::BlockGenerator(IRBuilder<> &B, ScopStmt &Stmt, Pass *P):
282   Builder(B), Statement(Stmt), P(P) {}
283 
284 Value *BlockGenerator::getNewValue(const Value *Old, ValueMapT &BBMap,
285                                    ValueMapT &GlobalMap) {
286   // We assume constants never change.
287   // This avoids map lookups for many calls to this function.
288   if (isa<Constant>(Old))
289     return const_cast<Value*>(Old);
290 
291   if (GlobalMap.count(Old)) {
292     Value *New = GlobalMap[Old];
293 
294     if (Old->getType()->getScalarSizeInBits()
295         < New->getType()->getScalarSizeInBits())
296       New = Builder.CreateTruncOrBitCast(New, Old->getType());
297 
298     return New;
299   }
300 
301   if (BBMap.count(Old)) {
302     return BBMap[Old];
303   }
304 
305   // 'Old' is within the original SCoP, but was not rewritten.
306   //
307   // Such values appear, if they only calculate information already available in
308   // the polyhedral description (e.g.  an induction variable increment). They
309   // can be safely ignored.
310   if (const Instruction *Inst = dyn_cast<Instruction>(Old))
311     if (Statement.getParent()->getRegion().contains(Inst->getParent()))
312       return NULL;
313 
314   // Everything else is probably a scop-constant value defined as global,
315   // function parameter or an instruction not within the scop.
316   return const_cast<Value*>(Old);
317 }
318 
319 void BlockGenerator::copyInstScalar(const Instruction *Inst, ValueMapT &BBMap,
320                                     ValueMapT &GlobalMap) {
321   Instruction *NewInst = Inst->clone();
322 
323   // Replace old operands with the new ones.
324   for (Instruction::const_op_iterator OI = Inst->op_begin(),
325        OE = Inst->op_end(); OI != OE; ++OI) {
326     Value *OldOperand = *OI;
327     Value *NewOperand = getNewValue(OldOperand, BBMap, GlobalMap);
328 
329     if (!NewOperand) {
330       assert(!isa<StoreInst>(NewInst)
331              && "Store instructions are always needed!");
332       delete NewInst;
333       return;
334     }
335 
336     NewInst->replaceUsesOfWith(OldOperand, NewOperand);
337   }
338 
339   Builder.Insert(NewInst);
340   BBMap[Inst] = NewInst;
341 
342   if (!NewInst->getType()->isVoidTy())
343     NewInst->setName("p_" + Inst->getName());
344 }
345 
346 std::vector<Value*> BlockGenerator::getMemoryAccessIndex(
347   __isl_keep isl_map *AccessRelation, Value *BaseAddress,
348   ValueMapT &BBMap, ValueMapT &GlobalMap) {
349 
350   assert((isl_map_dim(AccessRelation, isl_dim_out) == 1)
351          && "Only single dimensional access functions supported");
352 
353   std::vector<Value *> IVS;
354   for (unsigned i = 0; i < Statement.getNumIterators(); ++i) {
355     const Value *OriginalIV = Statement.getInductionVariableForDimension(i);
356     Value *NewIV = getNewValue(OriginalIV, BBMap, GlobalMap);
357     IVS.push_back(NewIV);
358   }
359 
360   isl_pw_aff *PwAff = isl_map_dim_max(isl_map_copy(AccessRelation), 0);
361   IslGenerator IslGen(Builder, IVS);
362   Value *OffsetValue = IslGen.generateIslPwAff(PwAff);
363 
364   Type *Ty = Builder.getInt64Ty();
365   OffsetValue = Builder.CreateIntCast(OffsetValue, Ty, true);
366 
367   std::vector<Value*> IndexArray;
368   Value *NullValue = Constant::getNullValue(Ty);
369   IndexArray.push_back(NullValue);
370   IndexArray.push_back(OffsetValue);
371   return IndexArray;
372 }
373 
374 Value *BlockGenerator::getNewAccessOperand(
375   __isl_keep isl_map *NewAccessRelation, Value *BaseAddress, const Value
376   *OldOperand, ValueMapT &BBMap, ValueMapT &GlobalMap) {
377   std::vector<Value*> IndexArray = getMemoryAccessIndex(NewAccessRelation,
378                                                         BaseAddress,
379                                                         BBMap, GlobalMap);
380   Value *NewOperand = Builder.CreateGEP(BaseAddress, IndexArray,
381                                         "p_newarrayidx_");
382   return NewOperand;
383 }
384 
385 Value *BlockGenerator::generateLocationAccessed(const Instruction *Inst,
386                                                 const Value *Pointer,
387                                                 ValueMapT &BBMap,
388                                                 ValueMapT &GlobalMap) {
389   MemoryAccess &Access = Statement.getAccessFor(Inst);
390   isl_map *CurrentAccessRelation = Access.getAccessRelation();
391   isl_map *NewAccessRelation = Access.getNewAccessRelation();
392 
393   assert(isl_map_has_equal_space(CurrentAccessRelation, NewAccessRelation)
394          && "Current and new access function use different spaces");
395 
396   Value *NewPointer;
397 
398   if (!NewAccessRelation) {
399     NewPointer = getNewValue(Pointer, BBMap, GlobalMap);
400   } else {
401     Value *BaseAddress = const_cast<Value*>(Access.getBaseAddr());
402     NewPointer = getNewAccessOperand(NewAccessRelation, BaseAddress, Pointer,
403                                      BBMap, GlobalMap);
404   }
405 
406   isl_map_free(CurrentAccessRelation);
407   isl_map_free(NewAccessRelation);
408   return NewPointer;
409 }
410 
411 Value *BlockGenerator::generateScalarLoad(const LoadInst *Load,
412                                           ValueMapT &BBMap,
413                                           ValueMapT &GlobalMap) {
414   const Value *Pointer = Load->getPointerOperand();
415   const Instruction *Inst = dyn_cast<Instruction>(Load);
416   Value *NewPointer = generateLocationAccessed(Inst, Pointer, BBMap, GlobalMap);
417   Value *ScalarLoad = Builder.CreateLoad(NewPointer,
418                                          Load->getName() + "_p_scalar_");
419   return ScalarLoad;
420 }
421 
422 Value *BlockGenerator::generateScalarStore(const StoreInst *Store,
423                                            ValueMapT &BBMap,
424                                            ValueMapT &GlobalMap) {
425   const Value *Pointer = Store->getPointerOperand();
426   Value *NewPointer = generateLocationAccessed(Store, Pointer, BBMap,
427                                                GlobalMap);
428   Value *ValueOperand = getNewValue(Store->getValueOperand(), BBMap, GlobalMap);
429 
430   return Builder.CreateStore(ValueOperand, NewPointer);
431 }
432 
433 void BlockGenerator::copyInstruction(const Instruction *Inst,
434                                      ValueMapT &BBMap, ValueMapT &GlobalMap) {
435   // Terminator instructions control the control flow. They are explicitly
436   // expressed in the clast and do not need to be copied.
437   if (Inst->isTerminator())
438     return;
439 
440   if (const LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
441     BBMap[Load] = generateScalarLoad(Load, BBMap, GlobalMap);
442     return;
443   }
444 
445   if (const StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
446     BBMap[Store] = generateScalarStore(Store, BBMap, GlobalMap);
447     return;
448   }
449 
450   copyInstScalar(Inst, BBMap, GlobalMap);
451 }
452 
453 
454 void BlockGenerator::copyBB(ValueMapT &GlobalMap) {
455   BasicBlock *BB = Statement.getBasicBlock();
456   BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
457                                   Builder.GetInsertPoint(), P);
458   CopyBB->setName("polly.stmt." + BB->getName());
459   Builder.SetInsertPoint(CopyBB->begin());
460 
461   ValueMapT BBMap;
462 
463   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE;
464        ++II)
465       copyInstruction(II, BBMap, GlobalMap);
466 }
467 
468 /// @brief Generate a new vector basic block for a polyhedral statement.
469 ///
470 /// The only public function exposed is generate().
471 class VectorBlockGenerator : BlockGenerator {
472 public:
473   /// @brief Generate a new vector basic block for a ScoPStmt.
474   ///
475   /// This code generation is similar to the normal, scalar code generation,
476   /// except that each instruction is code generated for several vector lanes
477   /// at a time. If possible instructions are issued as actual vector
478   /// instructions, but e.g. for address calculation instructions we currently
479   /// generate scalar instructions for each vector lane.
480   ///
481   /// @param Builder    The LLVM-IR Builder used to generate the statement. The
482   ///                   code is generated at the location, the builder points
483   ///                   to.
484   /// @param Stmt       The statement to code generate.
485   /// @param GlobalMaps A vector of maps that define for certain Values
486   ///                   referenced from the original code new Values they should
487   ///                   be replaced with. Each map in the vector of maps is
488   ///                   used for one vector lane. The number of elements in the
489   ///                   vector defines the width of the generated vector
490   ///                   instructions.
491   /// @param P          A reference to the pass this function is called from.
492   ///                   The pass is needed to update other analysis.
493   static void generate(IRBuilder<> &B, ScopStmt &Stmt,
494                        VectorValueMapT &GlobalMaps, __isl_keep isl_set *Domain,
495                        Pass *P) {
496     VectorBlockGenerator Generator(B, GlobalMaps, Stmt, Domain, P);
497     Generator.copyBB();
498   }
499 
500 private:
501   // This is a vector of global value maps.  The first map is used for the first
502   // vector lane, ...
503   // Each map, contains information about Instructions in the old ScoP, which
504   // are recalculated in the new SCoP. When copying the basic block, we replace
505   // all referenes to the old instructions with their recalculated values.
506   VectorValueMapT &GlobalMaps;
507 
508   isl_set *Domain;
509 
510   VectorBlockGenerator(IRBuilder<> &B, VectorValueMapT &GlobalMaps,
511                        ScopStmt &Stmt, __isl_keep isl_set *Domain, Pass *P);
512 
513   int getVectorWidth();
514 
515   Value *getVectorValue(const Value *Old, ValueMapT &VectorMap,
516                         VectorValueMapT &ScalarMaps);
517 
518   Type *getVectorPtrTy(const Value *V, int Width);
519 
520   /// @brief Load a vector from a set of adjacent scalars
521   ///
522   /// In case a set of scalars is known to be next to each other in memory,
523   /// create a vector load that loads those scalars
524   ///
525   /// %vector_ptr= bitcast double* %p to <4 x double>*
526   /// %vec_full = load <4 x double>* %vector_ptr
527   ///
528   Value *generateStrideOneLoad(const LoadInst *Load, ValueMapT &BBMap);
529 
530   /// @brief Load a vector initialized from a single scalar in memory
531   ///
532   /// In case all elements of a vector are initialized to the same
533   /// scalar value, this value is loaded and shuffeled into all elements
534   /// of the vector.
535   ///
536   /// %splat_one = load <1 x double>* %p
537   /// %splat = shufflevector <1 x double> %splat_one, <1 x
538   ///       double> %splat_one, <4 x i32> zeroinitializer
539   ///
540   Value *generateStrideZeroLoad(const LoadInst *Load, ValueMapT &BBMap);
541 
542   /// @Load a vector from scalars distributed in memory
543   ///
544   /// In case some scalars a distributed randomly in memory. Create a vector
545   /// by loading each scalar and by inserting one after the other into the
546   /// vector.
547   ///
548   /// %scalar_1= load double* %p_1
549   /// %vec_1 = insertelement <2 x double> undef, double %scalar_1, i32 0
550   /// %scalar 2 = load double* %p_2
551   /// %vec_2 = insertelement <2 x double> %vec_1, double %scalar_1, i32 1
552   ///
553   Value *generateUnknownStrideLoad(const LoadInst *Load,
554                                    VectorValueMapT &ScalarMaps);
555 
556   void generateLoad(const LoadInst *Load, ValueMapT &VectorMap,
557                     VectorValueMapT &ScalarMaps);
558 
559   void copyUnaryInst(const UnaryInstruction *Inst, ValueMapT &VectorMap,
560                      VectorValueMapT &ScalarMaps);
561 
562   void copyBinaryInst(const BinaryOperator *Inst, ValueMapT &VectorMap,
563                       VectorValueMapT &ScalarMaps);
564 
565   void copyStore(const StoreInst *Store, ValueMapT &VectorMap,
566                  VectorValueMapT &ScalarMaps);
567 
568   bool hasVectorOperands(const Instruction *Inst, ValueMapT &VectorMap);
569 
570   void copyInstruction(const Instruction *Inst, ValueMapT &VectorMap,
571                        VectorValueMapT &ScalarMaps);
572 
573   void copyBB();
574 };
575 
576 VectorBlockGenerator::VectorBlockGenerator(IRBuilder<> &B,
577   VectorValueMapT &GlobalMaps, ScopStmt &Stmt, __isl_keep isl_set *Domain,
578   Pass *P) : BlockGenerator(B, Stmt, P), GlobalMaps(GlobalMaps),
579   Domain(Domain) {
580     assert(GlobalMaps.size() > 1 && "Only one vector lane found");
581     assert(Domain && "No statement domain provided");
582   }
583 
584 Value *VectorBlockGenerator::getVectorValue(const Value *Old,
585                                             ValueMapT &VectorMap,
586                                             VectorValueMapT &ScalarMaps) {
587   if (VectorMap.count(Old))
588     return VectorMap[Old];
589 
590   int Width = getVectorWidth();
591 
592   Value *Vector = UndefValue::get(VectorType::get(Old->getType(), Width));
593 
594   for (int Lane = 0; Lane < Width; Lane++)
595     Vector = Builder.CreateInsertElement(Vector,
596                                          getNewValue(Old,
597                                                      ScalarMaps[Lane],
598                                                      GlobalMaps[Lane]),
599                                          Builder.getInt32(Lane));
600 
601   VectorMap[Old] = Vector;
602 
603   return Vector;
604 }
605 
606 Type *VectorBlockGenerator::getVectorPtrTy(const Value *Val, int Width) {
607   PointerType *PointerTy = dyn_cast<PointerType>(Val->getType());
608   assert(PointerTy && "PointerType expected");
609 
610   Type *ScalarType = PointerTy->getElementType();
611   VectorType *VectorType = VectorType::get(ScalarType, Width);
612 
613   return PointerType::getUnqual(VectorType);
614 }
615 
616 Value *VectorBlockGenerator::generateStrideOneLoad(const LoadInst *Load,
617                                                    ValueMapT &BBMap) {
618   const Value *Pointer = Load->getPointerOperand();
619   Type *VectorPtrType = getVectorPtrTy(Pointer, getVectorWidth());
620   Value *NewPointer = getNewValue(Pointer, BBMap, GlobalMaps[0]);
621   Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
622                                            "vector_ptr");
623   LoadInst *VecLoad = Builder.CreateLoad(VectorPtr,
624                                          Load->getName() + "_p_vec_full");
625   if (!Aligned)
626     VecLoad->setAlignment(8);
627 
628   return VecLoad;
629 }
630 
631 Value *VectorBlockGenerator::generateStrideZeroLoad(const LoadInst *Load,
632                                                     ValueMapT &BBMap) {
633   const Value *Pointer = Load->getPointerOperand();
634   Type *VectorPtrType = getVectorPtrTy(Pointer, 1);
635   Value *NewPointer = getNewValue(Pointer, BBMap, GlobalMaps[0]);
636   Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
637                                            Load->getName() + "_p_vec_p");
638   LoadInst *ScalarLoad= Builder.CreateLoad(VectorPtr,
639                                            Load->getName() + "_p_splat_one");
640 
641   if (!Aligned)
642     ScalarLoad->setAlignment(8);
643 
644   Constant *SplatVector =
645     Constant::getNullValue(VectorType::get(Builder.getInt32Ty(),
646                                            getVectorWidth()));
647 
648   Value *VectorLoad = Builder.CreateShuffleVector(ScalarLoad, ScalarLoad,
649                                                   SplatVector,
650                                                   Load->getName()
651                                                   + "_p_splat");
652   return VectorLoad;
653 }
654 
655 Value *VectorBlockGenerator::generateUnknownStrideLoad(const LoadInst *Load,
656   VectorValueMapT &ScalarMaps) {
657   int VectorWidth = getVectorWidth();
658   const Value *Pointer = Load->getPointerOperand();
659   VectorType *VectorType = VectorType::get(
660     dyn_cast<PointerType>(Pointer->getType())->getElementType(), VectorWidth);
661 
662   Value *Vector = UndefValue::get(VectorType);
663 
664   for (int i = 0; i < VectorWidth; i++) {
665     Value *NewPointer = getNewValue(Pointer, ScalarMaps[i], GlobalMaps[i]);
666     Value *ScalarLoad = Builder.CreateLoad(NewPointer,
667                                            Load->getName() + "_p_scalar_");
668     Vector = Builder.CreateInsertElement(Vector, ScalarLoad,
669                                          Builder.getInt32(i),
670                                          Load->getName() + "_p_vec_");
671   }
672 
673   return Vector;
674 }
675 
676 void VectorBlockGenerator::generateLoad(const LoadInst *Load,
677                                         ValueMapT &VectorMap,
678                                         VectorValueMapT &ScalarMaps) {
679   Value *NewLoad;
680 
681   MemoryAccess &Access = Statement.getAccessFor(Load);
682 
683   if (Access.isStrideZero(isl_set_copy(Domain)))
684     NewLoad = generateStrideZeroLoad(Load, ScalarMaps[0]);
685   else if (Access.isStrideOne(isl_set_copy(Domain)))
686     NewLoad = generateStrideOneLoad(Load, ScalarMaps[0]);
687   else
688     NewLoad = generateUnknownStrideLoad(Load, ScalarMaps);
689 
690   VectorMap[Load] = NewLoad;
691 }
692 
693 void VectorBlockGenerator::copyUnaryInst(const UnaryInstruction *Inst,
694                                          ValueMapT &VectorMap,
695                                          VectorValueMapT &ScalarMaps) {
696   int VectorWidth = getVectorWidth();
697   Value *NewOperand = getVectorValue(Inst->getOperand(0), VectorMap,
698                                      ScalarMaps);
699 
700   assert(isa<CastInst>(Inst) && "Can not generate vector code for instruction");
701 
702   const CastInst *Cast = dyn_cast<CastInst>(Inst);
703   VectorType *DestType = VectorType::get(Inst->getType(), VectorWidth);
704   VectorMap[Inst] = Builder.CreateCast(Cast->getOpcode(), NewOperand, DestType);
705 }
706 
707 void VectorBlockGenerator::copyBinaryInst(const BinaryOperator *Inst,
708                                           ValueMapT &VectorMap,
709                                           VectorValueMapT &ScalarMaps) {
710   Value *OpZero = Inst->getOperand(0);
711   Value *OpOne = Inst->getOperand(1);
712 
713   Value *NewOpZero, *NewOpOne;
714   NewOpZero = getVectorValue(OpZero, VectorMap, ScalarMaps);
715   NewOpOne = getVectorValue(OpOne, VectorMap, ScalarMaps);
716 
717   Value *NewInst = Builder.CreateBinOp(Inst->getOpcode(), NewOpZero,
718                                        NewOpOne,
719                                        Inst->getName() + "p_vec");
720   VectorMap[Inst] = NewInst;
721 }
722 
723 void VectorBlockGenerator::copyStore(const StoreInst *Store,
724                                      ValueMapT &VectorMap,
725                                      VectorValueMapT &ScalarMaps) {
726   int VectorWidth = getVectorWidth();
727 
728   MemoryAccess &Access = Statement.getAccessFor(Store);
729 
730   const Value *Pointer = Store->getPointerOperand();
731   Value *Vector = getVectorValue(Store->getValueOperand(), VectorMap,
732                                    ScalarMaps);
733 
734   if (Access.isStrideOne(isl_set_copy(Domain))) {
735     Type *VectorPtrType = getVectorPtrTy(Pointer, VectorWidth);
736     Value *NewPointer = getNewValue(Pointer, ScalarMaps[0], GlobalMaps[0]);
737 
738     Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
739                                              "vector_ptr");
740     StoreInst *Store = Builder.CreateStore(Vector, VectorPtr);
741 
742     if (!Aligned)
743       Store->setAlignment(8);
744   } else {
745     for (unsigned i = 0; i < ScalarMaps.size(); i++) {
746       Value *Scalar = Builder.CreateExtractElement(Vector,
747                                                    Builder.getInt32(i));
748       Value *NewPointer = getNewValue(Pointer, ScalarMaps[i], GlobalMaps[i]);
749       Builder.CreateStore(Scalar, NewPointer);
750     }
751   }
752 }
753 
754 bool VectorBlockGenerator::hasVectorOperands(const Instruction *Inst,
755                                              ValueMapT &VectorMap) {
756   for (Instruction::const_op_iterator OI = Inst->op_begin(),
757        OE = Inst->op_end(); OI != OE; ++OI)
758     if (VectorMap.count(*OI))
759       return true;
760   return false;
761 }
762 
763 int VectorBlockGenerator::getVectorWidth() {
764   return GlobalMaps.size();
765 }
766 
767 void VectorBlockGenerator::copyInstruction(const Instruction *Inst,
768                                            ValueMapT &VectorMap,
769                                            VectorValueMapT &ScalarMaps) {
770   // Terminator instructions control the control flow. They are explicitly
771   // expressed in the clast and do not need to be copied.
772   if (Inst->isTerminator())
773     return;
774 
775   if (const LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
776     generateLoad(Load, VectorMap, ScalarMaps);
777     return;
778   }
779 
780   if (hasVectorOperands(Inst, VectorMap)) {
781     if (const StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
782       copyStore(Store, VectorMap, ScalarMaps);
783       return;
784     }
785 
786     if (const UnaryInstruction *Unary = dyn_cast<UnaryInstruction>(Inst)) {
787       copyUnaryInst(Unary, VectorMap, ScalarMaps);
788       return;
789     }
790 
791     if (const BinaryOperator *Binary = dyn_cast<BinaryOperator>(Inst)) {
792       copyBinaryInst(Binary, VectorMap, ScalarMaps);
793       return;
794     }
795 
796     llvm_unreachable("Cannot issue vector code for this instruction");
797   }
798 
799   for (int VectorLane = 0; VectorLane < getVectorWidth(); VectorLane++)
800     copyInstScalar(Inst, ScalarMaps[VectorLane], GlobalMaps[VectorLane]);
801 }
802 
803 void VectorBlockGenerator::copyBB() {
804   BasicBlock *BB = Statement.getBasicBlock();
805   BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
806                                   Builder.GetInsertPoint(), P);
807   CopyBB->setName("polly.stmt." + BB->getName());
808   Builder.SetInsertPoint(CopyBB->begin());
809 
810   // Create two maps that store the mapping from the original instructions of
811   // the old basic block to their copies in the new basic block. Those maps
812   // are basic block local.
813   //
814   // As vector code generation is supported there is one map for scalar values
815   // and one for vector values.
816   //
817   // In case we just do scalar code generation, the vectorMap is not used and
818   // the scalarMap has just one dimension, which contains the mapping.
819   //
820   // In case vector code generation is done, an instruction may either appear
821   // in the vector map once (as it is calculating >vectorwidth< values at a
822   // time. Or (if the values are calculated using scalar operations), it
823   // appears once in every dimension of the scalarMap.
824   VectorValueMapT ScalarBlockMap(getVectorWidth());
825   ValueMapT VectorBlockMap;
826 
827   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
828        II != IE; ++II)
829       copyInstruction(II, VectorBlockMap, ScalarBlockMap);
830 }
831 
832 /// Class to generate LLVM-IR that calculates the value of a clast_expr.
833 class ClastExpCodeGen {
834   IRBuilder<> &Builder;
835   const CharMapT &IVS;
836 
837   Value *codegen(const clast_name *e, Type *Ty);
838   Value *codegen(const clast_term *e, Type *Ty);
839   Value *codegen(const clast_binary *e, Type *Ty);
840   Value *codegen(const clast_reduction *r, Type *Ty);
841 public:
842 
843   // A generator for clast expressions.
844   //
845   // @param B The IRBuilder that defines where the code to calculate the
846   //          clast expressions should be inserted.
847   // @param IVMAP A Map that translates strings describing the induction
848   //              variables to the Values* that represent these variables
849   //              on the LLVM side.
850   ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap);
851 
852   // Generates code to calculate a given clast expression.
853   //
854   // @param e The expression to calculate.
855   // @return The Value that holds the result.
856   Value *codegen(const clast_expr *e, Type *Ty);
857 };
858 
859 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) {
860   CharMapT::const_iterator I = IVS.find(e->name);
861 
862   assert(I != IVS.end() && "Clast name not found");
863 
864   return Builder.CreateSExtOrBitCast(I->second, Ty);
865 }
866 
867 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) {
868   APInt a = APInt_from_MPZ(e->val);
869 
870   Value *ConstOne = ConstantInt::get(Builder.getContext(), a);
871   ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty);
872 
873   if (!e->var)
874     return ConstOne;
875 
876   Value *var = codegen(e->var, Ty);
877   return Builder.CreateMul(ConstOne, var);
878 }
879 
880 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) {
881   Value *LHS = codegen(e->LHS, Ty);
882 
883   APInt RHS_AP = APInt_from_MPZ(e->RHS);
884 
885   Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP);
886   RHS = Builder.CreateSExtOrBitCast(RHS, Ty);
887 
888   switch (e->type) {
889   case clast_bin_mod:
890     return Builder.CreateSRem(LHS, RHS);
891   case clast_bin_fdiv:
892     {
893       // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d
894       Value *One = ConstantInt::get(Ty, 1);
895       Value *Zero = ConstantInt::get(Ty, 0);
896       Value *Sum1 = Builder.CreateSub(LHS, RHS);
897       Value *Sum2 = Builder.CreateAdd(Sum1, One);
898       Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
899       Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS);
900       return Builder.CreateSDiv(Dividend, RHS);
901     }
902   case clast_bin_cdiv:
903     {
904       // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d
905       Value *One = ConstantInt::get(Ty, 1);
906       Value *Zero = ConstantInt::get(Ty, 0);
907       Value *Sum1 = Builder.CreateAdd(LHS, RHS);
908       Value *Sum2 = Builder.CreateSub(Sum1, One);
909       Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
910       Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2);
911       return Builder.CreateSDiv(Dividend, RHS);
912     }
913   case clast_bin_div:
914     return Builder.CreateSDiv(LHS, RHS);
915   };
916 
917   llvm_unreachable("Unknown clast binary expression type");
918 }
919 
920 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) {
921   assert((   r->type == clast_red_min
922              || r->type == clast_red_max
923              || r->type == clast_red_sum)
924          && "Clast reduction type not supported");
925   Value *old = codegen(r->elts[0], Ty);
926 
927   for (int i=1; i < r->n; ++i) {
928     Value *exprValue = codegen(r->elts[i], Ty);
929 
930     switch (r->type) {
931     case clast_red_min:
932       {
933         Value *cmp = Builder.CreateICmpSLT(old, exprValue);
934         old = Builder.CreateSelect(cmp, old, exprValue);
935         break;
936       }
937     case clast_red_max:
938       {
939         Value *cmp = Builder.CreateICmpSGT(old, exprValue);
940         old = Builder.CreateSelect(cmp, old, exprValue);
941         break;
942       }
943     case clast_red_sum:
944       old = Builder.CreateAdd(old, exprValue);
945       break;
946     }
947   }
948 
949   return old;
950 }
951 
952 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap)
953   : Builder(B), IVS(IVMap) {}
954 
955 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) {
956   switch(e->type) {
957   case clast_expr_name:
958     return codegen((const clast_name *)e, Ty);
959   case clast_expr_term:
960     return codegen((const clast_term *)e, Ty);
961   case clast_expr_bin:
962     return codegen((const clast_binary *)e, Ty);
963   case clast_expr_red:
964     return codegen((const clast_reduction *)e, Ty);
965   }
966 
967   llvm_unreachable("Unknown clast expression!");
968 }
969 
970 class ClastStmtCodeGen {
971 public:
972   const std::vector<std::string> &getParallelLoops();
973 
974 private:
975   // The Scop we code generate.
976   Scop *S;
977   Pass *P;
978 
979   // The Builder specifies the current location to code generate at.
980   IRBuilder<> &Builder;
981 
982   // Map the Values from the old code to their counterparts in the new code.
983   ValueMapT ValueMap;
984 
985   // clastVars maps from the textual representation of a clast variable to its
986   // current *Value. clast variables are scheduling variables, original
987   // induction variables or parameters. They are used either in loop bounds or
988   // to define the statement instance that is executed.
989   //
990   //   for (s = 0; s < n + 3; ++i)
991   //     for (t = s; t < m; ++j)
992   //       Stmt(i = s + 3 * m, j = t);
993   //
994   // {s,t,i,j,n,m} is the set of clast variables in this clast.
995   CharMapT ClastVars;
996 
997   // Codegenerator for clast expressions.
998   ClastExpCodeGen ExpGen;
999 
1000   // Do we currently generate parallel code?
1001   bool parallelCodeGeneration;
1002 
1003   std::vector<std::string> parallelLoops;
1004 
1005   void codegen(const clast_assignment *a);
1006 
1007   void codegen(const clast_assignment *a, ScopStmt *Statement,
1008                unsigned Dimension, int vectorDim,
1009                std::vector<ValueMapT> *VectorVMap = 0);
1010 
1011   void codegenSubstitutions(const clast_stmt *Assignment,
1012                             ScopStmt *Statement, int vectorDim = 0,
1013                             std::vector<ValueMapT> *VectorVMap = 0);
1014 
1015   void codegen(const clast_user_stmt *u, std::vector<Value*> *IVS = NULL,
1016                const char *iterator = NULL, isl_set *scatteringDomain = 0);
1017 
1018   void codegen(const clast_block *b);
1019 
1020   /// @brief Create a classical sequential loop.
1021   void codegenForSequential(const clast_for *f);
1022 
1023   /// @brief Create OpenMP structure values.
1024   ///
1025   /// Create a list of values that has to be stored into the OpenMP subfuncition
1026   /// structure.
1027   SetVector<Value*> getOMPValues();
1028 
1029   /// @brief Update the internal structures according to a Value Map.
1030   ///
1031   /// @param VMap     A map from old to new values.
1032   /// @param Reverse  If true, we assume the update should be reversed.
1033   void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap,
1034                           bool Reverse);
1035 
1036   /// @brief Create an OpenMP parallel for loop.
1037   ///
1038   /// This loop reflects a loop as if it would have been created by an OpenMP
1039   /// statement.
1040   void codegenForOpenMP(const clast_for *f);
1041 
1042   bool isInnermostLoop(const clast_for *f);
1043 
1044   /// @brief Get the number of loop iterations for this loop.
1045   /// @param f The clast for loop to check.
1046   int getNumberOfIterations(const clast_for *f);
1047 
1048   /// @brief Create vector instructions for this loop.
1049   void codegenForVector(const clast_for *f);
1050 
1051   void codegen(const clast_for *f);
1052 
1053   Value *codegen(const clast_equation *eq);
1054 
1055   void codegen(const clast_guard *g);
1056 
1057   void codegen(const clast_stmt *stmt);
1058 
1059   void addParameters(const CloogNames *names);
1060 
1061   IntegerType *getIntPtrTy();
1062 
1063   public:
1064   void codegen(const clast_root *r);
1065 
1066   ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P);
1067 };
1068 }
1069 
1070 IntegerType *ClastStmtCodeGen::getIntPtrTy() {
1071   return P->getAnalysis<TargetData>().getIntPtrType(Builder.getContext());
1072 }
1073 
1074 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() {
1075   return parallelLoops;
1076 }
1077 
1078 void ClastStmtCodeGen::codegen(const clast_assignment *a) {
1079   Value *V= ExpGen.codegen(a->RHS, getIntPtrTy());
1080   ClastVars[a->LHS] = V;
1081 }
1082 
1083 void ClastStmtCodeGen::codegen(const clast_assignment *a, ScopStmt *Statement,
1084                                unsigned Dimension, int vectorDim,
1085                                std::vector<ValueMapT> *VectorVMap) {
1086   Value *RHS = ExpGen.codegen(a->RHS, getIntPtrTy());
1087 
1088   assert(!a->LHS && "Statement assignments do not have left hand side");
1089   const PHINode *PN;
1090   PN = Statement->getInductionVariableForDimension(Dimension);
1091   const Value *V = PN;
1092 
1093   if (VectorVMap)
1094     (*VectorVMap)[vectorDim][V] = RHS;
1095 
1096   ValueMap[V] = RHS;
1097 }
1098 
1099 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment,
1100                                              ScopStmt *Statement, int vectorDim,
1101   std::vector<ValueMapT> *VectorVMap) {
1102   int Dimension = 0;
1103 
1104   while (Assignment) {
1105     assert(CLAST_STMT_IS_A(Assignment, stmt_ass)
1106            && "Substitions are expected to be assignments");
1107     codegen((const clast_assignment *)Assignment, Statement, Dimension,
1108             vectorDim, VectorVMap);
1109     Assignment = Assignment->next;
1110     Dimension++;
1111   }
1112 }
1113 
1114 void ClastStmtCodeGen::codegen(const clast_user_stmt *u,
1115                                std::vector<Value*> *IVS , const char *iterator,
1116                                isl_set *Domain) {
1117   ScopStmt *Statement = (ScopStmt *)u->statement->usr;
1118 
1119   if (u->substitutions)
1120     codegenSubstitutions(u->substitutions, Statement);
1121 
1122   int VectorDimensions = IVS ? IVS->size() : 1;
1123 
1124   if (VectorDimensions == 1) {
1125     BlockGenerator::generate(Builder, *Statement, ValueMap, P);
1126     return;
1127   }
1128 
1129   VectorValueMapT VectorMap(VectorDimensions);
1130 
1131   if (IVS) {
1132     assert (u->substitutions && "Substitutions expected!");
1133     int i = 0;
1134     for (std::vector<Value*>::iterator II = IVS->begin(), IE = IVS->end();
1135          II != IE; ++II) {
1136       ClastVars[iterator] = *II;
1137       codegenSubstitutions(u->substitutions, Statement, i, &VectorMap);
1138       i++;
1139     }
1140   }
1141 
1142   VectorBlockGenerator::generate(Builder, *Statement, VectorMap, Domain, P);
1143 }
1144 
1145 void ClastStmtCodeGen::codegen(const clast_block *b) {
1146   if (b->body)
1147     codegen(b->body);
1148 }
1149 
1150 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) {
1151   Value *LowerBound, *UpperBound, *IV, *Stride;
1152   BasicBlock *AfterBB;
1153   Type *IntPtrTy = getIntPtrTy();
1154 
1155   LowerBound = ExpGen.codegen(f->LB, IntPtrTy);
1156   UpperBound = ExpGen.codegen(f->UB, IntPtrTy);
1157   Stride = Builder.getInt(APInt_from_MPZ(f->stride));
1158 
1159   IV = createLoop(LowerBound, UpperBound, Stride, &Builder, P, &AfterBB);
1160 
1161   // Add loop iv to symbols.
1162   ClastVars[f->iterator] = IV;
1163 
1164   if (f->body)
1165     codegen(f->body);
1166 
1167   // Loop is finished, so remove its iv from the live symbols.
1168   ClastVars.erase(f->iterator);
1169   Builder.SetInsertPoint(AfterBB->begin());
1170 }
1171 
1172 SetVector<Value*> ClastStmtCodeGen::getOMPValues() {
1173   SetVector<Value*> Values;
1174 
1175   // The clast variables
1176   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end();
1177        I != E; I++)
1178     Values.insert(I->second);
1179 
1180   // The memory reference base addresses
1181   for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
1182     ScopStmt *Stmt = *SI;
1183     for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
1184          E = Stmt->memacc_end(); I != E; ++I) {
1185       Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
1186       Values.insert((BaseAddr));
1187     }
1188   }
1189 
1190   return Values;
1191 }
1192 
1193 void ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap,
1194                                           bool Reverse) {
1195   std::set<Value*> Inserted;
1196 
1197   if (Reverse) {
1198     OMPGenerator::ValueToValueMapTy ReverseMap;
1199 
1200     for (std::map<Value*, Value*>::iterator I = VMap.begin(), E = VMap.end();
1201          I != E; ++I)
1202        ReverseMap.insert(std::make_pair(I->second, I->first));
1203 
1204     for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end();
1205          I != E; I++) {
1206       ClastVars[I->first] = ReverseMap[I->second];
1207       Inserted.insert(I->second);
1208     }
1209 
1210     /// FIXME: At the moment we do not reverse the update of the ValueMap.
1211     ///        This is incomplet, but the failure should be obvious, such that
1212     ///        we can fix this later.
1213     return;
1214   }
1215 
1216   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end();
1217        I != E; I++) {
1218     ClastVars[I->first] = VMap[I->second];
1219     Inserted.insert(I->second);
1220   }
1221 
1222   for (std::map<Value*, Value*>::iterator I = VMap.begin(), E = VMap.end();
1223        I != E; ++I) {
1224     if (Inserted.count(I->first))
1225       continue;
1226 
1227     ValueMap[I->first] = I->second;
1228   }
1229 }
1230 
1231 static void clearDomtree(Function *F, DominatorTree &DT) {
1232   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
1233   std::vector<BasicBlock*> Nodes;
1234   for (po_iterator<DomTreeNode*> I = po_begin(N), E = po_end(N); I != E; ++I)
1235     Nodes.push_back(I->getBlock());
1236 
1237   for (std::vector<BasicBlock*>::iterator I = Nodes.begin(), E = Nodes.end();
1238        I != E; ++I)
1239     DT.eraseNode(*I);
1240 }
1241 
1242 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) {
1243   Value *Stride, *LB, *UB, *IV;
1244   BasicBlock::iterator LoopBody;
1245   IntegerType *IntPtrTy = getIntPtrTy();
1246   SetVector<Value*> Values;
1247   OMPGenerator::ValueToValueMapTy VMap;
1248   OMPGenerator OMPGen(Builder, P);
1249 
1250   Stride = Builder.getInt(APInt_from_MPZ(For->stride));
1251   Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy);
1252   LB = ExpGen.codegen(For->LB, IntPtrTy);
1253   UB = ExpGen.codegen(For->UB, IntPtrTy);
1254 
1255   Values = getOMPValues();
1256 
1257   IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody);
1258   BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
1259   Builder.SetInsertPoint(LoopBody);
1260 
1261   updateWithValueMap(VMap, /* reverse */ false);
1262   ClastVars[For->iterator] = IV;
1263 
1264   if (For->body)
1265     codegen(For->body);
1266 
1267   ClastVars.erase(For->iterator);
1268   updateWithValueMap(VMap, /* reverse */ true);
1269 
1270   clearDomtree((*LoopBody).getParent()->getParent(),
1271                P->getAnalysis<DominatorTree>());
1272 
1273   Builder.SetInsertPoint(AfterLoop);
1274 }
1275 
1276 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
1277   const clast_stmt *stmt = f->body;
1278 
1279   while (stmt) {
1280     if (!CLAST_STMT_IS_A(stmt, stmt_user))
1281       return false;
1282 
1283     stmt = stmt->next;
1284   }
1285 
1286   return true;
1287 }
1288 
1289 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *f) {
1290   isl_set *loopDomain = isl_set_copy(isl_set_from_cloog_domain(f->domain));
1291   isl_set *tmp = isl_set_copy(loopDomain);
1292 
1293   // Calculate a map similar to the identity map, but with the last input
1294   // and output dimension not related.
1295   //  [i0, i1, i2, i3] -> [i0, i1, i2, o0]
1296   isl_space *Space = isl_set_get_space(loopDomain);
1297   Space = isl_space_drop_outputs(Space,
1298                                  isl_set_dim(loopDomain, isl_dim_set) - 2, 1);
1299   Space = isl_space_map_from_set(Space);
1300   isl_map *identity = isl_map_identity(Space);
1301   identity = isl_map_add_dims(identity, isl_dim_in, 1);
1302   identity = isl_map_add_dims(identity, isl_dim_out, 1);
1303 
1304   isl_map *map = isl_map_from_domain_and_range(tmp, loopDomain);
1305   map = isl_map_intersect(map, identity);
1306 
1307   isl_map *lexmax = isl_map_lexmax(isl_map_copy(map));
1308   isl_map *lexmin = isl_map_lexmin(map);
1309   isl_map *sub = isl_map_sum(lexmax, isl_map_neg(lexmin));
1310 
1311   isl_set *elements = isl_map_range(sub);
1312 
1313   if (!isl_set_is_singleton(elements)) {
1314     isl_set_free(elements);
1315     return -1;
1316   }
1317 
1318   isl_point *p = isl_set_sample_point(elements);
1319 
1320   isl_int v;
1321   isl_int_init(v);
1322   isl_point_get_coordinate(p, isl_dim_set, isl_set_n_dim(loopDomain) - 1, &v);
1323   int numberIterations = isl_int_get_si(v);
1324   isl_int_clear(v);
1325   isl_point_free(p);
1326 
1327   return (numberIterations) / isl_int_get_si(f->stride) + 1;
1328 }
1329 
1330 void ClastStmtCodeGen::codegenForVector(const clast_for *F) {
1331   DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";);
1332   int VectorWidth = getNumberOfIterations(F);
1333 
1334   Value *LB = ExpGen.codegen(F->LB, getIntPtrTy());
1335 
1336   APInt Stride = APInt_from_MPZ(F->stride);
1337   IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType());
1338   Stride =  Stride.zext(LoopIVType->getBitWidth());
1339   Value *StrideValue = ConstantInt::get(LoopIVType, Stride);
1340 
1341   std::vector<Value*> IVS(VectorWidth);
1342   IVS[0] = LB;
1343 
1344   for (int i = 1; i < VectorWidth; i++)
1345     IVS[i] = Builder.CreateAdd(IVS[i-1], StrideValue, "p_vector_iv");
1346 
1347   isl_set *Domain = isl_set_from_cloog_domain(F->domain);
1348 
1349   // Add loop iv to symbols.
1350   ClastVars[F->iterator] = LB;
1351 
1352   const clast_stmt *Stmt = F->body;
1353 
1354   while (Stmt) {
1355     codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator,
1356             isl_set_copy(Domain));
1357     Stmt = Stmt->next;
1358   }
1359 
1360   // Loop is finished, so remove its iv from the live symbols.
1361   isl_set_free(Domain);
1362   ClastVars.erase(F->iterator);
1363 }
1364 
1365 void ClastStmtCodeGen::codegen(const clast_for *f) {
1366   if ((Vector || OpenMP) && P->getAnalysis<Dependences>().isParallelFor(f)) {
1367     if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f))
1368         && (getNumberOfIterations(f) <= 16)) {
1369       codegenForVector(f);
1370       return;
1371     }
1372 
1373     if (OpenMP && !parallelCodeGeneration) {
1374       parallelCodeGeneration = true;
1375       parallelLoops.push_back(f->iterator);
1376       codegenForOpenMP(f);
1377       parallelCodeGeneration = false;
1378       return;
1379     }
1380   }
1381 
1382   codegenForSequential(f);
1383 }
1384 
1385 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) {
1386   Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy());
1387   Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy());
1388   CmpInst::Predicate P;
1389 
1390   if (eq->sign == 0)
1391     P = ICmpInst::ICMP_EQ;
1392   else if (eq->sign > 0)
1393     P = ICmpInst::ICMP_SGE;
1394   else
1395     P = ICmpInst::ICMP_SLE;
1396 
1397   return Builder.CreateICmp(P, LHS, RHS);
1398 }
1399 
1400 void ClastStmtCodeGen::codegen(const clast_guard *g) {
1401   Function *F = Builder.GetInsertBlock()->getParent();
1402   LLVMContext &Context = F->getContext();
1403 
1404   BasicBlock *CondBB = SplitBlock(Builder.GetInsertBlock(),
1405                                       Builder.GetInsertPoint(), P);
1406   CondBB->setName("polly.cond");
1407   BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P);
1408   MergeBB->setName("polly.merge");
1409   BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F);
1410 
1411   DominatorTree &DT = P->getAnalysis<DominatorTree>();
1412   DT.addNewBlock(ThenBB, CondBB);
1413   DT.changeImmediateDominator(MergeBB, CondBB);
1414 
1415   CondBB->getTerminator()->eraseFromParent();
1416 
1417   Builder.SetInsertPoint(CondBB);
1418 
1419   Value *Predicate = codegen(&(g->eq[0]));
1420 
1421   for (int i = 1; i < g->n; ++i) {
1422     Value *TmpPredicate = codegen(&(g->eq[i]));
1423     Predicate = Builder.CreateAnd(Predicate, TmpPredicate);
1424   }
1425 
1426   Builder.CreateCondBr(Predicate, ThenBB, MergeBB);
1427   Builder.SetInsertPoint(ThenBB);
1428   Builder.CreateBr(MergeBB);
1429   Builder.SetInsertPoint(ThenBB->begin());
1430 
1431   codegen(g->then);
1432 
1433   Builder.SetInsertPoint(MergeBB->begin());
1434 }
1435 
1436 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) {
1437   if	    (CLAST_STMT_IS_A(stmt, stmt_root))
1438     assert(false && "No second root statement expected");
1439   else if (CLAST_STMT_IS_A(stmt, stmt_ass))
1440     codegen((const clast_assignment *)stmt);
1441   else if (CLAST_STMT_IS_A(stmt, stmt_user))
1442     codegen((const clast_user_stmt *)stmt);
1443   else if (CLAST_STMT_IS_A(stmt, stmt_block))
1444     codegen((const clast_block *)stmt);
1445   else if (CLAST_STMT_IS_A(stmt, stmt_for))
1446     codegen((const clast_for *)stmt);
1447   else if (CLAST_STMT_IS_A(stmt, stmt_guard))
1448     codegen((const clast_guard *)stmt);
1449 
1450   if (stmt->next)
1451     codegen(stmt->next);
1452 }
1453 
1454 void ClastStmtCodeGen::addParameters(const CloogNames *names) {
1455   SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly");
1456 
1457   int i = 0;
1458   for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end();
1459        PI != PE; ++PI) {
1460     assert(i < names->nb_parameters && "Not enough parameter names");
1461 
1462     const SCEV *Param = *PI;
1463     Type *Ty = Param->getType();
1464 
1465     Instruction *insertLocation = --(Builder.GetInsertBlock()->end());
1466     Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation);
1467     ClastVars[names->parameters[i]] = V;
1468 
1469     ++i;
1470   }
1471 }
1472 
1473 void ClastStmtCodeGen::codegen(const clast_root *r) {
1474   addParameters(r->names);
1475 
1476   parallelCodeGeneration = false;
1477 
1478   const clast_stmt *stmt = (const clast_stmt*) r;
1479   if (stmt->next)
1480     codegen(stmt->next);
1481 }
1482 
1483 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) :
1484     S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {}
1485 
1486 namespace {
1487 class CodeGeneration : public ScopPass {
1488   Region *region;
1489   Scop *S;
1490   DominatorTree *DT;
1491   RegionInfo *RI;
1492 
1493   std::vector<std::string> parallelLoops;
1494 
1495   public:
1496   static char ID;
1497 
1498   CodeGeneration() : ScopPass(ID) {}
1499 
1500   // Split the entry edge of the region and generate a new basic block on this
1501   // edge. This function also updates ScopInfo and RegionInfo.
1502   //
1503   // @param region The region where the entry edge will be splitted.
1504   BasicBlock *splitEdgeAdvanced(Region *region) {
1505     BasicBlock *newBlock;
1506     BasicBlock *splitBlock;
1507 
1508     newBlock = SplitEdge(region->getEnteringBlock(), region->getEntry(), this);
1509 
1510     if (DT->dominates(region->getEntry(), newBlock)) {
1511       BasicBlock *OldBlock = region->getEntry();
1512       std::string OldName = OldBlock->getName();
1513 
1514       // Update ScopInfo.
1515       for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI)
1516         if ((*SI)->getBasicBlock() == OldBlock) {
1517           (*SI)->setBasicBlock(newBlock);
1518           break;
1519         }
1520 
1521       // Update RegionInfo.
1522       splitBlock = OldBlock;
1523       OldBlock->setName("polly.split");
1524       newBlock->setName(OldName);
1525       region->replaceEntry(newBlock);
1526       RI->setRegionFor(newBlock, region);
1527     } else {
1528       RI->setRegionFor(newBlock, region->getParent());
1529       splitBlock = newBlock;
1530     }
1531 
1532     return splitBlock;
1533   }
1534 
1535   // Create a split block that branches either to the old code or to a new basic
1536   // block where the new code can be inserted.
1537   //
1538   // @param Builder A builder that will be set to point to a basic block, where
1539   //                the new code can be generated.
1540   // @return The split basic block.
1541   BasicBlock *addSplitAndStartBlock(IRBuilder<> *Builder) {
1542     BasicBlock *StartBlock, *SplitBlock;
1543 
1544     SplitBlock = splitEdgeAdvanced(region);
1545     SplitBlock->setName("polly.split_new_and_old");
1546     Function *F = SplitBlock->getParent();
1547     StartBlock = BasicBlock::Create(F->getContext(), "polly.start", F);
1548     SplitBlock->getTerminator()->eraseFromParent();
1549     Builder->SetInsertPoint(SplitBlock);
1550     Builder->CreateCondBr(Builder->getTrue(), StartBlock, region->getEntry());
1551     DT->addNewBlock(StartBlock, SplitBlock);
1552     Builder->SetInsertPoint(StartBlock);
1553     return SplitBlock;
1554   }
1555 
1556   // Merge the control flow of the newly generated code with the existing code.
1557   //
1558   // @param SplitBlock The basic block where the control flow was split between
1559   //                   old and new version of the Scop.
1560   // @param Builder    An IRBuilder that points to the last instruction of the
1561   //                   newly generated code.
1562   void mergeControlFlow(BasicBlock *SplitBlock, IRBuilder<> *Builder) {
1563     BasicBlock *MergeBlock;
1564     Region *R = region;
1565 
1566     if (R->getExit()->getSinglePredecessor())
1567       // No splitEdge required.  A block with a single predecessor cannot have
1568       // PHI nodes that would complicate life.
1569       MergeBlock = R->getExit();
1570     else {
1571       MergeBlock = SplitEdge(R->getExitingBlock(), R->getExit(), this);
1572       // SplitEdge will never split R->getExit(), as R->getExit() has more than
1573       // one predecessor. Hence, mergeBlock is always a newly generated block.
1574       R->replaceExit(MergeBlock);
1575     }
1576 
1577     Builder->CreateBr(MergeBlock);
1578     MergeBlock->setName("polly.merge_new_and_old");
1579 
1580     if (DT->dominates(SplitBlock, MergeBlock))
1581       DT->changeImmediateDominator(MergeBlock, SplitBlock);
1582   }
1583 
1584   bool runOnScop(Scop &scop) {
1585     S = &scop;
1586     region = &S->getRegion();
1587     DT = &getAnalysis<DominatorTree>();
1588     RI = &getAnalysis<RegionInfo>();
1589 
1590     parallelLoops.clear();
1591 
1592     assert(region->isSimple() && "Only simple regions are supported");
1593 
1594     // In the CFG the optimized code of the SCoP is generated next to the
1595     // original code. Both the new and the original version of the code remain
1596     // in the CFG. A branch statement decides which version is executed.
1597     // For now, we always execute the new version (the old one is dead code
1598     // eliminated by the cleanup passes). In the future we may decide to execute
1599     // the new version only if certain run time checks succeed. This will be
1600     // useful to support constructs for which we cannot prove all assumptions at
1601     // compile time.
1602     //
1603     // Before transformation:
1604     //
1605     //                        bb0
1606     //                         |
1607     //                     orig_scop
1608     //                         |
1609     //                        bb1
1610     //
1611     // After transformation:
1612     //                        bb0
1613     //                         |
1614     //                  polly.splitBlock
1615     //                     /       \.
1616     //                     |     startBlock
1617     //                     |        |
1618     //               orig_scop   new_scop
1619     //                     \      /
1620     //                      \    /
1621     //                        bb1 (joinBlock)
1622     IRBuilder<> builder(region->getEntry());
1623 
1624     // The builder will be set to startBlock.
1625     BasicBlock *splitBlock = addSplitAndStartBlock(&builder);
1626     BasicBlock *StartBlock = builder.GetInsertBlock();
1627 
1628     mergeControlFlow(splitBlock, &builder);
1629     builder.SetInsertPoint(StartBlock->begin());
1630 
1631     ClastStmtCodeGen CodeGen(S, builder, this);
1632     CloogInfo &C = getAnalysis<CloogInfo>();
1633     CodeGen.codegen(C.getClast());
1634 
1635     parallelLoops.insert(parallelLoops.begin(),
1636                          CodeGen.getParallelLoops().begin(),
1637                          CodeGen.getParallelLoops().end());
1638 
1639     return true;
1640   }
1641 
1642   virtual void printScop(raw_ostream &OS) const {
1643     for (std::vector<std::string>::const_iterator PI = parallelLoops.begin(),
1644          PE = parallelLoops.end(); PI != PE; ++PI)
1645       OS << "Parallel loop with iterator '" << *PI << "' generated\n";
1646   }
1647 
1648   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
1649     AU.addRequired<CloogInfo>();
1650     AU.addRequired<Dependences>();
1651     AU.addRequired<DominatorTree>();
1652     AU.addRequired<RegionInfo>();
1653     AU.addRequired<ScalarEvolution>();
1654     AU.addRequired<ScopDetection>();
1655     AU.addRequired<ScopInfo>();
1656     AU.addRequired<TargetData>();
1657 
1658     AU.addPreserved<CloogInfo>();
1659     AU.addPreserved<Dependences>();
1660 
1661     // FIXME: We do not create LoopInfo for the newly generated loops.
1662     AU.addPreserved<LoopInfo>();
1663     AU.addPreserved<DominatorTree>();
1664     AU.addPreserved<ScopDetection>();
1665     AU.addPreserved<ScalarEvolution>();
1666 
1667     // FIXME: We do not yet add regions for the newly generated code to the
1668     //        region tree.
1669     AU.addPreserved<RegionInfo>();
1670     AU.addPreserved<TempScopInfo>();
1671     AU.addPreserved<ScopInfo>();
1672     AU.addPreservedID(IndependentBlocksID);
1673   }
1674 };
1675 }
1676 
1677 char CodeGeneration::ID = 1;
1678 
1679 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
1680                       "Polly - Create LLVM-IR from SCoPs", false, false)
1681 INITIALIZE_PASS_DEPENDENCY(CloogInfo)
1682 INITIALIZE_PASS_DEPENDENCY(Dependences)
1683 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
1684 INITIALIZE_PASS_DEPENDENCY(RegionInfo)
1685 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1686 INITIALIZE_PASS_DEPENDENCY(ScopDetection)
1687 INITIALIZE_PASS_DEPENDENCY(TargetData)
1688 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
1689                       "Polly - Create LLVM-IR from SCoPs", false, false)
1690 
1691 Pass *polly::createCodeGenerationPass() {
1692   return new CodeGeneration();
1693 }
1694