1 //===------ CodeGeneration.cpp - Code generate the Scops. -----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // The CodeGeneration pass takes a Scop created by ScopInfo and translates it
11 // back to LLVM-IR using Cloog.
12 //
13 // The Scop describes the high level memory behaviour of a control flow region.
14 // Transformation passes can update the schedule (execution order) of statements
15 // in the Scop. Cloog is used to generate an abstract syntax tree (clast) that
16 // reflects the updated execution order. This clast is used to create new
17 // LLVM-IR that is computational equivalent to the original control flow region,
18 // but executes its code in the new execution order defined by the changed
19 // scattering.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #define DEBUG_TYPE "polly-codegen"
24 
25 #include "polly/Cloog.h"
26 #include "polly/CodeGeneration.h"
27 #include "polly/Dependences.h"
28 #include "polly/LinkAllPasses.h"
29 #include "polly/ScopInfo.h"
30 #include "polly/TempScopInfo.h"
31 #include "polly/Support/GICHelper.h"
32 #include "polly/LoopGenerators.h"
33 
34 #include "llvm/Module.h"
35 #include "llvm/ADT/SetVector.h"
36 #include "llvm/ADT/PostOrderIterator.h"
37 #include "llvm/Analysis/LoopInfo.h"
38 #include "llvm/Analysis/ScalarEvolutionExpander.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/IRBuilder.h"
42 #include "llvm/Target/TargetData.h"
43 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
44 
45 #define CLOOG_INT_GMP 1
46 #include "cloog/cloog.h"
47 #include "cloog/isl/cloog.h"
48 
49 #include "isl/aff.h"
50 
51 #include <vector>
52 #include <utility>
53 
54 using namespace polly;
55 using namespace llvm;
56 
57 struct isl_set;
58 
59 namespace polly {
60 
61 bool EnablePollyVector;
62 
63 static cl::opt<bool, true>
64 Vector("enable-polly-vector",
65        cl::desc("Enable polly vector code generation"), cl::Hidden,
66        cl::location(EnablePollyVector), cl::init(false), cl::ZeroOrMore);
67 
68 static cl::opt<bool>
69 OpenMP("enable-polly-openmp",
70        cl::desc("Generate OpenMP parallel code"), cl::Hidden,
71        cl::value_desc("OpenMP code generation enabled if true"),
72        cl::init(false), cl::ZeroOrMore);
73 
74 static cl::opt<bool>
75 AtLeastOnce("enable-polly-atLeastOnce",
76        cl::desc("Give polly the hint, that every loop is executed at least"
77                 "once"), cl::Hidden,
78        cl::value_desc("OpenMP code generation enabled if true"),
79        cl::init(false), cl::ZeroOrMore);
80 
81 static cl::opt<bool>
82 Aligned("enable-polly-aligned",
83        cl::desc("Assumed aligned memory accesses."), cl::Hidden,
84        cl::value_desc("OpenMP code generation enabled if true"),
85        cl::init(false), cl::ZeroOrMore);
86 
87 static cl::opt<bool>
88 GroupedUnrolling("enable-polly-grouped-unroll",
89                  cl::desc("Perform grouped unrolling, but don't generate SIMD "
90                           "instuctions"), cl::Hidden, cl::init(false),
91                  cl::ZeroOrMore);
92 
93 typedef DenseMap<const Value*, Value*> ValueMapT;
94 typedef DenseMap<const char*, Value*> CharMapT;
95 typedef std::vector<ValueMapT> VectorValueMapT;
96 
97 class IslGenerator {
98 public:
99   IslGenerator(IRBuilder<> &Builder, std::vector<Value *> &IVS) :
100     Builder(Builder), IVS(IVS) {}
101   Value *generateIslInt(__isl_take isl_int Int);
102   Value *generateIslAff(__isl_take isl_aff *Aff);
103   Value *generateIslPwAff(__isl_take isl_pw_aff *PwAff);
104 
105 private:
106   typedef struct {
107     Value *Result;
108     class IslGenerator *Generator;
109   } IslGenInfo;
110 
111   IRBuilder<> &Builder;
112   std::vector<Value *> &IVS;
113   static int mergeIslAffValues(__isl_take isl_set *Set,
114                                __isl_take isl_aff *Aff, void *User);
115 };
116 
117 Value *IslGenerator::generateIslInt(isl_int Int) {
118   mpz_t IntMPZ;
119   mpz_init(IntMPZ);
120   isl_int_get_gmp(Int, IntMPZ);
121   Value *IntValue = Builder.getInt(APInt_from_MPZ(IntMPZ));
122   mpz_clear(IntMPZ);
123   return IntValue;
124 }
125 
126 Value *IslGenerator::generateIslAff(__isl_take isl_aff *Aff) {
127   Value *Result;
128   Value *ConstValue;
129   isl_int ConstIsl;
130 
131   isl_int_init(ConstIsl);
132   isl_aff_get_constant(Aff, &ConstIsl);
133   ConstValue = generateIslInt(ConstIsl);
134   Type *Ty = Builder.getInt64Ty();
135 
136   // FIXME: We should give the constant and coefficients the right type. Here
137   // we force it into i64.
138   Result = Builder.CreateSExtOrBitCast(ConstValue, Ty);
139 
140   unsigned int NbInputDims = isl_aff_dim(Aff, isl_dim_in);
141 
142   assert((IVS.size() == NbInputDims) && "The Dimension of Induction Variables"
143          "must match the dimension of the affine space.");
144 
145   isl_int CoefficientIsl;
146   isl_int_init(CoefficientIsl);
147 
148   for (unsigned int i = 0; i < NbInputDims; ++i) {
149     Value *CoefficientValue;
150     isl_aff_get_coefficient(Aff, isl_dim_in, i, &CoefficientIsl);
151 
152     if (isl_int_is_zero(CoefficientIsl))
153       continue;
154 
155     CoefficientValue = generateIslInt(CoefficientIsl);
156     CoefficientValue = Builder.CreateIntCast(CoefficientValue, Ty, true);
157     Value *IV = Builder.CreateIntCast(IVS[i], Ty, true);
158     Value *PAdd = Builder.CreateMul(CoefficientValue, IV, "p_mul_coeff");
159     Result = Builder.CreateAdd(Result, PAdd, "p_sum_coeff");
160   }
161 
162   isl_int_clear(CoefficientIsl);
163   isl_int_clear(ConstIsl);
164   isl_aff_free(Aff);
165 
166   return Result;
167 }
168 
169 int IslGenerator::mergeIslAffValues(__isl_take isl_set *Set,
170                                     __isl_take isl_aff *Aff, void *User) {
171   IslGenInfo *GenInfo = (IslGenInfo *)User;
172 
173   assert((GenInfo->Result == NULL) && "Result is already set."
174          "Currently only single isl_aff is supported");
175   assert(isl_set_plain_is_universe(Set)
176          && "Code generation failed because the set is not universe");
177 
178   GenInfo->Result = GenInfo->Generator->generateIslAff(Aff);
179 
180   isl_set_free(Set);
181   return 0;
182 }
183 
184 Value *IslGenerator::generateIslPwAff(__isl_take isl_pw_aff *PwAff) {
185   IslGenInfo User;
186   User.Result = NULL;
187   User.Generator = this;
188   isl_pw_aff_foreach_piece(PwAff, mergeIslAffValues, &User);
189   assert(User.Result && "Code generation for isl_pw_aff failed");
190 
191   isl_pw_aff_free(PwAff);
192   return User.Result;
193 }
194 
195 /// @brief Generate a new basic block for a polyhedral statement.
196 ///
197 /// The only public function exposed is generate().
198 class BlockGenerator {
199 public:
200   /// @brief Generate a new BasicBlock for a ScopStmt.
201   ///
202   /// @param Builder   The LLVM-IR Builder used to generate the statement. The
203   ///                  code is generated at the location, the Builder points to.
204   /// @param Stmt      The statement to code generate.
205   /// @param GlobalMap A map that defines for certain Values referenced from the
206   ///                  original code new Values they should be replaced with.
207   /// @param P         A reference to the pass this function is called from.
208   ///                  The pass is needed to update other analysis.
209   static void generate(IRBuilder<> &Builder, ScopStmt &Stmt,
210                        ValueMapT &GlobalMap, Pass *P) {
211     BlockGenerator Generator(Builder, Stmt, P);
212     Generator.copyBB(GlobalMap);
213   }
214 
215 protected:
216   IRBuilder<> &Builder;
217   ScopStmt &Statement;
218   Pass *P;
219 
220   BlockGenerator(IRBuilder<> &B, ScopStmt &Stmt, Pass *P);
221 
222   /// @brief Get the new version of a Value.
223   ///
224   /// @param Old       The old Value.
225   /// @param BBMap     A mapping from old values to their new values
226   ///                  (for values recalculated within this basic block).
227   /// @param GlobalMap A mapping from old values to their new values
228   ///                  (for values recalculated in the new ScoP, but not
229   ///                   within this basic block).
230   ///
231   /// @returns  o The old value, if it is still valid.
232   ///           o The new value, if available.
233   ///           o NULL, if no value is found.
234   Value *getNewValue(const Value *Old, ValueMapT &BBMap, ValueMapT &GlobalMap);
235 
236   void copyInstScalar(const Instruction *Inst, ValueMapT &BBMap,
237                       ValueMapT &GlobalMap);
238 
239   /// @brief Get the memory access offset to be added to the base address
240   std::vector<Value*> getMemoryAccessIndex(__isl_keep isl_map *AccessRelation,
241                                            Value *BaseAddress, ValueMapT &BBMap,
242                                            ValueMapT &GlobalMap);
243 
244   /// @brief Get the new operand address according to the changed access in
245   ///        JSCOP file.
246   Value *getNewAccessOperand(__isl_keep isl_map *NewAccessRelation,
247                              Value *BaseAddress, ValueMapT &BBMap,
248                              ValueMapT &GlobalMap);
249 
250   /// @brief Generate the operand address
251   Value *generateLocationAccessed(const Instruction *Inst,
252                                   const Value *Pointer, ValueMapT &BBMap,
253                                   ValueMapT &GlobalMap);
254 
255   Value *generateScalarLoad(const LoadInst *load, ValueMapT &BBMap,
256                             ValueMapT &GlobalMap);
257 
258   Value *generateScalarStore(const StoreInst *store, ValueMapT &BBMap,
259                              ValueMapT &GlobalMap);
260 
261   /// @brief Copy a single Instruction.
262   ///
263   /// This copies a single Instruction and updates references to old values
264   /// with references to new values, as defined by GlobalMap and BBMap.
265   ///
266   /// @param BBMap     A mapping from old values to their new values
267   ///                  (for values recalculated within this basic block).
268   /// @param GlobalMap A mapping from old values to their new values
269   ///                  (for values recalculated in the new ScoP, but not
270   ///                  within this basic block).
271   void copyInstruction(const Instruction *Inst, ValueMapT &BBMap,
272                        ValueMapT &GlobalMap);
273 
274   /// @brief Copy the basic block.
275   ///
276   /// This copies the entire basic block and updates references to old values
277   /// with references to new values, as defined by GlobalMap.
278   ///
279   /// @param GlobalMap A mapping from old values to their new values
280   ///                  (for values recalculated in the new ScoP, but not
281   ///                  within this basic block).
282   void copyBB(ValueMapT &GlobalMap);
283 };
284 
285 BlockGenerator::BlockGenerator(IRBuilder<> &B, ScopStmt &Stmt, Pass *P):
286   Builder(B), Statement(Stmt), P(P) {}
287 
288 Value *BlockGenerator::getNewValue(const Value *Old, ValueMapT &BBMap,
289                                    ValueMapT &GlobalMap) {
290   // We assume constants never change.
291   // This avoids map lookups for many calls to this function.
292   if (isa<Constant>(Old))
293     return const_cast<Value*>(Old);
294 
295   if (GlobalMap.count(Old)) {
296     Value *New = GlobalMap[Old];
297 
298     if (Old->getType()->getScalarSizeInBits()
299         < New->getType()->getScalarSizeInBits())
300       New = Builder.CreateTruncOrBitCast(New, Old->getType());
301 
302     return New;
303   }
304 
305   if (BBMap.count(Old)) {
306     return BBMap[Old];
307   }
308 
309   // 'Old' is within the original SCoP, but was not rewritten.
310   //
311   // Such values appear, if they only calculate information already available in
312   // the polyhedral description (e.g.  an induction variable increment). They
313   // can be safely ignored.
314   if (const Instruction *Inst = dyn_cast<Instruction>(Old))
315     if (Statement.getParent()->getRegion().contains(Inst->getParent()))
316       return NULL;
317 
318   // Everything else is probably a scop-constant value defined as global,
319   // function parameter or an instruction not within the scop.
320   return const_cast<Value*>(Old);
321 }
322 
323 void BlockGenerator::copyInstScalar(const Instruction *Inst, ValueMapT &BBMap,
324                                     ValueMapT &GlobalMap) {
325   Instruction *NewInst = Inst->clone();
326 
327   // Replace old operands with the new ones.
328   for (Instruction::const_op_iterator OI = Inst->op_begin(),
329        OE = Inst->op_end(); OI != OE; ++OI) {
330     Value *OldOperand = *OI;
331     Value *NewOperand = getNewValue(OldOperand, BBMap, GlobalMap);
332 
333     if (!NewOperand) {
334       assert(!isa<StoreInst>(NewInst)
335              && "Store instructions are always needed!");
336       delete NewInst;
337       return;
338     }
339 
340     NewInst->replaceUsesOfWith(OldOperand, NewOperand);
341   }
342 
343   Builder.Insert(NewInst);
344   BBMap[Inst] = NewInst;
345 
346   if (!NewInst->getType()->isVoidTy())
347     NewInst->setName("p_" + Inst->getName());
348 }
349 
350 std::vector<Value*> BlockGenerator::getMemoryAccessIndex(
351   __isl_keep isl_map *AccessRelation, Value *BaseAddress,
352   ValueMapT &BBMap, ValueMapT &GlobalMap) {
353 
354   assert((isl_map_dim(AccessRelation, isl_dim_out) == 1)
355          && "Only single dimensional access functions supported");
356 
357   std::vector<Value *> IVS;
358   for (unsigned i = 0; i < Statement.getNumIterators(); ++i) {
359     const Value *OriginalIV = Statement.getInductionVariableForDimension(i);
360     Value *NewIV = getNewValue(OriginalIV, BBMap, GlobalMap);
361     IVS.push_back(NewIV);
362   }
363 
364   isl_pw_aff *PwAff = isl_map_dim_max(isl_map_copy(AccessRelation), 0);
365   IslGenerator IslGen(Builder, IVS);
366   Value *OffsetValue = IslGen.generateIslPwAff(PwAff);
367 
368   Type *Ty = Builder.getInt64Ty();
369   OffsetValue = Builder.CreateIntCast(OffsetValue, Ty, true);
370 
371   std::vector<Value*> IndexArray;
372   Value *NullValue = Constant::getNullValue(Ty);
373   IndexArray.push_back(NullValue);
374   IndexArray.push_back(OffsetValue);
375   return IndexArray;
376 }
377 
378 Value *BlockGenerator::getNewAccessOperand(
379   __isl_keep isl_map *NewAccessRelation, Value *BaseAddress,
380   ValueMapT &BBMap, ValueMapT &GlobalMap) {
381   std::vector<Value*> IndexArray = getMemoryAccessIndex(NewAccessRelation,
382                                                         BaseAddress,
383                                                         BBMap, GlobalMap);
384   Value *NewOperand = Builder.CreateGEP(BaseAddress, IndexArray,
385                                         "p_newarrayidx_");
386   return NewOperand;
387 }
388 
389 Value *BlockGenerator::generateLocationAccessed(const Instruction *Inst,
390                                                 const Value *Pointer,
391                                                 ValueMapT &BBMap,
392                                                 ValueMapT &GlobalMap) {
393   MemoryAccess &Access = Statement.getAccessFor(Inst);
394   isl_map *CurrentAccessRelation = Access.getAccessRelation();
395   isl_map *NewAccessRelation = Access.getNewAccessRelation();
396 
397   assert(isl_map_has_equal_space(CurrentAccessRelation, NewAccessRelation)
398          && "Current and new access function use different spaces");
399 
400   Value *NewPointer;
401 
402   if (!NewAccessRelation) {
403     NewPointer = getNewValue(Pointer, BBMap, GlobalMap);
404   } else {
405     Value *BaseAddress = const_cast<Value*>(Access.getBaseAddr());
406     NewPointer = getNewAccessOperand(NewAccessRelation, BaseAddress,
407                                      BBMap, GlobalMap);
408   }
409 
410   isl_map_free(CurrentAccessRelation);
411   isl_map_free(NewAccessRelation);
412   return NewPointer;
413 }
414 
415 Value *BlockGenerator::generateScalarLoad(const LoadInst *Load,
416                                           ValueMapT &BBMap,
417                                           ValueMapT &GlobalMap) {
418   const Value *Pointer = Load->getPointerOperand();
419   const Instruction *Inst = dyn_cast<Instruction>(Load);
420   Value *NewPointer = generateLocationAccessed(Inst, Pointer, BBMap, GlobalMap);
421   Value *ScalarLoad = Builder.CreateLoad(NewPointer,
422                                          Load->getName() + "_p_scalar_");
423   return ScalarLoad;
424 }
425 
426 Value *BlockGenerator::generateScalarStore(const StoreInst *Store,
427                                            ValueMapT &BBMap,
428                                            ValueMapT &GlobalMap) {
429   const Value *Pointer = Store->getPointerOperand();
430   Value *NewPointer = generateLocationAccessed(Store, Pointer, BBMap,
431                                                GlobalMap);
432   Value *ValueOperand = getNewValue(Store->getValueOperand(), BBMap, GlobalMap);
433 
434   return Builder.CreateStore(ValueOperand, NewPointer);
435 }
436 
437 void BlockGenerator::copyInstruction(const Instruction *Inst,
438                                      ValueMapT &BBMap, ValueMapT &GlobalMap) {
439   // Terminator instructions control the control flow. They are explicitly
440   // expressed in the clast and do not need to be copied.
441   if (Inst->isTerminator())
442     return;
443 
444   if (const LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
445     BBMap[Load] = generateScalarLoad(Load, BBMap, GlobalMap);
446     return;
447   }
448 
449   if (const StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
450     BBMap[Store] = generateScalarStore(Store, BBMap, GlobalMap);
451     return;
452   }
453 
454   copyInstScalar(Inst, BBMap, GlobalMap);
455 }
456 
457 
458 void BlockGenerator::copyBB(ValueMapT &GlobalMap) {
459   BasicBlock *BB = Statement.getBasicBlock();
460   BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
461                                   Builder.GetInsertPoint(), P);
462   CopyBB->setName("polly.stmt." + BB->getName());
463   Builder.SetInsertPoint(CopyBB->begin());
464 
465   ValueMapT BBMap;
466 
467   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE;
468        ++II)
469       copyInstruction(II, BBMap, GlobalMap);
470 }
471 
472 /// @brief Generate a new vector basic block for a polyhedral statement.
473 ///
474 /// The only public function exposed is generate().
475 class VectorBlockGenerator : BlockGenerator {
476 public:
477   /// @brief Generate a new vector basic block for a ScoPStmt.
478   ///
479   /// This code generation is similar to the normal, scalar code generation,
480   /// except that each instruction is code generated for several vector lanes
481   /// at a time. If possible instructions are issued as actual vector
482   /// instructions, but e.g. for address calculation instructions we currently
483   /// generate scalar instructions for each vector lane.
484   ///
485   /// @param Builder    The LLVM-IR Builder used to generate the statement. The
486   ///                   code is generated at the location, the builder points
487   ///                   to.
488   /// @param Stmt       The statement to code generate.
489   /// @param GlobalMaps A vector of maps that define for certain Values
490   ///                   referenced from the original code new Values they should
491   ///                   be replaced with. Each map in the vector of maps is
492   ///                   used for one vector lane. The number of elements in the
493   ///                   vector defines the width of the generated vector
494   ///                   instructions.
495   /// @param P          A reference to the pass this function is called from.
496   ///                   The pass is needed to update other analysis.
497   static void generate(IRBuilder<> &B, ScopStmt &Stmt,
498                        VectorValueMapT &GlobalMaps, __isl_keep isl_set *Domain,
499                        Pass *P) {
500     VectorBlockGenerator Generator(B, GlobalMaps, Stmt, Domain, P);
501     Generator.copyBB();
502   }
503 
504 private:
505   // This is a vector of global value maps.  The first map is used for the first
506   // vector lane, ...
507   // Each map, contains information about Instructions in the old ScoP, which
508   // are recalculated in the new SCoP. When copying the basic block, we replace
509   // all referenes to the old instructions with their recalculated values.
510   VectorValueMapT &GlobalMaps;
511 
512   isl_set *Domain;
513 
514   VectorBlockGenerator(IRBuilder<> &B, VectorValueMapT &GlobalMaps,
515                        ScopStmt &Stmt, __isl_keep isl_set *Domain, Pass *P);
516 
517   int getVectorWidth();
518 
519   Value *getVectorValue(const Value *Old, ValueMapT &VectorMap,
520                         VectorValueMapT &ScalarMaps);
521 
522   Type *getVectorPtrTy(const Value *V, int Width);
523 
524   /// @brief Load a vector from a set of adjacent scalars
525   ///
526   /// In case a set of scalars is known to be next to each other in memory,
527   /// create a vector load that loads those scalars
528   ///
529   /// %vector_ptr= bitcast double* %p to <4 x double>*
530   /// %vec_full = load <4 x double>* %vector_ptr
531   ///
532   Value *generateStrideOneLoad(const LoadInst *Load, ValueMapT &BBMap);
533 
534   /// @brief Load a vector initialized from a single scalar in memory
535   ///
536   /// In case all elements of a vector are initialized to the same
537   /// scalar value, this value is loaded and shuffeled into all elements
538   /// of the vector.
539   ///
540   /// %splat_one = load <1 x double>* %p
541   /// %splat = shufflevector <1 x double> %splat_one, <1 x
542   ///       double> %splat_one, <4 x i32> zeroinitializer
543   ///
544   Value *generateStrideZeroLoad(const LoadInst *Load, ValueMapT &BBMap);
545 
546   /// @Load a vector from scalars distributed in memory
547   ///
548   /// In case some scalars a distributed randomly in memory. Create a vector
549   /// by loading each scalar and by inserting one after the other into the
550   /// vector.
551   ///
552   /// %scalar_1= load double* %p_1
553   /// %vec_1 = insertelement <2 x double> undef, double %scalar_1, i32 0
554   /// %scalar 2 = load double* %p_2
555   /// %vec_2 = insertelement <2 x double> %vec_1, double %scalar_1, i32 1
556   ///
557   Value *generateUnknownStrideLoad(const LoadInst *Load,
558                                    VectorValueMapT &ScalarMaps);
559 
560   void generateLoad(const LoadInst *Load, ValueMapT &VectorMap,
561                     VectorValueMapT &ScalarMaps);
562 
563   void copyUnaryInst(const UnaryInstruction *Inst, ValueMapT &VectorMap,
564                      VectorValueMapT &ScalarMaps);
565 
566   void copyBinaryInst(const BinaryOperator *Inst, ValueMapT &VectorMap,
567                       VectorValueMapT &ScalarMaps);
568 
569   void copyStore(const StoreInst *Store, ValueMapT &VectorMap,
570                  VectorValueMapT &ScalarMaps);
571 
572   void copyInstScalarized(const Instruction *Inst, ValueMapT &VectorMap,
573                           VectorValueMapT &ScalarMaps);
574 
575   bool extractScalarValues(const Instruction *Inst, ValueMapT &VectorMap,
576                            VectorValueMapT &ScalarMaps);
577 
578   bool hasVectorOperands(const Instruction *Inst, ValueMapT &VectorMap);
579 
580   void copyInstruction(const Instruction *Inst, ValueMapT &VectorMap,
581                        VectorValueMapT &ScalarMaps);
582 
583   void copyBB();
584 };
585 
586 VectorBlockGenerator::VectorBlockGenerator(IRBuilder<> &B,
587   VectorValueMapT &GlobalMaps, ScopStmt &Stmt, __isl_keep isl_set *Domain,
588   Pass *P) : BlockGenerator(B, Stmt, P), GlobalMaps(GlobalMaps),
589   Domain(Domain) {
590     assert(GlobalMaps.size() > 1 && "Only one vector lane found");
591     assert(Domain && "No statement domain provided");
592   }
593 
594 Value *VectorBlockGenerator::getVectorValue(const Value *Old,
595                                             ValueMapT &VectorMap,
596                                             VectorValueMapT &ScalarMaps) {
597   if (VectorMap.count(Old))
598     return VectorMap[Old];
599 
600   int Width = getVectorWidth();
601 
602   Value *Vector = UndefValue::get(VectorType::get(Old->getType(), Width));
603 
604   for (int Lane = 0; Lane < Width; Lane++)
605     Vector = Builder.CreateInsertElement(Vector,
606                                          getNewValue(Old,
607                                                      ScalarMaps[Lane],
608                                                      GlobalMaps[Lane]),
609                                          Builder.getInt32(Lane));
610 
611   VectorMap[Old] = Vector;
612 
613   return Vector;
614 }
615 
616 Type *VectorBlockGenerator::getVectorPtrTy(const Value *Val, int Width) {
617   PointerType *PointerTy = dyn_cast<PointerType>(Val->getType());
618   assert(PointerTy && "PointerType expected");
619 
620   Type *ScalarType = PointerTy->getElementType();
621   VectorType *VectorType = VectorType::get(ScalarType, Width);
622 
623   return PointerType::getUnqual(VectorType);
624 }
625 
626 Value *VectorBlockGenerator::generateStrideOneLoad(const LoadInst *Load,
627                                                    ValueMapT &BBMap) {
628   const Value *Pointer = Load->getPointerOperand();
629   Type *VectorPtrType = getVectorPtrTy(Pointer, getVectorWidth());
630   Value *NewPointer = getNewValue(Pointer, BBMap, GlobalMaps[0]);
631   Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
632                                            "vector_ptr");
633   LoadInst *VecLoad = Builder.CreateLoad(VectorPtr,
634                                          Load->getName() + "_p_vec_full");
635   if (!Aligned)
636     VecLoad->setAlignment(8);
637 
638   return VecLoad;
639 }
640 
641 Value *VectorBlockGenerator::generateStrideZeroLoad(const LoadInst *Load,
642                                                     ValueMapT &BBMap) {
643   const Value *Pointer = Load->getPointerOperand();
644   Type *VectorPtrType = getVectorPtrTy(Pointer, 1);
645   Value *NewPointer = getNewValue(Pointer, BBMap, GlobalMaps[0]);
646   Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
647                                            Load->getName() + "_p_vec_p");
648   LoadInst *ScalarLoad= Builder.CreateLoad(VectorPtr,
649                                            Load->getName() + "_p_splat_one");
650 
651   if (!Aligned)
652     ScalarLoad->setAlignment(8);
653 
654   Constant *SplatVector =
655     Constant::getNullValue(VectorType::get(Builder.getInt32Ty(),
656                                            getVectorWidth()));
657 
658   Value *VectorLoad = Builder.CreateShuffleVector(ScalarLoad, ScalarLoad,
659                                                   SplatVector,
660                                                   Load->getName()
661                                                   + "_p_splat");
662   return VectorLoad;
663 }
664 
665 Value *VectorBlockGenerator::generateUnknownStrideLoad(const LoadInst *Load,
666   VectorValueMapT &ScalarMaps) {
667   int VectorWidth = getVectorWidth();
668   const Value *Pointer = Load->getPointerOperand();
669   VectorType *VectorType = VectorType::get(
670     dyn_cast<PointerType>(Pointer->getType())->getElementType(), VectorWidth);
671 
672   Value *Vector = UndefValue::get(VectorType);
673 
674   for (int i = 0; i < VectorWidth; i++) {
675     Value *NewPointer = getNewValue(Pointer, ScalarMaps[i], GlobalMaps[i]);
676     Value *ScalarLoad = Builder.CreateLoad(NewPointer,
677                                            Load->getName() + "_p_scalar_");
678     Vector = Builder.CreateInsertElement(Vector, ScalarLoad,
679                                          Builder.getInt32(i),
680                                          Load->getName() + "_p_vec_");
681   }
682 
683   return Vector;
684 }
685 
686 void VectorBlockGenerator::generateLoad(const LoadInst *Load,
687                                         ValueMapT &VectorMap,
688                                         VectorValueMapT &ScalarMaps) {
689   if (GroupedUnrolling || !VectorType::isValidElementType(Load->getType())) {
690     for (int i = 0; i < getVectorWidth(); i++)
691       ScalarMaps[i][Load] = generateScalarLoad(Load, ScalarMaps[i],
692                                                GlobalMaps[i]);
693     return;
694   }
695 
696   MemoryAccess &Access = Statement.getAccessFor(Load);
697 
698   Value *NewLoad;
699   if (Access.isStrideZero(isl_set_copy(Domain)))
700     NewLoad = generateStrideZeroLoad(Load, ScalarMaps[0]);
701   else if (Access.isStrideOne(isl_set_copy(Domain)))
702     NewLoad = generateStrideOneLoad(Load, ScalarMaps[0]);
703   else
704     NewLoad = generateUnknownStrideLoad(Load, ScalarMaps);
705 
706   VectorMap[Load] = NewLoad;
707 }
708 
709 void VectorBlockGenerator::copyUnaryInst(const UnaryInstruction *Inst,
710                                          ValueMapT &VectorMap,
711                                          VectorValueMapT &ScalarMaps) {
712   int VectorWidth = getVectorWidth();
713   Value *NewOperand = getVectorValue(Inst->getOperand(0), VectorMap,
714                                      ScalarMaps);
715 
716   assert(isa<CastInst>(Inst) && "Can not generate vector code for instruction");
717 
718   const CastInst *Cast = dyn_cast<CastInst>(Inst);
719   VectorType *DestType = VectorType::get(Inst->getType(), VectorWidth);
720   VectorMap[Inst] = Builder.CreateCast(Cast->getOpcode(), NewOperand, DestType);
721 }
722 
723 void VectorBlockGenerator::copyBinaryInst(const BinaryOperator *Inst,
724                                           ValueMapT &VectorMap,
725                                           VectorValueMapT &ScalarMaps) {
726   Value *OpZero = Inst->getOperand(0);
727   Value *OpOne = Inst->getOperand(1);
728 
729   Value *NewOpZero, *NewOpOne;
730   NewOpZero = getVectorValue(OpZero, VectorMap, ScalarMaps);
731   NewOpOne = getVectorValue(OpOne, VectorMap, ScalarMaps);
732 
733   Value *NewInst = Builder.CreateBinOp(Inst->getOpcode(), NewOpZero,
734                                        NewOpOne,
735                                        Inst->getName() + "p_vec");
736   VectorMap[Inst] = NewInst;
737 }
738 
739 void VectorBlockGenerator::copyStore(const StoreInst *Store,
740                                      ValueMapT &VectorMap,
741                                      VectorValueMapT &ScalarMaps) {
742   int VectorWidth = getVectorWidth();
743 
744   MemoryAccess &Access = Statement.getAccessFor(Store);
745 
746   const Value *Pointer = Store->getPointerOperand();
747   Value *Vector = getVectorValue(Store->getValueOperand(), VectorMap,
748                                    ScalarMaps);
749 
750   if (Access.isStrideOne(isl_set_copy(Domain))) {
751     Type *VectorPtrType = getVectorPtrTy(Pointer, VectorWidth);
752     Value *NewPointer = getNewValue(Pointer, ScalarMaps[0], GlobalMaps[0]);
753 
754     Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
755                                              "vector_ptr");
756     StoreInst *Store = Builder.CreateStore(Vector, VectorPtr);
757 
758     if (!Aligned)
759       Store->setAlignment(8);
760   } else {
761     for (unsigned i = 0; i < ScalarMaps.size(); i++) {
762       Value *Scalar = Builder.CreateExtractElement(Vector,
763                                                    Builder.getInt32(i));
764       Value *NewPointer = getNewValue(Pointer, ScalarMaps[i], GlobalMaps[i]);
765       Builder.CreateStore(Scalar, NewPointer);
766     }
767   }
768 }
769 
770 bool VectorBlockGenerator::hasVectorOperands(const Instruction *Inst,
771                                              ValueMapT &VectorMap) {
772   for (Instruction::const_op_iterator OI = Inst->op_begin(),
773        OE = Inst->op_end(); OI != OE; ++OI)
774     if (VectorMap.count(*OI))
775       return true;
776   return false;
777 }
778 
779 bool VectorBlockGenerator::extractScalarValues(const Instruction *Inst,
780                                                ValueMapT &VectorMap,
781                                                VectorValueMapT &ScalarMaps) {
782   bool HasVectorOperand = false;
783   int VectorWidth = getVectorWidth();
784 
785   for (Instruction::const_op_iterator OI = Inst->op_begin(),
786        OE = Inst->op_end(); OI != OE; ++OI) {
787     ValueMapT::iterator VecOp = VectorMap.find(*OI);
788 
789     if (VecOp == VectorMap.end())
790       continue;
791 
792     HasVectorOperand = true;
793     Value *NewVector = VecOp->second;
794 
795     for (int i = 0; i < VectorWidth; ++i) {
796       ValueMapT &SM = ScalarMaps[i];
797 
798       // If there is one scalar extracted, all scalar elements should have
799       // already been extracted by the code here. So no need to check for the
800       // existance of all of them.
801       if (SM.count(*OI))
802         break;
803 
804       SM[*OI] = Builder.CreateExtractElement(NewVector, Builder.getInt32(i));
805     }
806   }
807 
808   return HasVectorOperand;
809 }
810 
811 void VectorBlockGenerator::copyInstScalarized(const Instruction *Inst,
812                                               ValueMapT &VectorMap,
813                                               VectorValueMapT &ScalarMaps) {
814   bool HasVectorOperand;
815   int VectorWidth = getVectorWidth();
816 
817   HasVectorOperand = extractScalarValues(Inst, VectorMap, ScalarMaps);
818 
819   for (int VectorLane = 0; VectorLane < getVectorWidth(); VectorLane++)
820     copyInstScalar(Inst, ScalarMaps[VectorLane], GlobalMaps[VectorLane]);
821 
822   if (!VectorType::isValidElementType(Inst->getType()) || !HasVectorOperand)
823     return;
824 
825   // Make the result available as vector value.
826   VectorType *VectorType = VectorType::get(Inst->getType(), VectorWidth);
827   Value *Vector = UndefValue::get(VectorType);
828 
829   for (int i = 0; i < VectorWidth; i++)
830     Vector = Builder.CreateInsertElement(Vector, ScalarMaps[i][Inst],
831                                          Builder.getInt32(i));
832 
833   VectorMap[Inst] = Vector;
834 }
835 
836 int VectorBlockGenerator::getVectorWidth() {
837   return GlobalMaps.size();
838 }
839 
840 void VectorBlockGenerator::copyInstruction(const Instruction *Inst,
841                                            ValueMapT &VectorMap,
842                                            VectorValueMapT &ScalarMaps) {
843   // Terminator instructions control the control flow. They are explicitly
844   // expressed in the clast and do not need to be copied.
845   if (Inst->isTerminator())
846     return;
847 
848   if (const LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
849     generateLoad(Load, VectorMap, ScalarMaps);
850     return;
851   }
852 
853   if (hasVectorOperands(Inst, VectorMap)) {
854     if (const StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
855       copyStore(Store, VectorMap, ScalarMaps);
856       return;
857     }
858 
859     if (const UnaryInstruction *Unary = dyn_cast<UnaryInstruction>(Inst)) {
860       copyUnaryInst(Unary, VectorMap, ScalarMaps);
861       return;
862     }
863 
864     if (const BinaryOperator *Binary = dyn_cast<BinaryOperator>(Inst)) {
865       copyBinaryInst(Binary, VectorMap, ScalarMaps);
866       return;
867     }
868 
869     // Falltrough: We generate scalar instructions, if we don't know how to
870     // generate vector code.
871   }
872 
873   copyInstScalarized(Inst, VectorMap, ScalarMaps);
874 }
875 
876 void VectorBlockGenerator::copyBB() {
877   BasicBlock *BB = Statement.getBasicBlock();
878   BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
879                                   Builder.GetInsertPoint(), P);
880   CopyBB->setName("polly.stmt." + BB->getName());
881   Builder.SetInsertPoint(CopyBB->begin());
882 
883   // Create two maps that store the mapping from the original instructions of
884   // the old basic block to their copies in the new basic block. Those maps
885   // are basic block local.
886   //
887   // As vector code generation is supported there is one map for scalar values
888   // and one for vector values.
889   //
890   // In case we just do scalar code generation, the vectorMap is not used and
891   // the scalarMap has just one dimension, which contains the mapping.
892   //
893   // In case vector code generation is done, an instruction may either appear
894   // in the vector map once (as it is calculating >vectorwidth< values at a
895   // time. Or (if the values are calculated using scalar operations), it
896   // appears once in every dimension of the scalarMap.
897   VectorValueMapT ScalarBlockMap(getVectorWidth());
898   ValueMapT VectorBlockMap;
899 
900   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
901        II != IE; ++II)
902       copyInstruction(II, VectorBlockMap, ScalarBlockMap);
903 }
904 
905 /// Class to generate LLVM-IR that calculates the value of a clast_expr.
906 class ClastExpCodeGen {
907   IRBuilder<> &Builder;
908   const CharMapT &IVS;
909 
910   Value *codegen(const clast_name *e, Type *Ty);
911   Value *codegen(const clast_term *e, Type *Ty);
912   Value *codegen(const clast_binary *e, Type *Ty);
913   Value *codegen(const clast_reduction *r, Type *Ty);
914 public:
915 
916   // A generator for clast expressions.
917   //
918   // @param B The IRBuilder that defines where the code to calculate the
919   //          clast expressions should be inserted.
920   // @param IVMAP A Map that translates strings describing the induction
921   //              variables to the Values* that represent these variables
922   //              on the LLVM side.
923   ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap);
924 
925   // Generates code to calculate a given clast expression.
926   //
927   // @param e The expression to calculate.
928   // @return The Value that holds the result.
929   Value *codegen(const clast_expr *e, Type *Ty);
930 };
931 
932 Value *ClastExpCodeGen::codegen(const clast_name *e, Type *Ty) {
933   CharMapT::const_iterator I = IVS.find(e->name);
934 
935   assert(I != IVS.end() && "Clast name not found");
936 
937   return Builder.CreateSExtOrBitCast(I->second, Ty);
938 }
939 
940 Value *ClastExpCodeGen::codegen(const clast_term *e, Type *Ty) {
941   APInt a = APInt_from_MPZ(e->val);
942 
943   Value *ConstOne = ConstantInt::get(Builder.getContext(), a);
944   ConstOne = Builder.CreateSExtOrBitCast(ConstOne, Ty);
945 
946   if (!e->var)
947     return ConstOne;
948 
949   Value *var = codegen(e->var, Ty);
950   return Builder.CreateMul(ConstOne, var);
951 }
952 
953 Value *ClastExpCodeGen::codegen(const clast_binary *e, Type *Ty) {
954   Value *LHS = codegen(e->LHS, Ty);
955 
956   APInt RHS_AP = APInt_from_MPZ(e->RHS);
957 
958   Value *RHS = ConstantInt::get(Builder.getContext(), RHS_AP);
959   RHS = Builder.CreateSExtOrBitCast(RHS, Ty);
960 
961   switch (e->type) {
962   case clast_bin_mod:
963     return Builder.CreateSRem(LHS, RHS);
964   case clast_bin_fdiv:
965     {
966       // floord(n,d) ((n < 0) ? (n - d + 1) : n) / d
967       Value *One = ConstantInt::get(Ty, 1);
968       Value *Zero = ConstantInt::get(Ty, 0);
969       Value *Sum1 = Builder.CreateSub(LHS, RHS);
970       Value *Sum2 = Builder.CreateAdd(Sum1, One);
971       Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
972       Value *Dividend = Builder.CreateSelect(isNegative, Sum2, LHS);
973       return Builder.CreateSDiv(Dividend, RHS);
974     }
975   case clast_bin_cdiv:
976     {
977       // ceild(n,d) ((n < 0) ? n : (n + d - 1)) / d
978       Value *One = ConstantInt::get(Ty, 1);
979       Value *Zero = ConstantInt::get(Ty, 0);
980       Value *Sum1 = Builder.CreateAdd(LHS, RHS);
981       Value *Sum2 = Builder.CreateSub(Sum1, One);
982       Value *isNegative = Builder.CreateICmpSLT(LHS, Zero);
983       Value *Dividend = Builder.CreateSelect(isNegative, LHS, Sum2);
984       return Builder.CreateSDiv(Dividend, RHS);
985     }
986   case clast_bin_div:
987     return Builder.CreateSDiv(LHS, RHS);
988   };
989 
990   llvm_unreachable("Unknown clast binary expression type");
991 }
992 
993 Value *ClastExpCodeGen::codegen(const clast_reduction *r, Type *Ty) {
994   assert((   r->type == clast_red_min
995              || r->type == clast_red_max
996              || r->type == clast_red_sum)
997          && "Clast reduction type not supported");
998   Value *old = codegen(r->elts[0], Ty);
999 
1000   for (int i=1; i < r->n; ++i) {
1001     Value *exprValue = codegen(r->elts[i], Ty);
1002 
1003     switch (r->type) {
1004     case clast_red_min:
1005       {
1006         Value *cmp = Builder.CreateICmpSLT(old, exprValue);
1007         old = Builder.CreateSelect(cmp, old, exprValue);
1008         break;
1009       }
1010     case clast_red_max:
1011       {
1012         Value *cmp = Builder.CreateICmpSGT(old, exprValue);
1013         old = Builder.CreateSelect(cmp, old, exprValue);
1014         break;
1015       }
1016     case clast_red_sum:
1017       old = Builder.CreateAdd(old, exprValue);
1018       break;
1019     }
1020   }
1021 
1022   return old;
1023 }
1024 
1025 ClastExpCodeGen::ClastExpCodeGen(IRBuilder<> &B, CharMapT &IVMap)
1026   : Builder(B), IVS(IVMap) {}
1027 
1028 Value *ClastExpCodeGen::codegen(const clast_expr *e, Type *Ty) {
1029   switch(e->type) {
1030   case clast_expr_name:
1031     return codegen((const clast_name *)e, Ty);
1032   case clast_expr_term:
1033     return codegen((const clast_term *)e, Ty);
1034   case clast_expr_bin:
1035     return codegen((const clast_binary *)e, Ty);
1036   case clast_expr_red:
1037     return codegen((const clast_reduction *)e, Ty);
1038   }
1039 
1040   llvm_unreachable("Unknown clast expression!");
1041 }
1042 
1043 class ClastStmtCodeGen {
1044 public:
1045   const std::vector<std::string> &getParallelLoops();
1046 
1047 private:
1048   // The Scop we code generate.
1049   Scop *S;
1050   Pass *P;
1051 
1052   // The Builder specifies the current location to code generate at.
1053   IRBuilder<> &Builder;
1054 
1055   // Map the Values from the old code to their counterparts in the new code.
1056   ValueMapT ValueMap;
1057 
1058   // clastVars maps from the textual representation of a clast variable to its
1059   // current *Value. clast variables are scheduling variables, original
1060   // induction variables or parameters. They are used either in loop bounds or
1061   // to define the statement instance that is executed.
1062   //
1063   //   for (s = 0; s < n + 3; ++i)
1064   //     for (t = s; t < m; ++j)
1065   //       Stmt(i = s + 3 * m, j = t);
1066   //
1067   // {s,t,i,j,n,m} is the set of clast variables in this clast.
1068   CharMapT ClastVars;
1069 
1070   // Codegenerator for clast expressions.
1071   ClastExpCodeGen ExpGen;
1072 
1073   // Do we currently generate parallel code?
1074   bool parallelCodeGeneration;
1075 
1076   std::vector<std::string> parallelLoops;
1077 
1078   void codegen(const clast_assignment *a);
1079 
1080   void codegen(const clast_assignment *a, ScopStmt *Statement,
1081                unsigned Dimension, int vectorDim,
1082                std::vector<ValueMapT> *VectorVMap = 0);
1083 
1084   void codegenSubstitutions(const clast_stmt *Assignment,
1085                             ScopStmt *Statement, int vectorDim = 0,
1086                             std::vector<ValueMapT> *VectorVMap = 0);
1087 
1088   void codegen(const clast_user_stmt *u, std::vector<Value*> *IVS = NULL,
1089                const char *iterator = NULL, isl_set *scatteringDomain = 0);
1090 
1091   void codegen(const clast_block *b);
1092 
1093   /// @brief Create a classical sequential loop.
1094   void codegenForSequential(const clast_for *f);
1095 
1096   /// @brief Create OpenMP structure values.
1097   ///
1098   /// Create a list of values that has to be stored into the OpenMP subfuncition
1099   /// structure.
1100   SetVector<Value*> getOMPValues();
1101 
1102   /// @brief Update the internal structures according to a Value Map.
1103   ///
1104   /// @param VMap     A map from old to new values.
1105   /// @param Reverse  If true, we assume the update should be reversed.
1106   void updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap,
1107                           bool Reverse);
1108 
1109   /// @brief Create an OpenMP parallel for loop.
1110   ///
1111   /// This loop reflects a loop as if it would have been created by an OpenMP
1112   /// statement.
1113   void codegenForOpenMP(const clast_for *f);
1114 
1115   bool isInnermostLoop(const clast_for *f);
1116 
1117   /// @brief Get the number of loop iterations for this loop.
1118   /// @param f The clast for loop to check.
1119   int getNumberOfIterations(const clast_for *f);
1120 
1121   /// @brief Create vector instructions for this loop.
1122   void codegenForVector(const clast_for *f);
1123 
1124   void codegen(const clast_for *f);
1125 
1126   Value *codegen(const clast_equation *eq);
1127 
1128   void codegen(const clast_guard *g);
1129 
1130   void codegen(const clast_stmt *stmt);
1131 
1132   void addParameters(const CloogNames *names);
1133 
1134   IntegerType *getIntPtrTy();
1135 
1136   public:
1137   void codegen(const clast_root *r);
1138 
1139   ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P);
1140 };
1141 }
1142 
1143 IntegerType *ClastStmtCodeGen::getIntPtrTy() {
1144   return P->getAnalysis<TargetData>().getIntPtrType(Builder.getContext());
1145 }
1146 
1147 const std::vector<std::string> &ClastStmtCodeGen::getParallelLoops() {
1148   return parallelLoops;
1149 }
1150 
1151 void ClastStmtCodeGen::codegen(const clast_assignment *a) {
1152   Value *V= ExpGen.codegen(a->RHS, getIntPtrTy());
1153   ClastVars[a->LHS] = V;
1154 }
1155 
1156 void ClastStmtCodeGen::codegen(const clast_assignment *A, ScopStmt *Stmt,
1157                                unsigned Dim, int VectorDim,
1158                                std::vector<ValueMapT> *VectorVMap) {
1159   const PHINode *PN;
1160   Value *RHS;
1161 
1162   assert(!A->LHS && "Statement assignments do not have left hand side");
1163 
1164   PN = Stmt->getInductionVariableForDimension(Dim);
1165   RHS = ExpGen.codegen(A->RHS, Builder.getInt64Ty());
1166   RHS = Builder.CreateTruncOrBitCast(RHS, PN->getType());
1167 
1168   if (VectorVMap)
1169     (*VectorVMap)[VectorDim][PN] = RHS;
1170 
1171   ValueMap[PN] = RHS;
1172 }
1173 
1174 void ClastStmtCodeGen::codegenSubstitutions(const clast_stmt *Assignment,
1175                                              ScopStmt *Statement, int vectorDim,
1176   std::vector<ValueMapT> *VectorVMap) {
1177   int Dimension = 0;
1178 
1179   while (Assignment) {
1180     assert(CLAST_STMT_IS_A(Assignment, stmt_ass)
1181            && "Substitions are expected to be assignments");
1182     codegen((const clast_assignment *)Assignment, Statement, Dimension,
1183             vectorDim, VectorVMap);
1184     Assignment = Assignment->next;
1185     Dimension++;
1186   }
1187 }
1188 
1189 void ClastStmtCodeGen::codegen(const clast_user_stmt *u,
1190                                std::vector<Value*> *IVS , const char *iterator,
1191                                isl_set *Domain) {
1192   ScopStmt *Statement = (ScopStmt *)u->statement->usr;
1193 
1194   if (u->substitutions)
1195     codegenSubstitutions(u->substitutions, Statement);
1196 
1197   int VectorDimensions = IVS ? IVS->size() : 1;
1198 
1199   if (VectorDimensions == 1) {
1200     BlockGenerator::generate(Builder, *Statement, ValueMap, P);
1201     return;
1202   }
1203 
1204   VectorValueMapT VectorMap(VectorDimensions);
1205 
1206   if (IVS) {
1207     assert (u->substitutions && "Substitutions expected!");
1208     int i = 0;
1209     for (std::vector<Value*>::iterator II = IVS->begin(), IE = IVS->end();
1210          II != IE; ++II) {
1211       ClastVars[iterator] = *II;
1212       codegenSubstitutions(u->substitutions, Statement, i, &VectorMap);
1213       i++;
1214     }
1215   }
1216 
1217   VectorBlockGenerator::generate(Builder, *Statement, VectorMap, Domain, P);
1218 }
1219 
1220 void ClastStmtCodeGen::codegen(const clast_block *b) {
1221   if (b->body)
1222     codegen(b->body);
1223 }
1224 
1225 void ClastStmtCodeGen::codegenForSequential(const clast_for *f) {
1226   Value *LowerBound, *UpperBound, *IV, *Stride;
1227   BasicBlock *AfterBB;
1228   Type *IntPtrTy = getIntPtrTy();
1229 
1230   LowerBound = ExpGen.codegen(f->LB, IntPtrTy);
1231   UpperBound = ExpGen.codegen(f->UB, IntPtrTy);
1232   Stride = Builder.getInt(APInt_from_MPZ(f->stride));
1233 
1234   IV = createLoop(LowerBound, UpperBound, Stride, &Builder, P, &AfterBB);
1235 
1236   // Add loop iv to symbols.
1237   ClastVars[f->iterator] = IV;
1238 
1239   if (f->body)
1240     codegen(f->body);
1241 
1242   // Loop is finished, so remove its iv from the live symbols.
1243   ClastVars.erase(f->iterator);
1244   Builder.SetInsertPoint(AfterBB->begin());
1245 }
1246 
1247 SetVector<Value*> ClastStmtCodeGen::getOMPValues() {
1248   SetVector<Value*> Values;
1249 
1250   // The clast variables
1251   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end();
1252        I != E; I++)
1253     Values.insert(I->second);
1254 
1255   // The memory reference base addresses
1256   for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
1257     ScopStmt *Stmt = *SI;
1258     for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
1259          E = Stmt->memacc_end(); I != E; ++I) {
1260       Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
1261       Values.insert((BaseAddr));
1262     }
1263   }
1264 
1265   return Values;
1266 }
1267 
1268 void ClastStmtCodeGen::updateWithValueMap(OMPGenerator::ValueToValueMapTy &VMap,
1269                                           bool Reverse) {
1270   std::set<Value*> Inserted;
1271 
1272   if (Reverse) {
1273     OMPGenerator::ValueToValueMapTy ReverseMap;
1274 
1275     for (std::map<Value*, Value*>::iterator I = VMap.begin(), E = VMap.end();
1276          I != E; ++I)
1277        ReverseMap.insert(std::make_pair(I->second, I->first));
1278 
1279     for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end();
1280          I != E; I++) {
1281       ClastVars[I->first] = ReverseMap[I->second];
1282       Inserted.insert(I->second);
1283     }
1284 
1285     /// FIXME: At the moment we do not reverse the update of the ValueMap.
1286     ///        This is incomplet, but the failure should be obvious, such that
1287     ///        we can fix this later.
1288     return;
1289   }
1290 
1291   for (CharMapT::iterator I = ClastVars.begin(), E = ClastVars.end();
1292        I != E; I++) {
1293     ClastVars[I->first] = VMap[I->second];
1294     Inserted.insert(I->second);
1295   }
1296 
1297   for (std::map<Value*, Value*>::iterator I = VMap.begin(), E = VMap.end();
1298        I != E; ++I) {
1299     if (Inserted.count(I->first))
1300       continue;
1301 
1302     ValueMap[I->first] = I->second;
1303   }
1304 }
1305 
1306 static void clearDomtree(Function *F, DominatorTree &DT) {
1307   DomTreeNode *N = DT.getNode(&F->getEntryBlock());
1308   std::vector<BasicBlock*> Nodes;
1309   for (po_iterator<DomTreeNode*> I = po_begin(N), E = po_end(N); I != E; ++I)
1310     Nodes.push_back(I->getBlock());
1311 
1312   for (std::vector<BasicBlock*>::iterator I = Nodes.begin(), E = Nodes.end();
1313        I != E; ++I)
1314     DT.eraseNode(*I);
1315 }
1316 
1317 void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) {
1318   Value *Stride, *LB, *UB, *IV;
1319   BasicBlock::iterator LoopBody;
1320   IntegerType *IntPtrTy = getIntPtrTy();
1321   SetVector<Value*> Values;
1322   OMPGenerator::ValueToValueMapTy VMap;
1323   OMPGenerator OMPGen(Builder, P);
1324 
1325   Stride = Builder.getInt(APInt_from_MPZ(For->stride));
1326   Stride = Builder.CreateSExtOrBitCast(Stride, IntPtrTy);
1327   LB = ExpGen.codegen(For->LB, IntPtrTy);
1328   UB = ExpGen.codegen(For->UB, IntPtrTy);
1329 
1330   Values = getOMPValues();
1331 
1332   IV = OMPGen.createParallelLoop(LB, UB, Stride, Values, VMap, &LoopBody);
1333   BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
1334   Builder.SetInsertPoint(LoopBody);
1335 
1336   updateWithValueMap(VMap, /* reverse */ false);
1337   ClastVars[For->iterator] = IV;
1338 
1339   if (For->body)
1340     codegen(For->body);
1341 
1342   ClastVars.erase(For->iterator);
1343   updateWithValueMap(VMap, /* reverse */ true);
1344 
1345   clearDomtree((*LoopBody).getParent()->getParent(),
1346                P->getAnalysis<DominatorTree>());
1347 
1348   Builder.SetInsertPoint(AfterLoop);
1349 }
1350 
1351 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
1352   const clast_stmt *stmt = f->body;
1353 
1354   while (stmt) {
1355     if (!CLAST_STMT_IS_A(stmt, stmt_user))
1356       return false;
1357 
1358     stmt = stmt->next;
1359   }
1360 
1361   return true;
1362 }
1363 
1364 int ClastStmtCodeGen::getNumberOfIterations(const clast_for *f) {
1365   isl_set *loopDomain = isl_set_copy(isl_set_from_cloog_domain(f->domain));
1366   isl_set *tmp = isl_set_copy(loopDomain);
1367 
1368   // Calculate a map similar to the identity map, but with the last input
1369   // and output dimension not related.
1370   //  [i0, i1, i2, i3] -> [i0, i1, i2, o0]
1371   isl_space *Space = isl_set_get_space(loopDomain);
1372   Space = isl_space_drop_outputs(Space,
1373                                  isl_set_dim(loopDomain, isl_dim_set) - 2, 1);
1374   Space = isl_space_map_from_set(Space);
1375   isl_map *identity = isl_map_identity(Space);
1376   identity = isl_map_add_dims(identity, isl_dim_in, 1);
1377   identity = isl_map_add_dims(identity, isl_dim_out, 1);
1378 
1379   isl_map *map = isl_map_from_domain_and_range(tmp, loopDomain);
1380   map = isl_map_intersect(map, identity);
1381 
1382   isl_map *lexmax = isl_map_lexmax(isl_map_copy(map));
1383   isl_map *lexmin = isl_map_lexmin(map);
1384   isl_map *sub = isl_map_sum(lexmax, isl_map_neg(lexmin));
1385 
1386   isl_set *elements = isl_map_range(sub);
1387 
1388   if (!isl_set_is_singleton(elements)) {
1389     isl_set_free(elements);
1390     return -1;
1391   }
1392 
1393   isl_point *p = isl_set_sample_point(elements);
1394 
1395   isl_int v;
1396   isl_int_init(v);
1397   isl_point_get_coordinate(p, isl_dim_set, isl_set_n_dim(loopDomain) - 1, &v);
1398   int numberIterations = isl_int_get_si(v);
1399   isl_int_clear(v);
1400   isl_point_free(p);
1401 
1402   return (numberIterations) / isl_int_get_si(f->stride) + 1;
1403 }
1404 
1405 void ClastStmtCodeGen::codegenForVector(const clast_for *F) {
1406   DEBUG(dbgs() << "Vectorizing loop '" << F->iterator << "'\n";);
1407   int VectorWidth = getNumberOfIterations(F);
1408 
1409   Value *LB = ExpGen.codegen(F->LB, getIntPtrTy());
1410 
1411   APInt Stride = APInt_from_MPZ(F->stride);
1412   IntegerType *LoopIVType = dyn_cast<IntegerType>(LB->getType());
1413   Stride =  Stride.zext(LoopIVType->getBitWidth());
1414   Value *StrideValue = ConstantInt::get(LoopIVType, Stride);
1415 
1416   std::vector<Value*> IVS(VectorWidth);
1417   IVS[0] = LB;
1418 
1419   for (int i = 1; i < VectorWidth; i++)
1420     IVS[i] = Builder.CreateAdd(IVS[i-1], StrideValue, "p_vector_iv");
1421 
1422   isl_set *Domain = isl_set_from_cloog_domain(F->domain);
1423 
1424   // Add loop iv to symbols.
1425   ClastVars[F->iterator] = LB;
1426 
1427   const clast_stmt *Stmt = F->body;
1428 
1429   while (Stmt) {
1430     codegen((const clast_user_stmt *)Stmt, &IVS, F->iterator,
1431             isl_set_copy(Domain));
1432     Stmt = Stmt->next;
1433   }
1434 
1435   // Loop is finished, so remove its iv from the live symbols.
1436   isl_set_free(Domain);
1437   ClastVars.erase(F->iterator);
1438 }
1439 
1440 void ClastStmtCodeGen::codegen(const clast_for *f) {
1441   if ((Vector || OpenMP) && P->getAnalysis<Dependences>().isParallelFor(f)) {
1442     if (Vector && isInnermostLoop(f) && (-1 != getNumberOfIterations(f))
1443         && (getNumberOfIterations(f) <= 16)) {
1444       codegenForVector(f);
1445       return;
1446     }
1447 
1448     if (OpenMP && !parallelCodeGeneration) {
1449       parallelCodeGeneration = true;
1450       parallelLoops.push_back(f->iterator);
1451       codegenForOpenMP(f);
1452       parallelCodeGeneration = false;
1453       return;
1454     }
1455   }
1456 
1457   codegenForSequential(f);
1458 }
1459 
1460 Value *ClastStmtCodeGen::codegen(const clast_equation *eq) {
1461   Value *LHS = ExpGen.codegen(eq->LHS, getIntPtrTy());
1462   Value *RHS = ExpGen.codegen(eq->RHS, getIntPtrTy());
1463   CmpInst::Predicate P;
1464 
1465   if (eq->sign == 0)
1466     P = ICmpInst::ICMP_EQ;
1467   else if (eq->sign > 0)
1468     P = ICmpInst::ICMP_SGE;
1469   else
1470     P = ICmpInst::ICMP_SLE;
1471 
1472   return Builder.CreateICmp(P, LHS, RHS);
1473 }
1474 
1475 void ClastStmtCodeGen::codegen(const clast_guard *g) {
1476   Function *F = Builder.GetInsertBlock()->getParent();
1477   LLVMContext &Context = F->getContext();
1478 
1479   BasicBlock *CondBB = SplitBlock(Builder.GetInsertBlock(),
1480                                       Builder.GetInsertPoint(), P);
1481   CondBB->setName("polly.cond");
1482   BasicBlock *MergeBB = SplitBlock(CondBB, CondBB->begin(), P);
1483   MergeBB->setName("polly.merge");
1484   BasicBlock *ThenBB = BasicBlock::Create(Context, "polly.then", F);
1485 
1486   DominatorTree &DT = P->getAnalysis<DominatorTree>();
1487   DT.addNewBlock(ThenBB, CondBB);
1488   DT.changeImmediateDominator(MergeBB, CondBB);
1489 
1490   CondBB->getTerminator()->eraseFromParent();
1491 
1492   Builder.SetInsertPoint(CondBB);
1493 
1494   Value *Predicate = codegen(&(g->eq[0]));
1495 
1496   for (int i = 1; i < g->n; ++i) {
1497     Value *TmpPredicate = codegen(&(g->eq[i]));
1498     Predicate = Builder.CreateAnd(Predicate, TmpPredicate);
1499   }
1500 
1501   Builder.CreateCondBr(Predicate, ThenBB, MergeBB);
1502   Builder.SetInsertPoint(ThenBB);
1503   Builder.CreateBr(MergeBB);
1504   Builder.SetInsertPoint(ThenBB->begin());
1505 
1506   codegen(g->then);
1507 
1508   Builder.SetInsertPoint(MergeBB->begin());
1509 }
1510 
1511 void ClastStmtCodeGen::codegen(const clast_stmt *stmt) {
1512   if	    (CLAST_STMT_IS_A(stmt, stmt_root))
1513     assert(false && "No second root statement expected");
1514   else if (CLAST_STMT_IS_A(stmt, stmt_ass))
1515     codegen((const clast_assignment *)stmt);
1516   else if (CLAST_STMT_IS_A(stmt, stmt_user))
1517     codegen((const clast_user_stmt *)stmt);
1518   else if (CLAST_STMT_IS_A(stmt, stmt_block))
1519     codegen((const clast_block *)stmt);
1520   else if (CLAST_STMT_IS_A(stmt, stmt_for))
1521     codegen((const clast_for *)stmt);
1522   else if (CLAST_STMT_IS_A(stmt, stmt_guard))
1523     codegen((const clast_guard *)stmt);
1524 
1525   if (stmt->next)
1526     codegen(stmt->next);
1527 }
1528 
1529 void ClastStmtCodeGen::addParameters(const CloogNames *names) {
1530   SCEVExpander Rewriter(P->getAnalysis<ScalarEvolution>(), "polly");
1531 
1532   int i = 0;
1533   for (Scop::param_iterator PI = S->param_begin(), PE = S->param_end();
1534        PI != PE; ++PI) {
1535     assert(i < names->nb_parameters && "Not enough parameter names");
1536 
1537     const SCEV *Param = *PI;
1538     Type *Ty = Param->getType();
1539 
1540     Instruction *insertLocation = --(Builder.GetInsertBlock()->end());
1541     Value *V = Rewriter.expandCodeFor(Param, Ty, insertLocation);
1542     ClastVars[names->parameters[i]] = V;
1543 
1544     ++i;
1545   }
1546 }
1547 
1548 void ClastStmtCodeGen::codegen(const clast_root *r) {
1549   addParameters(r->names);
1550 
1551   parallelCodeGeneration = false;
1552 
1553   const clast_stmt *stmt = (const clast_stmt*) r;
1554   if (stmt->next)
1555     codegen(stmt->next);
1556 }
1557 
1558 ClastStmtCodeGen::ClastStmtCodeGen(Scop *scop, IRBuilder<> &B, Pass *P) :
1559     S(scop), P(P), Builder(B), ExpGen(Builder, ClastVars) {}
1560 
1561 namespace {
1562 class CodeGeneration : public ScopPass {
1563   Region *region;
1564   Scop *S;
1565   DominatorTree *DT;
1566   RegionInfo *RI;
1567 
1568   std::vector<std::string> parallelLoops;
1569 
1570   public:
1571   static char ID;
1572 
1573   CodeGeneration() : ScopPass(ID) {}
1574 
1575   // Split the entry edge of the region and generate a new basic block on this
1576   // edge. This function also updates ScopInfo and RegionInfo.
1577   //
1578   // @param region The region where the entry edge will be splitted.
1579   BasicBlock *splitEdgeAdvanced(Region *region) {
1580     BasicBlock *newBlock;
1581     BasicBlock *splitBlock;
1582 
1583     newBlock = SplitEdge(region->getEnteringBlock(), region->getEntry(), this);
1584 
1585     if (DT->dominates(region->getEntry(), newBlock)) {
1586       BasicBlock *OldBlock = region->getEntry();
1587       std::string OldName = OldBlock->getName();
1588 
1589       // Update ScopInfo.
1590       for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI)
1591         if ((*SI)->getBasicBlock() == OldBlock) {
1592           (*SI)->setBasicBlock(newBlock);
1593           break;
1594         }
1595 
1596       // Update RegionInfo.
1597       splitBlock = OldBlock;
1598       OldBlock->setName("polly.split");
1599       newBlock->setName(OldName);
1600       region->replaceEntry(newBlock);
1601       RI->setRegionFor(newBlock, region);
1602     } else {
1603       RI->setRegionFor(newBlock, region->getParent());
1604       splitBlock = newBlock;
1605     }
1606 
1607     return splitBlock;
1608   }
1609 
1610   // Create a split block that branches either to the old code or to a new basic
1611   // block where the new code can be inserted.
1612   //
1613   // @param Builder A builder that will be set to point to a basic block, where
1614   //                the new code can be generated.
1615   // @return The split basic block.
1616   BasicBlock *addSplitAndStartBlock(IRBuilder<> *Builder) {
1617     BasicBlock *StartBlock, *SplitBlock;
1618 
1619     SplitBlock = splitEdgeAdvanced(region);
1620     SplitBlock->setName("polly.split_new_and_old");
1621     Function *F = SplitBlock->getParent();
1622     StartBlock = BasicBlock::Create(F->getContext(), "polly.start", F);
1623     SplitBlock->getTerminator()->eraseFromParent();
1624     Builder->SetInsertPoint(SplitBlock);
1625     Builder->CreateCondBr(Builder->getTrue(), StartBlock, region->getEntry());
1626     DT->addNewBlock(StartBlock, SplitBlock);
1627     Builder->SetInsertPoint(StartBlock);
1628     return SplitBlock;
1629   }
1630 
1631   // Merge the control flow of the newly generated code with the existing code.
1632   //
1633   // @param SplitBlock The basic block where the control flow was split between
1634   //                   old and new version of the Scop.
1635   // @param Builder    An IRBuilder that points to the last instruction of the
1636   //                   newly generated code.
1637   void mergeControlFlow(BasicBlock *SplitBlock, IRBuilder<> *Builder) {
1638     BasicBlock *MergeBlock;
1639     Region *R = region;
1640 
1641     if (R->getExit()->getSinglePredecessor())
1642       // No splitEdge required.  A block with a single predecessor cannot have
1643       // PHI nodes that would complicate life.
1644       MergeBlock = R->getExit();
1645     else {
1646       MergeBlock = SplitEdge(R->getExitingBlock(), R->getExit(), this);
1647       // SplitEdge will never split R->getExit(), as R->getExit() has more than
1648       // one predecessor. Hence, mergeBlock is always a newly generated block.
1649       R->replaceExit(MergeBlock);
1650     }
1651 
1652     Builder->CreateBr(MergeBlock);
1653     MergeBlock->setName("polly.merge_new_and_old");
1654 
1655     if (DT->dominates(SplitBlock, MergeBlock))
1656       DT->changeImmediateDominator(MergeBlock, SplitBlock);
1657   }
1658 
1659   bool runOnScop(Scop &scop) {
1660     S = &scop;
1661     region = &S->getRegion();
1662     DT = &getAnalysis<DominatorTree>();
1663     RI = &getAnalysis<RegionInfo>();
1664 
1665     parallelLoops.clear();
1666 
1667     assert(region->isSimple() && "Only simple regions are supported");
1668 
1669     // In the CFG the optimized code of the SCoP is generated next to the
1670     // original code. Both the new and the original version of the code remain
1671     // in the CFG. A branch statement decides which version is executed.
1672     // For now, we always execute the new version (the old one is dead code
1673     // eliminated by the cleanup passes). In the future we may decide to execute
1674     // the new version only if certain run time checks succeed. This will be
1675     // useful to support constructs for which we cannot prove all assumptions at
1676     // compile time.
1677     //
1678     // Before transformation:
1679     //
1680     //                        bb0
1681     //                         |
1682     //                     orig_scop
1683     //                         |
1684     //                        bb1
1685     //
1686     // After transformation:
1687     //                        bb0
1688     //                         |
1689     //                  polly.splitBlock
1690     //                     /       \.
1691     //                     |     startBlock
1692     //                     |        |
1693     //               orig_scop   new_scop
1694     //                     \      /
1695     //                      \    /
1696     //                        bb1 (joinBlock)
1697     IRBuilder<> builder(region->getEntry());
1698 
1699     // The builder will be set to startBlock.
1700     BasicBlock *splitBlock = addSplitAndStartBlock(&builder);
1701     BasicBlock *StartBlock = builder.GetInsertBlock();
1702 
1703     mergeControlFlow(splitBlock, &builder);
1704     builder.SetInsertPoint(StartBlock->begin());
1705 
1706     ClastStmtCodeGen CodeGen(S, builder, this);
1707     CloogInfo &C = getAnalysis<CloogInfo>();
1708     CodeGen.codegen(C.getClast());
1709 
1710     parallelLoops.insert(parallelLoops.begin(),
1711                          CodeGen.getParallelLoops().begin(),
1712                          CodeGen.getParallelLoops().end());
1713 
1714     return true;
1715   }
1716 
1717   virtual void printScop(raw_ostream &OS) const {
1718     for (std::vector<std::string>::const_iterator PI = parallelLoops.begin(),
1719          PE = parallelLoops.end(); PI != PE; ++PI)
1720       OS << "Parallel loop with iterator '" << *PI << "' generated\n";
1721   }
1722 
1723   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
1724     AU.addRequired<CloogInfo>();
1725     AU.addRequired<Dependences>();
1726     AU.addRequired<DominatorTree>();
1727     AU.addRequired<RegionInfo>();
1728     AU.addRequired<ScalarEvolution>();
1729     AU.addRequired<ScopDetection>();
1730     AU.addRequired<ScopInfo>();
1731     AU.addRequired<TargetData>();
1732 
1733     AU.addPreserved<CloogInfo>();
1734     AU.addPreserved<Dependences>();
1735 
1736     // FIXME: We do not create LoopInfo for the newly generated loops.
1737     AU.addPreserved<LoopInfo>();
1738     AU.addPreserved<DominatorTree>();
1739     AU.addPreserved<ScopDetection>();
1740     AU.addPreserved<ScalarEvolution>();
1741 
1742     // FIXME: We do not yet add regions for the newly generated code to the
1743     //        region tree.
1744     AU.addPreserved<RegionInfo>();
1745     AU.addPreserved<TempScopInfo>();
1746     AU.addPreserved<ScopInfo>();
1747     AU.addPreservedID(IndependentBlocksID);
1748   }
1749 };
1750 }
1751 
1752 char CodeGeneration::ID = 1;
1753 
1754 INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
1755                       "Polly - Create LLVM-IR from SCoPs", false, false)
1756 INITIALIZE_PASS_DEPENDENCY(CloogInfo)
1757 INITIALIZE_PASS_DEPENDENCY(Dependences)
1758 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
1759 INITIALIZE_PASS_DEPENDENCY(RegionInfo)
1760 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1761 INITIALIZE_PASS_DEPENDENCY(ScopDetection)
1762 INITIALIZE_PASS_DEPENDENCY(TargetData)
1763 INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
1764                       "Polly - Create LLVM-IR from SCoPs", false, false)
1765 
1766 Pass *polly::createCodeGenerationPass() {
1767   return new CodeGeneration();
1768 }
1769