1 //===--- BlockGenerators.cpp - Generate code for statements -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the BlockGenerator and VectorBlockGenerator classes,
11 // which generate sequential code and vectorized code for a polyhedral
12 // statement, respectively.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "polly/CodeGen/BlockGenerators.h"
17 #include "polly/CodeGen/CodeGeneration.h"
18 #include "polly/CodeGen/IslExprBuilder.h"
19 #include "polly/CodeGen/RuntimeDebugBuilder.h"
20 #include "polly/Options.h"
21 #include "polly/ScopInfo.h"
22 #include "polly/Support/GICHelper.h"
23 #include "polly/Support/SCEVValidator.h"
24 #include "polly/Support/ScopHelper.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/RegionInfo.h"
27 #include "llvm/Analysis/ScalarEvolution.h"
28 #include "llvm/IR/IntrinsicInst.h"
29 #include "llvm/IR/Module.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/Local.h"
32 #include "isl/aff.h"
33 #include "isl/ast.h"
34 #include "isl/ast_build.h"
35 #include "isl/set.h"
36 #include <deque>
37 
38 using namespace llvm;
39 using namespace polly;
40 
41 static cl::opt<bool> Aligned("enable-polly-aligned",
42                              cl::desc("Assumed aligned memory accesses."),
43                              cl::Hidden, cl::init(false), cl::ZeroOrMore,
44                              cl::cat(PollyCategory));
45 
46 bool PollyDebugPrinting;
47 static cl::opt<bool, true> DebugPrintingX(
48     "polly-codegen-add-debug-printing",
49     cl::desc("Add printf calls that show the values loaded/stored."),
50     cl::location(PollyDebugPrinting), cl::Hidden, cl::init(false),
51     cl::ZeroOrMore, cl::cat(PollyCategory));
52 
53 BlockGenerator::BlockGenerator(
54     PollyIRBuilder &B, LoopInfo &LI, ScalarEvolution &SE, DominatorTree &DT,
55     AllocaMapTy &ScalarMap, EscapeUsersAllocaMapTy &EscapeMap,
56     ValueMapT &GlobalMap, IslExprBuilder *ExprBuilder, BasicBlock *StartBlock)
57     : Builder(B), LI(LI), SE(SE), ExprBuilder(ExprBuilder), DT(DT),
58       EntryBB(nullptr), ScalarMap(ScalarMap), EscapeMap(EscapeMap),
59       GlobalMap(GlobalMap), StartBlock(StartBlock) {}
60 
61 Value *BlockGenerator::trySynthesizeNewValue(ScopStmt &Stmt, Value *Old,
62                                              ValueMapT &BBMap,
63                                              LoopToScevMapT &LTS,
64                                              Loop *L) const {
65   if (!SE.isSCEVable(Old->getType()))
66     return nullptr;
67 
68   const SCEV *Scev = SE.getSCEVAtScope(Old, L);
69   if (!Scev)
70     return nullptr;
71 
72   if (isa<SCEVCouldNotCompute>(Scev))
73     return nullptr;
74 
75   const SCEV *NewScev = SCEVLoopAddRecRewriter::rewrite(Scev, LTS, SE);
76   ValueMapT VTV;
77   VTV.insert(BBMap.begin(), BBMap.end());
78   VTV.insert(GlobalMap.begin(), GlobalMap.end());
79 
80   Scop &S = *Stmt.getParent();
81   const DataLayout &DL = S.getFunction().getParent()->getDataLayout();
82   auto IP = Builder.GetInsertPoint();
83 
84   assert(IP != Builder.GetInsertBlock()->end() &&
85          "Only instructions can be insert points for SCEVExpander");
86   Value *Expanded =
87       expandCodeFor(S, SE, DL, "polly", NewScev, Old->getType(), &*IP, &VTV,
88                     StartBlock->getSinglePredecessor());
89 
90   BBMap[Old] = Expanded;
91   return Expanded;
92 }
93 
94 Value *BlockGenerator::getNewValue(ScopStmt &Stmt, Value *Old, ValueMapT &BBMap,
95                                    LoopToScevMapT &LTS, Loop *L) const {
96   // Constants that do not reference any named value can always remain
97   // unchanged. Handle them early to avoid expensive map lookups. We do not take
98   // the fast-path for external constants which are referenced through globals
99   // as these may need to be rewritten when distributing code accross different
100   // LLVM modules.
101   if (isa<Constant>(Old) && !isa<GlobalValue>(Old))
102     return Old;
103 
104   // Inline asm is like a constant to us.
105   if (isa<InlineAsm>(Old))
106     return Old;
107 
108   if (Value *New = GlobalMap.lookup(Old)) {
109     if (Value *NewRemapped = GlobalMap.lookup(New))
110       New = NewRemapped;
111     if (Old->getType()->getScalarSizeInBits() <
112         New->getType()->getScalarSizeInBits())
113       New = Builder.CreateTruncOrBitCast(New, Old->getType());
114 
115     return New;
116   }
117 
118   if (Value *New = BBMap.lookup(Old))
119     return New;
120 
121   if (Value *New = trySynthesizeNewValue(Stmt, Old, BBMap, LTS, L))
122     return New;
123 
124   // A scop-constant value defined by a global or a function parameter.
125   if (isa<GlobalValue>(Old) || isa<Argument>(Old))
126     return Old;
127 
128   // A scop-constant value defined by an instruction executed outside the scop.
129   if (const Instruction *Inst = dyn_cast<Instruction>(Old))
130     if (!Stmt.getParent()->contains(Inst->getParent()))
131       return Old;
132 
133   // The scalar dependence is neither available nor SCEVCodegenable.
134   llvm_unreachable("Unexpected scalar dependence in region!");
135   return nullptr;
136 }
137 
138 void BlockGenerator::copyInstScalar(ScopStmt &Stmt, Instruction *Inst,
139                                     ValueMapT &BBMap, LoopToScevMapT &LTS) {
140   // We do not generate debug intrinsics as we did not investigate how to
141   // copy them correctly. At the current state, they just crash the code
142   // generation as the meta-data operands are not correctly copied.
143   if (isa<DbgInfoIntrinsic>(Inst))
144     return;
145 
146   Instruction *NewInst = Inst->clone();
147 
148   // Replace old operands with the new ones.
149   for (Value *OldOperand : Inst->operands()) {
150     Value *NewOperand =
151         getNewValue(Stmt, OldOperand, BBMap, LTS, getLoopForStmt(Stmt));
152 
153     if (!NewOperand) {
154       assert(!isa<StoreInst>(NewInst) &&
155              "Store instructions are always needed!");
156       delete NewInst;
157       return;
158     }
159 
160     NewInst->replaceUsesOfWith(OldOperand, NewOperand);
161   }
162 
163   Builder.Insert(NewInst);
164   BBMap[Inst] = NewInst;
165 
166   if (!NewInst->getType()->isVoidTy())
167     NewInst->setName("p_" + Inst->getName());
168 }
169 
170 Value *
171 BlockGenerator::generateLocationAccessed(ScopStmt &Stmt, MemAccInst Inst,
172                                          ValueMapT &BBMap, LoopToScevMapT &LTS,
173                                          isl_id_to_ast_expr *NewAccesses) {
174   const MemoryAccess &MA = Stmt.getArrayAccessFor(Inst);
175   return generateLocationAccessed(
176       Stmt, getLoopForStmt(Stmt),
177       Inst.isNull() ? nullptr : Inst.getPointerOperand(), BBMap, LTS,
178       NewAccesses, MA.getId(), MA.getAccessValue()->getType());
179 }
180 
181 Value *BlockGenerator::generateLocationAccessed(
182     ScopStmt &Stmt, Loop *L, Value *Pointer, ValueMapT &BBMap,
183     LoopToScevMapT &LTS, isl_id_to_ast_expr *NewAccesses, __isl_take isl_id *Id,
184     Type *ExpectedType) {
185   isl_ast_expr *AccessExpr = isl_id_to_ast_expr_get(NewAccesses, Id);
186 
187   if (AccessExpr) {
188     AccessExpr = isl_ast_expr_address_of(AccessExpr);
189     auto Address = ExprBuilder->create(AccessExpr);
190 
191     // Cast the address of this memory access to a pointer type that has the
192     // same element type as the original access, but uses the address space of
193     // the newly generated pointer.
194     auto OldPtrTy = ExpectedType->getPointerTo();
195     auto NewPtrTy = Address->getType();
196     OldPtrTy = PointerType::get(OldPtrTy->getElementType(),
197                                 NewPtrTy->getPointerAddressSpace());
198 
199     if (OldPtrTy != NewPtrTy)
200       Address = Builder.CreateBitOrPointerCast(Address, OldPtrTy);
201     return Address;
202   }
203   assert(
204       Pointer &&
205       "If expression was not generated, must use the original pointer value");
206   return getNewValue(Stmt, Pointer, BBMap, LTS, L);
207 }
208 
209 Value *
210 BlockGenerator::getImplicitAddress(MemoryAccess &Access, Loop *L,
211                                    LoopToScevMapT &LTS, ValueMapT &BBMap,
212                                    __isl_keep isl_id_to_ast_expr *NewAccesses) {
213   if (Access.isLatestArrayKind())
214     return generateLocationAccessed(*Access.getStatement(), L, nullptr, BBMap,
215                                     LTS, NewAccesses, Access.getId(),
216                                     Access.getAccessValue()->getType());
217 
218   return getOrCreateAlloca(Access);
219 }
220 
221 Loop *BlockGenerator::getLoopForStmt(const ScopStmt &Stmt) const {
222   auto *StmtBB = Stmt.getEntryBlock();
223   return LI.getLoopFor(StmtBB);
224 }
225 
226 Value *BlockGenerator::generateArrayLoad(ScopStmt &Stmt, LoadInst *Load,
227                                          ValueMapT &BBMap, LoopToScevMapT &LTS,
228                                          isl_id_to_ast_expr *NewAccesses) {
229   if (Value *PreloadLoad = GlobalMap.lookup(Load))
230     return PreloadLoad;
231 
232   Value *NewPointer =
233       generateLocationAccessed(Stmt, Load, BBMap, LTS, NewAccesses);
234   Value *ScalarLoad = Builder.CreateAlignedLoad(
235       NewPointer, Load->getAlignment(), Load->getName() + "_p_scalar_");
236 
237   if (PollyDebugPrinting)
238     RuntimeDebugBuilder::createCPUPrinter(Builder, "Load from ", NewPointer,
239                                           ": ", ScalarLoad, "\n");
240 
241   return ScalarLoad;
242 }
243 
244 void BlockGenerator::generateArrayStore(ScopStmt &Stmt, StoreInst *Store,
245                                         ValueMapT &BBMap, LoopToScevMapT &LTS,
246                                         isl_id_to_ast_expr *NewAccesses) {
247   Value *NewPointer =
248       generateLocationAccessed(Stmt, Store, BBMap, LTS, NewAccesses);
249   Value *ValueOperand = getNewValue(Stmt, Store->getValueOperand(), BBMap, LTS,
250                                     getLoopForStmt(Stmt));
251 
252   if (PollyDebugPrinting)
253     RuntimeDebugBuilder::createCPUPrinter(Builder, "Store to  ", NewPointer,
254                                           ": ", ValueOperand, "\n");
255 
256   Builder.CreateAlignedStore(ValueOperand, NewPointer, Store->getAlignment());
257 }
258 
259 bool BlockGenerator::canSyntheziseInStmt(ScopStmt &Stmt, Instruction *Inst) {
260   Loop *L = getLoopForStmt(Stmt);
261   return (Stmt.isBlockStmt() || !Stmt.getRegion()->contains(L)) &&
262          canSynthesize(Inst, *Stmt.getParent(), &SE, L);
263 }
264 
265 void BlockGenerator::copyInstruction(ScopStmt &Stmt, Instruction *Inst,
266                                      ValueMapT &BBMap, LoopToScevMapT &LTS,
267                                      isl_id_to_ast_expr *NewAccesses) {
268   // Terminator instructions control the control flow. They are explicitly
269   // expressed in the clast and do not need to be copied.
270   if (Inst->isTerminator())
271     return;
272 
273   // Synthesizable statements will be generated on-demand.
274   if (canSyntheziseInStmt(Stmt, Inst))
275     return;
276 
277   if (auto *Load = dyn_cast<LoadInst>(Inst)) {
278     Value *NewLoad = generateArrayLoad(Stmt, Load, BBMap, LTS, NewAccesses);
279     // Compute NewLoad before its insertion in BBMap to make the insertion
280     // deterministic.
281     BBMap[Load] = NewLoad;
282     return;
283   }
284 
285   if (auto *Store = dyn_cast<StoreInst>(Inst)) {
286     // Identified as redundant by -polly-simplify.
287     if (!Stmt.getArrayAccessOrNULLFor(Store))
288       return;
289 
290     generateArrayStore(Stmt, Store, BBMap, LTS, NewAccesses);
291     return;
292   }
293 
294   if (auto *PHI = dyn_cast<PHINode>(Inst)) {
295     copyPHIInstruction(Stmt, PHI, BBMap, LTS);
296     return;
297   }
298 
299   // Skip some special intrinsics for which we do not adjust the semantics to
300   // the new schedule. All others are handled like every other instruction.
301   if (isIgnoredIntrinsic(Inst))
302     return;
303 
304   copyInstScalar(Stmt, Inst, BBMap, LTS);
305 }
306 
307 void BlockGenerator::removeDeadInstructions(BasicBlock *BB, ValueMapT &BBMap) {
308   auto NewBB = Builder.GetInsertBlock();
309   for (auto I = NewBB->rbegin(); I != NewBB->rend(); I++) {
310     Instruction *NewInst = &*I;
311 
312     if (!isInstructionTriviallyDead(NewInst))
313       continue;
314 
315     for (auto Pair : BBMap)
316       if (Pair.second == NewInst) {
317         BBMap.erase(Pair.first);
318       }
319 
320     NewInst->eraseFromParent();
321     I = NewBB->rbegin();
322   }
323 }
324 
325 void BlockGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
326                               isl_id_to_ast_expr *NewAccesses) {
327   assert(Stmt.isBlockStmt() &&
328          "Only block statements can be copied by the block generator");
329 
330   ValueMapT BBMap;
331 
332   BasicBlock *BB = Stmt.getBasicBlock();
333   copyBB(Stmt, BB, BBMap, LTS, NewAccesses);
334   removeDeadInstructions(BB, BBMap);
335 }
336 
337 BasicBlock *BlockGenerator::splitBB(BasicBlock *BB) {
338   BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
339                                   &*Builder.GetInsertPoint(), &DT, &LI);
340   CopyBB->setName("polly.stmt." + BB->getName());
341   return CopyBB;
342 }
343 
344 BasicBlock *BlockGenerator::copyBB(ScopStmt &Stmt, BasicBlock *BB,
345                                    ValueMapT &BBMap, LoopToScevMapT &LTS,
346                                    isl_id_to_ast_expr *NewAccesses) {
347   BasicBlock *CopyBB = splitBB(BB);
348   Builder.SetInsertPoint(&CopyBB->front());
349   generateScalarLoads(Stmt, LTS, BBMap, NewAccesses);
350 
351   copyBB(Stmt, BB, CopyBB, BBMap, LTS, NewAccesses);
352 
353   // After a basic block was copied store all scalars that escape this block in
354   // their alloca.
355   generateScalarStores(Stmt, LTS, BBMap, NewAccesses);
356   return CopyBB;
357 }
358 
359 void BlockGenerator::copyBB(ScopStmt &Stmt, BasicBlock *BB, BasicBlock *CopyBB,
360                             ValueMapT &BBMap, LoopToScevMapT &LTS,
361                             isl_id_to_ast_expr *NewAccesses) {
362   EntryBB = &CopyBB->getParent()->getEntryBlock();
363 
364   for (Instruction &Inst : *BB)
365     copyInstruction(Stmt, &Inst, BBMap, LTS, NewAccesses);
366 }
367 
368 Value *BlockGenerator::getOrCreateAlloca(const MemoryAccess &Access) {
369   assert(!Access.isLatestArrayKind() && "Trying to get alloca for array kind");
370 
371   return getOrCreateAlloca(Access.getLatestScopArrayInfo());
372 }
373 
374 Value *BlockGenerator::getOrCreateAlloca(const ScopArrayInfo *Array) {
375   assert(!Array->isArrayKind() && "Trying to get alloca for array kind");
376 
377   auto &Addr = ScalarMap[Array];
378 
379   if (Addr) {
380     // Allow allocas to be (temporarily) redirected once by adding a new
381     // old-alloca-addr to new-addr mapping to GlobalMap. This funcitionality
382     // is used for example by the OpenMP code generation where a first use
383     // of a scalar while still in the host code allocates a normal alloca with
384     // getOrCreateAlloca. When the values of this scalar are accessed during
385     // the generation of the parallel subfunction, these values are copied over
386     // to the parallel subfunction and each request for a scalar alloca slot
387     // must be forwared to the temporary in-subfunction slot. This mapping is
388     // removed when the subfunction has been generated and again normal host
389     // code is generated. Due to the following reasons it is not possible to
390     // perform the GlobalMap lookup right after creating the alloca below, but
391     // instead we need to check GlobalMap at each call to getOrCreateAlloca:
392     //
393     //   1) GlobalMap may be changed multiple times (for each parallel loop),
394     //   2) The temporary mapping is commonly only known after the initial
395     //      alloca has already been generated, and
396     //   3) The original alloca value must be restored after leaving the
397     //      sub-function.
398     if (Value *NewAddr = GlobalMap.lookup(&*Addr))
399       return NewAddr;
400     return Addr;
401   }
402 
403   Type *Ty = Array->getElementType();
404   Value *ScalarBase = Array->getBasePtr();
405   std::string NameExt;
406   if (Array->isPHIKind())
407     NameExt = ".phiops";
408   else
409     NameExt = ".s2a";
410 
411   Addr = new AllocaInst(Ty, ScalarBase->getName() + NameExt);
412   EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
413   Addr->insertBefore(&*EntryBB->getFirstInsertionPt());
414 
415   return Addr;
416 }
417 
418 void BlockGenerator::handleOutsideUsers(const Scop &S, ScopArrayInfo *Array) {
419   Instruction *Inst = cast<Instruction>(Array->getBasePtr());
420 
421   // If there are escape users we get the alloca for this instruction and put it
422   // in the EscapeMap for later finalization. Lastly, if the instruction was
423   // copied multiple times we already did this and can exit.
424   if (EscapeMap.count(Inst))
425     return;
426 
427   EscapeUserVectorTy EscapeUsers;
428   for (User *U : Inst->users()) {
429 
430     // Non-instruction user will never escape.
431     Instruction *UI = dyn_cast<Instruction>(U);
432     if (!UI)
433       continue;
434 
435     if (S.contains(UI))
436       continue;
437 
438     EscapeUsers.push_back(UI);
439   }
440 
441   // Exit if no escape uses were found.
442   if (EscapeUsers.empty())
443     return;
444 
445   // Get or create an escape alloca for this instruction.
446   auto *ScalarAddr = getOrCreateAlloca(Array);
447 
448   // Remember that this instruction has escape uses and the escape alloca.
449   EscapeMap[Inst] = std::make_pair(ScalarAddr, std::move(EscapeUsers));
450 }
451 
452 void BlockGenerator::generateScalarLoads(
453     ScopStmt &Stmt, LoopToScevMapT &LTS, ValueMapT &BBMap,
454     __isl_keep isl_id_to_ast_expr *NewAccesses) {
455   for (MemoryAccess *MA : Stmt) {
456     if (MA->isOriginalArrayKind() || MA->isWrite())
457       continue;
458 
459 #ifndef NDEBUG
460     auto *StmtDom = Stmt.getDomain();
461     auto *AccDom = isl_map_domain(MA->getAccessRelation());
462     assert(isl_set_is_subset(StmtDom, AccDom) &&
463            "Scalar must be loaded in all statement instances");
464     isl_set_free(StmtDom);
465     isl_set_free(AccDom);
466 #endif
467 
468     auto *Address =
469         getImplicitAddress(*MA, getLoopForStmt(Stmt), LTS, BBMap, NewAccesses);
470     assert((!isa<Instruction>(Address) ||
471             DT.dominates(cast<Instruction>(Address)->getParent(),
472                          Builder.GetInsertBlock())) &&
473            "Domination violation");
474     BBMap[MA->getAccessValue()] =
475         Builder.CreateLoad(Address, Address->getName() + ".reload");
476   }
477 }
478 
479 void BlockGenerator::generateScalarStores(
480     ScopStmt &Stmt, LoopToScevMapT &LTS, ValueMapT &BBMap,
481     __isl_keep isl_id_to_ast_expr *NewAccesses) {
482   Loop *L = LI.getLoopFor(Stmt.getBasicBlock());
483 
484   assert(Stmt.isBlockStmt() &&
485          "Region statements need to use the generateScalarStores() function in "
486          "the RegionGenerator");
487 
488   for (MemoryAccess *MA : Stmt) {
489     if (MA->isOriginalArrayKind() || MA->isRead())
490       continue;
491 
492 #ifndef NDEBUG
493     auto *StmtDom = Stmt.getDomain();
494     auto *AccDom = isl_map_domain(MA->getAccessRelation());
495     assert(isl_set_is_subset(StmtDom, AccDom) &&
496            "Scalar must be stored in all statement instances");
497     isl_set_free(StmtDom);
498     isl_set_free(AccDom);
499 #endif
500 
501     Value *Val = MA->getAccessValue();
502     if (MA->isAnyPHIKind()) {
503       assert(MA->getIncoming().size() >= 1 &&
504              "Block statements have exactly one exiting block, or multiple but "
505              "with same incoming block and value");
506       assert(std::all_of(MA->getIncoming().begin(), MA->getIncoming().end(),
507                          [&](std::pair<BasicBlock *, Value *> p) -> bool {
508                            return p.first == Stmt.getBasicBlock();
509                          }) &&
510              "Incoming block must be statement's block");
511       Val = MA->getIncoming()[0].second;
512     }
513     auto Address =
514         getImplicitAddress(*MA, getLoopForStmt(Stmt), LTS, BBMap, NewAccesses);
515 
516     Val = getNewValue(Stmt, Val, BBMap, LTS, L);
517     assert((!isa<Instruction>(Val) ||
518             DT.dominates(cast<Instruction>(Val)->getParent(),
519                          Builder.GetInsertBlock())) &&
520            "Domination violation");
521     assert((!isa<Instruction>(Address) ||
522             DT.dominates(cast<Instruction>(Address)->getParent(),
523                          Builder.GetInsertBlock())) &&
524            "Domination violation");
525     Builder.CreateStore(Val, Address);
526   }
527 }
528 
529 void BlockGenerator::createScalarInitialization(Scop &S) {
530   BasicBlock *ExitBB = S.getExit();
531   BasicBlock *PreEntryBB = S.getEnteringBlock();
532 
533   Builder.SetInsertPoint(&*StartBlock->begin());
534 
535   for (auto &Array : S.arrays()) {
536     if (Array->getNumberOfDimensions() != 0)
537       continue;
538     if (Array->isPHIKind()) {
539       // For PHI nodes, the only values we need to store are the ones that
540       // reach the PHI node from outside the region. In general there should
541       // only be one such incoming edge and this edge should enter through
542       // 'PreEntryBB'.
543       auto PHI = cast<PHINode>(Array->getBasePtr());
544 
545       for (auto BI = PHI->block_begin(), BE = PHI->block_end(); BI != BE; BI++)
546         if (!S.contains(*BI) && *BI != PreEntryBB)
547           llvm_unreachable("Incoming edges from outside the scop should always "
548                            "come from PreEntryBB");
549 
550       int Idx = PHI->getBasicBlockIndex(PreEntryBB);
551       if (Idx < 0)
552         continue;
553 
554       Value *ScalarValue = PHI->getIncomingValue(Idx);
555 
556       Builder.CreateStore(ScalarValue, getOrCreateAlloca(Array));
557       continue;
558     }
559 
560     auto *Inst = dyn_cast<Instruction>(Array->getBasePtr());
561 
562     if (Inst && S.contains(Inst))
563       continue;
564 
565     // PHI nodes that are not marked as such in their SAI object are either exit
566     // PHI nodes we model as common scalars but without initialization, or
567     // incoming phi nodes that need to be initialized. Check if the first is the
568     // case for Inst and do not create and initialize memory if so.
569     if (auto *PHI = dyn_cast_or_null<PHINode>(Inst))
570       if (!S.hasSingleExitEdge() && PHI->getBasicBlockIndex(ExitBB) >= 0)
571         continue;
572 
573     Builder.CreateStore(Array->getBasePtr(), getOrCreateAlloca(Array));
574   }
575 }
576 
577 void BlockGenerator::createScalarFinalization(Scop &S) {
578   // The exit block of the __unoptimized__ region.
579   BasicBlock *ExitBB = S.getExitingBlock();
580   // The merge block __just after__ the region and the optimized region.
581   BasicBlock *MergeBB = S.getExit();
582 
583   // The exit block of the __optimized__ region.
584   BasicBlock *OptExitBB = *(pred_begin(MergeBB));
585   if (OptExitBB == ExitBB)
586     OptExitBB = *(++pred_begin(MergeBB));
587 
588   Builder.SetInsertPoint(OptExitBB->getTerminator());
589   for (const auto &EscapeMapping : EscapeMap) {
590     // Extract the escaping instruction and the escaping users as well as the
591     // alloca the instruction was demoted to.
592     Instruction *EscapeInst = EscapeMapping.first;
593     const auto &EscapeMappingValue = EscapeMapping.second;
594     const EscapeUserVectorTy &EscapeUsers = EscapeMappingValue.second;
595     Value *ScalarAddr = EscapeMappingValue.first;
596 
597     // Reload the demoted instruction in the optimized version of the SCoP.
598     Value *EscapeInstReload =
599         Builder.CreateLoad(ScalarAddr, EscapeInst->getName() + ".final_reload");
600     EscapeInstReload =
601         Builder.CreateBitOrPointerCast(EscapeInstReload, EscapeInst->getType());
602 
603     // Create the merge PHI that merges the optimized and unoptimized version.
604     PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2,
605                                         EscapeInst->getName() + ".merge");
606     MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt());
607 
608     // Add the respective values to the merge PHI.
609     MergePHI->addIncoming(EscapeInstReload, OptExitBB);
610     MergePHI->addIncoming(EscapeInst, ExitBB);
611 
612     // The information of scalar evolution about the escaping instruction needs
613     // to be revoked so the new merged instruction will be used.
614     if (SE.isSCEVable(EscapeInst->getType()))
615       SE.forgetValue(EscapeInst);
616 
617     // Replace all uses of the demoted instruction with the merge PHI.
618     for (Instruction *EUser : EscapeUsers)
619       EUser->replaceUsesOfWith(EscapeInst, MergePHI);
620   }
621 }
622 
623 void BlockGenerator::findOutsideUsers(Scop &S) {
624   for (auto &Array : S.arrays()) {
625 
626     if (Array->getNumberOfDimensions() != 0)
627       continue;
628 
629     if (Array->isPHIKind())
630       continue;
631 
632     auto *Inst = dyn_cast<Instruction>(Array->getBasePtr());
633 
634     if (!Inst)
635       continue;
636 
637     // Scop invariant hoisting moves some of the base pointers out of the scop.
638     // We can ignore these, as the invariant load hoisting already registers the
639     // relevant outside users.
640     if (!S.contains(Inst))
641       continue;
642 
643     handleOutsideUsers(S, Array);
644   }
645 }
646 
647 void BlockGenerator::createExitPHINodeMerges(Scop &S) {
648   if (S.hasSingleExitEdge())
649     return;
650 
651   auto *ExitBB = S.getExitingBlock();
652   auto *MergeBB = S.getExit();
653   auto *AfterMergeBB = MergeBB->getSingleSuccessor();
654   BasicBlock *OptExitBB = *(pred_begin(MergeBB));
655   if (OptExitBB == ExitBB)
656     OptExitBB = *(++pred_begin(MergeBB));
657 
658   Builder.SetInsertPoint(OptExitBB->getTerminator());
659 
660   for (auto &SAI : S.arrays()) {
661     auto *Val = SAI->getBasePtr();
662 
663     // Only Value-like scalars need a merge PHI. Exit block PHIs receive either
664     // the original PHI's value or the reloaded incoming values from the
665     // generated code. An llvm::Value is merged between the original code's
666     // value or the generated one.
667     if (!SAI->isExitPHIKind())
668       continue;
669 
670     PHINode *PHI = dyn_cast<PHINode>(Val);
671     if (!PHI)
672       continue;
673 
674     if (PHI->getParent() != AfterMergeBB)
675       continue;
676 
677     std::string Name = PHI->getName();
678     Value *ScalarAddr = getOrCreateAlloca(SAI);
679     Value *Reload = Builder.CreateLoad(ScalarAddr, Name + ".ph.final_reload");
680     Reload = Builder.CreateBitOrPointerCast(Reload, PHI->getType());
681     Value *OriginalValue = PHI->getIncomingValueForBlock(MergeBB);
682     assert((!isa<Instruction>(OriginalValue) ||
683             cast<Instruction>(OriginalValue)->getParent() != MergeBB) &&
684            "Original value must no be one we just generated.");
685     auto *MergePHI = PHINode::Create(PHI->getType(), 2, Name + ".ph.merge");
686     MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt());
687     MergePHI->addIncoming(Reload, OptExitBB);
688     MergePHI->addIncoming(OriginalValue, ExitBB);
689     int Idx = PHI->getBasicBlockIndex(MergeBB);
690     PHI->setIncomingValue(Idx, MergePHI);
691   }
692 }
693 
694 void BlockGenerator::invalidateScalarEvolution(Scop &S) {
695   for (auto &Stmt : S)
696     if (Stmt.isCopyStmt())
697       continue;
698     else if (Stmt.isBlockStmt())
699       for (auto &Inst : *Stmt.getBasicBlock())
700         SE.forgetValue(&Inst);
701     else if (Stmt.isRegionStmt())
702       for (auto *BB : Stmt.getRegion()->blocks())
703         for (auto &Inst : *BB)
704           SE.forgetValue(&Inst);
705     else
706       llvm_unreachable("Unexpected statement type found");
707 }
708 
709 void BlockGenerator::finalizeSCoP(Scop &S) {
710   findOutsideUsers(S);
711   createScalarInitialization(S);
712   createExitPHINodeMerges(S);
713   createScalarFinalization(S);
714   invalidateScalarEvolution(S);
715 }
716 
717 VectorBlockGenerator::VectorBlockGenerator(BlockGenerator &BlockGen,
718                                            std::vector<LoopToScevMapT> &VLTS,
719                                            isl_map *Schedule)
720     : BlockGenerator(BlockGen), VLTS(VLTS), Schedule(Schedule) {
721   assert(Schedule && "No statement domain provided");
722 }
723 
724 Value *VectorBlockGenerator::getVectorValue(ScopStmt &Stmt, Value *Old,
725                                             ValueMapT &VectorMap,
726                                             VectorValueMapT &ScalarMaps,
727                                             Loop *L) {
728   if (Value *NewValue = VectorMap.lookup(Old))
729     return NewValue;
730 
731   int Width = getVectorWidth();
732 
733   Value *Vector = UndefValue::get(VectorType::get(Old->getType(), Width));
734 
735   for (int Lane = 0; Lane < Width; Lane++)
736     Vector = Builder.CreateInsertElement(
737         Vector, getNewValue(Stmt, Old, ScalarMaps[Lane], VLTS[Lane], L),
738         Builder.getInt32(Lane));
739 
740   VectorMap[Old] = Vector;
741 
742   return Vector;
743 }
744 
745 Type *VectorBlockGenerator::getVectorPtrTy(const Value *Val, int Width) {
746   PointerType *PointerTy = dyn_cast<PointerType>(Val->getType());
747   assert(PointerTy && "PointerType expected");
748 
749   Type *ScalarType = PointerTy->getElementType();
750   VectorType *VectorType = VectorType::get(ScalarType, Width);
751 
752   return PointerType::getUnqual(VectorType);
753 }
754 
755 Value *VectorBlockGenerator::generateStrideOneLoad(
756     ScopStmt &Stmt, LoadInst *Load, VectorValueMapT &ScalarMaps,
757     __isl_keep isl_id_to_ast_expr *NewAccesses, bool NegativeStride = false) {
758   unsigned VectorWidth = getVectorWidth();
759   auto *Pointer = Load->getPointerOperand();
760   Type *VectorPtrType = getVectorPtrTy(Pointer, VectorWidth);
761   unsigned Offset = NegativeStride ? VectorWidth - 1 : 0;
762 
763   Value *NewPointer = generateLocationAccessed(Stmt, Load, ScalarMaps[Offset],
764                                                VLTS[Offset], NewAccesses);
765   Value *VectorPtr =
766       Builder.CreateBitCast(NewPointer, VectorPtrType, "vector_ptr");
767   LoadInst *VecLoad =
768       Builder.CreateLoad(VectorPtr, Load->getName() + "_p_vec_full");
769   if (!Aligned)
770     VecLoad->setAlignment(8);
771 
772   if (NegativeStride) {
773     SmallVector<Constant *, 16> Indices;
774     for (int i = VectorWidth - 1; i >= 0; i--)
775       Indices.push_back(ConstantInt::get(Builder.getInt32Ty(), i));
776     Constant *SV = llvm::ConstantVector::get(Indices);
777     Value *RevVecLoad = Builder.CreateShuffleVector(
778         VecLoad, VecLoad, SV, Load->getName() + "_reverse");
779     return RevVecLoad;
780   }
781 
782   return VecLoad;
783 }
784 
785 Value *VectorBlockGenerator::generateStrideZeroLoad(
786     ScopStmt &Stmt, LoadInst *Load, ValueMapT &BBMap,
787     __isl_keep isl_id_to_ast_expr *NewAccesses) {
788   auto *Pointer = Load->getPointerOperand();
789   Type *VectorPtrType = getVectorPtrTy(Pointer, 1);
790   Value *NewPointer =
791       generateLocationAccessed(Stmt, Load, BBMap, VLTS[0], NewAccesses);
792   Value *VectorPtr = Builder.CreateBitCast(NewPointer, VectorPtrType,
793                                            Load->getName() + "_p_vec_p");
794   LoadInst *ScalarLoad =
795       Builder.CreateLoad(VectorPtr, Load->getName() + "_p_splat_one");
796 
797   if (!Aligned)
798     ScalarLoad->setAlignment(8);
799 
800   Constant *SplatVector = Constant::getNullValue(
801       VectorType::get(Builder.getInt32Ty(), getVectorWidth()));
802 
803   Value *VectorLoad = Builder.CreateShuffleVector(
804       ScalarLoad, ScalarLoad, SplatVector, Load->getName() + "_p_splat");
805   return VectorLoad;
806 }
807 
808 Value *VectorBlockGenerator::generateUnknownStrideLoad(
809     ScopStmt &Stmt, LoadInst *Load, VectorValueMapT &ScalarMaps,
810     __isl_keep isl_id_to_ast_expr *NewAccesses) {
811   int VectorWidth = getVectorWidth();
812   auto *Pointer = Load->getPointerOperand();
813   VectorType *VectorType = VectorType::get(
814       dyn_cast<PointerType>(Pointer->getType())->getElementType(), VectorWidth);
815 
816   Value *Vector = UndefValue::get(VectorType);
817 
818   for (int i = 0; i < VectorWidth; i++) {
819     Value *NewPointer = generateLocationAccessed(Stmt, Load, ScalarMaps[i],
820                                                  VLTS[i], NewAccesses);
821     Value *ScalarLoad =
822         Builder.CreateLoad(NewPointer, Load->getName() + "_p_scalar_");
823     Vector = Builder.CreateInsertElement(
824         Vector, ScalarLoad, Builder.getInt32(i), Load->getName() + "_p_vec_");
825   }
826 
827   return Vector;
828 }
829 
830 void VectorBlockGenerator::generateLoad(
831     ScopStmt &Stmt, LoadInst *Load, ValueMapT &VectorMap,
832     VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
833   if (Value *PreloadLoad = GlobalMap.lookup(Load)) {
834     VectorMap[Load] = Builder.CreateVectorSplat(getVectorWidth(), PreloadLoad,
835                                                 Load->getName() + "_p");
836     return;
837   }
838 
839   if (!VectorType::isValidElementType(Load->getType())) {
840     for (int i = 0; i < getVectorWidth(); i++)
841       ScalarMaps[i][Load] =
842           generateArrayLoad(Stmt, Load, ScalarMaps[i], VLTS[i], NewAccesses);
843     return;
844   }
845 
846   const MemoryAccess &Access = Stmt.getArrayAccessFor(Load);
847 
848   // Make sure we have scalar values available to access the pointer to
849   // the data location.
850   extractScalarValues(Load, VectorMap, ScalarMaps);
851 
852   Value *NewLoad;
853   if (Access.isStrideZero(isl_map_copy(Schedule)))
854     NewLoad = generateStrideZeroLoad(Stmt, Load, ScalarMaps[0], NewAccesses);
855   else if (Access.isStrideOne(isl_map_copy(Schedule)))
856     NewLoad = generateStrideOneLoad(Stmt, Load, ScalarMaps, NewAccesses);
857   else if (Access.isStrideX(isl_map_copy(Schedule), -1))
858     NewLoad = generateStrideOneLoad(Stmt, Load, ScalarMaps, NewAccesses, true);
859   else
860     NewLoad = generateUnknownStrideLoad(Stmt, Load, ScalarMaps, NewAccesses);
861 
862   VectorMap[Load] = NewLoad;
863 }
864 
865 void VectorBlockGenerator::copyUnaryInst(ScopStmt &Stmt, UnaryInstruction *Inst,
866                                          ValueMapT &VectorMap,
867                                          VectorValueMapT &ScalarMaps) {
868   int VectorWidth = getVectorWidth();
869   Value *NewOperand = getVectorValue(Stmt, Inst->getOperand(0), VectorMap,
870                                      ScalarMaps, getLoopForStmt(Stmt));
871 
872   assert(isa<CastInst>(Inst) && "Can not generate vector code for instruction");
873 
874   const CastInst *Cast = dyn_cast<CastInst>(Inst);
875   VectorType *DestType = VectorType::get(Inst->getType(), VectorWidth);
876   VectorMap[Inst] = Builder.CreateCast(Cast->getOpcode(), NewOperand, DestType);
877 }
878 
879 void VectorBlockGenerator::copyBinaryInst(ScopStmt &Stmt, BinaryOperator *Inst,
880                                           ValueMapT &VectorMap,
881                                           VectorValueMapT &ScalarMaps) {
882   Loop *L = getLoopForStmt(Stmt);
883   Value *OpZero = Inst->getOperand(0);
884   Value *OpOne = Inst->getOperand(1);
885 
886   Value *NewOpZero, *NewOpOne;
887   NewOpZero = getVectorValue(Stmt, OpZero, VectorMap, ScalarMaps, L);
888   NewOpOne = getVectorValue(Stmt, OpOne, VectorMap, ScalarMaps, L);
889 
890   Value *NewInst = Builder.CreateBinOp(Inst->getOpcode(), NewOpZero, NewOpOne,
891                                        Inst->getName() + "p_vec");
892   VectorMap[Inst] = NewInst;
893 }
894 
895 void VectorBlockGenerator::copyStore(
896     ScopStmt &Stmt, StoreInst *Store, ValueMapT &VectorMap,
897     VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
898   const MemoryAccess &Access = Stmt.getArrayAccessFor(Store);
899 
900   auto *Pointer = Store->getPointerOperand();
901   Value *Vector = getVectorValue(Stmt, Store->getValueOperand(), VectorMap,
902                                  ScalarMaps, getLoopForStmt(Stmt));
903 
904   // Make sure we have scalar values available to access the pointer to
905   // the data location.
906   extractScalarValues(Store, VectorMap, ScalarMaps);
907 
908   if (Access.isStrideOne(isl_map_copy(Schedule))) {
909     Type *VectorPtrType = getVectorPtrTy(Pointer, getVectorWidth());
910     Value *NewPointer = generateLocationAccessed(Stmt, Store, ScalarMaps[0],
911                                                  VLTS[0], NewAccesses);
912 
913     Value *VectorPtr =
914         Builder.CreateBitCast(NewPointer, VectorPtrType, "vector_ptr");
915     StoreInst *Store = Builder.CreateStore(Vector, VectorPtr);
916 
917     if (!Aligned)
918       Store->setAlignment(8);
919   } else {
920     for (unsigned i = 0; i < ScalarMaps.size(); i++) {
921       Value *Scalar = Builder.CreateExtractElement(Vector, Builder.getInt32(i));
922       Value *NewPointer = generateLocationAccessed(Stmt, Store, ScalarMaps[i],
923                                                    VLTS[i], NewAccesses);
924       Builder.CreateStore(Scalar, NewPointer);
925     }
926   }
927 }
928 
929 bool VectorBlockGenerator::hasVectorOperands(const Instruction *Inst,
930                                              ValueMapT &VectorMap) {
931   for (Value *Operand : Inst->operands())
932     if (VectorMap.count(Operand))
933       return true;
934   return false;
935 }
936 
937 bool VectorBlockGenerator::extractScalarValues(const Instruction *Inst,
938                                                ValueMapT &VectorMap,
939                                                VectorValueMapT &ScalarMaps) {
940   bool HasVectorOperand = false;
941   int VectorWidth = getVectorWidth();
942 
943   for (Value *Operand : Inst->operands()) {
944     ValueMapT::iterator VecOp = VectorMap.find(Operand);
945 
946     if (VecOp == VectorMap.end())
947       continue;
948 
949     HasVectorOperand = true;
950     Value *NewVector = VecOp->second;
951 
952     for (int i = 0; i < VectorWidth; ++i) {
953       ValueMapT &SM = ScalarMaps[i];
954 
955       // If there is one scalar extracted, all scalar elements should have
956       // already been extracted by the code here. So no need to check for the
957       // existence of all of them.
958       if (SM.count(Operand))
959         break;
960 
961       SM[Operand] =
962           Builder.CreateExtractElement(NewVector, Builder.getInt32(i));
963     }
964   }
965 
966   return HasVectorOperand;
967 }
968 
969 void VectorBlockGenerator::copyInstScalarized(
970     ScopStmt &Stmt, Instruction *Inst, ValueMapT &VectorMap,
971     VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
972   bool HasVectorOperand;
973   int VectorWidth = getVectorWidth();
974 
975   HasVectorOperand = extractScalarValues(Inst, VectorMap, ScalarMaps);
976 
977   for (int VectorLane = 0; VectorLane < getVectorWidth(); VectorLane++)
978     BlockGenerator::copyInstruction(Stmt, Inst, ScalarMaps[VectorLane],
979                                     VLTS[VectorLane], NewAccesses);
980 
981   if (!VectorType::isValidElementType(Inst->getType()) || !HasVectorOperand)
982     return;
983 
984   // Make the result available as vector value.
985   VectorType *VectorType = VectorType::get(Inst->getType(), VectorWidth);
986   Value *Vector = UndefValue::get(VectorType);
987 
988   for (int i = 0; i < VectorWidth; i++)
989     Vector = Builder.CreateInsertElement(Vector, ScalarMaps[i][Inst],
990                                          Builder.getInt32(i));
991 
992   VectorMap[Inst] = Vector;
993 }
994 
995 int VectorBlockGenerator::getVectorWidth() { return VLTS.size(); }
996 
997 void VectorBlockGenerator::copyInstruction(
998     ScopStmt &Stmt, Instruction *Inst, ValueMapT &VectorMap,
999     VectorValueMapT &ScalarMaps, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1000   // Terminator instructions control the control flow. They are explicitly
1001   // expressed in the clast and do not need to be copied.
1002   if (Inst->isTerminator())
1003     return;
1004 
1005   if (canSyntheziseInStmt(Stmt, Inst))
1006     return;
1007 
1008   if (auto *Load = dyn_cast<LoadInst>(Inst)) {
1009     generateLoad(Stmt, Load, VectorMap, ScalarMaps, NewAccesses);
1010     return;
1011   }
1012 
1013   if (hasVectorOperands(Inst, VectorMap)) {
1014     if (auto *Store = dyn_cast<StoreInst>(Inst)) {
1015       // Identified as redundant by -polly-simplify.
1016       if (!Stmt.getArrayAccessOrNULLFor(Store))
1017         return;
1018 
1019       copyStore(Stmt, Store, VectorMap, ScalarMaps, NewAccesses);
1020       return;
1021     }
1022 
1023     if (auto *Unary = dyn_cast<UnaryInstruction>(Inst)) {
1024       copyUnaryInst(Stmt, Unary, VectorMap, ScalarMaps);
1025       return;
1026     }
1027 
1028     if (auto *Binary = dyn_cast<BinaryOperator>(Inst)) {
1029       copyBinaryInst(Stmt, Binary, VectorMap, ScalarMaps);
1030       return;
1031     }
1032 
1033     // Falltrough: We generate scalar instructions, if we don't know how to
1034     // generate vector code.
1035   }
1036 
1037   copyInstScalarized(Stmt, Inst, VectorMap, ScalarMaps, NewAccesses);
1038 }
1039 
1040 void VectorBlockGenerator::generateScalarVectorLoads(
1041     ScopStmt &Stmt, ValueMapT &VectorBlockMap) {
1042   for (MemoryAccess *MA : Stmt) {
1043     if (MA->isArrayKind() || MA->isWrite())
1044       continue;
1045 
1046     auto *Address = getOrCreateAlloca(*MA);
1047     Type *VectorPtrType = getVectorPtrTy(Address, 1);
1048     Value *VectorPtr = Builder.CreateBitCast(Address, VectorPtrType,
1049                                              Address->getName() + "_p_vec_p");
1050     auto *Val = Builder.CreateLoad(VectorPtr, Address->getName() + ".reload");
1051     Constant *SplatVector = Constant::getNullValue(
1052         VectorType::get(Builder.getInt32Ty(), getVectorWidth()));
1053 
1054     Value *VectorVal = Builder.CreateShuffleVector(
1055         Val, Val, SplatVector, Address->getName() + "_p_splat");
1056     VectorBlockMap[MA->getAccessValue()] = VectorVal;
1057   }
1058 }
1059 
1060 void VectorBlockGenerator::verifyNoScalarStores(ScopStmt &Stmt) {
1061   for (MemoryAccess *MA : Stmt) {
1062     if (MA->isArrayKind() || MA->isRead())
1063       continue;
1064 
1065     llvm_unreachable("Scalar stores not expected in vector loop");
1066   }
1067 }
1068 
1069 void VectorBlockGenerator::copyStmt(
1070     ScopStmt &Stmt, __isl_keep isl_id_to_ast_expr *NewAccesses) {
1071   assert(Stmt.isBlockStmt() &&
1072          "TODO: Only block statements can be copied by the vector block "
1073          "generator");
1074 
1075   BasicBlock *BB = Stmt.getBasicBlock();
1076   BasicBlock *CopyBB = SplitBlock(Builder.GetInsertBlock(),
1077                                   &*Builder.GetInsertPoint(), &DT, &LI);
1078   CopyBB->setName("polly.stmt." + BB->getName());
1079   Builder.SetInsertPoint(&CopyBB->front());
1080 
1081   // Create two maps that store the mapping from the original instructions of
1082   // the old basic block to their copies in the new basic block. Those maps
1083   // are basic block local.
1084   //
1085   // As vector code generation is supported there is one map for scalar values
1086   // and one for vector values.
1087   //
1088   // In case we just do scalar code generation, the vectorMap is not used and
1089   // the scalarMap has just one dimension, which contains the mapping.
1090   //
1091   // In case vector code generation is done, an instruction may either appear
1092   // in the vector map once (as it is calculating >vectorwidth< values at a
1093   // time. Or (if the values are calculated using scalar operations), it
1094   // appears once in every dimension of the scalarMap.
1095   VectorValueMapT ScalarBlockMap(getVectorWidth());
1096   ValueMapT VectorBlockMap;
1097 
1098   generateScalarVectorLoads(Stmt, VectorBlockMap);
1099 
1100   for (Instruction &Inst : *BB)
1101     copyInstruction(Stmt, &Inst, VectorBlockMap, ScalarBlockMap, NewAccesses);
1102 
1103   verifyNoScalarStores(Stmt);
1104 }
1105 
1106 BasicBlock *RegionGenerator::repairDominance(BasicBlock *BB,
1107                                              BasicBlock *BBCopy) {
1108 
1109   BasicBlock *BBIDom = DT.getNode(BB)->getIDom()->getBlock();
1110   BasicBlock *BBCopyIDom = BlockMap.lookup(BBIDom);
1111 
1112   if (BBCopyIDom)
1113     DT.changeImmediateDominator(BBCopy, BBCopyIDom);
1114 
1115   return BBCopyIDom;
1116 }
1117 
1118 // This is to determine whether an llvm::Value (defined in @p BB) is usable when
1119 // leaving a subregion. The straight-forward DT.dominates(BB, R->getExitBlock())
1120 // does not work in cases where the exit block has edges from outside the
1121 // region. In that case the llvm::Value would never be usable in in the exit
1122 // block. The RegionGenerator however creates an new exit block ('ExitBBCopy')
1123 // for the subregion's exiting edges only. We need to determine whether an
1124 // llvm::Value is usable in there. We do this by checking whether it dominates
1125 // all exiting blocks individually.
1126 static bool isDominatingSubregionExit(const DominatorTree &DT, Region *R,
1127                                       BasicBlock *BB) {
1128   for (auto ExitingBB : predecessors(R->getExit())) {
1129     // Check for non-subregion incoming edges.
1130     if (!R->contains(ExitingBB))
1131       continue;
1132 
1133     if (!DT.dominates(BB, ExitingBB))
1134       return false;
1135   }
1136 
1137   return true;
1138 }
1139 
1140 // Find the direct dominator of the subregion's exit block if the subregion was
1141 // simplified.
1142 static BasicBlock *findExitDominator(DominatorTree &DT, Region *R) {
1143   BasicBlock *Common = nullptr;
1144   for (auto ExitingBB : predecessors(R->getExit())) {
1145     // Check for non-subregion incoming edges.
1146     if (!R->contains(ExitingBB))
1147       continue;
1148 
1149     // First exiting edge.
1150     if (!Common) {
1151       Common = ExitingBB;
1152       continue;
1153     }
1154 
1155     Common = DT.findNearestCommonDominator(Common, ExitingBB);
1156   }
1157 
1158   assert(Common && R->contains(Common));
1159   return Common;
1160 }
1161 
1162 void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT &LTS,
1163                                isl_id_to_ast_expr *IdToAstExp) {
1164   assert(Stmt.isRegionStmt() &&
1165          "Only region statements can be copied by the region generator");
1166 
1167   // Forget all old mappings.
1168   BlockMap.clear();
1169   RegionMaps.clear();
1170   IncompletePHINodeMap.clear();
1171 
1172   // Collection of all values related to this subregion.
1173   ValueMapT ValueMap;
1174 
1175   // The region represented by the statement.
1176   Region *R = Stmt.getRegion();
1177 
1178   // Create a dedicated entry for the region where we can reload all demoted
1179   // inputs.
1180   BasicBlock *EntryBB = R->getEntry();
1181   BasicBlock *EntryBBCopy = SplitBlock(Builder.GetInsertBlock(),
1182                                        &*Builder.GetInsertPoint(), &DT, &LI);
1183   EntryBBCopy->setName("polly.stmt." + EntryBB->getName() + ".entry");
1184   Builder.SetInsertPoint(&EntryBBCopy->front());
1185 
1186   ValueMapT &EntryBBMap = RegionMaps[EntryBBCopy];
1187   generateScalarLoads(Stmt, LTS, EntryBBMap, IdToAstExp);
1188 
1189   for (auto PI = pred_begin(EntryBB), PE = pred_end(EntryBB); PI != PE; ++PI)
1190     if (!R->contains(*PI))
1191       BlockMap[*PI] = EntryBBCopy;
1192 
1193   // Iterate over all blocks in the region in a breadth-first search.
1194   std::deque<BasicBlock *> Blocks;
1195   SmallSetVector<BasicBlock *, 8> SeenBlocks;
1196   Blocks.push_back(EntryBB);
1197   SeenBlocks.insert(EntryBB);
1198 
1199   while (!Blocks.empty()) {
1200     BasicBlock *BB = Blocks.front();
1201     Blocks.pop_front();
1202 
1203     // First split the block and update dominance information.
1204     BasicBlock *BBCopy = splitBB(BB);
1205     BasicBlock *BBCopyIDom = repairDominance(BB, BBCopy);
1206 
1207     // Get the mapping for this block and initialize it with either the scalar
1208     // loads from the generated entering block (which dominates all blocks of
1209     // this subregion) or the maps of the immediate dominator, if part of the
1210     // subregion. The latter necessarily includes the former.
1211     ValueMapT *InitBBMap;
1212     if (BBCopyIDom) {
1213       assert(RegionMaps.count(BBCopyIDom));
1214       InitBBMap = &RegionMaps[BBCopyIDom];
1215     } else
1216       InitBBMap = &EntryBBMap;
1217     auto Inserted = RegionMaps.insert(std::make_pair(BBCopy, *InitBBMap));
1218     ValueMapT &RegionMap = Inserted.first->second;
1219 
1220     // Copy the block with the BlockGenerator.
1221     Builder.SetInsertPoint(&BBCopy->front());
1222     copyBB(Stmt, BB, BBCopy, RegionMap, LTS, IdToAstExp);
1223 
1224     // In order to remap PHI nodes we store also basic block mappings.
1225     BlockMap[BB] = BBCopy;
1226 
1227     // Add values to incomplete PHI nodes waiting for this block to be copied.
1228     for (const PHINodePairTy &PHINodePair : IncompletePHINodeMap[BB])
1229       addOperandToPHI(Stmt, PHINodePair.first, PHINodePair.second, BB, LTS);
1230     IncompletePHINodeMap[BB].clear();
1231 
1232     // And continue with new successors inside the region.
1233     for (auto SI = succ_begin(BB), SE = succ_end(BB); SI != SE; SI++)
1234       if (R->contains(*SI) && SeenBlocks.insert(*SI))
1235         Blocks.push_back(*SI);
1236 
1237     // Remember value in case it is visible after this subregion.
1238     if (isDominatingSubregionExit(DT, R, BB))
1239       ValueMap.insert(RegionMap.begin(), RegionMap.end());
1240   }
1241 
1242   // Now create a new dedicated region exit block and add it to the region map.
1243   BasicBlock *ExitBBCopy = SplitBlock(Builder.GetInsertBlock(),
1244                                       &*Builder.GetInsertPoint(), &DT, &LI);
1245   ExitBBCopy->setName("polly.stmt." + R->getExit()->getName() + ".exit");
1246   BlockMap[R->getExit()] = ExitBBCopy;
1247 
1248   BasicBlock *ExitDomBBCopy = BlockMap.lookup(findExitDominator(DT, R));
1249   assert(ExitDomBBCopy &&
1250          "Common exit dominator must be within region; at least the entry node "
1251          "must match");
1252   DT.changeImmediateDominator(ExitBBCopy, ExitDomBBCopy);
1253 
1254   // As the block generator doesn't handle control flow we need to add the
1255   // region control flow by hand after all blocks have been copied.
1256   for (BasicBlock *BB : SeenBlocks) {
1257 
1258     BasicBlock *BBCopy = BlockMap[BB];
1259     TerminatorInst *TI = BB->getTerminator();
1260     if (isa<UnreachableInst>(TI)) {
1261       while (!BBCopy->empty())
1262         BBCopy->begin()->eraseFromParent();
1263       new UnreachableInst(BBCopy->getContext(), BBCopy);
1264       continue;
1265     }
1266 
1267     Instruction *BICopy = BBCopy->getTerminator();
1268 
1269     ValueMapT &RegionMap = RegionMaps[BBCopy];
1270     RegionMap.insert(BlockMap.begin(), BlockMap.end());
1271 
1272     Builder.SetInsertPoint(BICopy);
1273     copyInstScalar(Stmt, TI, RegionMap, LTS);
1274     BICopy->eraseFromParent();
1275   }
1276 
1277   // Add counting PHI nodes to all loops in the region that can be used as
1278   // replacement for SCEVs refering to the old loop.
1279   for (BasicBlock *BB : SeenBlocks) {
1280     Loop *L = LI.getLoopFor(BB);
1281     if (L == nullptr || L->getHeader() != BB || !R->contains(L))
1282       continue;
1283 
1284     BasicBlock *BBCopy = BlockMap[BB];
1285     Value *NullVal = Builder.getInt32(0);
1286     PHINode *LoopPHI =
1287         PHINode::Create(Builder.getInt32Ty(), 2, "polly.subregion.iv");
1288     Instruction *LoopPHIInc = BinaryOperator::CreateAdd(
1289         LoopPHI, Builder.getInt32(1), "polly.subregion.iv.inc");
1290     LoopPHI->insertBefore(&BBCopy->front());
1291     LoopPHIInc->insertBefore(BBCopy->getTerminator());
1292 
1293     for (auto *PredBB : make_range(pred_begin(BB), pred_end(BB))) {
1294       if (!R->contains(PredBB))
1295         continue;
1296       if (L->contains(PredBB))
1297         LoopPHI->addIncoming(LoopPHIInc, BlockMap[PredBB]);
1298       else
1299         LoopPHI->addIncoming(NullVal, BlockMap[PredBB]);
1300     }
1301 
1302     for (auto *PredBBCopy : make_range(pred_begin(BBCopy), pred_end(BBCopy)))
1303       if (LoopPHI->getBasicBlockIndex(PredBBCopy) < 0)
1304         LoopPHI->addIncoming(NullVal, PredBBCopy);
1305 
1306     LTS[L] = SE.getUnknown(LoopPHI);
1307   }
1308 
1309   // Continue generating code in the exit block.
1310   Builder.SetInsertPoint(&*ExitBBCopy->getFirstInsertionPt());
1311 
1312   // Write values visible to other statements.
1313   generateScalarStores(Stmt, LTS, ValueMap, IdToAstExp);
1314   BlockMap.clear();
1315   RegionMaps.clear();
1316   IncompletePHINodeMap.clear();
1317 }
1318 
1319 PHINode *RegionGenerator::buildExitPHI(MemoryAccess *MA, LoopToScevMapT &LTS,
1320                                        ValueMapT &BBMap, Loop *L) {
1321   ScopStmt *Stmt = MA->getStatement();
1322   Region *SubR = Stmt->getRegion();
1323   auto Incoming = MA->getIncoming();
1324 
1325   PollyIRBuilder::InsertPointGuard IPGuard(Builder);
1326   PHINode *OrigPHI = cast<PHINode>(MA->getAccessInstruction());
1327   BasicBlock *NewSubregionExit = Builder.GetInsertBlock();
1328 
1329   // This can happen if the subregion is simplified after the ScopStmts
1330   // have been created; simplification happens as part of CodeGeneration.
1331   if (OrigPHI->getParent() != SubR->getExit()) {
1332     BasicBlock *FormerExit = SubR->getExitingBlock();
1333     if (FormerExit)
1334       NewSubregionExit = BlockMap.lookup(FormerExit);
1335   }
1336 
1337   PHINode *NewPHI = PHINode::Create(OrigPHI->getType(), Incoming.size(),
1338                                     "polly." + OrigPHI->getName(),
1339                                     NewSubregionExit->getFirstNonPHI());
1340 
1341   // Add the incoming values to the PHI.
1342   for (auto &Pair : Incoming) {
1343     BasicBlock *OrigIncomingBlock = Pair.first;
1344     BasicBlock *NewIncomingBlock = BlockMap.lookup(OrigIncomingBlock);
1345     Builder.SetInsertPoint(NewIncomingBlock->getTerminator());
1346     assert(RegionMaps.count(NewIncomingBlock));
1347     ValueMapT *LocalBBMap = &RegionMaps[NewIncomingBlock];
1348 
1349     Value *OrigIncomingValue = Pair.second;
1350     Value *NewIncomingValue =
1351         getNewValue(*Stmt, OrigIncomingValue, *LocalBBMap, LTS, L);
1352     NewPHI->addIncoming(NewIncomingValue, NewIncomingBlock);
1353   }
1354 
1355   return NewPHI;
1356 }
1357 
1358 Value *RegionGenerator::getExitScalar(MemoryAccess *MA, LoopToScevMapT &LTS,
1359                                       ValueMapT &BBMap) {
1360   ScopStmt *Stmt = MA->getStatement();
1361 
1362   // TODO: Add some test cases that ensure this is really the right choice.
1363   Loop *L = LI.getLoopFor(Stmt->getRegion()->getExit());
1364 
1365   if (MA->isAnyPHIKind()) {
1366     auto Incoming = MA->getIncoming();
1367     assert(!Incoming.empty() &&
1368            "PHI WRITEs must have originate from at least one incoming block");
1369 
1370     // If there is only one incoming value, we do not need to create a PHI.
1371     if (Incoming.size() == 1) {
1372       Value *OldVal = Incoming[0].second;
1373       return getNewValue(*Stmt, OldVal, BBMap, LTS, L);
1374     }
1375 
1376     return buildExitPHI(MA, LTS, BBMap, L);
1377   }
1378 
1379   // MemoryKind::Value accesses leaving the subregion must dominate the exit
1380   // block; just pass the copied value.
1381   Value *OldVal = MA->getAccessValue();
1382   return getNewValue(*Stmt, OldVal, BBMap, LTS, L);
1383 }
1384 
1385 void RegionGenerator::generateScalarStores(
1386     ScopStmt &Stmt, LoopToScevMapT &LTS, ValueMapT &BBMap,
1387     __isl_keep isl_id_to_ast_expr *NewAccesses) {
1388   assert(Stmt.getRegion() &&
1389          "Block statements need to use the generateScalarStores() "
1390          "function in the BlockGenerator");
1391 
1392   for (MemoryAccess *MA : Stmt) {
1393     if (MA->isOriginalArrayKind() || MA->isRead())
1394       continue;
1395 
1396     Value *NewVal = getExitScalar(MA, LTS, BBMap);
1397     Value *Address =
1398         getImplicitAddress(*MA, getLoopForStmt(Stmt), LTS, BBMap, NewAccesses);
1399     assert((!isa<Instruction>(NewVal) ||
1400             DT.dominates(cast<Instruction>(NewVal)->getParent(),
1401                          Builder.GetInsertBlock())) &&
1402            "Domination violation");
1403     assert((!isa<Instruction>(Address) ||
1404             DT.dominates(cast<Instruction>(Address)->getParent(),
1405                          Builder.GetInsertBlock())) &&
1406            "Domination violation");
1407     Builder.CreateStore(NewVal, Address);
1408   }
1409 }
1410 
1411 void RegionGenerator::addOperandToPHI(ScopStmt &Stmt, PHINode *PHI,
1412                                       PHINode *PHICopy, BasicBlock *IncomingBB,
1413                                       LoopToScevMapT &LTS) {
1414   // If the incoming block was not yet copied mark this PHI as incomplete.
1415   // Once the block will be copied the incoming value will be added.
1416   BasicBlock *BBCopy = BlockMap[IncomingBB];
1417   if (!BBCopy) {
1418     assert(Stmt.contains(IncomingBB) &&
1419            "Bad incoming block for PHI in non-affine region");
1420     IncompletePHINodeMap[IncomingBB].push_back(std::make_pair(PHI, PHICopy));
1421     return;
1422   }
1423 
1424   assert(RegionMaps.count(BBCopy) && "Incoming PHI block did not have a BBMap");
1425   ValueMapT &BBCopyMap = RegionMaps[BBCopy];
1426 
1427   Value *OpCopy = nullptr;
1428 
1429   if (Stmt.contains(IncomingBB)) {
1430     Value *Op = PHI->getIncomingValueForBlock(IncomingBB);
1431 
1432     // If the current insert block is different from the PHIs incoming block
1433     // change it, otherwise do not.
1434     auto IP = Builder.GetInsertPoint();
1435     if (IP->getParent() != BBCopy)
1436       Builder.SetInsertPoint(BBCopy->getTerminator());
1437     OpCopy = getNewValue(Stmt, Op, BBCopyMap, LTS, getLoopForStmt(Stmt));
1438     if (IP->getParent() != BBCopy)
1439       Builder.SetInsertPoint(&*IP);
1440   } else {
1441     // All edges from outside the non-affine region become a single edge
1442     // in the new copy of the non-affine region. Make sure to only add the
1443     // corresponding edge the first time we encounter a basic block from
1444     // outside the non-affine region.
1445     if (PHICopy->getBasicBlockIndex(BBCopy) >= 0)
1446       return;
1447 
1448     // Get the reloaded value.
1449     OpCopy = getNewValue(Stmt, PHI, BBCopyMap, LTS, getLoopForStmt(Stmt));
1450   }
1451 
1452   assert(OpCopy && "Incoming PHI value was not copied properly");
1453   assert(BBCopy && "Incoming PHI block was not copied properly");
1454   PHICopy->addIncoming(OpCopy, BBCopy);
1455 }
1456 
1457 void RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, PHINode *PHI,
1458                                          ValueMapT &BBMap,
1459                                          LoopToScevMapT &LTS) {
1460   unsigned NumIncoming = PHI->getNumIncomingValues();
1461   PHINode *PHICopy =
1462       Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName());
1463   PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI());
1464   BBMap[PHI] = PHICopy;
1465 
1466   for (BasicBlock *IncomingBB : PHI->blocks())
1467     addOperandToPHI(Stmt, PHI, PHICopy, IncomingBB, LTS);
1468 }
1469