1 //===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This is the LLVM vectorization plan. It represents a candidate for
11 /// vectorization, allowing to plan and optimize how to vectorize a given loop
12 /// before generating LLVM-IR.
13 /// The vectorizer uses vectorization plans to estimate the costs of potential
14 /// candidates and if profitable to execute the desired plan, generating vector
15 /// LLVM-IR code.
16 ///
17 //===----------------------------------------------------------------------===//
18 
19 #include "VPlan.h"
20 #include "VPlanDominatorTree.h"
21 #include "llvm/ADT/DepthFirstIterator.h"
22 #include "llvm/ADT/PostOrderIterator.h"
23 #include "llvm/ADT/STLExtras.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Twine.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/CFG.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/Instruction.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/IR/Value.h"
34 #include "llvm/Support/Casting.h"
35 #include "llvm/Support/CommandLine.h"
36 #include "llvm/Support/Debug.h"
37 #include "llvm/Support/GenericDomTreeConstruction.h"
38 #include "llvm/Support/GraphWriter.h"
39 #include "llvm/Support/raw_ostream.h"
40 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
41 #include "llvm/Transforms/Utils/LoopVersioning.h"
42 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
43 #include <cassert>
44 #include <string>
45 #include <vector>
46 
47 using namespace llvm;
48 extern cl::opt<bool> EnableVPlanNativePath;
49 
50 #define DEBUG_TYPE "vplan"
51 
52 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
53 raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
54   const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
55   VPSlotTracker SlotTracker(
56       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
57   V.print(OS, SlotTracker);
58   return OS;
59 }
60 #endif
61 
62 Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
63                                 const ElementCount &VF) const {
64   switch (LaneKind) {
65   case VPLane::Kind::ScalableLast:
66     // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
67     return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF),
68                              Builder.getInt32(VF.getKnownMinValue() - Lane));
69   case VPLane::Kind::First:
70     return Builder.getInt32(Lane);
71   }
72   llvm_unreachable("Unknown lane kind");
73 }
74 
75 VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
76     : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
77   if (Def)
78     Def->addDefinedValue(this);
79 }
80 
81 VPValue::~VPValue() {
82   assert(Users.empty() && "trying to delete a VPValue with remaining users");
83   if (Def)
84     Def->removeDefinedValue(this);
85 }
86 
87 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88 void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
89   if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
90     R->print(OS, "", SlotTracker);
91   else
92     printAsOperand(OS, SlotTracker);
93 }
94 
95 void VPValue::dump() const {
96   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
97   VPSlotTracker SlotTracker(
98       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
99   print(dbgs(), SlotTracker);
100   dbgs() << "\n";
101 }
102 
103 void VPDef::dump() const {
104   const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this);
105   VPSlotTracker SlotTracker(
106       (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
107   print(dbgs(), "", SlotTracker);
108   dbgs() << "\n";
109 }
110 #endif
111 
112 // Get the top-most entry block of \p Start. This is the entry block of the
113 // containing VPlan. This function is templated to support both const and non-const blocks
114 template <typename T> static T *getPlanEntry(T *Start) {
115   T *Next = Start;
116   T *Current = Start;
117   while ((Next = Next->getParent()))
118     Current = Next;
119 
120   SmallSetVector<T *, 8> WorkList;
121   WorkList.insert(Current);
122 
123   for (unsigned i = 0; i < WorkList.size(); i++) {
124     T *Current = WorkList[i];
125     if (Current->getNumPredecessors() == 0)
126       return Current;
127     auto &Predecessors = Current->getPredecessors();
128     WorkList.insert(Predecessors.begin(), Predecessors.end());
129   }
130 
131   llvm_unreachable("VPlan without any entry node without predecessors");
132 }
133 
134 VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
135 
136 const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
137 
138 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
139 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
140   const VPBlockBase *Block = this;
141   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
142     Block = Region->getEntry();
143   return cast<VPBasicBlock>(Block);
144 }
145 
146 VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
147   VPBlockBase *Block = this;
148   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
149     Block = Region->getEntry();
150   return cast<VPBasicBlock>(Block);
151 }
152 
153 void VPBlockBase::setPlan(VPlan *ParentPlan) {
154   assert(ParentPlan->getEntry() == this &&
155          "Can only set plan on its entry block.");
156   Plan = ParentPlan;
157 }
158 
159 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
160 const VPBasicBlock *VPBlockBase::getExitingBasicBlock() const {
161   const VPBlockBase *Block = this;
162   while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
163     Block = Region->getExiting();
164   return cast<VPBasicBlock>(Block);
165 }
166 
167 VPBasicBlock *VPBlockBase::getExitingBasicBlock() {
168   VPBlockBase *Block = this;
169   while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
170     Block = Region->getExiting();
171   return cast<VPBasicBlock>(Block);
172 }
173 
174 VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
175   if (!Successors.empty() || !Parent)
176     return this;
177   assert(Parent->getExiting() == this &&
178          "Block w/o successors not the exiting block of its parent.");
179   return Parent->getEnclosingBlockWithSuccessors();
180 }
181 
182 VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
183   if (!Predecessors.empty() || !Parent)
184     return this;
185   assert(Parent->getEntry() == this &&
186          "Block w/o predecessors not the entry of its parent.");
187   return Parent->getEnclosingBlockWithPredecessors();
188 }
189 
190 void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
191   SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
192 
193   for (VPBlockBase *Block : Blocks)
194     delete Block;
195 }
196 
197 VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
198   iterator It = begin();
199   while (It != end() && It->isPhi())
200     It++;
201   return It;
202 }
203 
204 Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
205   if (!Def->getDef())
206     return Def->getLiveInIRValue();
207 
208   if (hasScalarValue(Def, Instance)) {
209     return Data
210         .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
211   }
212 
213   assert(hasVectorValue(Def, Instance.Part));
214   auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
215   if (!VecPart->getType()->isVectorTy()) {
216     assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
217     return VecPart;
218   }
219   // TODO: Cache created scalar values.
220   Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
221   auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
222   // set(Def, Extract, Instance);
223   return Extract;
224 }
225 BasicBlock *VPTransformState::CFGState::getPreheaderBBFor(VPRecipeBase *R) {
226   VPRegionBlock *LoopRegion = R->getParent()->getEnclosingLoopRegion();
227   return VPBB2IRBB[LoopRegion->getPreheaderVPBB()];
228 }
229 
230 void VPTransformState::addNewMetadata(Instruction *To,
231                                       const Instruction *Orig) {
232   // If the loop was versioned with memchecks, add the corresponding no-alias
233   // metadata.
234   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
235     LVer->annotateInstWithNoAlias(To, Orig);
236 }
237 
238 void VPTransformState::addMetadata(Instruction *To, Instruction *From) {
239   propagateMetadata(To, From);
240   addNewMetadata(To, From);
241 }
242 
243 void VPTransformState::addMetadata(ArrayRef<Value *> To, Instruction *From) {
244   for (Value *V : To) {
245     if (Instruction *I = dyn_cast<Instruction>(V))
246       addMetadata(I, From);
247   }
248 }
249 
250 void VPTransformState::setDebugLocFromInst(const Value *V) {
251   const Instruction *Inst = dyn_cast<Instruction>(V);
252   if (!Inst) {
253     Builder.SetCurrentDebugLocation(DebugLoc());
254     return;
255   }
256 
257   const DILocation *DIL = Inst->getDebugLoc();
258   // When a FSDiscriminator is enabled, we don't need to add the multiply
259   // factors to the discriminators.
260   if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
261       !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
262     // FIXME: For scalable vectors, assume vscale=1.
263     auto NewDIL =
264         DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
265     if (NewDIL)
266       Builder.SetCurrentDebugLocation(*NewDIL);
267     else
268       LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
269                         << DIL->getFilename() << " Line: " << DIL->getLine());
270   } else
271     Builder.SetCurrentDebugLocation(DIL);
272 }
273 
274 BasicBlock *
275 VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
276   // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
277   // Pred stands for Predessor. Prev stands for Previous - last visited/created.
278   BasicBlock *PrevBB = CFG.PrevBB;
279   BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
280                                          PrevBB->getParent(), CFG.ExitBB);
281   LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
282 
283   // Hook up the new basic block to its predecessors.
284   for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
285     VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
286     auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
287     BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
288 
289     assert(PredBB && "Predecessor basic-block not found building successor.");
290     auto *PredBBTerminator = PredBB->getTerminator();
291     LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
292 
293     auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
294     if (isa<UnreachableInst>(PredBBTerminator)) {
295       assert(PredVPSuccessors.size() == 1 &&
296              "Predecessor ending w/o branch must have single successor.");
297       DebugLoc DL = PredBBTerminator->getDebugLoc();
298       PredBBTerminator->eraseFromParent();
299       auto *Br = BranchInst::Create(NewBB, PredBB);
300       Br->setDebugLoc(DL);
301     } else if (TermBr && !TermBr->isConditional()) {
302       TermBr->setSuccessor(0, NewBB);
303     } else {
304       // Set each forward successor here when it is created, excluding
305       // backedges. A backward successor is set when the branch is created.
306       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
307       assert(!TermBr->getSuccessor(idx) &&
308              "Trying to reset an existing successor block.");
309       TermBr->setSuccessor(idx, NewBB);
310     }
311   }
312   return NewBB;
313 }
314 
315 void VPBasicBlock::execute(VPTransformState *State) {
316   bool Replica = State->Instance && !State->Instance->isFirstIteration();
317   VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
318   VPBlockBase *SingleHPred = nullptr;
319   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
320 
321   auto IsLoopRegion = [](VPBlockBase *BB) {
322     auto *R = dyn_cast<VPRegionBlock>(BB);
323     return R && !R->isReplicator();
324   };
325 
326   // 1. Create an IR basic block, or reuse the last one or ExitBB if possible.
327   if (getPlan()->getVectorLoopRegion()->getSingleSuccessor() == this) {
328     // ExitBB can be re-used for the exit block of the Plan.
329     NewBB = State->CFG.ExitBB;
330     State->CFG.PrevBB = NewBB;
331 
332     // Update the branch instruction in the predecessor to branch to ExitBB.
333     VPBlockBase *PredVPB = getSingleHierarchicalPredecessor();
334     VPBasicBlock *ExitingVPBB = PredVPB->getExitingBasicBlock();
335     assert(PredVPB->getSingleSuccessor() == this &&
336            "predecessor must have the current block as only successor");
337     BasicBlock *ExitingBB = State->CFG.VPBB2IRBB[ExitingVPBB];
338     // The Exit block of a loop is always set to be successor 0 of the Exiting
339     // block.
340     cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
341   } else if (PrevVPBB && /* A */
342              !((SingleHPred = getSingleHierarchicalPredecessor()) &&
343                SingleHPred->getExitingBasicBlock() == PrevVPBB &&
344                PrevVPBB->getSingleHierarchicalSuccessor() &&
345                (SingleHPred->getParent() == getEnclosingLoopRegion() &&
346                 !IsLoopRegion(SingleHPred))) &&         /* B */
347              !(Replica && getPredecessors().empty())) { /* C */
348     // The last IR basic block is reused, as an optimization, in three cases:
349     // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null;
350     // B. when the current VPBB has a single (hierarchical) predecessor which
351     //    is PrevVPBB and the latter has a single (hierarchical) successor which
352     //    both are in the same non-replicator region; and
353     // C. when the current VPBB is an entry of a region replica - where PrevVPBB
354     //    is the exiting VPBB of this region from a previous instance, or the
355     //    predecessor of this region.
356 
357     NewBB = createEmptyBasicBlock(State->CFG);
358     State->Builder.SetInsertPoint(NewBB);
359     // Temporarily terminate with unreachable until CFG is rewired.
360     UnreachableInst *Terminator = State->Builder.CreateUnreachable();
361     // Register NewBB in its loop. In innermost loops its the same for all
362     // BB's.
363     if (State->CurrentVectorLoop)
364       State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
365     State->Builder.SetInsertPoint(Terminator);
366     State->CFG.PrevBB = NewBB;
367   }
368 
369   // 2. Fill the IR basic block with IR instructions.
370   LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
371                     << " in BB:" << NewBB->getName() << '\n');
372 
373   State->CFG.VPBB2IRBB[this] = NewBB;
374   State->CFG.PrevVPBB = this;
375 
376   for (VPRecipeBase &Recipe : Recipes)
377     Recipe.execute(*State);
378 
379   LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
380 }
381 
382 void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
383   for (VPRecipeBase &R : Recipes) {
384     for (auto *Def : R.definedValues())
385       Def->replaceAllUsesWith(NewValue);
386 
387     for (unsigned I = 0, E = R.getNumOperands(); I != E; I++)
388       R.setOperand(I, NewValue);
389   }
390 }
391 
392 VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
393   assert((SplitAt == end() || SplitAt->getParent() == this) &&
394          "can only split at a position in the same block");
395 
396   SmallVector<VPBlockBase *, 2> Succs(successors());
397   // First, disconnect the current block from its successors.
398   for (VPBlockBase *Succ : Succs)
399     VPBlockUtils::disconnectBlocks(this, Succ);
400 
401   // Create new empty block after the block to split.
402   auto *SplitBlock = new VPBasicBlock(getName() + ".split");
403   VPBlockUtils::insertBlockAfter(SplitBlock, this);
404 
405   // Add successors for block to split to new block.
406   for (VPBlockBase *Succ : Succs)
407     VPBlockUtils::connectBlocks(SplitBlock, Succ);
408 
409   // Finally, move the recipes starting at SplitAt to new block.
410   for (VPRecipeBase &ToMove :
411        make_early_inc_range(make_range(SplitAt, this->end())))
412     ToMove.moveBefore(*SplitBlock, SplitBlock->end());
413 
414   return SplitBlock;
415 }
416 
417 VPRegionBlock *VPBasicBlock::getEnclosingLoopRegion() {
418   VPRegionBlock *P = getParent();
419   if (P && P->isReplicator()) {
420     P = P->getParent();
421     assert(!cast<VPRegionBlock>(P)->isReplicator() &&
422            "unexpected nested replicate regions");
423   }
424   return P;
425 }
426 
427 static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
428   if (VPBB->empty()) {
429     assert(
430         VPBB->getNumSuccessors() < 2 &&
431         "block with multiple successors doesn't have a recipe as terminator");
432     return false;
433   }
434 
435   const VPRecipeBase *R = &VPBB->back();
436   auto *VPI = dyn_cast<VPInstruction>(R);
437   bool IsCondBranch =
438       isa<VPBranchOnMaskRecipe>(R) ||
439       (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond ||
440                VPI->getOpcode() == VPInstruction::BranchOnCount));
441   (void)IsCondBranch;
442 
443   if (VPBB->getNumSuccessors() >= 2 || VPBB->isExiting()) {
444     assert(IsCondBranch && "block with multiple successors not terminated by "
445                            "conditional branch recipe");
446 
447     return true;
448   }
449 
450   assert(
451       !IsCondBranch &&
452       "block with 0 or 1 successors terminated by conditional branch recipe");
453   return false;
454 }
455 
456 VPRecipeBase *VPBasicBlock::getTerminator() {
457   if (hasConditionalTerminator(this))
458     return &back();
459   return nullptr;
460 }
461 
462 const VPRecipeBase *VPBasicBlock::getTerminator() const {
463   if (hasConditionalTerminator(this))
464     return &back();
465   return nullptr;
466 }
467 
468 bool VPBasicBlock::isExiting() const {
469   return getParent()->getExitingBasicBlock() == this;
470 }
471 
472 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
473 void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
474   if (getSuccessors().empty()) {
475     O << Indent << "No successors\n";
476   } else {
477     O << Indent << "Successor(s): ";
478     ListSeparator LS;
479     for (auto *Succ : getSuccessors())
480       O << LS << Succ->getName();
481     O << '\n';
482   }
483 }
484 
485 void VPBasicBlock::print(raw_ostream &O, const Twine &Indent,
486                          VPSlotTracker &SlotTracker) const {
487   O << Indent << getName() << ":\n";
488 
489   auto RecipeIndent = Indent + "  ";
490   for (const VPRecipeBase &Recipe : *this) {
491     Recipe.print(O, RecipeIndent, SlotTracker);
492     O << '\n';
493   }
494 
495   printSuccessors(O, Indent);
496 }
497 #endif
498 
499 void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
500   for (VPBlockBase *Block : depth_first(Entry))
501     // Drop all references in VPBasicBlocks and replace all uses with
502     // DummyValue.
503     Block->dropAllReferences(NewValue);
504 }
505 
506 void VPRegionBlock::execute(VPTransformState *State) {
507   ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
508 
509   if (!isReplicator()) {
510     // Create and register the new vector loop.
511     Loop *PrevLoop = State->CurrentVectorLoop;
512     State->CurrentVectorLoop = State->LI->AllocateLoop();
513     BasicBlock *VectorPH = State->CFG.VPBB2IRBB[getPreheaderVPBB()];
514     Loop *ParentLoop = State->LI->getLoopFor(VectorPH);
515 
516     // Insert the new loop into the loop nest and register the new basic blocks
517     // before calling any utilities such as SCEV that require valid LoopInfo.
518     if (ParentLoop)
519       ParentLoop->addChildLoop(State->CurrentVectorLoop);
520     else
521       State->LI->addTopLevelLoop(State->CurrentVectorLoop);
522 
523     // Visit the VPBlocks connected to "this", starting from it.
524     for (VPBlockBase *Block : RPOT) {
525       LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
526       Block->execute(State);
527     }
528 
529     State->CurrentVectorLoop = PrevLoop;
530     return;
531   }
532 
533   assert(!State->Instance && "Replicating a Region with non-null instance.");
534 
535   // Enter replicating mode.
536   State->Instance = VPIteration(0, 0);
537 
538   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
539     State->Instance->Part = Part;
540     assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
541     for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
542          ++Lane) {
543       State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
544       // Visit the VPBlocks connected to \p this, starting from it.
545       for (VPBlockBase *Block : RPOT) {
546         LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
547         Block->execute(State);
548       }
549     }
550   }
551 
552   // Exit replicating mode.
553   State->Instance.reset();
554 }
555 
556 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
557 void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
558                           VPSlotTracker &SlotTracker) const {
559   O << Indent << (isReplicator() ? "<xVFxUF> " : "<x1> ") << getName() << ": {";
560   auto NewIndent = Indent + "  ";
561   for (auto *BlockBase : depth_first(Entry)) {
562     O << '\n';
563     BlockBase->print(O, NewIndent, SlotTracker);
564   }
565   O << Indent << "}\n";
566 
567   printSuccessors(O, Indent);
568 }
569 #endif
570 
571 void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
572                              Value *CanonicalIVStartValue,
573                              VPTransformState &State,
574                              bool IsEpilogueVectorization) {
575 
576   VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
577   auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
578   // Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
579   // preparing to execute the plan for the main vector loop.
580   if (!IsEpilogueVectorization && Term &&
581       Term->getOpcode() == VPInstruction::BranchOnCount &&
582       isa<ConstantInt>(TripCountV)) {
583     ConstantInt *C = cast<ConstantInt>(TripCountV);
584     uint64_t TCVal = C->getZExtValue();
585     if (TCVal && TCVal <= State.VF.getKnownMinValue() * State.UF) {
586       auto *BOC =
587           new VPInstruction(VPInstruction::BranchOnCond,
588                             {getOrAddExternalDef(State.Builder.getTrue())});
589       Term->eraseFromParent();
590       ExitingVPBB->appendRecipe(BOC);
591       // TODO: Further simplifications are possible
592       //      1. Replace inductions with constants.
593       //      2. Replace vector loop region with VPBasicBlock.
594     }
595   }
596 
597   // Check if the trip count is needed, and if so build it.
598   if (TripCount && TripCount->getNumUsers()) {
599     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
600       State.set(TripCount, TripCountV, Part);
601   }
602 
603   // Check if the backedge taken count is needed, and if so build it.
604   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
605     IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
606     auto *TCMO = Builder.CreateSub(TripCountV,
607                                    ConstantInt::get(TripCountV->getType(), 1),
608                                    "trip.count.minus.1");
609     auto VF = State.VF;
610     Value *VTCMO =
611         VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
612     for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
613       State.set(BackedgeTakenCount, VTCMO, Part);
614   }
615 
616   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
617     State.set(&VectorTripCount, VectorTripCountV, Part);
618 
619   // When vectorizing the epilogue loop, the canonical induction start value
620   // needs to be changed from zero to the value after the main vector loop.
621   if (CanonicalIVStartValue) {
622     VPValue *VPV = getOrAddExternalDef(CanonicalIVStartValue);
623     auto *IV = getCanonicalIV();
624     assert(all_of(IV->users(),
625                   [](const VPUser *U) {
626                     if (isa<VPScalarIVStepsRecipe>(U))
627                       return true;
628                     auto *VPI = cast<VPInstruction>(U);
629                     return VPI->getOpcode() ==
630                                VPInstruction::CanonicalIVIncrement ||
631                            VPI->getOpcode() ==
632                                VPInstruction::CanonicalIVIncrementNUW;
633                   }) &&
634            "the canonical IV should only be used by its increments or "
635            "ScalarIVSteps when "
636            "resetting the start value");
637     IV->setOperand(0, VPV);
638   }
639 }
640 
641 /// Generate the code inside the preheader and body of the vectorized loop.
642 /// Assumes a single pre-header basic-block was created for this. Introduce
643 /// additional basic-blocks as needed, and fill them all.
644 void VPlan::execute(VPTransformState *State) {
645   // Set the reverse mapping from VPValues to Values for code generation.
646   for (auto &Entry : Value2VPValue)
647     State->VPValue2Value[Entry.second] = Entry.first;
648 
649   // Initialize CFG state.
650   State->CFG.PrevVPBB = nullptr;
651   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
652   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
653   State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
654 
655   // Generate code in the loop pre-header and body.
656   for (VPBlockBase *Block : depth_first(Entry))
657     Block->execute(State);
658 
659   VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
660   BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
661 
662   // Fix the latch value of canonical, reduction and first-order recurrences
663   // phis in the vector loop.
664   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
665   for (VPRecipeBase &R : Header->phis()) {
666     // Skip phi-like recipes that generate their backedege values themselves.
667     if (isa<VPWidenPHIRecipe>(&R))
668       continue;
669 
670     if (isa<VPWidenPointerInductionRecipe>(&R) ||
671         isa<VPWidenIntOrFpInductionRecipe>(&R)) {
672       PHINode *Phi = nullptr;
673       if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
674         Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
675       } else {
676         auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
677         // TODO: Split off the case that all users of a pointer phi are scalar
678         // from the VPWidenPointerInductionRecipe.
679         if (WidenPhi->onlyScalarsGenerated(State->VF))
680           continue;
681 
682         auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
683         Phi = cast<PHINode>(GEP->getPointerOperand());
684       }
685 
686       Phi->setIncomingBlock(1, VectorLatchBB);
687 
688       // Move the last step to the end of the latch block. This ensures
689       // consistent placement of all induction updates.
690       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
691       Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
692       continue;
693     }
694 
695     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
696     // For  canonical IV, first-order recurrences and in-order reduction phis,
697     // only a single part is generated, which provides the last part from the
698     // previous iteration. For non-ordered reductions all UF parts are
699     // generated.
700     bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
701                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
702                             cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
703     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
704 
705     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
706       Value *Phi = State->get(PhiR, Part);
707       Value *Val = State->get(PhiR->getBackedgeValue(),
708                               SinglePartNeeded ? State->UF - 1 : Part);
709       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
710     }
711   }
712 
713   // We do not attempt to preserve DT for outer loop vectorization currently.
714   if (!EnableVPlanNativePath) {
715     BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
716     State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
717     updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
718                         State->CFG.ExitBB);
719   }
720 }
721 
722 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
723 LLVM_DUMP_METHOD
724 void VPlan::print(raw_ostream &O) const {
725   VPSlotTracker SlotTracker(this);
726 
727   O << "VPlan '" << Name << "' {";
728 
729   if (VectorTripCount.getNumUsers() > 0) {
730     O << "\nLive-in ";
731     VectorTripCount.printAsOperand(O, SlotTracker);
732     O << " = vector-trip-count\n";
733   }
734 
735   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
736     O << "\nLive-in ";
737     BackedgeTakenCount->printAsOperand(O, SlotTracker);
738     O << " = backedge-taken count\n";
739   }
740 
741   for (const VPBlockBase *Block : depth_first(getEntry())) {
742     O << '\n';
743     Block->print(O, "", SlotTracker);
744   }
745 
746   if (!LiveOuts.empty())
747     O << "\n";
748   for (auto &KV : LiveOuts) {
749     O << "Live-out ";
750     KV.second->getPhi()->printAsOperand(O);
751     O << " = ";
752     KV.second->getOperand(0)->printAsOperand(O, SlotTracker);
753     O << "\n";
754   }
755 
756   O << "}\n";
757 }
758 
759 LLVM_DUMP_METHOD
760 void VPlan::printDOT(raw_ostream &O) const {
761   VPlanPrinter Printer(O, *this);
762   Printer.dump();
763 }
764 
765 LLVM_DUMP_METHOD
766 void VPlan::dump() const { print(dbgs()); }
767 #endif
768 
769 void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
770   assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
771   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
772 }
773 
774 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
775                                 BasicBlock *LoopLatchBB,
776                                 BasicBlock *LoopExitBB) {
777   // The vector body may be more than a single basic-block by this point.
778   // Update the dominator tree information inside the vector body by propagating
779   // it from header to latch, expecting only triangular control-flow, if any.
780   BasicBlock *PostDomSucc = nullptr;
781   for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
782     // Get the list of successors of this block.
783     std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
784     assert(Succs.size() <= 2 &&
785            "Basic block in vector loop has more than 2 successors.");
786     PostDomSucc = Succs[0];
787     if (Succs.size() == 1) {
788       assert(PostDomSucc->getSinglePredecessor() &&
789              "PostDom successor has more than one predecessor.");
790       DT->addNewBlock(PostDomSucc, BB);
791       continue;
792     }
793     BasicBlock *InterimSucc = Succs[1];
794     if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
795       PostDomSucc = Succs[1];
796       InterimSucc = Succs[0];
797     }
798     assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
799            "One successor of a basic block does not lead to the other.");
800     assert(InterimSucc->getSinglePredecessor() &&
801            "Interim successor has more than one predecessor.");
802     assert(PostDomSucc->hasNPredecessors(2) &&
803            "PostDom successor has more than two predecessors.");
804     DT->addNewBlock(InterimSucc, BB);
805     DT->addNewBlock(PostDomSucc, BB);
806   }
807   // Latch block is a new dominator for the loop exit.
808   DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
809   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
810 }
811 
812 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
813 
814 Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
815   return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
816          Twine(getOrCreateBID(Block));
817 }
818 
819 Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
820   const std::string &Name = Block->getName();
821   if (!Name.empty())
822     return Name;
823   return "VPB" + Twine(getOrCreateBID(Block));
824 }
825 
826 void VPlanPrinter::dump() {
827   Depth = 1;
828   bumpIndent(0);
829   OS << "digraph VPlan {\n";
830   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
831   if (!Plan.getName().empty())
832     OS << "\\n" << DOT::EscapeString(Plan.getName());
833   if (Plan.BackedgeTakenCount) {
834     OS << ", where:\\n";
835     Plan.BackedgeTakenCount->print(OS, SlotTracker);
836     OS << " := BackedgeTakenCount";
837   }
838   OS << "\"]\n";
839   OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
840   OS << "edge [fontname=Courier, fontsize=30]\n";
841   OS << "compound=true\n";
842 
843   for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
844     dumpBlock(Block);
845 
846   OS << "}\n";
847 }
848 
849 void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
850   if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
851     dumpBasicBlock(BasicBlock);
852   else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
853     dumpRegion(Region);
854   else
855     llvm_unreachable("Unsupported kind of VPBlock.");
856 }
857 
858 void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
859                             bool Hidden, const Twine &Label) {
860   // Due to "dot" we print an edge between two regions as an edge between the
861   // exiting basic block and the entry basic of the respective regions.
862   const VPBlockBase *Tail = From->getExitingBasicBlock();
863   const VPBlockBase *Head = To->getEntryBasicBlock();
864   OS << Indent << getUID(Tail) << " -> " << getUID(Head);
865   OS << " [ label=\"" << Label << '\"';
866   if (Tail != From)
867     OS << " ltail=" << getUID(From);
868   if (Head != To)
869     OS << " lhead=" << getUID(To);
870   if (Hidden)
871     OS << "; splines=none";
872   OS << "]\n";
873 }
874 
875 void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
876   auto &Successors = Block->getSuccessors();
877   if (Successors.size() == 1)
878     drawEdge(Block, Successors.front(), false, "");
879   else if (Successors.size() == 2) {
880     drawEdge(Block, Successors.front(), false, "T");
881     drawEdge(Block, Successors.back(), false, "F");
882   } else {
883     unsigned SuccessorNumber = 0;
884     for (auto *Successor : Successors)
885       drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
886   }
887 }
888 
889 void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
890   // Implement dot-formatted dump by performing plain-text dump into the
891   // temporary storage followed by some post-processing.
892   OS << Indent << getUID(BasicBlock) << " [label =\n";
893   bumpIndent(1);
894   std::string Str;
895   raw_string_ostream SS(Str);
896   // Use no indentation as we need to wrap the lines into quotes ourselves.
897   BasicBlock->print(SS, "", SlotTracker);
898 
899   // We need to process each line of the output separately, so split
900   // single-string plain-text dump.
901   SmallVector<StringRef, 0> Lines;
902   StringRef(Str).rtrim('\n').split(Lines, "\n");
903 
904   auto EmitLine = [&](StringRef Line, StringRef Suffix) {
905     OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix;
906   };
907 
908   // Don't need the "+" after the last line.
909   for (auto Line : make_range(Lines.begin(), Lines.end() - 1))
910     EmitLine(Line, " +\n");
911   EmitLine(Lines.back(), "\n");
912 
913   bumpIndent(-1);
914   OS << Indent << "]\n";
915 
916   dumpEdges(BasicBlock);
917 }
918 
919 void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
920   OS << Indent << "subgraph " << getUID(Region) << " {\n";
921   bumpIndent(1);
922   OS << Indent << "fontname=Courier\n"
923      << Indent << "label=\""
924      << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
925      << DOT::EscapeString(Region->getName()) << "\"\n";
926   // Dump the blocks of the region.
927   assert(Region->getEntry() && "Region contains no inner blocks.");
928   for (const VPBlockBase *Block : depth_first(Region->getEntry()))
929     dumpBlock(Block);
930   bumpIndent(-1);
931   OS << Indent << "}\n";
932   dumpEdges(Region);
933 }
934 
935 void VPlanIngredient::print(raw_ostream &O) const {
936   if (auto *Inst = dyn_cast<Instruction>(V)) {
937     if (!Inst->getType()->isVoidTy()) {
938       Inst->printAsOperand(O, false);
939       O << " = ";
940     }
941     O << Inst->getOpcodeName() << " ";
942     unsigned E = Inst->getNumOperands();
943     if (E > 0) {
944       Inst->getOperand(0)->printAsOperand(O, false);
945       for (unsigned I = 1; I < E; ++I)
946         Inst->getOperand(I)->printAsOperand(O << ", ", false);
947     }
948   } else // !Inst
949     V->printAsOperand(O, false);
950 }
951 
952 #endif
953 
954 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
955 
956 void VPValue::replaceAllUsesWith(VPValue *New) {
957   for (unsigned J = 0; J < getNumUsers();) {
958     VPUser *User = Users[J];
959     unsigned NumUsers = getNumUsers();
960     for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
961       if (User->getOperand(I) == this)
962         User->setOperand(I, New);
963     // If a user got removed after updating the current user, the next user to
964     // update will be moved to the current position, so we only need to
965     // increment the index if the number of users did not change.
966     if (NumUsers == getNumUsers())
967       J++;
968   }
969 }
970 
971 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
972 void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
973   if (const Value *UV = getUnderlyingValue()) {
974     OS << "ir<";
975     UV->printAsOperand(OS, false);
976     OS << ">";
977     return;
978   }
979 
980   unsigned Slot = Tracker.getSlot(this);
981   if (Slot == unsigned(-1))
982     OS << "<badref>";
983   else
984     OS << "vp<%" << Tracker.getSlot(this) << ">";
985 }
986 
987 void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
988   interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
989     Op->printAsOperand(O, SlotTracker);
990   });
991 }
992 #endif
993 
994 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
995                                           Old2NewTy &Old2New,
996                                           InterleavedAccessInfo &IAI) {
997   ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
998   for (VPBlockBase *Base : RPOT) {
999     visitBlock(Base, Old2New, IAI);
1000   }
1001 }
1002 
1003 void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
1004                                          InterleavedAccessInfo &IAI) {
1005   if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
1006     for (VPRecipeBase &VPI : *VPBB) {
1007       if (isa<VPHeaderPHIRecipe>(&VPI))
1008         continue;
1009       assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
1010       auto *VPInst = cast<VPInstruction>(&VPI);
1011 
1012       auto *Inst = dyn_cast_or_null<Instruction>(VPInst->getUnderlyingValue());
1013       if (!Inst)
1014         continue;
1015       auto *IG = IAI.getInterleaveGroup(Inst);
1016       if (!IG)
1017         continue;
1018 
1019       auto NewIGIter = Old2New.find(IG);
1020       if (NewIGIter == Old2New.end())
1021         Old2New[IG] = new InterleaveGroup<VPInstruction>(
1022             IG->getFactor(), IG->isReverse(), IG->getAlign());
1023 
1024       if (Inst == IG->getInsertPos())
1025         Old2New[IG]->setInsertPos(VPInst);
1026 
1027       InterleaveGroupMap[VPInst] = Old2New[IG];
1028       InterleaveGroupMap[VPInst]->insertMember(
1029           VPInst, IG->getIndex(Inst),
1030           Align(IG->isReverse() ? (-1) * int(IG->getFactor())
1031                                 : IG->getFactor()));
1032     }
1033   } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
1034     visitRegion(Region, Old2New, IAI);
1035   else
1036     llvm_unreachable("Unsupported kind of VPBlock.");
1037 }
1038 
1039 VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
1040                                                  InterleavedAccessInfo &IAI) {
1041   Old2NewTy Old2New;
1042   visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI);
1043 }
1044 
1045 void VPSlotTracker::assignSlot(const VPValue *V) {
1046   assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
1047   Slots[V] = NextSlot++;
1048 }
1049 
1050 void VPSlotTracker::assignSlots(const VPlan &Plan) {
1051 
1052   for (const auto &P : Plan.VPExternalDefs)
1053     assignSlot(P.second);
1054 
1055   assignSlot(&Plan.VectorTripCount);
1056   if (Plan.BackedgeTakenCount)
1057     assignSlot(Plan.BackedgeTakenCount);
1058 
1059   ReversePostOrderTraversal<
1060       VPBlockRecursiveTraversalWrapper<const VPBlockBase *>>
1061       RPOT(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
1062           Plan.getEntry()));
1063   for (const VPBasicBlock *VPBB :
1064        VPBlockUtils::blocksOnly<const VPBasicBlock>(RPOT))
1065     for (const VPRecipeBase &Recipe : *VPBB)
1066       for (VPValue *Def : Recipe.definedValues())
1067         assignSlot(Def);
1068 }
1069 
1070 bool vputils::onlyFirstLaneUsed(VPValue *Def) {
1071   return all_of(Def->users(),
1072                 [Def](VPUser *U) { return U->onlyFirstLaneUsed(Def); });
1073 }
1074 
1075 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
1076                                                 ScalarEvolution &SE) {
1077   if (auto *E = dyn_cast<SCEVConstant>(Expr))
1078     return Plan.getOrAddExternalDef(E->getValue());
1079   if (auto *E = dyn_cast<SCEVUnknown>(Expr))
1080     return Plan.getOrAddExternalDef(E->getValue());
1081 
1082   VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
1083   VPValue *Step = new VPExpandSCEVRecipe(Expr, SE);
1084   Preheader->appendRecipe(cast<VPRecipeBase>(Step->getDef()));
1085   return Step;
1086 }
1087