1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 //    of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 //    widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 //    of vectorization. It decides on the optimal vector width, which
27 //    can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
39 //  Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
45 //  Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
50 #include "VPlan.h"
51 #include "llvm/ADT/APInt.h"
52 #include "llvm/ADT/ArrayRef.h"
53 #include "llvm/ADT/DenseMap.h"
54 #include "llvm/ADT/DenseMapInfo.h"
55 #include "llvm/ADT/Hashing.h"
56 #include "llvm/ADT/MapVector.h"
57 #include "llvm/ADT/None.h"
58 #include "llvm/ADT/Optional.h"
59 #include "llvm/ADT/SCCIterator.h"
60 #include "llvm/ADT/STLExtras.h"
61 #include "llvm/ADT/SetVector.h"
62 #include "llvm/ADT/SmallPtrSet.h"
63 #include "llvm/ADT/SmallSet.h"
64 #include "llvm/ADT/SmallVector.h"
65 #include "llvm/ADT/Statistic.h"
66 #include "llvm/ADT/StringRef.h"
67 #include "llvm/ADT/Twine.h"
68 #include "llvm/ADT/iterator_range.h"
69 #include "llvm/Analysis/AssumptionCache.h"
70 #include "llvm/Analysis/BasicAliasAnalysis.h"
71 #include "llvm/Analysis/BlockFrequencyInfo.h"
72 #include "llvm/Analysis/CodeMetrics.h"
73 #include "llvm/Analysis/DemandedBits.h"
74 #include "llvm/Analysis/GlobalsModRef.h"
75 #include "llvm/Analysis/LoopAccessAnalysis.h"
76 #include "llvm/Analysis/LoopAnalysisManager.h"
77 #include "llvm/Analysis/LoopInfo.h"
78 #include "llvm/Analysis/LoopIterator.h"
79 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
80 #include "llvm/Analysis/ScalarEvolution.h"
81 #include "llvm/Analysis/ScalarEvolutionExpander.h"
82 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
83 #include "llvm/Analysis/TargetLibraryInfo.h"
84 #include "llvm/Analysis/TargetTransformInfo.h"
85 #include "llvm/Analysis/VectorUtils.h"
86 #include "llvm/IR/Attributes.h"
87 #include "llvm/IR/BasicBlock.h"
88 #include "llvm/IR/CFG.h"
89 #include "llvm/IR/Constant.h"
90 #include "llvm/IR/Constants.h"
91 #include "llvm/IR/DataLayout.h"
92 #include "llvm/IR/DebugInfoMetadata.h"
93 #include "llvm/IR/DebugLoc.h"
94 #include "llvm/IR/DerivedTypes.h"
95 #include "llvm/IR/DiagnosticInfo.h"
96 #include "llvm/IR/Dominators.h"
97 #include "llvm/IR/Function.h"
98 #include "llvm/IR/IRBuilder.h"
99 #include "llvm/IR/InstrTypes.h"
100 #include "llvm/IR/Instruction.h"
101 #include "llvm/IR/Instructions.h"
102 #include "llvm/IR/IntrinsicInst.h"
103 #include "llvm/IR/Intrinsics.h"
104 #include "llvm/IR/LLVMContext.h"
105 #include "llvm/IR/Metadata.h"
106 #include "llvm/IR/Module.h"
107 #include "llvm/IR/Operator.h"
108 #include "llvm/IR/Type.h"
109 #include "llvm/IR/Use.h"
110 #include "llvm/IR/User.h"
111 #include "llvm/IR/Value.h"
112 #include "llvm/IR/ValueHandle.h"
113 #include "llvm/IR/Verifier.h"
114 #include "llvm/Pass.h"
115 #include "llvm/Support/Casting.h"
116 #include "llvm/Support/CommandLine.h"
117 #include "llvm/Support/Compiler.h"
118 #include "llvm/Support/Debug.h"
119 #include "llvm/Support/ErrorHandling.h"
120 #include "llvm/Support/MathExtras.h"
121 #include "llvm/Support/raw_ostream.h"
122 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
123 #include "llvm/Transforms/Utils/LoopSimplify.h"
124 #include "llvm/Transforms/Utils/LoopUtils.h"
125 #include "llvm/Transforms/Utils/LoopVersioning.h"
126 #include <algorithm>
127 #include <cassert>
128 #include <cstdint>
129 #include <cstdlib>
130 #include <functional>
131 #include <iterator>
132 #include <limits>
133 #include <memory>
134 #include <string>
135 #include <tuple>
136 #include <utility>
137 #include <vector>
138 
139 using namespace llvm;
140 
141 #define LV_NAME "loop-vectorize"
142 #define DEBUG_TYPE LV_NAME
143 
144 STATISTIC(LoopsVectorized, "Number of loops vectorized");
145 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
146 
147 static cl::opt<bool>
148     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
149                        cl::desc("Enable if-conversion during vectorization."));
150 
151 /// Loops with a known constant trip count below this number are vectorized only
152 /// if no scalar iteration overheads are incurred.
153 static cl::opt<unsigned> TinyTripCountVectorThreshold(
154     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
155     cl::desc("Loops with a constant trip count that is smaller than this "
156              "value are vectorized only if no scalar iteration overheads "
157              "are incurred."));
158 
159 static cl::opt<bool> MaximizeBandwidth(
160     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
161     cl::desc("Maximize bandwidth when selecting vectorization factor which "
162              "will be determined by the smallest type in loop."));
163 
164 static cl::opt<bool> EnableInterleavedMemAccesses(
165     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
166     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
167 
168 /// Maximum factor for an interleaved memory access.
169 static cl::opt<unsigned> MaxInterleaveGroupFactor(
170     "max-interleave-group-factor", cl::Hidden,
171     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
172     cl::init(8));
173 
174 /// We don't interleave loops with a known constant trip count below this
175 /// number.
176 static const unsigned TinyTripCountInterleaveThreshold = 128;
177 
178 static cl::opt<unsigned> ForceTargetNumScalarRegs(
179     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
180     cl::desc("A flag that overrides the target's number of scalar registers."));
181 
182 static cl::opt<unsigned> ForceTargetNumVectorRegs(
183     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
184     cl::desc("A flag that overrides the target's number of vector registers."));
185 
186 /// Maximum vectorization interleave count.
187 static const unsigned MaxInterleaveFactor = 16;
188 
189 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
190     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
191     cl::desc("A flag that overrides the target's max interleave factor for "
192              "scalar loops."));
193 
194 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
195     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
196     cl::desc("A flag that overrides the target's max interleave factor for "
197              "vectorized loops."));
198 
199 static cl::opt<unsigned> ForceTargetInstructionCost(
200     "force-target-instruction-cost", cl::init(0), cl::Hidden,
201     cl::desc("A flag that overrides the target's expected cost for "
202              "an instruction to a single constant value. Mostly "
203              "useful for getting consistent testing."));
204 
205 static cl::opt<unsigned> SmallLoopCost(
206     "small-loop-cost", cl::init(20), cl::Hidden,
207     cl::desc(
208         "The cost of a loop that is considered 'small' by the interleaver."));
209 
210 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
211     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
212     cl::desc("Enable the use of the block frequency analysis to access PGO "
213              "heuristics minimizing code growth in cold regions and being more "
214              "aggressive in hot regions."));
215 
216 // Runtime interleave loops for load/store throughput.
217 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
218     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
219     cl::desc(
220         "Enable runtime interleaving until load/store ports are saturated"));
221 
222 /// The number of stores in a loop that are allowed to need predication.
223 static cl::opt<unsigned> NumberOfStoresToPredicate(
224     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
225     cl::desc("Max number of stores to be predicated behind an if."));
226 
227 static cl::opt<bool> EnableIndVarRegisterHeur(
228     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
229     cl::desc("Count the induction variable only once when interleaving"));
230 
231 static cl::opt<bool> EnableCondStoresVectorization(
232     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
233     cl::desc("Enable if predication of stores during vectorization."));
234 
235 static cl::opt<unsigned> MaxNestedScalarReductionIC(
236     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
237     cl::desc("The maximum interleave count to use when interleaving a scalar "
238              "reduction in a nested loop."));
239 
240 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
241     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
242     cl::desc("The maximum allowed number of runtime memory checks with a "
243              "vectorize(enable) pragma."));
244 
245 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
246     "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
247     cl::desc("The maximum number of SCEV checks allowed."));
248 
249 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
250     "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
251     cl::desc("The maximum number of SCEV checks allowed with a "
252              "vectorize(enable) pragma"));
253 
254 /// Create an analysis remark that explains why vectorization failed
255 ///
256 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
257 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
258 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
259 /// the location of the remark.  \return the remark object that can be
260 /// streamed to.
261 static OptimizationRemarkAnalysis
262 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
263                      Instruction *I = nullptr) {
264   Value *CodeRegion = TheLoop->getHeader();
265   DebugLoc DL = TheLoop->getStartLoc();
266 
267   if (I) {
268     CodeRegion = I->getParent();
269     // If there is no debug location attached to the instruction, revert back to
270     // using the loop's.
271     if (I->getDebugLoc())
272       DL = I->getDebugLoc();
273   }
274 
275   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
276   R << "loop not vectorized: ";
277   return R;
278 }
279 
280 namespace {
281 
282 class LoopVectorizationLegality;
283 class LoopVectorizationCostModel;
284 class LoopVectorizationRequirements;
285 class VPInterleaveRecipe;
286 class VPReplicateRecipe;
287 class VPWidenIntOrFpInductionRecipe;
288 
289 } // end anonymous namespace
290 
291 /// Returns true if the given loop body has a cycle, excluding the loop
292 /// itself.
293 static bool hasCyclesInLoopBody(const Loop &L) {
294   if (!L.empty())
295     return true;
296 
297   for (const auto &SCC :
298        make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
299                   scc_iterator<Loop, LoopBodyTraits>::end(L))) {
300     if (SCC.size() > 1) {
301       DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
302       DEBUG(L.dump());
303       return true;
304     }
305   }
306   return false;
307 }
308 
309 /// A helper function for converting Scalar types to vector types.
310 /// If the incoming type is void, we return void. If the VF is 1, we return
311 /// the scalar type.
312 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
313   if (Scalar->isVoidTy() || VF == 1)
314     return Scalar;
315   return VectorType::get(Scalar, VF);
316 }
317 
318 // FIXME: The following helper functions have multiple implementations
319 // in the project. They can be effectively organized in a common Load/Store
320 // utilities unit.
321 
322 /// A helper function that returns the pointer operand of a load or store
323 /// instruction.
324 static Value *getPointerOperand(Value *I) {
325   if (auto *LI = dyn_cast<LoadInst>(I))
326     return LI->getPointerOperand();
327   if (auto *SI = dyn_cast<StoreInst>(I))
328     return SI->getPointerOperand();
329   return nullptr;
330 }
331 
332 /// A helper function that returns the type of loaded or stored value.
333 static Type *getMemInstValueType(Value *I) {
334   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
335          "Expected Load or Store instruction");
336   if (auto *LI = dyn_cast<LoadInst>(I))
337     return LI->getType();
338   return cast<StoreInst>(I)->getValueOperand()->getType();
339 }
340 
341 /// A helper function that returns the alignment of load or store instruction.
342 static unsigned getMemInstAlignment(Value *I) {
343   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
344          "Expected Load or Store instruction");
345   if (auto *LI = dyn_cast<LoadInst>(I))
346     return LI->getAlignment();
347   return cast<StoreInst>(I)->getAlignment();
348 }
349 
350 /// A helper function that returns the address space of the pointer operand of
351 /// load or store instruction.
352 static unsigned getMemInstAddressSpace(Value *I) {
353   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
354          "Expected Load or Store instruction");
355   if (auto *LI = dyn_cast<LoadInst>(I))
356     return LI->getPointerAddressSpace();
357   return cast<StoreInst>(I)->getPointerAddressSpace();
358 }
359 
360 /// A helper function that returns true if the given type is irregular. The
361 /// type is irregular if its allocated size doesn't equal the store size of an
362 /// element of the corresponding vector type at the given vectorization factor.
363 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
364   // Determine if an array of VF elements of type Ty is "bitcast compatible"
365   // with a <VF x Ty> vector.
366   if (VF > 1) {
367     auto *VectorTy = VectorType::get(Ty, VF);
368     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
369   }
370 
371   // If the vectorization factor is one, we just check if an array of type Ty
372   // requires padding between elements.
373   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
374 }
375 
376 /// A helper function that returns the reciprocal of the block probability of
377 /// predicated blocks. If we return X, we are assuming the predicated block
378 /// will execute once for for every X iterations of the loop header.
379 ///
380 /// TODO: We should use actual block probability here, if available. Currently,
381 ///       we always assume predicated blocks have a 50% chance of executing.
382 static unsigned getReciprocalPredBlockProb() { return 2; }
383 
384 /// A helper function that adds a 'fast' flag to floating-point operations.
385 static Value *addFastMathFlag(Value *V) {
386   if (isa<FPMathOperator>(V)) {
387     FastMathFlags Flags;
388     Flags.setFast();
389     cast<Instruction>(V)->setFastMathFlags(Flags);
390   }
391   return V;
392 }
393 
394 /// A helper function that returns an integer or floating-point constant with
395 /// value C.
396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
397   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
398                            : ConstantFP::get(Ty, C);
399 }
400 
401 namespace llvm {
402 
403 /// InnerLoopVectorizer vectorizes loops which contain only one basic
404 /// block to a specified vectorization factor (VF).
405 /// This class performs the widening of scalars into vectors, or multiple
406 /// scalars. This class also implements the following features:
407 /// * It inserts an epilogue loop for handling loops that don't have iteration
408 ///   counts that are known to be a multiple of the vectorization factor.
409 /// * It handles the code generation for reduction variables.
410 /// * Scalarization (implementation using scalars) of un-vectorizable
411 ///   instructions.
412 /// InnerLoopVectorizer does not perform any vectorization-legality
413 /// checks, and relies on the caller to check for the different legality
414 /// aspects. The InnerLoopVectorizer relies on the
415 /// LoopVectorizationLegality class to provide information about the induction
416 /// and reduction variables that were found to a given vectorization factor.
417 class InnerLoopVectorizer {
418 public:
419   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
420                       LoopInfo *LI, DominatorTree *DT,
421                       const TargetLibraryInfo *TLI,
422                       const TargetTransformInfo *TTI, AssumptionCache *AC,
423                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
424                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
425                       LoopVectorizationCostModel *CM)
426       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
427         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
428         Builder(PSE.getSE()->getContext()),
429         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
430   virtual ~InnerLoopVectorizer() = default;
431 
432   /// Create a new empty loop. Unlink the old loop and connect the new one.
433   /// Return the pre-header block of the new loop.
434   BasicBlock *createVectorizedLoopSkeleton();
435 
436   /// Widen a single instruction within the innermost loop.
437   void widenInstruction(Instruction &I);
438 
439   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
440   void fixVectorizedLoop();
441 
442   // Return true if any runtime check is added.
443   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
444 
445   /// A type for vectorized values in the new loop. Each value from the
446   /// original loop, when vectorized, is represented by UF vector values in the
447   /// new unrolled loop, where UF is the unroll factor.
448   using VectorParts = SmallVector<Value *, 2>;
449 
450   /// A helper function that computes the predicate of the block BB, assuming
451   /// that the header block of the loop is set to True. It returns the *entry*
452   /// mask for the block BB.
453   VectorParts createBlockInMask(BasicBlock *BB);
454 
455   /// Vectorize a single PHINode in a block. This method handles the induction
456   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
457   /// arbitrary length vectors.
458   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
459 
460   /// A helper function to scalarize a single Instruction in the innermost loop.
461   /// Generates a sequence of scalar instances for each lane between \p MinLane
462   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
463   /// inclusive..
464   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
465                             bool IfPredicateInstr);
466 
467   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
468   /// is provided, the integer induction variable will first be truncated to
469   /// the corresponding type.
470   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
471 
472   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
473   /// vector or scalar value on-demand if one is not yet available. When
474   /// vectorizing a loop, we visit the definition of an instruction before its
475   /// uses. When visiting the definition, we either vectorize or scalarize the
476   /// instruction, creating an entry for it in the corresponding map. (In some
477   /// cases, such as induction variables, we will create both vector and scalar
478   /// entries.) Then, as we encounter uses of the definition, we derive values
479   /// for each scalar or vector use unless such a value is already available.
480   /// For example, if we scalarize a definition and one of its uses is vector,
481   /// we build the required vector on-demand with an insertelement sequence
482   /// when visiting the use. Otherwise, if the use is scalar, we can use the
483   /// existing scalar definition.
484   ///
485   /// Return a value in the new loop corresponding to \p V from the original
486   /// loop at unroll index \p Part. If the value has already been vectorized,
487   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
488   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
489   /// a new vector value on-demand by inserting the scalar values into a vector
490   /// with an insertelement sequence. If the value has been neither vectorized
491   /// nor scalarized, it must be loop invariant, so we simply broadcast the
492   /// value into a vector.
493   Value *getOrCreateVectorValue(Value *V, unsigned Part);
494 
495   /// Return a value in the new loop corresponding to \p V from the original
496   /// loop at unroll and vector indices \p Instance. If the value has been
497   /// vectorized but not scalarized, the necessary extractelement instruction
498   /// will be generated.
499   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
500 
501   /// Construct the vector value of a scalarized value \p V one lane at a time.
502   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
503 
504   /// Try to vectorize the interleaved access group that \p Instr belongs to.
505   void vectorizeInterleaveGroup(Instruction *Instr);
506 
507 protected:
508   friend class LoopVectorizationPlanner;
509 
510   /// A small list of PHINodes.
511   using PhiVector = SmallVector<PHINode *, 4>;
512 
513   /// A type for scalarized values in the new loop. Each value from the
514   /// original loop, when scalarized, is represented by UF x VF scalar values
515   /// in the new unrolled loop, where UF is the unroll factor and VF is the
516   /// vectorization factor.
517   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
518 
519   // When we if-convert we need to create edge masks. We have to cache values
520   // so that we don't end up with exponential recursion/IR.
521   using EdgeMaskCacheTy =
522       DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>;
523   using BlockMaskCacheTy = DenseMap<BasicBlock *, VectorParts>;
524 
525   /// Set up the values of the IVs correctly when exiting the vector loop.
526   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
527                     Value *CountRoundDown, Value *EndValue,
528                     BasicBlock *MiddleBlock);
529 
530   /// Create a new induction variable inside L.
531   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
532                                    Value *Step, Instruction *DL);
533 
534   /// Handle all cross-iteration phis in the header.
535   void fixCrossIterationPHIs();
536 
537   /// Fix a first-order recurrence. This is the second phase of vectorizing
538   /// this phi node.
539   void fixFirstOrderRecurrence(PHINode *Phi);
540 
541   /// Fix a reduction cross-iteration phi. This is the second phase of
542   /// vectorizing this phi node.
543   void fixReduction(PHINode *Phi);
544 
545   /// \brief The Loop exit block may have single value PHI nodes with some
546   /// incoming value. While vectorizing we only handled real values
547   /// that were defined inside the loop and we should have one value for
548   /// each predecessor of its parent basic block. See PR14725.
549   void fixLCSSAPHIs();
550 
551   /// Iteratively sink the scalarized operands of a predicated instruction into
552   /// the block that was created for it.
553   void sinkScalarOperands(Instruction *PredInst);
554 
555   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
556   /// represented as.
557   void truncateToMinimalBitwidths();
558 
559   /// A helper function that computes the predicate of the edge between SRC
560   /// and DST.
561   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
562 
563   /// Insert the new loop to the loop hierarchy and pass manager
564   /// and update the analysis passes.
565   void updateAnalysis();
566 
567   /// Vectorize Load and Store instructions,
568   virtual void vectorizeMemoryInstruction(Instruction *Instr);
569 
570   /// Create a broadcast instruction. This method generates a broadcast
571   /// instruction (shuffle) for loop invariant values and for the induction
572   /// value. If this is the induction variable then we extend it to N, N+1, ...
573   /// this is needed because each iteration in the loop corresponds to a SIMD
574   /// element.
575   virtual Value *getBroadcastInstrs(Value *V);
576 
577   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
578   /// to each vector element of Val. The sequence starts at StartIndex.
579   /// \p Opcode is relevant for FP induction variable.
580   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
581                                Instruction::BinaryOps Opcode =
582                                Instruction::BinaryOpsEnd);
583 
584   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
585   /// variable on which to base the steps, \p Step is the size of the step, and
586   /// \p EntryVal is the value from the original loop that maps to the steps.
587   /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
588   /// can be a truncate instruction).
589   void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
590                         const InductionDescriptor &ID);
591 
592   /// Create a vector induction phi node based on an existing scalar one. \p
593   /// EntryVal is the value from the original loop that maps to the vector phi
594   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
595   /// truncate instruction, instead of widening the original IV, we widen a
596   /// version of the IV truncated to \p EntryVal's type.
597   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
598                                        Value *Step, Instruction *EntryVal);
599 
600   /// Returns true if an instruction \p I should be scalarized instead of
601   /// vectorized for the chosen vectorization factor.
602   bool shouldScalarizeInstruction(Instruction *I) const;
603 
604   /// Returns true if we should generate a scalar version of \p IV.
605   bool needsScalarInduction(Instruction *IV) const;
606 
607   /// Generate a shuffle sequence that will reverse the vector Vec.
608   virtual Value *reverseVector(Value *Vec);
609 
610   /// Returns (and creates if needed) the original loop trip count.
611   Value *getOrCreateTripCount(Loop *NewLoop);
612 
613   /// Returns (and creates if needed) the trip count of the widened loop.
614   Value *getOrCreateVectorTripCount(Loop *NewLoop);
615 
616   /// Returns a bitcasted value to the requested vector type.
617   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
618   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
619                                 const DataLayout &DL);
620 
621   /// Emit a bypass check to see if the vector trip count is zero, including if
622   /// it overflows.
623   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
624 
625   /// Emit a bypass check to see if all of the SCEV assumptions we've
626   /// had to make are correct.
627   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
628 
629   /// Emit bypass checks to check any memory assumptions we may have made.
630   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
631 
632   /// Add additional metadata to \p To that was not present on \p Orig.
633   ///
634   /// Currently this is used to add the noalias annotations based on the
635   /// inserted memchecks.  Use this for instructions that are *cloned* into the
636   /// vector loop.
637   void addNewMetadata(Instruction *To, const Instruction *Orig);
638 
639   /// Add metadata from one instruction to another.
640   ///
641   /// This includes both the original MDs from \p From and additional ones (\see
642   /// addNewMetadata).  Use this for *newly created* instructions in the vector
643   /// loop.
644   void addMetadata(Instruction *To, Instruction *From);
645 
646   /// \brief Similar to the previous function but it adds the metadata to a
647   /// vector of instructions.
648   void addMetadata(ArrayRef<Value *> To, Instruction *From);
649 
650   /// \brief Set the debug location in the builder using the debug location in
651   /// the instruction.
652   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
653 
654   /// The original loop.
655   Loop *OrigLoop;
656 
657   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
658   /// dynamic knowledge to simplify SCEV expressions and converts them to a
659   /// more usable form.
660   PredicatedScalarEvolution &PSE;
661 
662   /// Loop Info.
663   LoopInfo *LI;
664 
665   /// Dominator Tree.
666   DominatorTree *DT;
667 
668   /// Alias Analysis.
669   AliasAnalysis *AA;
670 
671   /// Target Library Info.
672   const TargetLibraryInfo *TLI;
673 
674   /// Target Transform Info.
675   const TargetTransformInfo *TTI;
676 
677   /// Assumption Cache.
678   AssumptionCache *AC;
679 
680   /// Interface to emit optimization remarks.
681   OptimizationRemarkEmitter *ORE;
682 
683   /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
684   /// used.
685   ///
686   /// This is currently only used to add no-alias metadata based on the
687   /// memchecks.  The actually versioning is performed manually.
688   std::unique_ptr<LoopVersioning> LVer;
689 
690   /// The vectorization SIMD factor to use. Each vector will have this many
691   /// vector elements.
692   unsigned VF;
693 
694   /// The vectorization unroll factor to use. Each scalar is vectorized to this
695   /// many different vector instructions.
696   unsigned UF;
697 
698   /// The builder that we use
699   IRBuilder<> Builder;
700 
701   // --- Vectorization state ---
702 
703   /// The vector-loop preheader.
704   BasicBlock *LoopVectorPreHeader;
705 
706   /// The scalar-loop preheader.
707   BasicBlock *LoopScalarPreHeader;
708 
709   /// Middle Block between the vector and the scalar.
710   BasicBlock *LoopMiddleBlock;
711 
712   /// The ExitBlock of the scalar loop.
713   BasicBlock *LoopExitBlock;
714 
715   /// The vector loop body.
716   BasicBlock *LoopVectorBody;
717 
718   /// The scalar loop body.
719   BasicBlock *LoopScalarBody;
720 
721   /// A list of all bypass blocks. The first block is the entry of the loop.
722   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
723 
724   /// The new Induction variable which was added to the new block.
725   PHINode *Induction = nullptr;
726 
727   /// The induction variable of the old basic block.
728   PHINode *OldInduction = nullptr;
729 
730   /// Maps values from the original loop to their corresponding values in the
731   /// vectorized loop. A key value can map to either vector values, scalar
732   /// values or both kinds of values, depending on whether the key was
733   /// vectorized and scalarized.
734   VectorizerValueMap VectorLoopValueMap;
735 
736   /// Store instructions that were predicated.
737   SmallVector<Instruction *, 4> PredicatedInstructions;
738 
739   EdgeMaskCacheTy EdgeMaskCache;
740   BlockMaskCacheTy BlockMaskCache;
741 
742   /// Trip count of the original loop.
743   Value *TripCount = nullptr;
744 
745   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
746   Value *VectorTripCount = nullptr;
747 
748   /// The legality analysis.
749   LoopVectorizationLegality *Legal;
750 
751   /// The profitablity analysis.
752   LoopVectorizationCostModel *Cost;
753 
754   // Record whether runtime checks are added.
755   bool AddedSafetyChecks = false;
756 
757   // Holds the end values for each induction variable. We save the end values
758   // so we can later fix-up the external users of the induction variables.
759   DenseMap<PHINode *, Value *> IVEndValues;
760 };
761 
762 class InnerLoopUnroller : public InnerLoopVectorizer {
763 public:
764   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
765                     LoopInfo *LI, DominatorTree *DT,
766                     const TargetLibraryInfo *TLI,
767                     const TargetTransformInfo *TTI, AssumptionCache *AC,
768                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
769                     LoopVectorizationLegality *LVL,
770                     LoopVectorizationCostModel *CM)
771       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
772                             UnrollFactor, LVL, CM) {}
773 
774 private:
775   Value *getBroadcastInstrs(Value *V) override;
776   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
777                        Instruction::BinaryOps Opcode =
778                        Instruction::BinaryOpsEnd) override;
779   Value *reverseVector(Value *Vec) override;
780 };
781 
782 } // end namespace llvm
783 
784 /// \brief Look for a meaningful debug location on the instruction or it's
785 /// operands.
786 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
787   if (!I)
788     return I;
789 
790   DebugLoc Empty;
791   if (I->getDebugLoc() != Empty)
792     return I;
793 
794   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
795     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
796       if (OpInst->getDebugLoc() != Empty)
797         return OpInst;
798   }
799 
800   return I;
801 }
802 
803 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
804   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
805     const DILocation *DIL = Inst->getDebugLoc();
806     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
807         !isa<DbgInfoIntrinsic>(Inst))
808       B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
809     else
810       B.SetCurrentDebugLocation(DIL);
811   } else
812     B.SetCurrentDebugLocation(DebugLoc());
813 }
814 
815 #ifndef NDEBUG
816 /// \return string containing a file name and a line # for the given loop.
817 static std::string getDebugLocString(const Loop *L) {
818   std::string Result;
819   if (L) {
820     raw_string_ostream OS(Result);
821     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
822       LoopDbgLoc.print(OS);
823     else
824       // Just print the module name.
825       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
826     OS.flush();
827   }
828   return Result;
829 }
830 #endif
831 
832 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
833                                          const Instruction *Orig) {
834   // If the loop was versioned with memchecks, add the corresponding no-alias
835   // metadata.
836   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
837     LVer->annotateInstWithNoAlias(To, Orig);
838 }
839 
840 void InnerLoopVectorizer::addMetadata(Instruction *To,
841                                       Instruction *From) {
842   propagateMetadata(To, From);
843   addNewMetadata(To, From);
844 }
845 
846 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
847                                       Instruction *From) {
848   for (Value *V : To) {
849     if (Instruction *I = dyn_cast<Instruction>(V))
850       addMetadata(I, From);
851   }
852 }
853 
854 namespace {
855 
856 /// \brief The group of interleaved loads/stores sharing the same stride and
857 /// close to each other.
858 ///
859 /// Each member in this group has an index starting from 0, and the largest
860 /// index should be less than interleaved factor, which is equal to the absolute
861 /// value of the access's stride.
862 ///
863 /// E.g. An interleaved load group of factor 4:
864 ///        for (unsigned i = 0; i < 1024; i+=4) {
865 ///          a = A[i];                           // Member of index 0
866 ///          b = A[i+1];                         // Member of index 1
867 ///          d = A[i+3];                         // Member of index 3
868 ///          ...
869 ///        }
870 ///
871 ///      An interleaved store group of factor 4:
872 ///        for (unsigned i = 0; i < 1024; i+=4) {
873 ///          ...
874 ///          A[i]   = a;                         // Member of index 0
875 ///          A[i+1] = b;                         // Member of index 1
876 ///          A[i+2] = c;                         // Member of index 2
877 ///          A[i+3] = d;                         // Member of index 3
878 ///        }
879 ///
880 /// Note: the interleaved load group could have gaps (missing members), but
881 /// the interleaved store group doesn't allow gaps.
882 class InterleaveGroup {
883 public:
884   InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
885       : Align(Align), InsertPos(Instr) {
886     assert(Align && "The alignment should be non-zero");
887 
888     Factor = std::abs(Stride);
889     assert(Factor > 1 && "Invalid interleave factor");
890 
891     Reverse = Stride < 0;
892     Members[0] = Instr;
893   }
894 
895   bool isReverse() const { return Reverse; }
896   unsigned getFactor() const { return Factor; }
897   unsigned getAlignment() const { return Align; }
898   unsigned getNumMembers() const { return Members.size(); }
899 
900   /// \brief Try to insert a new member \p Instr with index \p Index and
901   /// alignment \p NewAlign. The index is related to the leader and it could be
902   /// negative if it is the new leader.
903   ///
904   /// \returns false if the instruction doesn't belong to the group.
905   bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
906     assert(NewAlign && "The new member's alignment should be non-zero");
907 
908     int Key = Index + SmallestKey;
909 
910     // Skip if there is already a member with the same index.
911     if (Members.count(Key))
912       return false;
913 
914     if (Key > LargestKey) {
915       // The largest index is always less than the interleave factor.
916       if (Index >= static_cast<int>(Factor))
917         return false;
918 
919       LargestKey = Key;
920     } else if (Key < SmallestKey) {
921       // The largest index is always less than the interleave factor.
922       if (LargestKey - Key >= static_cast<int>(Factor))
923         return false;
924 
925       SmallestKey = Key;
926     }
927 
928     // It's always safe to select the minimum alignment.
929     Align = std::min(Align, NewAlign);
930     Members[Key] = Instr;
931     return true;
932   }
933 
934   /// \brief Get the member with the given index \p Index
935   ///
936   /// \returns nullptr if contains no such member.
937   Instruction *getMember(unsigned Index) const {
938     int Key = SmallestKey + Index;
939     if (!Members.count(Key))
940       return nullptr;
941 
942     return Members.find(Key)->second;
943   }
944 
945   /// \brief Get the index for the given member. Unlike the key in the member
946   /// map, the index starts from 0.
947   unsigned getIndex(Instruction *Instr) const {
948     for (auto I : Members)
949       if (I.second == Instr)
950         return I.first - SmallestKey;
951 
952     llvm_unreachable("InterleaveGroup contains no such member");
953   }
954 
955   Instruction *getInsertPos() const { return InsertPos; }
956   void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
957 
958 private:
959   unsigned Factor; // Interleave Factor.
960   bool Reverse;
961   unsigned Align;
962   DenseMap<int, Instruction *> Members;
963   int SmallestKey = 0;
964   int LargestKey = 0;
965 
966   // To avoid breaking dependences, vectorized instructions of an interleave
967   // group should be inserted at either the first load or the last store in
968   // program order.
969   //
970   // E.g. %even = load i32             // Insert Position
971   //      %add = add i32 %even         // Use of %even
972   //      %odd = load i32
973   //
974   //      store i32 %even
975   //      %odd = add i32               // Def of %odd
976   //      store i32 %odd               // Insert Position
977   Instruction *InsertPos;
978 };
979 
980 /// \brief Drive the analysis of interleaved memory accesses in the loop.
981 ///
982 /// Use this class to analyze interleaved accesses only when we can vectorize
983 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
984 /// on interleaved accesses is unsafe.
985 ///
986 /// The analysis collects interleave groups and records the relationships
987 /// between the member and the group in a map.
988 class InterleavedAccessInfo {
989 public:
990   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
991                         DominatorTree *DT, LoopInfo *LI)
992       : PSE(PSE), TheLoop(L), DT(DT), LI(LI) {}
993 
994   ~InterleavedAccessInfo() {
995     SmallSet<InterleaveGroup *, 4> DelSet;
996     // Avoid releasing a pointer twice.
997     for (auto &I : InterleaveGroupMap)
998       DelSet.insert(I.second);
999     for (auto *Ptr : DelSet)
1000       delete Ptr;
1001   }
1002 
1003   /// \brief Analyze the interleaved accesses and collect them in interleave
1004   /// groups. Substitute symbolic strides using \p Strides.
1005   void analyzeInterleaving(const ValueToValueMap &Strides);
1006 
1007   /// \brief Check if \p Instr belongs to any interleave group.
1008   bool isInterleaved(Instruction *Instr) const {
1009     return InterleaveGroupMap.count(Instr);
1010   }
1011 
1012   /// \brief Get the interleave group that \p Instr belongs to.
1013   ///
1014   /// \returns nullptr if doesn't have such group.
1015   InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1016     if (InterleaveGroupMap.count(Instr))
1017       return InterleaveGroupMap.find(Instr)->second;
1018     return nullptr;
1019   }
1020 
1021   /// \brief Returns true if an interleaved group that may access memory
1022   /// out-of-bounds requires a scalar epilogue iteration for correctness.
1023   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1024 
1025   /// \brief Initialize the LoopAccessInfo used for dependence checking.
1026   void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
1027 
1028 private:
1029   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1030   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1031   /// The interleaved access analysis can also add new predicates (for example
1032   /// by versioning strides of pointers).
1033   PredicatedScalarEvolution &PSE;
1034 
1035   Loop *TheLoop;
1036   DominatorTree *DT;
1037   LoopInfo *LI;
1038   const LoopAccessInfo *LAI = nullptr;
1039 
1040   /// True if the loop may contain non-reversed interleaved groups with
1041   /// out-of-bounds accesses. We ensure we don't speculatively access memory
1042   /// out-of-bounds by executing at least one scalar epilogue iteration.
1043   bool RequiresScalarEpilogue = false;
1044 
1045   /// Holds the relationships between the members and the interleave group.
1046   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1047 
1048   /// Holds dependences among the memory accesses in the loop. It maps a source
1049   /// access to a set of dependent sink accesses.
1050   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
1051 
1052   /// \brief The descriptor for a strided memory access.
1053   struct StrideDescriptor {
1054     StrideDescriptor() = default;
1055     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1056                      unsigned Align)
1057         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1058 
1059     // The access's stride. It is negative for a reverse access.
1060     int64_t Stride = 0;
1061 
1062     // The scalar expression of this access.
1063     const SCEV *Scev = nullptr;
1064 
1065     // The size of the memory object.
1066     uint64_t Size = 0;
1067 
1068     // The alignment of this access.
1069     unsigned Align = 0;
1070   };
1071 
1072   /// \brief A type for holding instructions and their stride descriptors.
1073   using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
1074 
1075   /// \brief Create a new interleave group with the given instruction \p Instr,
1076   /// stride \p Stride and alignment \p Align.
1077   ///
1078   /// \returns the newly created interleave group.
1079   InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1080                                          unsigned Align) {
1081     assert(!InterleaveGroupMap.count(Instr) &&
1082            "Already in an interleaved access group");
1083     InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1084     return InterleaveGroupMap[Instr];
1085   }
1086 
1087   /// \brief Release the group and remove all the relationships.
1088   void releaseGroup(InterleaveGroup *Group) {
1089     for (unsigned i = 0; i < Group->getFactor(); i++)
1090       if (Instruction *Member = Group->getMember(i))
1091         InterleaveGroupMap.erase(Member);
1092 
1093     delete Group;
1094   }
1095 
1096   /// \brief Collect all the accesses with a constant stride in program order.
1097   void collectConstStrideAccesses(
1098       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
1099       const ValueToValueMap &Strides);
1100 
1101   /// \brief Returns true if \p Stride is allowed in an interleaved group.
1102   static bool isStrided(int Stride) {
1103     unsigned Factor = std::abs(Stride);
1104     return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1105   }
1106 
1107   /// \brief Returns true if \p BB is a predicated block.
1108   bool isPredicated(BasicBlock *BB) const {
1109     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1110   }
1111 
1112   /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1113   bool areDependencesValid() const {
1114     return LAI && LAI->getDepChecker().getDependences();
1115   }
1116 
1117   /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1118   /// necessary, when constructing interleaved groups.
1119   ///
1120   /// \p A must precede \p B in program order. We return false if reordering is
1121   /// not necessary or is prevented because \p A and \p B may be dependent.
1122   bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1123                                                  StrideEntry *B) const {
1124     // Code motion for interleaved accesses can potentially hoist strided loads
1125     // and sink strided stores. The code below checks the legality of the
1126     // following two conditions:
1127     //
1128     // 1. Potentially moving a strided load (B) before any store (A) that
1129     //    precedes B, or
1130     //
1131     // 2. Potentially moving a strided store (A) after any load or store (B)
1132     //    that A precedes.
1133     //
1134     // It's legal to reorder A and B if we know there isn't a dependence from A
1135     // to B. Note that this determination is conservative since some
1136     // dependences could potentially be reordered safely.
1137 
1138     // A is potentially the source of a dependence.
1139     auto *Src = A->first;
1140     auto SrcDes = A->second;
1141 
1142     // B is potentially the sink of a dependence.
1143     auto *Sink = B->first;
1144     auto SinkDes = B->second;
1145 
1146     // Code motion for interleaved accesses can't violate WAR dependences.
1147     // Thus, reordering is legal if the source isn't a write.
1148     if (!Src->mayWriteToMemory())
1149       return true;
1150 
1151     // At least one of the accesses must be strided.
1152     if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1153       return true;
1154 
1155     // If dependence information is not available from LoopAccessInfo,
1156     // conservatively assume the instructions can't be reordered.
1157     if (!areDependencesValid())
1158       return false;
1159 
1160     // If we know there is a dependence from source to sink, assume the
1161     // instructions can't be reordered. Otherwise, reordering is legal.
1162     return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1163   }
1164 
1165   /// \brief Collect the dependences from LoopAccessInfo.
1166   ///
1167   /// We process the dependences once during the interleaved access analysis to
1168   /// enable constant-time dependence queries.
1169   void collectDependences() {
1170     if (!areDependencesValid())
1171       return;
1172     auto *Deps = LAI->getDepChecker().getDependences();
1173     for (auto Dep : *Deps)
1174       Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1175   }
1176 };
1177 
1178 /// Utility class for getting and setting loop vectorizer hints in the form
1179 /// of loop metadata.
1180 /// This class keeps a number of loop annotations locally (as member variables)
1181 /// and can, upon request, write them back as metadata on the loop. It will
1182 /// initially scan the loop for existing metadata, and will update the local
1183 /// values based on information in the loop.
1184 /// We cannot write all values to metadata, as the mere presence of some info,
1185 /// for example 'force', means a decision has been made. So, we need to be
1186 /// careful NOT to add them if the user hasn't specifically asked so.
1187 class LoopVectorizeHints {
1188   enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
1189 
1190   /// Hint - associates name and validation with the hint value.
1191   struct Hint {
1192     const char *Name;
1193     unsigned Value; // This may have to change for non-numeric values.
1194     HintKind Kind;
1195 
1196     Hint(const char *Name, unsigned Value, HintKind Kind)
1197         : Name(Name), Value(Value), Kind(Kind) {}
1198 
1199     bool validate(unsigned Val) {
1200       switch (Kind) {
1201       case HK_WIDTH:
1202         return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1203       case HK_UNROLL:
1204         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1205       case HK_FORCE:
1206         return (Val <= 1);
1207       case HK_ISVECTORIZED:
1208         return (Val==0 || Val==1);
1209       }
1210       return false;
1211     }
1212   };
1213 
1214   /// Vectorization width.
1215   Hint Width;
1216 
1217   /// Vectorization interleave factor.
1218   Hint Interleave;
1219 
1220   /// Vectorization forced
1221   Hint Force;
1222 
1223   /// Already Vectorized
1224   Hint IsVectorized;
1225 
1226   /// Return the loop metadata prefix.
1227   static StringRef Prefix() { return "llvm.loop."; }
1228 
1229   /// True if there is any unsafe math in the loop.
1230   bool PotentiallyUnsafe = false;
1231 
1232 public:
1233   enum ForceKind {
1234     FK_Undefined = -1, ///< Not selected.
1235     FK_Disabled = 0,   ///< Forcing disabled.
1236     FK_Enabled = 1,    ///< Forcing enabled.
1237   };
1238 
1239   LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1240                      OptimizationRemarkEmitter &ORE)
1241       : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1242               HK_WIDTH),
1243         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1244         Force("vectorize.enable", FK_Undefined, HK_FORCE),
1245         IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
1246     // Populate values with existing loop metadata.
1247     getHintsFromMetadata();
1248 
1249     // force-vector-interleave overrides DisableInterleaving.
1250     if (VectorizerParams::isInterleaveForced())
1251       Interleave.Value = VectorizerParams::VectorizationInterleave;
1252 
1253     if (IsVectorized.Value != 1)
1254       // If the vectorization width and interleaving count are both 1 then
1255       // consider the loop to have been already vectorized because there's
1256       // nothing more that we can do.
1257       IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
1258     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1259           << "LV: Interleaving disabled by the pass manager\n");
1260   }
1261 
1262   /// Mark the loop L as already vectorized by setting the width to 1.
1263   void setAlreadyVectorized() {
1264     IsVectorized.Value = 1;
1265     Hint Hints[] = {IsVectorized};
1266     writeHintsToMetadata(Hints);
1267   }
1268 
1269   bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1270     if (getForce() == LoopVectorizeHints::FK_Disabled) {
1271       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1272       emitRemarkWithHints();
1273       return false;
1274     }
1275 
1276     if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1277       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1278       emitRemarkWithHints();
1279       return false;
1280     }
1281 
1282     if (getIsVectorized() == 1) {
1283       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1284       // FIXME: Add interleave.disable metadata. This will allow
1285       // vectorize.disable to be used without disabling the pass and errors
1286       // to differentiate between disabled vectorization and a width of 1.
1287       ORE.emit([&]() {
1288         return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1289                                           "AllDisabled", L->getStartLoc(),
1290                                           L->getHeader())
1291                << "loop not vectorized: vectorization and interleaving are "
1292                   "explicitly disabled, or the loop has already been "
1293                   "vectorized";
1294       });
1295       return false;
1296     }
1297 
1298     return true;
1299   }
1300 
1301   /// Dumps all the hint information.
1302   void emitRemarkWithHints() const {
1303     using namespace ore;
1304 
1305     ORE.emit([&]() {
1306       if (Force.Value == LoopVectorizeHints::FK_Disabled)
1307         return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1308                                         TheLoop->getStartLoc(),
1309                                         TheLoop->getHeader())
1310                << "loop not vectorized: vectorization is explicitly disabled";
1311       else {
1312         OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1313                                    TheLoop->getStartLoc(),
1314                                    TheLoop->getHeader());
1315         R << "loop not vectorized";
1316         if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1317           R << " (Force=" << NV("Force", true);
1318           if (Width.Value != 0)
1319             R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1320           if (Interleave.Value != 0)
1321             R << ", Interleave Count="
1322               << NV("InterleaveCount", Interleave.Value);
1323           R << ")";
1324         }
1325         return R;
1326       }
1327     });
1328   }
1329 
1330   unsigned getWidth() const { return Width.Value; }
1331   unsigned getInterleave() const { return Interleave.Value; }
1332   unsigned getIsVectorized() const { return IsVectorized.Value; }
1333   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1334 
1335   /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1336   /// pass name to force the frontend to print the diagnostic.
1337   const char *vectorizeAnalysisPassName() const {
1338     if (getWidth() == 1)
1339       return LV_NAME;
1340     if (getForce() == LoopVectorizeHints::FK_Disabled)
1341       return LV_NAME;
1342     if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1343       return LV_NAME;
1344     return OptimizationRemarkAnalysis::AlwaysPrint;
1345   }
1346 
1347   bool allowReordering() const {
1348     // When enabling loop hints are provided we allow the vectorizer to change
1349     // the order of operations that is given by the scalar loop. This is not
1350     // enabled by default because can be unsafe or inefficient. For example,
1351     // reordering floating-point operations will change the way round-off
1352     // error accumulates in the loop.
1353     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1354   }
1355 
1356   bool isPotentiallyUnsafe() const {
1357     // Avoid FP vectorization if the target is unsure about proper support.
1358     // This may be related to the SIMD unit in the target not handling
1359     // IEEE 754 FP ops properly, or bad single-to-double promotions.
1360     // Otherwise, a sequence of vectorized loops, even without reduction,
1361     // could lead to different end results on the destination vectors.
1362     return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1363   }
1364 
1365   void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1366 
1367 private:
1368   /// Find hints specified in the loop metadata and update local values.
1369   void getHintsFromMetadata() {
1370     MDNode *LoopID = TheLoop->getLoopID();
1371     if (!LoopID)
1372       return;
1373 
1374     // First operand should refer to the loop id itself.
1375     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1376     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1377 
1378     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1379       const MDString *S = nullptr;
1380       SmallVector<Metadata *, 4> Args;
1381 
1382       // The expected hint is either a MDString or a MDNode with the first
1383       // operand a MDString.
1384       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1385         if (!MD || MD->getNumOperands() == 0)
1386           continue;
1387         S = dyn_cast<MDString>(MD->getOperand(0));
1388         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1389           Args.push_back(MD->getOperand(i));
1390       } else {
1391         S = dyn_cast<MDString>(LoopID->getOperand(i));
1392         assert(Args.size() == 0 && "too many arguments for MDString");
1393       }
1394 
1395       if (!S)
1396         continue;
1397 
1398       // Check if the hint starts with the loop metadata prefix.
1399       StringRef Name = S->getString();
1400       if (Args.size() == 1)
1401         setHint(Name, Args[0]);
1402     }
1403   }
1404 
1405   /// Checks string hint with one operand and set value if valid.
1406   void setHint(StringRef Name, Metadata *Arg) {
1407     if (!Name.startswith(Prefix()))
1408       return;
1409     Name = Name.substr(Prefix().size(), StringRef::npos);
1410 
1411     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1412     if (!C)
1413       return;
1414     unsigned Val = C->getZExtValue();
1415 
1416     Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
1417     for (auto H : Hints) {
1418       if (Name == H->Name) {
1419         if (H->validate(Val))
1420           H->Value = Val;
1421         else
1422           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1423         break;
1424       }
1425     }
1426   }
1427 
1428   /// Create a new hint from name / value pair.
1429   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1430     LLVMContext &Context = TheLoop->getHeader()->getContext();
1431     Metadata *MDs[] = {MDString::get(Context, Name),
1432                        ConstantAsMetadata::get(
1433                            ConstantInt::get(Type::getInt32Ty(Context), V))};
1434     return MDNode::get(Context, MDs);
1435   }
1436 
1437   /// Matches metadata with hint name.
1438   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1439     MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1440     if (!Name)
1441       return false;
1442 
1443     for (auto H : HintTypes)
1444       if (Name->getString().endswith(H.Name))
1445         return true;
1446     return false;
1447   }
1448 
1449   /// Sets current hints into loop metadata, keeping other values intact.
1450   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1451     if (HintTypes.empty())
1452       return;
1453 
1454     // Reserve the first element to LoopID (see below).
1455     SmallVector<Metadata *, 4> MDs(1);
1456     // If the loop already has metadata, then ignore the existing operands.
1457     MDNode *LoopID = TheLoop->getLoopID();
1458     if (LoopID) {
1459       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1460         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1461         // If node in update list, ignore old value.
1462         if (!matchesHintMetadataName(Node, HintTypes))
1463           MDs.push_back(Node);
1464       }
1465     }
1466 
1467     // Now, add the missing hints.
1468     for (auto H : HintTypes)
1469       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1470 
1471     // Replace current metadata node with new one.
1472     LLVMContext &Context = TheLoop->getHeader()->getContext();
1473     MDNode *NewLoopID = MDNode::get(Context, MDs);
1474     // Set operand 0 to refer to the loop id itself.
1475     NewLoopID->replaceOperandWith(0, NewLoopID);
1476 
1477     TheLoop->setLoopID(NewLoopID);
1478   }
1479 
1480   /// The loop these hints belong to.
1481   const Loop *TheLoop;
1482 
1483   /// Interface to emit optimization remarks.
1484   OptimizationRemarkEmitter &ORE;
1485 };
1486 
1487 } // end anonymous namespace
1488 
1489 static void emitMissedWarning(Function *F, Loop *L,
1490                               const LoopVectorizeHints &LH,
1491                               OptimizationRemarkEmitter *ORE) {
1492   LH.emitRemarkWithHints();
1493 
1494   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1495     if (LH.getWidth() != 1)
1496       ORE->emit(DiagnosticInfoOptimizationFailure(
1497                     DEBUG_TYPE, "FailedRequestedVectorization",
1498                     L->getStartLoc(), L->getHeader())
1499                 << "loop not vectorized: "
1500                 << "failed explicitly specified loop vectorization");
1501     else if (LH.getInterleave() != 1)
1502       ORE->emit(DiagnosticInfoOptimizationFailure(
1503                     DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
1504                     L->getHeader())
1505                 << "loop not interleaved: "
1506                 << "failed explicitly specified loop interleaving");
1507   }
1508 }
1509 
1510 namespace {
1511 
1512 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1513 /// to what vectorization factor.
1514 /// This class does not look at the profitability of vectorization, only the
1515 /// legality. This class has two main kinds of checks:
1516 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1517 ///   will change the order of memory accesses in a way that will change the
1518 ///   correctness of the program.
1519 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1520 /// checks for a number of different conditions, such as the availability of a
1521 /// single induction variable, that all types are supported and vectorize-able,
1522 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1523 /// This class is also used by InnerLoopVectorizer for identifying
1524 /// induction variable and the different reduction variables.
1525 class LoopVectorizationLegality {
1526 public:
1527   LoopVectorizationLegality(
1528       Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
1529       TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
1530       const TargetTransformInfo *TTI,
1531       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1532       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1533       LoopVectorizeHints *H)
1534       : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
1535         ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
1536 
1537   /// ReductionList contains the reduction descriptors for all
1538   /// of the reductions that were found in the loop.
1539   using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
1540 
1541   /// InductionList saves induction variables and maps them to the
1542   /// induction descriptor.
1543   using InductionList = MapVector<PHINode *, InductionDescriptor>;
1544 
1545   /// RecurrenceSet contains the phi nodes that are recurrences other than
1546   /// inductions and reductions.
1547   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
1548 
1549   /// Returns true if it is legal to vectorize this loop.
1550   /// This does not mean that it is profitable to vectorize this
1551   /// loop, only that it is legal to do so.
1552   bool canVectorize();
1553 
1554   /// Returns the primary induction variable.
1555   PHINode *getPrimaryInduction() { return PrimaryInduction; }
1556 
1557   /// Returns the reduction variables found in the loop.
1558   ReductionList *getReductionVars() { return &Reductions; }
1559 
1560   /// Returns the induction variables found in the loop.
1561   InductionList *getInductionVars() { return &Inductions; }
1562 
1563   /// Return the first-order recurrences found in the loop.
1564   RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1565 
1566   /// Return the set of instructions to sink to handle first-order recurrences.
1567   DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
1568 
1569   /// Returns the widest induction type.
1570   Type *getWidestInductionType() { return WidestIndTy; }
1571 
1572   /// Returns True if V is an induction variable in this loop.
1573   bool isInductionVariable(const Value *V);
1574 
1575   /// Returns True if PN is a reduction variable in this loop.
1576   bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1577 
1578   /// Returns True if Phi is a first-order recurrence in this loop.
1579   bool isFirstOrderRecurrence(const PHINode *Phi);
1580 
1581   /// Return true if the block BB needs to be predicated in order for the loop
1582   /// to be vectorized.
1583   bool blockNeedsPredication(BasicBlock *BB);
1584 
1585   /// Check if this pointer is consecutive when vectorizing. This happens
1586   /// when the last index of the GEP is the induction variable, or that the
1587   /// pointer itself is an induction variable.
1588   /// This check allows us to vectorize A[idx] into a wide load/store.
1589   /// Returns:
1590   /// 0 - Stride is unknown or non-consecutive.
1591   /// 1 - Address is consecutive.
1592   /// -1 - Address is consecutive, and decreasing.
1593   int isConsecutivePtr(Value *Ptr);
1594 
1595   /// Returns true if the value V is uniform within the loop.
1596   bool isUniform(Value *V);
1597 
1598   /// Returns the information that we collected about runtime memory check.
1599   const RuntimePointerChecking *getRuntimePointerChecking() const {
1600     return LAI->getRuntimePointerChecking();
1601   }
1602 
1603   const LoopAccessInfo *getLAI() const { return LAI; }
1604 
1605   /// \brief Check if \p Instr belongs to any interleaved access group.
1606   bool isAccessInterleaved(Instruction *Instr) {
1607     return InterleaveInfo.isInterleaved(Instr);
1608   }
1609 
1610   /// \brief Get the interleaved access group that \p Instr belongs to.
1611   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
1612     return InterleaveInfo.getInterleaveGroup(Instr);
1613   }
1614 
1615   /// \brief Returns true if an interleaved group requires a scalar iteration
1616   /// to handle accesses with gaps.
1617   bool requiresScalarEpilogue() const {
1618     return InterleaveInfo.requiresScalarEpilogue();
1619   }
1620 
1621   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1622 
1623   uint64_t getMaxSafeRegisterWidth() const {
1624 	  return LAI->getDepChecker().getMaxSafeRegisterWidth();
1625   }
1626 
1627   bool hasStride(Value *V) { return LAI->hasStride(V); }
1628 
1629   /// Returns true if the target machine supports masked store operation
1630   /// for the given \p DataType and kind of access to \p Ptr.
1631   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1632     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
1633   }
1634 
1635   /// Returns true if the target machine supports masked load operation
1636   /// for the given \p DataType and kind of access to \p Ptr.
1637   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1638     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
1639   }
1640 
1641   /// Returns true if the target machine supports masked scatter operation
1642   /// for the given \p DataType.
1643   bool isLegalMaskedScatter(Type *DataType) {
1644     return TTI->isLegalMaskedScatter(DataType);
1645   }
1646 
1647   /// Returns true if the target machine supports masked gather operation
1648   /// for the given \p DataType.
1649   bool isLegalMaskedGather(Type *DataType) {
1650     return TTI->isLegalMaskedGather(DataType);
1651   }
1652 
1653   /// Returns true if the target machine can represent \p V as a masked gather
1654   /// or scatter operation.
1655   bool isLegalGatherOrScatter(Value *V) {
1656     auto *LI = dyn_cast<LoadInst>(V);
1657     auto *SI = dyn_cast<StoreInst>(V);
1658     if (!LI && !SI)
1659       return false;
1660     auto *Ptr = getPointerOperand(V);
1661     auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
1662     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1663   }
1664 
1665   /// Returns true if vector representation of the instruction \p I
1666   /// requires mask.
1667   bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1668 
1669   unsigned getNumStores() const { return LAI->getNumStores(); }
1670   unsigned getNumLoads() const { return LAI->getNumLoads(); }
1671   unsigned getNumPredStores() const { return NumPredStores; }
1672 
1673   /// Returns true if \p I is an instruction that will be scalarized with
1674   /// predication. Such instructions include conditional stores and
1675   /// instructions that may divide by zero.
1676   bool isScalarWithPredication(Instruction *I);
1677 
1678   /// Returns true if \p I is a memory instruction with consecutive memory
1679   /// access that can be widened.
1680   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1681 
1682   // Returns true if the NoNaN attribute is set on the function.
1683   bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
1684 
1685 private:
1686   /// Check if a single basic block loop is vectorizable.
1687   /// At this point we know that this is a loop with a constant trip count
1688   /// and we only need to check individual instructions.
1689   bool canVectorizeInstrs();
1690 
1691   /// When we vectorize loops we may change the order in which
1692   /// we read and write from memory. This method checks if it is
1693   /// legal to vectorize the code, considering only memory constrains.
1694   /// Returns true if the loop is vectorizable
1695   bool canVectorizeMemory();
1696 
1697   /// Return true if we can vectorize this loop using the IF-conversion
1698   /// transformation.
1699   bool canVectorizeWithIfConvert();
1700 
1701   /// Return true if all of the instructions in the block can be speculatively
1702   /// executed. \p SafePtrs is a list of addresses that are known to be legal
1703   /// and we know that we can read from them without segfault.
1704   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1705 
1706   /// Updates the vectorization state by adding \p Phi to the inductions list.
1707   /// This can set \p Phi as the main induction of the loop if \p Phi is a
1708   /// better choice for the main induction than the existing one.
1709   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1710                        SmallPtrSetImpl<Value *> &AllowedExit);
1711 
1712   /// Create an analysis remark that explains why vectorization failed
1713   ///
1714   /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
1715   /// an instruction that prevents vectorization.  Otherwise the loop is used
1716   /// for the location of the remark.  \return the remark object that can be
1717   /// streamed to.
1718   OptimizationRemarkAnalysis
1719   createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1720     return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1721                                   RemarkName, TheLoop, I);
1722   }
1723 
1724   /// \brief If an access has a symbolic strides, this maps the pointer value to
1725   /// the stride symbol.
1726   const ValueToValueMap *getSymbolicStrides() {
1727     // FIXME: Currently, the set of symbolic strides is sometimes queried before
1728     // it's collected.  This happens from canVectorizeWithIfConvert, when the
1729     // pointer is checked to reference consecutive elements suitable for a
1730     // masked access.
1731     return LAI ? &LAI->getSymbolicStrides() : nullptr;
1732   }
1733 
1734   unsigned NumPredStores = 0;
1735 
1736   /// The loop that we evaluate.
1737   Loop *TheLoop;
1738 
1739   /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1740   /// Applies dynamic knowledge to simplify SCEV expressions in the context
1741   /// of existing SCEV assumptions. The analysis will also add a minimal set
1742   /// of new predicates if this is required to enable vectorization and
1743   /// unrolling.
1744   PredicatedScalarEvolution &PSE;
1745 
1746   /// Target Library Info.
1747   TargetLibraryInfo *TLI;
1748 
1749   /// Target Transform Info
1750   const TargetTransformInfo *TTI;
1751 
1752   /// Dominator Tree.
1753   DominatorTree *DT;
1754 
1755   // LoopAccess analysis.
1756   std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1757 
1758   // And the loop-accesses info corresponding to this loop.  This pointer is
1759   // null until canVectorizeMemory sets it up.
1760   const LoopAccessInfo *LAI = nullptr;
1761 
1762   /// Interface to emit optimization remarks.
1763   OptimizationRemarkEmitter *ORE;
1764 
1765   /// The interleave access information contains groups of interleaved accesses
1766   /// with the same stride and close to each other.
1767   InterleavedAccessInfo InterleaveInfo;
1768 
1769   //  ---  vectorization state --- //
1770 
1771   /// Holds the primary induction variable. This is the counter of the
1772   /// loop.
1773   PHINode *PrimaryInduction = nullptr;
1774 
1775   /// Holds the reduction variables.
1776   ReductionList Reductions;
1777 
1778   /// Holds all of the induction variables that we found in the loop.
1779   /// Notice that inductions don't need to start at zero and that induction
1780   /// variables can be pointers.
1781   InductionList Inductions;
1782 
1783   /// Holds the phi nodes that are first-order recurrences.
1784   RecurrenceSet FirstOrderRecurrences;
1785 
1786   /// Holds instructions that need to sink past other instructions to handle
1787   /// first-order recurrences.
1788   DenseMap<Instruction *, Instruction *> SinkAfter;
1789 
1790   /// Holds the widest induction type encountered.
1791   Type *WidestIndTy = nullptr;
1792 
1793   /// Allowed outside users. This holds the induction and reduction
1794   /// vars which can be accessed from outside the loop.
1795   SmallPtrSet<Value *, 4> AllowedExit;
1796 
1797   /// Can we assume the absence of NaNs.
1798   bool HasFunNoNaNAttr = false;
1799 
1800   /// Vectorization requirements that will go through late-evaluation.
1801   LoopVectorizationRequirements *Requirements;
1802 
1803   /// Used to emit an analysis of any legality issues.
1804   LoopVectorizeHints *Hints;
1805 
1806   /// While vectorizing these instructions we have to generate a
1807   /// call to the appropriate masked intrinsic
1808   SmallPtrSet<const Instruction *, 8> MaskedOp;
1809 };
1810 
1811 /// LoopVectorizationCostModel - estimates the expected speedups due to
1812 /// vectorization.
1813 /// In many cases vectorization is not profitable. This can happen because of
1814 /// a number of reasons. In this class we mainly attempt to predict the
1815 /// expected speedup/slowdowns due to the supported instruction set. We use the
1816 /// TargetTransformInfo to query the different backends for the cost of
1817 /// different operations.
1818 class LoopVectorizationCostModel {
1819 public:
1820   LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
1821                              LoopInfo *LI, LoopVectorizationLegality *Legal,
1822                              const TargetTransformInfo &TTI,
1823                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1824                              AssumptionCache *AC,
1825                              OptimizationRemarkEmitter *ORE, const Function *F,
1826                              const LoopVectorizeHints *Hints)
1827       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1828         AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1829 
1830   /// \return An upper bound for the vectorization factor, or None if
1831   /// vectorization should be avoided up front.
1832   Optional<unsigned> computeMaxVF(bool OptForSize);
1833 
1834   /// Information about vectorization costs
1835   struct VectorizationFactor {
1836     // Vector width with best cost
1837     unsigned Width;
1838 
1839     // Cost of the loop with that width
1840     unsigned Cost;
1841   };
1842 
1843   /// \return The most profitable vectorization factor and the cost of that VF.
1844   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1845   /// then this vectorization factor will be selected if vectorization is
1846   /// possible.
1847   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1848 
1849   /// Setup cost-based decisions for user vectorization factor.
1850   void selectUserVectorizationFactor(unsigned UserVF) {
1851     collectUniformsAndScalars(UserVF);
1852     collectInstsToScalarize(UserVF);
1853   }
1854 
1855   /// \return The size (in bits) of the smallest and widest types in the code
1856   /// that needs to be vectorized. We ignore values that remain scalar such as
1857   /// 64 bit loop indices.
1858   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1859 
1860   /// \return The desired interleave count.
1861   /// If interleave count has been specified by metadata it will be returned.
1862   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1863   /// are the selected vectorization factor and the cost of the selected VF.
1864   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1865                                  unsigned LoopCost);
1866 
1867   /// Memory access instruction may be vectorized in more than one way.
1868   /// Form of instruction after vectorization depends on cost.
1869   /// This function takes cost-based decisions for Load/Store instructions
1870   /// and collects them in a map. This decisions map is used for building
1871   /// the lists of loop-uniform and loop-scalar instructions.
1872   /// The calculated cost is saved with widening decision in order to
1873   /// avoid redundant calculations.
1874   void setCostBasedWideningDecision(unsigned VF);
1875 
1876   /// \brief A struct that represents some properties of the register usage
1877   /// of a loop.
1878   struct RegisterUsage {
1879     /// Holds the number of loop invariant values that are used in the loop.
1880     unsigned LoopInvariantRegs;
1881 
1882     /// Holds the maximum number of concurrent live intervals in the loop.
1883     unsigned MaxLocalUsers;
1884 
1885     /// Holds the number of instructions in the loop.
1886     unsigned NumInstructions;
1887   };
1888 
1889   /// \return Returns information about the register usages of the loop for the
1890   /// given vectorization factors.
1891   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1892 
1893   /// Collect values we want to ignore in the cost model.
1894   void collectValuesToIgnore();
1895 
1896   /// \returns The smallest bitwidth each instruction can be represented with.
1897   /// The vector equivalents of these instructions should be truncated to this
1898   /// type.
1899   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1900     return MinBWs;
1901   }
1902 
1903   /// \returns True if it is more profitable to scalarize instruction \p I for
1904   /// vectorization factor \p VF.
1905   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1906     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1907     auto Scalars = InstsToScalarize.find(VF);
1908     assert(Scalars != InstsToScalarize.end() &&
1909            "VF not yet analyzed for scalarization profitability");
1910     return Scalars->second.count(I);
1911   }
1912 
1913   /// Returns true if \p I is known to be uniform after vectorization.
1914   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1915     if (VF == 1)
1916       return true;
1917     assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
1918     auto UniformsPerVF = Uniforms.find(VF);
1919     return UniformsPerVF->second.count(I);
1920   }
1921 
1922   /// Returns true if \p I is known to be scalar after vectorization.
1923   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1924     if (VF == 1)
1925       return true;
1926     assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
1927     auto ScalarsPerVF = Scalars.find(VF);
1928     return ScalarsPerVF->second.count(I);
1929   }
1930 
1931   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1932   /// for vectorization factor \p VF.
1933   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1934     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1935            !isScalarAfterVectorization(I, VF);
1936   }
1937 
1938   /// Decision that was taken during cost calculation for memory instruction.
1939   enum InstWidening {
1940     CM_Unknown,
1941     CM_Widen,
1942     CM_Interleave,
1943     CM_GatherScatter,
1944     CM_Scalarize
1945   };
1946 
1947   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1948   /// instruction \p I and vector width \p VF.
1949   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1950                            unsigned Cost) {
1951     assert(VF >= 2 && "Expected VF >=2");
1952     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1953   }
1954 
1955   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1956   /// interleaving group \p Grp and vector width \p VF.
1957   void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
1958                            InstWidening W, unsigned Cost) {
1959     assert(VF >= 2 && "Expected VF >=2");
1960     /// Broadcast this decicion to all instructions inside the group.
1961     /// But the cost will be assigned to one instruction only.
1962     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1963       if (auto *I = Grp->getMember(i)) {
1964         if (Grp->getInsertPos() == I)
1965           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1966         else
1967           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1968       }
1969     }
1970   }
1971 
1972   /// Return the cost model decision for the given instruction \p I and vector
1973   /// width \p VF. Return CM_Unknown if this instruction did not pass
1974   /// through the cost modeling.
1975   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1976     assert(VF >= 2 && "Expected VF >=2");
1977     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1978     auto Itr = WideningDecisions.find(InstOnVF);
1979     if (Itr == WideningDecisions.end())
1980       return CM_Unknown;
1981     return Itr->second.first;
1982   }
1983 
1984   /// Return the vectorization cost for the given instruction \p I and vector
1985   /// width \p VF.
1986   unsigned getWideningCost(Instruction *I, unsigned VF) {
1987     assert(VF >= 2 && "Expected VF >=2");
1988     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1989     assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
1990     return WideningDecisions[InstOnVF].second;
1991   }
1992 
1993   /// Return True if instruction \p I is an optimizable truncate whose operand
1994   /// is an induction variable. Such a truncate will be removed by adding a new
1995   /// induction variable with the destination type.
1996   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1997     // If the instruction is not a truncate, return false.
1998     auto *Trunc = dyn_cast<TruncInst>(I);
1999     if (!Trunc)
2000       return false;
2001 
2002     // Get the source and destination types of the truncate.
2003     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
2004     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
2005 
2006     // If the truncate is free for the given types, return false. Replacing a
2007     // free truncate with an induction variable would add an induction variable
2008     // update instruction to each iteration of the loop. We exclude from this
2009     // check the primary induction variable since it will need an update
2010     // instruction regardless.
2011     Value *Op = Trunc->getOperand(0);
2012     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
2013       return false;
2014 
2015     // If the truncated value is not an induction variable, return false.
2016     return Legal->isInductionVariable(Op);
2017   }
2018 
2019   /// Collects the instructions to scalarize for each predicated instruction in
2020   /// the loop.
2021   void collectInstsToScalarize(unsigned VF);
2022 
2023   /// Collect Uniform and Scalar values for the given \p VF.
2024   /// The sets depend on CM decision for Load/Store instructions
2025   /// that may be vectorized as interleave, gather-scatter or scalarized.
2026   void collectUniformsAndScalars(unsigned VF) {
2027     // Do the analysis once.
2028     if (VF == 1 || Uniforms.count(VF))
2029       return;
2030     setCostBasedWideningDecision(VF);
2031     collectLoopUniforms(VF);
2032     collectLoopScalars(VF);
2033   }
2034 
2035 private:
2036   /// \return An upper bound for the vectorization factor, larger than zero.
2037   /// One is returned if vectorization should best be avoided due to cost.
2038   unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
2039 
2040   /// The vectorization cost is a combination of the cost itself and a boolean
2041   /// indicating whether any of the contributing operations will actually
2042   /// operate on
2043   /// vector values after type legalization in the backend. If this latter value
2044   /// is
2045   /// false, then all operations will be scalarized (i.e. no vectorization has
2046   /// actually taken place).
2047   using VectorizationCostTy = std::pair<unsigned, bool>;
2048 
2049   /// Returns the expected execution cost. The unit of the cost does
2050   /// not matter because we use the 'cost' units to compare different
2051   /// vector widths. The cost that is returned is *not* normalized by
2052   /// the factor width.
2053   VectorizationCostTy expectedCost(unsigned VF);
2054 
2055   /// Returns the execution time cost of an instruction for a given vector
2056   /// width. Vector width of one means scalar.
2057   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
2058 
2059   /// The cost-computation logic from getInstructionCost which provides
2060   /// the vector type as an output parameter.
2061   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
2062 
2063   /// Calculate vectorization cost of memory instruction \p I.
2064   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
2065 
2066   /// The cost computation for scalarized memory instruction.
2067   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
2068 
2069   /// The cost computation for interleaving group of memory instructions.
2070   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
2071 
2072   /// The cost computation for Gather/Scatter instruction.
2073   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
2074 
2075   /// The cost computation for widening instruction \p I with consecutive
2076   /// memory access.
2077   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
2078 
2079   /// The cost calculation for Load instruction \p I with uniform pointer -
2080   /// scalar load + broadcast.
2081   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
2082 
2083   /// Returns whether the instruction is a load or store and will be a emitted
2084   /// as a vector operation.
2085   bool isConsecutiveLoadOrStore(Instruction *I);
2086 
2087   /// Create an analysis remark that explains why vectorization failed
2088   ///
2089   /// \p RemarkName is the identifier for the remark.  \return the remark object
2090   /// that can be streamed to.
2091   OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
2092     return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
2093                                   RemarkName, TheLoop);
2094   }
2095 
2096   /// Map of scalar integer values to the smallest bitwidth they can be legally
2097   /// represented as. The vector equivalents of these values should be truncated
2098   /// to this type.
2099   MapVector<Instruction *, uint64_t> MinBWs;
2100 
2101   /// A type representing the costs for instructions if they were to be
2102   /// scalarized rather than vectorized. The entries are Instruction-Cost
2103   /// pairs.
2104   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
2105 
2106   /// A set containing all BasicBlocks that are known to present after
2107   /// vectorization as a predicated block.
2108   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
2109 
2110   /// A map holding scalar costs for different vectorization factors. The
2111   /// presence of a cost for an instruction in the mapping indicates that the
2112   /// instruction will be scalarized when vectorizing with the associated
2113   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
2114   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
2115 
2116   /// Holds the instructions known to be uniform after vectorization.
2117   /// The data is collected per VF.
2118   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
2119 
2120   /// Holds the instructions known to be scalar after vectorization.
2121   /// The data is collected per VF.
2122   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
2123 
2124   /// Holds the instructions (address computations) that are forced to be
2125   /// scalarized.
2126   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
2127 
2128   /// Returns the expected difference in cost from scalarizing the expression
2129   /// feeding a predicated instruction \p PredInst. The instructions to
2130   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
2131   /// non-negative return value implies the expression will be scalarized.
2132   /// Currently, only single-use chains are considered for scalarization.
2133   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
2134                               unsigned VF);
2135 
2136   /// Collect the instructions that are uniform after vectorization. An
2137   /// instruction is uniform if we represent it with a single scalar value in
2138   /// the vectorized loop corresponding to each vector iteration. Examples of
2139   /// uniform instructions include pointer operands of consecutive or
2140   /// interleaved memory accesses. Note that although uniformity implies an
2141   /// instruction will be scalar, the reverse is not true. In general, a
2142   /// scalarized instruction will be represented by VF scalar values in the
2143   /// vectorized loop, each corresponding to an iteration of the original
2144   /// scalar loop.
2145   void collectLoopUniforms(unsigned VF);
2146 
2147   /// Collect the instructions that are scalar after vectorization. An
2148   /// instruction is scalar if it is known to be uniform or will be scalarized
2149   /// during vectorization. Non-uniform scalarized instructions will be
2150   /// represented by VF values in the vectorized loop, each corresponding to an
2151   /// iteration of the original scalar loop.
2152   void collectLoopScalars(unsigned VF);
2153 
2154   /// Keeps cost model vectorization decision and cost for instructions.
2155   /// Right now it is used for memory instructions only.
2156   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
2157                                 std::pair<InstWidening, unsigned>>;
2158 
2159   DecisionList WideningDecisions;
2160 
2161 public:
2162   /// The loop that we evaluate.
2163   Loop *TheLoop;
2164 
2165   /// Predicated scalar evolution analysis.
2166   PredicatedScalarEvolution &PSE;
2167 
2168   /// Loop Info analysis.
2169   LoopInfo *LI;
2170 
2171   /// Vectorization legality.
2172   LoopVectorizationLegality *Legal;
2173 
2174   /// Vector target information.
2175   const TargetTransformInfo &TTI;
2176 
2177   /// Target Library Info.
2178   const TargetLibraryInfo *TLI;
2179 
2180   /// Demanded bits analysis.
2181   DemandedBits *DB;
2182 
2183   /// Assumption cache.
2184   AssumptionCache *AC;
2185 
2186   /// Interface to emit optimization remarks.
2187   OptimizationRemarkEmitter *ORE;
2188 
2189   const Function *TheFunction;
2190 
2191   /// Loop Vectorize Hint.
2192   const LoopVectorizeHints *Hints;
2193 
2194   /// Values to ignore in the cost model.
2195   SmallPtrSet<const Value *, 16> ValuesToIgnore;
2196 
2197   /// Values to ignore in the cost model when VF > 1.
2198   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2199 };
2200 
2201 } // end anonymous namespace
2202 
2203 namespace llvm {
2204 
2205 /// InnerLoopVectorizer vectorizes loops which contain only one basic
2206 /// LoopVectorizationPlanner - drives the vectorization process after having
2207 /// passed Legality checks.
2208 /// The planner builds and optimizes the Vectorization Plans which record the
2209 /// decisions how to vectorize the given loop. In particular, represent the
2210 /// control-flow of the vectorized version, the replication of instructions that
2211 /// are to be scalarized, and interleave access groups.
2212 class LoopVectorizationPlanner {
2213   /// The loop that we evaluate.
2214   Loop *OrigLoop;
2215 
2216   /// Loop Info analysis.
2217   LoopInfo *LI;
2218 
2219   /// Target Library Info.
2220   const TargetLibraryInfo *TLI;
2221 
2222   /// Target Transform Info.
2223   const TargetTransformInfo *TTI;
2224 
2225   /// The legality analysis.
2226   LoopVectorizationLegality *Legal;
2227 
2228   /// The profitablity analysis.
2229   LoopVectorizationCostModel &CM;
2230 
2231   SmallVector<std::unique_ptr<VPlan>, 4> VPlans;
2232 
2233   unsigned BestVF = 0;
2234   unsigned BestUF = 0;
2235 
2236 public:
2237   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
2238                            const TargetTransformInfo *TTI,
2239                            LoopVectorizationLegality *Legal,
2240                            LoopVectorizationCostModel &CM)
2241       : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {}
2242 
2243   /// Plan how to best vectorize, return the best VF and its cost.
2244   LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
2245                                                        unsigned UserVF);
2246 
2247   /// Finalize the best decision and dispose of all other VPlans.
2248   void setBestPlan(unsigned VF, unsigned UF);
2249 
2250   /// Generate the IR code for the body of the vectorized loop according to the
2251   /// best selected VPlan.
2252   void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
2253 
2254   void printPlans(raw_ostream &O) {
2255     for (const auto &Plan : VPlans)
2256       O << *Plan;
2257   }
2258 
2259 protected:
2260   /// Collect the instructions from the original loop that would be trivially
2261   /// dead in the vectorized loop if generated.
2262   void collectTriviallyDeadInstructions(
2263       SmallPtrSetImpl<Instruction *> &DeadInstructions);
2264 
2265   /// A range of powers-of-2 vectorization factors with fixed start and
2266   /// adjustable end. The range includes start and excludes end, e.g.,:
2267   /// [1, 9) = {1, 2, 4, 8}
2268   struct VFRange {
2269     // A power of 2.
2270     const unsigned Start;
2271 
2272     // Need not be a power of 2. If End <= Start range is empty.
2273     unsigned End;
2274   };
2275 
2276   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
2277   /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
2278   /// returned value holds for the entire \p Range.
2279   bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
2280                                 VFRange &Range);
2281 
2282   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
2283   /// according to the information gathered by Legal when it checked if it is
2284   /// legal to vectorize the loop.
2285   void buildVPlans(unsigned MinVF, unsigned MaxVF);
2286 
2287 private:
2288   /// Check if \I belongs to an Interleave Group within the given VF \p Range,
2289   /// \return true in the first returned value if so and false otherwise.
2290   /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
2291   /// for \p Range.Start, and provide it as the second returned value.
2292   /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
2293   /// \return value is <true, nullptr>, as it is handled by another recipe.
2294   /// \p Range.End may be decreased to ensure same decision from \p Range.Start
2295   /// to \p Range.End.
2296   VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
2297 
2298   /// Check if an induction recipe should be constructed for \I within the given
2299   /// VF \p Range. If so build and return it. If not, return null. \p Range.End
2300   /// may be decreased to ensure same decision from \p Range.Start to
2301   /// \p Range.End.
2302   VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
2303                                                         VFRange &Range);
2304 
2305   /// Check if \p I can be widened within the given VF \p Range. If \p I can be
2306   /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
2307   /// extended to include \p I or else build a new VPWidenRecipe for it and
2308   /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
2309   /// false otherwise. Range.End may be decreased to ensure same decision from
2310   /// \p Range.Start to \p Range.End.
2311   bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
2312 
2313   /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
2314   /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
2315   /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
2316   /// Region. Update the packing decision of predicated instructions if they
2317   /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
2318   /// \p Range.Start to \p Range.End.
2319   VPBasicBlock *handleReplication(
2320       Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
2321       DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe);
2322 
2323   /// Create a replicating region for instruction \p I that requires
2324   /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
2325   VPRegionBlock *createReplicateRegion(Instruction *I,
2326                                        VPRecipeBase *PredRecipe);
2327 
2328   /// Build a VPlan according to the information gathered by Legal. \return a
2329   /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
2330   /// exclusive, possibly decreasing \p Range.End.
2331   std::unique_ptr<VPlan> buildVPlan(VFRange &Range);
2332 };
2333 
2334 } // end namespace llvm
2335 
2336 namespace {
2337 
2338 /// \brief This holds vectorization requirements that must be verified late in
2339 /// the process. The requirements are set by legalize and costmodel. Once
2340 /// vectorization has been determined to be possible and profitable the
2341 /// requirements can be verified by looking for metadata or compiler options.
2342 /// For example, some loops require FP commutativity which is only allowed if
2343 /// vectorization is explicitly specified or if the fast-math compiler option
2344 /// has been provided.
2345 /// Late evaluation of these requirements allows helpful diagnostics to be
2346 /// composed that tells the user what need to be done to vectorize the loop. For
2347 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2348 /// evaluation should be used only when diagnostics can generated that can be
2349 /// followed by a non-expert user.
2350 class LoopVectorizationRequirements {
2351 public:
2352   LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
2353 
2354   void addUnsafeAlgebraInst(Instruction *I) {
2355     // First unsafe algebra instruction.
2356     if (!UnsafeAlgebraInst)
2357       UnsafeAlgebraInst = I;
2358   }
2359 
2360   void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2361 
2362   bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2363     const char *PassName = Hints.vectorizeAnalysisPassName();
2364     bool Failed = false;
2365     if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2366       ORE.emit([&]() {
2367         return OptimizationRemarkAnalysisFPCommute(
2368                    PassName, "CantReorderFPOps",
2369                    UnsafeAlgebraInst->getDebugLoc(),
2370                    UnsafeAlgebraInst->getParent())
2371                << "loop not vectorized: cannot prove it is safe to reorder "
2372                   "floating-point operations";
2373       });
2374       Failed = true;
2375     }
2376 
2377     // Test if runtime memcheck thresholds are exceeded.
2378     bool PragmaThresholdReached =
2379         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2380     bool ThresholdReached =
2381         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2382     if ((ThresholdReached && !Hints.allowReordering()) ||
2383         PragmaThresholdReached) {
2384       ORE.emit([&]() {
2385         return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2386                                                   L->getStartLoc(),
2387                                                   L->getHeader())
2388                << "loop not vectorized: cannot prove it is safe to reorder "
2389                   "memory operations";
2390       });
2391       DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2392       Failed = true;
2393     }
2394 
2395     return Failed;
2396   }
2397 
2398 private:
2399   unsigned NumRuntimePointerChecks = 0;
2400   Instruction *UnsafeAlgebraInst = nullptr;
2401 
2402   /// Interface to emit optimization remarks.
2403   OptimizationRemarkEmitter &ORE;
2404 };
2405 
2406 } // end anonymous namespace
2407 
2408 static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
2409   if (L.empty()) {
2410     if (!hasCyclesInLoopBody(L))
2411       V.push_back(&L);
2412     return;
2413   }
2414   for (Loop *InnerL : L)
2415     addAcyclicInnerLoop(*InnerL, V);
2416 }
2417 
2418 namespace {
2419 
2420 /// The LoopVectorize Pass.
2421 struct LoopVectorize : public FunctionPass {
2422   /// Pass identification, replacement for typeid
2423   static char ID;
2424 
2425   LoopVectorizePass Impl;
2426 
2427   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2428       : FunctionPass(ID) {
2429     Impl.DisableUnrolling = NoUnrolling;
2430     Impl.AlwaysVectorize = AlwaysVectorize;
2431     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2432   }
2433 
2434   bool runOnFunction(Function &F) override {
2435     if (skipFunction(F))
2436       return false;
2437 
2438     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2439     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2440     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2441     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2442     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2443     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2444     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2445     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2446     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2447     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2448     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2449     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2450 
2451     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2452         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2453 
2454     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2455                         GetLAA, *ORE);
2456   }
2457 
2458   void getAnalysisUsage(AnalysisUsage &AU) const override {
2459     AU.addRequired<AssumptionCacheTracker>();
2460     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2461     AU.addRequired<DominatorTreeWrapperPass>();
2462     AU.addRequired<LoopInfoWrapperPass>();
2463     AU.addRequired<ScalarEvolutionWrapperPass>();
2464     AU.addRequired<TargetTransformInfoWrapperPass>();
2465     AU.addRequired<AAResultsWrapperPass>();
2466     AU.addRequired<LoopAccessLegacyAnalysis>();
2467     AU.addRequired<DemandedBitsWrapperPass>();
2468     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2469     AU.addPreserved<LoopInfoWrapperPass>();
2470     AU.addPreserved<DominatorTreeWrapperPass>();
2471     AU.addPreserved<BasicAAWrapperPass>();
2472     AU.addPreserved<GlobalsAAWrapperPass>();
2473   }
2474 };
2475 
2476 } // end anonymous namespace
2477 
2478 //===----------------------------------------------------------------------===//
2479 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2480 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2481 //===----------------------------------------------------------------------===//
2482 
2483 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2484   // We need to place the broadcast of invariant variables outside the loop.
2485   Instruction *Instr = dyn_cast<Instruction>(V);
2486   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2487   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2488 
2489   // Place the code for broadcasting invariant variables in the new preheader.
2490   IRBuilder<>::InsertPointGuard Guard(Builder);
2491   if (Invariant)
2492     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2493 
2494   // Broadcast the scalar into all locations in the vector.
2495   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2496 
2497   return Shuf;
2498 }
2499 
2500 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2501     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2502   Value *Start = II.getStartValue();
2503 
2504   // Construct the initial value of the vector IV in the vector loop preheader
2505   auto CurrIP = Builder.saveIP();
2506   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2507   if (isa<TruncInst>(EntryVal)) {
2508     assert(Start->getType()->isIntegerTy() &&
2509            "Truncation requires an integer type");
2510     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2511     Step = Builder.CreateTrunc(Step, TruncType);
2512     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2513   }
2514   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2515   Value *SteppedStart =
2516       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2517 
2518   // We create vector phi nodes for both integer and floating-point induction
2519   // variables. Here, we determine the kind of arithmetic we will perform.
2520   Instruction::BinaryOps AddOp;
2521   Instruction::BinaryOps MulOp;
2522   if (Step->getType()->isIntegerTy()) {
2523     AddOp = Instruction::Add;
2524     MulOp = Instruction::Mul;
2525   } else {
2526     AddOp = II.getInductionOpcode();
2527     MulOp = Instruction::FMul;
2528   }
2529 
2530   // Multiply the vectorization factor by the step using integer or
2531   // floating-point arithmetic as appropriate.
2532   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
2533   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2534 
2535   // Create a vector splat to use in the induction update.
2536   //
2537   // FIXME: If the step is non-constant, we create the vector splat with
2538   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2539   //        handle a constant vector splat.
2540   Value *SplatVF = isa<Constant>(Mul)
2541                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2542                        : Builder.CreateVectorSplat(VF, Mul);
2543   Builder.restoreIP(CurrIP);
2544 
2545   // We may need to add the step a number of times, depending on the unroll
2546   // factor. The last of those goes into the PHI.
2547   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2548                                     &*LoopVectorBody->getFirstInsertionPt());
2549   Instruction *LastInduction = VecInd;
2550   for (unsigned Part = 0; Part < UF; ++Part) {
2551     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2552     if (isa<TruncInst>(EntryVal))
2553       addMetadata(LastInduction, EntryVal);
2554     LastInduction = cast<Instruction>(addFastMathFlag(
2555         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2556   }
2557 
2558   // Move the last step to the end of the latch block. This ensures consistent
2559   // placement of all induction updates.
2560   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2561   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2562   auto *ICmp = cast<Instruction>(Br->getCondition());
2563   LastInduction->moveBefore(ICmp);
2564   LastInduction->setName("vec.ind.next");
2565 
2566   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2567   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2568 }
2569 
2570 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2571   return Cost->isScalarAfterVectorization(I, VF) ||
2572          Cost->isProfitableToScalarize(I, VF);
2573 }
2574 
2575 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2576   if (shouldScalarizeInstruction(IV))
2577     return true;
2578   auto isScalarInst = [&](User *U) -> bool {
2579     auto *I = cast<Instruction>(U);
2580     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2581   };
2582   return llvm::any_of(IV->users(), isScalarInst);
2583 }
2584 
2585 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2586   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2587          "Primary induction variable must have an integer type");
2588 
2589   auto II = Legal->getInductionVars()->find(IV);
2590   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2591 
2592   auto ID = II->second;
2593   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2594 
2595   // The scalar value to broadcast. This will be derived from the canonical
2596   // induction variable.
2597   Value *ScalarIV = nullptr;
2598 
2599   // The value from the original loop to which we are mapping the new induction
2600   // variable.
2601   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2602 
2603   // True if we have vectorized the induction variable.
2604   auto VectorizedIV = false;
2605 
2606   // Determine if we want a scalar version of the induction variable. This is
2607   // true if the induction variable itself is not widened, or if it has at
2608   // least one user in the loop that is not widened.
2609   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2610 
2611   // Generate code for the induction step. Note that induction steps are
2612   // required to be loop-invariant
2613   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
2614          "Induction step should be loop invariant");
2615   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2616   Value *Step = nullptr;
2617   if (PSE.getSE()->isSCEVable(IV->getType())) {
2618     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2619     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2620                              LoopVectorPreHeader->getTerminator());
2621   } else {
2622     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
2623   }
2624 
2625   // Try to create a new independent vector induction variable. If we can't
2626   // create the phi node, we will splat the scalar induction variable in each
2627   // loop iteration.
2628   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
2629     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2630     VectorizedIV = true;
2631   }
2632 
2633   // If we haven't yet vectorized the induction variable, or if we will create
2634   // a scalar one, we need to define the scalar induction variable and step
2635   // values. If we were given a truncation type, truncate the canonical
2636   // induction variable and step. Otherwise, derive these values from the
2637   // induction descriptor.
2638   if (!VectorizedIV || NeedsScalarIV) {
2639     ScalarIV = Induction;
2640     if (IV != OldInduction) {
2641       ScalarIV = IV->getType()->isIntegerTy()
2642                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2643                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2644                                           IV->getType());
2645       ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2646       ScalarIV->setName("offset.idx");
2647     }
2648     if (Trunc) {
2649       auto *TruncType = cast<IntegerType>(Trunc->getType());
2650       assert(Step->getType()->isIntegerTy() &&
2651              "Truncation requires an integer step");
2652       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2653       Step = Builder.CreateTrunc(Step, TruncType);
2654     }
2655   }
2656 
2657   // If we haven't yet vectorized the induction variable, splat the scalar
2658   // induction variable, and build the necessary step vectors.
2659   if (!VectorizedIV) {
2660     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2661     for (unsigned Part = 0; Part < UF; ++Part) {
2662       Value *EntryPart =
2663           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
2664       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2665       if (Trunc)
2666         addMetadata(EntryPart, Trunc);
2667     }
2668   }
2669 
2670   // If an induction variable is only used for counting loop iterations or
2671   // calculating addresses, it doesn't need to be widened. Create scalar steps
2672   // that can be used by instructions we will later scalarize. Note that the
2673   // addition of the scalar steps will not increase the number of instructions
2674   // in the loop in the common case prior to InstCombine. We will be trading
2675   // one vector extract for each scalar step.
2676   if (NeedsScalarIV)
2677     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2678 }
2679 
2680 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2681                                           Instruction::BinaryOps BinOp) {
2682   // Create and check the types.
2683   assert(Val->getType()->isVectorTy() && "Must be a vector");
2684   int VLen = Val->getType()->getVectorNumElements();
2685 
2686   Type *STy = Val->getType()->getScalarType();
2687   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2688          "Induction Step must be an integer or FP");
2689   assert(Step->getType() == STy && "Step has wrong type");
2690 
2691   SmallVector<Constant *, 8> Indices;
2692 
2693   if (STy->isIntegerTy()) {
2694     // Create a vector of consecutive numbers from zero to VF.
2695     for (int i = 0; i < VLen; ++i)
2696       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2697 
2698     // Add the consecutive indices to the vector value.
2699     Constant *Cv = ConstantVector::get(Indices);
2700     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2701     Step = Builder.CreateVectorSplat(VLen, Step);
2702     assert(Step->getType() == Val->getType() && "Invalid step vec");
2703     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2704     // which can be found from the original scalar operations.
2705     Step = Builder.CreateMul(Cv, Step);
2706     return Builder.CreateAdd(Val, Step, "induction");
2707   }
2708 
2709   // Floating point induction.
2710   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2711          "Binary Opcode should be specified for FP induction");
2712   // Create a vector of consecutive numbers from zero to VF.
2713   for (int i = 0; i < VLen; ++i)
2714     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2715 
2716   // Add the consecutive indices to the vector value.
2717   Constant *Cv = ConstantVector::get(Indices);
2718 
2719   Step = Builder.CreateVectorSplat(VLen, Step);
2720 
2721   // Floating point operations had to be 'fast' to enable the induction.
2722   FastMathFlags Flags;
2723   Flags.setFast();
2724 
2725   Value *MulOp = Builder.CreateFMul(Cv, Step);
2726   if (isa<Instruction>(MulOp))
2727     // Have to check, MulOp may be a constant
2728     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2729 
2730   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2731   if (isa<Instruction>(BOp))
2732     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2733   return BOp;
2734 }
2735 
2736 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2737                                            Value *EntryVal,
2738                                            const InductionDescriptor &ID) {
2739   // We shouldn't have to build scalar steps if we aren't vectorizing.
2740   assert(VF > 1 && "VF should be greater than one");
2741 
2742   // Get the value type and ensure it and the step have the same integer type.
2743   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2744   assert(ScalarIVTy == Step->getType() &&
2745          "Val and Step should have the same type");
2746 
2747   // We build scalar steps for both integer and floating-point induction
2748   // variables. Here, we determine the kind of arithmetic we will perform.
2749   Instruction::BinaryOps AddOp;
2750   Instruction::BinaryOps MulOp;
2751   if (ScalarIVTy->isIntegerTy()) {
2752     AddOp = Instruction::Add;
2753     MulOp = Instruction::Mul;
2754   } else {
2755     AddOp = ID.getInductionOpcode();
2756     MulOp = Instruction::FMul;
2757   }
2758 
2759   // Determine the number of scalars we need to generate for each unroll
2760   // iteration. If EntryVal is uniform, we only need to generate the first
2761   // lane. Otherwise, we generate all VF values.
2762   unsigned Lanes =
2763       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2764                                                                          : VF;
2765   // Compute the scalar steps and save the results in VectorLoopValueMap.
2766   for (unsigned Part = 0; Part < UF; ++Part) {
2767     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2768       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2769       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2770       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2771       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2772     }
2773   }
2774 }
2775 
2776 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
2777   const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2778     ValueToValueMap();
2779 
2780   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2781   if (Stride == 1 || Stride == -1)
2782     return Stride;
2783   return 0;
2784 }
2785 
2786 bool LoopVectorizationLegality::isUniform(Value *V) {
2787   return LAI->isUniform(V);
2788 }
2789 
2790 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2791   assert(V != Induction && "The new induction variable should not be used.");
2792   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2793   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2794 
2795   // If we have a stride that is replaced by one, do it here.
2796   if (Legal->hasStride(V))
2797     V = ConstantInt::get(V->getType(), 1);
2798 
2799   // If we have a vector mapped to this value, return it.
2800   if (VectorLoopValueMap.hasVectorValue(V, Part))
2801     return VectorLoopValueMap.getVectorValue(V, Part);
2802 
2803   // If the value has not been vectorized, check if it has been scalarized
2804   // instead. If it has been scalarized, and we actually need the value in
2805   // vector form, we will construct the vector values on demand.
2806   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2807     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2808 
2809     // If we've scalarized a value, that value should be an instruction.
2810     auto *I = cast<Instruction>(V);
2811 
2812     // If we aren't vectorizing, we can just copy the scalar map values over to
2813     // the vector map.
2814     if (VF == 1) {
2815       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2816       return ScalarValue;
2817     }
2818 
2819     // Get the last scalar instruction we generated for V and Part. If the value
2820     // is known to be uniform after vectorization, this corresponds to lane zero
2821     // of the Part unroll iteration. Otherwise, the last instruction is the one
2822     // we created for the last vector lane of the Part unroll iteration.
2823     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2824     auto *LastInst = cast<Instruction>(
2825         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2826 
2827     // Set the insert point after the last scalarized instruction. This ensures
2828     // the insertelement sequence will directly follow the scalar definitions.
2829     auto OldIP = Builder.saveIP();
2830     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2831     Builder.SetInsertPoint(&*NewIP);
2832 
2833     // However, if we are vectorizing, we need to construct the vector values.
2834     // If the value is known to be uniform after vectorization, we can just
2835     // broadcast the scalar value corresponding to lane zero for each unroll
2836     // iteration. Otherwise, we construct the vector values using insertelement
2837     // instructions. Since the resulting vectors are stored in
2838     // VectorLoopValueMap, we will only generate the insertelements once.
2839     Value *VectorValue = nullptr;
2840     if (Cost->isUniformAfterVectorization(I, VF)) {
2841       VectorValue = getBroadcastInstrs(ScalarValue);
2842       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2843     } else {
2844       // Initialize packing with insertelements to start from undef.
2845       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2846       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2847       for (unsigned Lane = 0; Lane < VF; ++Lane)
2848         packScalarIntoVectorValue(V, {Part, Lane});
2849       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2850     }
2851     Builder.restoreIP(OldIP);
2852     return VectorValue;
2853   }
2854 
2855   // If this scalar is unknown, assume that it is a constant or that it is
2856   // loop invariant. Broadcast V and save the value for future uses.
2857   Value *B = getBroadcastInstrs(V);
2858   VectorLoopValueMap.setVectorValue(V, Part, B);
2859   return B;
2860 }
2861 
2862 Value *
2863 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2864                                             const VPIteration &Instance) {
2865   // If the value is not an instruction contained in the loop, it should
2866   // already be scalar.
2867   if (OrigLoop->isLoopInvariant(V))
2868     return V;
2869 
2870   assert(Instance.Lane > 0
2871              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2872              : true && "Uniform values only have lane zero");
2873 
2874   // If the value from the original loop has not been vectorized, it is
2875   // represented by UF x VF scalar values in the new loop. Return the requested
2876   // scalar value.
2877   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2878     return VectorLoopValueMap.getScalarValue(V, Instance);
2879 
2880   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2881   // for the given unroll part. If this entry is not a vector type (i.e., the
2882   // vectorization factor is one), there is no need to generate an
2883   // extractelement instruction.
2884   auto *U = getOrCreateVectorValue(V, Instance.Part);
2885   if (!U->getType()->isVectorTy()) {
2886     assert(VF == 1 && "Value not scalarized has non-vector type");
2887     return U;
2888   }
2889 
2890   // Otherwise, the value from the original loop has been vectorized and is
2891   // represented by UF vector values. Extract and return the requested scalar
2892   // value from the appropriate vector lane.
2893   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2894 }
2895 
2896 void InnerLoopVectorizer::packScalarIntoVectorValue(
2897     Value *V, const VPIteration &Instance) {
2898   assert(V != Induction && "The new induction variable should not be used.");
2899   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2900   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2901 
2902   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2903   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2904   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2905                                             Builder.getInt32(Instance.Lane));
2906   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2907 }
2908 
2909 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2910   assert(Vec->getType()->isVectorTy() && "Invalid type");
2911   SmallVector<Constant *, 8> ShuffleMask;
2912   for (unsigned i = 0; i < VF; ++i)
2913     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2914 
2915   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2916                                      ConstantVector::get(ShuffleMask),
2917                                      "reverse");
2918 }
2919 
2920 // Try to vectorize the interleave group that \p Instr belongs to.
2921 //
2922 // E.g. Translate following interleaved load group (factor = 3):
2923 //   for (i = 0; i < N; i+=3) {
2924 //     R = Pic[i];             // Member of index 0
2925 //     G = Pic[i+1];           // Member of index 1
2926 //     B = Pic[i+2];           // Member of index 2
2927 //     ... // do something to R, G, B
2928 //   }
2929 // To:
2930 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2931 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2932 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2933 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2934 //
2935 // Or translate following interleaved store group (factor = 3):
2936 //   for (i = 0; i < N; i+=3) {
2937 //     ... do something to R, G, B
2938 //     Pic[i]   = R;           // Member of index 0
2939 //     Pic[i+1] = G;           // Member of index 1
2940 //     Pic[i+2] = B;           // Member of index 2
2941 //   }
2942 // To:
2943 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2944 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2945 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2946 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2947 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2948 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
2949   const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2950   assert(Group && "Fail to get an interleaved access group.");
2951 
2952   // Skip if current instruction is not the insert position.
2953   if (Instr != Group->getInsertPos())
2954     return;
2955 
2956   const DataLayout &DL = Instr->getModule()->getDataLayout();
2957   Value *Ptr = getPointerOperand(Instr);
2958 
2959   // Prepare for the vector type of the interleaved load/store.
2960   Type *ScalarTy = getMemInstValueType(Instr);
2961   unsigned InterleaveFactor = Group->getFactor();
2962   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2963   Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
2964 
2965   // Prepare for the new pointers.
2966   setDebugLocFromInst(Builder, Ptr);
2967   SmallVector<Value *, 2> NewPtrs;
2968   unsigned Index = Group->getIndex(Instr);
2969 
2970   // If the group is reverse, adjust the index to refer to the last vector lane
2971   // instead of the first. We adjust the index from the first vector lane,
2972   // rather than directly getting the pointer for lane VF - 1, because the
2973   // pointer operand of the interleaved access is supposed to be uniform. For
2974   // uniform instructions, we're only required to generate a value for the
2975   // first vector lane in each unroll iteration.
2976   if (Group->isReverse())
2977     Index += (VF - 1) * Group->getFactor();
2978 
2979   for (unsigned Part = 0; Part < UF; Part++) {
2980     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2981 
2982     // Notice current instruction could be any index. Need to adjust the address
2983     // to the member of index 0.
2984     //
2985     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2986     //       b = A[i];       // Member of index 0
2987     // Current pointer is pointed to A[i+1], adjust it to A[i].
2988     //
2989     // E.g.  A[i+1] = a;     // Member of index 1
2990     //       A[i]   = b;     // Member of index 0
2991     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2992     // Current pointer is pointed to A[i+2], adjust it to A[i].
2993     NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2994 
2995     // Cast to the vector pointer type.
2996     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2997   }
2998 
2999   setDebugLocFromInst(Builder, Instr);
3000   Value *UndefVec = UndefValue::get(VecTy);
3001 
3002   // Vectorize the interleaved load group.
3003   if (isa<LoadInst>(Instr)) {
3004     // For each unroll part, create a wide load for the group.
3005     SmallVector<Value *, 2> NewLoads;
3006     for (unsigned Part = 0; Part < UF; Part++) {
3007       auto *NewLoad = Builder.CreateAlignedLoad(
3008           NewPtrs[Part], Group->getAlignment(), "wide.vec");
3009       addMetadata(NewLoad, Instr);
3010       NewLoads.push_back(NewLoad);
3011     }
3012 
3013     // For each member in the group, shuffle out the appropriate data from the
3014     // wide loads.
3015     for (unsigned I = 0; I < InterleaveFactor; ++I) {
3016       Instruction *Member = Group->getMember(I);
3017 
3018       // Skip the gaps in the group.
3019       if (!Member)
3020         continue;
3021 
3022       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
3023       for (unsigned Part = 0; Part < UF; Part++) {
3024         Value *StridedVec = Builder.CreateShuffleVector(
3025             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
3026 
3027         // If this member has different type, cast the result type.
3028         if (Member->getType() != ScalarTy) {
3029           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
3030           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
3031         }
3032 
3033         if (Group->isReverse())
3034           StridedVec = reverseVector(StridedVec);
3035 
3036         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
3037       }
3038     }
3039     return;
3040   }
3041 
3042   // The sub vector type for current instruction.
3043   VectorType *SubVT = VectorType::get(ScalarTy, VF);
3044 
3045   // Vectorize the interleaved store group.
3046   for (unsigned Part = 0; Part < UF; Part++) {
3047     // Collect the stored vector from each member.
3048     SmallVector<Value *, 4> StoredVecs;
3049     for (unsigned i = 0; i < InterleaveFactor; i++) {
3050       // Interleaved store group doesn't allow a gap, so each index has a member
3051       Instruction *Member = Group->getMember(i);
3052       assert(Member && "Fail to get a member from an interleaved store group");
3053 
3054       Value *StoredVec = getOrCreateVectorValue(
3055           cast<StoreInst>(Member)->getValueOperand(), Part);
3056       if (Group->isReverse())
3057         StoredVec = reverseVector(StoredVec);
3058 
3059       // If this member has different type, cast it to a unified type.
3060 
3061       if (StoredVec->getType() != SubVT)
3062         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
3063 
3064       StoredVecs.push_back(StoredVec);
3065     }
3066 
3067     // Concatenate all vectors into a wide vector.
3068     Value *WideVec = concatenateVectors(Builder, StoredVecs);
3069 
3070     // Interleave the elements in the wide vector.
3071     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
3072     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
3073                                               "interleaved.vec");
3074 
3075     Instruction *NewStoreInstr =
3076         Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
3077     addMetadata(NewStoreInstr, Instr);
3078   }
3079 }
3080 
3081 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
3082   // Attempt to issue a wide load.
3083   LoadInst *LI = dyn_cast<LoadInst>(Instr);
3084   StoreInst *SI = dyn_cast<StoreInst>(Instr);
3085 
3086   assert((LI || SI) && "Invalid Load/Store instruction");
3087 
3088   LoopVectorizationCostModel::InstWidening Decision =
3089       Cost->getWideningDecision(Instr, VF);
3090   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
3091          "CM decision should be taken at this point");
3092   if (Decision == LoopVectorizationCostModel::CM_Interleave)
3093     return vectorizeInterleaveGroup(Instr);
3094 
3095   Type *ScalarDataTy = getMemInstValueType(Instr);
3096   Type *DataTy = VectorType::get(ScalarDataTy, VF);
3097   Value *Ptr = getPointerOperand(Instr);
3098   unsigned Alignment = getMemInstAlignment(Instr);
3099   // An alignment of 0 means target abi alignment. We need to use the scalar's
3100   // target abi alignment in such a case.
3101   const DataLayout &DL = Instr->getModule()->getDataLayout();
3102   if (!Alignment)
3103     Alignment = DL.getABITypeAlignment(ScalarDataTy);
3104   unsigned AddressSpace = getMemInstAddressSpace(Instr);
3105 
3106   // Determine if the pointer operand of the access is either consecutive or
3107   // reverse consecutive.
3108   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
3109   bool Reverse = ConsecutiveStride < 0;
3110   bool CreateGatherScatter =
3111       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
3112 
3113   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
3114   // gather/scatter. Otherwise Decision should have been to Scalarize.
3115   assert((ConsecutiveStride || CreateGatherScatter) &&
3116          "The instruction should be scalarized");
3117 
3118   // Handle consecutive loads/stores.
3119   if (ConsecutiveStride)
3120     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
3121 
3122   VectorParts Mask = createBlockInMask(Instr->getParent());
3123   // Handle Stores:
3124   if (SI) {
3125     assert(!Legal->isUniform(SI->getPointerOperand()) &&
3126            "We do not allow storing to uniform addresses");
3127     setDebugLocFromInst(Builder, SI);
3128 
3129     for (unsigned Part = 0; Part < UF; ++Part) {
3130       Instruction *NewSI = nullptr;
3131       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
3132       if (CreateGatherScatter) {
3133         Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
3134         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3135         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3136                                             MaskPart);
3137       } else {
3138         // Calculate the pointer for the specific unroll-part.
3139         Value *PartPtr =
3140             Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3141 
3142         if (Reverse) {
3143           // If we store to reverse consecutive memory locations, then we need
3144           // to reverse the order of elements in the stored value.
3145           StoredVal = reverseVector(StoredVal);
3146           // We don't want to update the value in the map as it might be used in
3147           // another expression. So don't call resetVectorValue(StoredVal).
3148 
3149           // If the address is consecutive but reversed, then the
3150           // wide store needs to start at the last vector element.
3151           PartPtr =
3152               Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3153           PartPtr =
3154               Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3155           if (Mask[Part]) // The reverse of a null all-one mask is a null mask.
3156             Mask[Part] = reverseVector(Mask[Part]);
3157         }
3158 
3159         Value *VecPtr =
3160             Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3161 
3162         if (Legal->isMaskRequired(SI) && Mask[Part])
3163           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3164                                             Mask[Part]);
3165         else
3166           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3167       }
3168       addMetadata(NewSI, SI);
3169     }
3170     return;
3171   }
3172 
3173   // Handle loads.
3174   assert(LI && "Must have a load instruction");
3175   setDebugLocFromInst(Builder, LI);
3176   for (unsigned Part = 0; Part < UF; ++Part) {
3177     Value *NewLI;
3178     if (CreateGatherScatter) {
3179       Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
3180       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3181       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
3182                                          nullptr, "wide.masked.gather");
3183       addMetadata(NewLI, LI);
3184     } else {
3185       // Calculate the pointer for the specific unroll-part.
3186       Value *PartPtr =
3187           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3188 
3189       if (Reverse) {
3190         // If the address is consecutive but reversed, then the
3191         // wide load needs to start at the last vector element.
3192         PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3193         PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3194         if (Mask[Part]) // The reverse of a null all-one mask is a null mask.
3195           Mask[Part] = reverseVector(Mask[Part]);
3196       }
3197 
3198       Value *VecPtr =
3199           Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3200       if (Legal->isMaskRequired(LI) && Mask[Part])
3201         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
3202                                          UndefValue::get(DataTy),
3203                                          "wide.masked.load");
3204       else
3205         NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
3206 
3207       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3208       addMetadata(NewLI, LI);
3209       if (Reverse)
3210         NewLI = reverseVector(NewLI);
3211     }
3212     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
3213   }
3214 }
3215 
3216 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
3217                                                const VPIteration &Instance,
3218                                                bool IfPredicateInstr) {
3219   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3220 
3221   setDebugLocFromInst(Builder, Instr);
3222 
3223   // Does this instruction return a value ?
3224   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3225 
3226   Instruction *Cloned = Instr->clone();
3227   if (!IsVoidRetTy)
3228     Cloned->setName(Instr->getName() + ".cloned");
3229 
3230   // Replace the operands of the cloned instructions with their scalar
3231   // equivalents in the new loop.
3232   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3233     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
3234     Cloned->setOperand(op, NewOp);
3235   }
3236   addNewMetadata(Cloned, Instr);
3237 
3238   // Place the cloned scalar in the new loop.
3239   Builder.Insert(Cloned);
3240 
3241   // Add the cloned scalar to the scalar map entry.
3242   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
3243 
3244   // If we just cloned a new assumption, add it the assumption cache.
3245   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3246     if (II->getIntrinsicID() == Intrinsic::assume)
3247       AC->registerAssumption(II);
3248 
3249   // End if-block.
3250   if (IfPredicateInstr)
3251     PredicatedInstructions.push_back(Cloned);
3252 }
3253 
3254 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3255                                                       Value *End, Value *Step,
3256                                                       Instruction *DL) {
3257   BasicBlock *Header = L->getHeader();
3258   BasicBlock *Latch = L->getLoopLatch();
3259   // As we're just creating this loop, it's possible no latch exists
3260   // yet. If so, use the header as this will be a single block loop.
3261   if (!Latch)
3262     Latch = Header;
3263 
3264   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3265   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3266   setDebugLocFromInst(Builder, OldInst);
3267   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3268 
3269   Builder.SetInsertPoint(Latch->getTerminator());
3270   setDebugLocFromInst(Builder, OldInst);
3271 
3272   // Create i+1 and fill the PHINode.
3273   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3274   Induction->addIncoming(Start, L->getLoopPreheader());
3275   Induction->addIncoming(Next, Latch);
3276   // Create the compare.
3277   Value *ICmp = Builder.CreateICmpEQ(Next, End);
3278   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3279 
3280   // Now we have two terminators. Remove the old one from the block.
3281   Latch->getTerminator()->eraseFromParent();
3282 
3283   return Induction;
3284 }
3285 
3286 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3287   if (TripCount)
3288     return TripCount;
3289 
3290   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3291   // Find the loop boundaries.
3292   ScalarEvolution *SE = PSE.getSE();
3293   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3294   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3295          "Invalid loop count");
3296 
3297   Type *IdxTy = Legal->getWidestInductionType();
3298 
3299   // The exit count might have the type of i64 while the phi is i32. This can
3300   // happen if we have an induction variable that is sign extended before the
3301   // compare. The only way that we get a backedge taken count is that the
3302   // induction variable was signed and as such will not overflow. In such a case
3303   // truncation is legal.
3304   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3305       IdxTy->getPrimitiveSizeInBits())
3306     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3307   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3308 
3309   // Get the total trip count from the count by adding 1.
3310   const SCEV *ExitCount = SE->getAddExpr(
3311       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3312 
3313   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3314 
3315   // Expand the trip count and place the new instructions in the preheader.
3316   // Notice that the pre-header does not change, only the loop body.
3317   SCEVExpander Exp(*SE, DL, "induction");
3318 
3319   // Count holds the overall loop count (N).
3320   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3321                                 L->getLoopPreheader()->getTerminator());
3322 
3323   if (TripCount->getType()->isPointerTy())
3324     TripCount =
3325         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3326                                     L->getLoopPreheader()->getTerminator());
3327 
3328   return TripCount;
3329 }
3330 
3331 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3332   if (VectorTripCount)
3333     return VectorTripCount;
3334 
3335   Value *TC = getOrCreateTripCount(L);
3336   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3337 
3338   // Now we need to generate the expression for the part of the loop that the
3339   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3340   // iterations are not required for correctness, or N - Step, otherwise. Step
3341   // is equal to the vectorization factor (number of SIMD elements) times the
3342   // unroll factor (number of SIMD instructions).
3343   Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3344   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3345 
3346   // If there is a non-reversed interleaved group that may speculatively access
3347   // memory out-of-bounds, we need to ensure that there will be at least one
3348   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3349   // the trip count, we set the remainder to be equal to the step. If the step
3350   // does not evenly divide the trip count, no adjustment is necessary since
3351   // there will already be scalar iterations. Note that the minimum iterations
3352   // check ensures that N >= Step.
3353   if (VF > 1 && Legal->requiresScalarEpilogue()) {
3354     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3355     R = Builder.CreateSelect(IsZero, Step, R);
3356   }
3357 
3358   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3359 
3360   return VectorTripCount;
3361 }
3362 
3363 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3364                                                    const DataLayout &DL) {
3365   // Verify that V is a vector type with same number of elements as DstVTy.
3366   unsigned VF = DstVTy->getNumElements();
3367   VectorType *SrcVecTy = cast<VectorType>(V->getType());
3368   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3369   Type *SrcElemTy = SrcVecTy->getElementType();
3370   Type *DstElemTy = DstVTy->getElementType();
3371   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3372          "Vector elements must have same size");
3373 
3374   // Do a direct cast if element types are castable.
3375   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3376     return Builder.CreateBitOrPointerCast(V, DstVTy);
3377   }
3378   // V cannot be directly casted to desired vector type.
3379   // May happen when V is a floating point vector but DstVTy is a vector of
3380   // pointers or vice-versa. Handle this using a two-step bitcast using an
3381   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3382   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3383          "Only one type should be a pointer type");
3384   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3385          "Only one type should be a floating point type");
3386   Type *IntTy =
3387       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3388   VectorType *VecIntTy = VectorType::get(IntTy, VF);
3389   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3390   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3391 }
3392 
3393 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3394                                                          BasicBlock *Bypass) {
3395   Value *Count = getOrCreateTripCount(L);
3396   BasicBlock *BB = L->getLoopPreheader();
3397   IRBuilder<> Builder(BB->getTerminator());
3398 
3399   // Generate code to check if the loop's trip count is less than VF * UF, or
3400   // equal to it in case a scalar epilogue is required; this implies that the
3401   // vector trip count is zero. This check also covers the case where adding one
3402   // to the backedge-taken count overflowed leading to an incorrect trip count
3403   // of zero. In this case we will also jump to the scalar loop.
3404   auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3405                                            : ICmpInst::ICMP_ULT;
3406   Value *CheckMinIters = Builder.CreateICmp(
3407       P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3408 
3409   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3410   // Update dominator tree immediately if the generated block is a
3411   // LoopBypassBlock because SCEV expansions to generate loop bypass
3412   // checks may query it before the current function is finished.
3413   DT->addNewBlock(NewBB, BB);
3414   if (L->getParentLoop())
3415     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3416   ReplaceInstWithInst(BB->getTerminator(),
3417                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
3418   LoopBypassBlocks.push_back(BB);
3419 }
3420 
3421 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3422   BasicBlock *BB = L->getLoopPreheader();
3423 
3424   // Generate the code to check that the SCEV assumptions that we made.
3425   // We want the new basic block to start at the first instruction in a
3426   // sequence of instructions that form a check.
3427   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3428                    "scev.check");
3429   Value *SCEVCheck =
3430       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3431 
3432   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3433     if (C->isZero())
3434       return;
3435 
3436   // Create a new block containing the stride check.
3437   BB->setName("vector.scevcheck");
3438   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3439   // Update dominator tree immediately if the generated block is a
3440   // LoopBypassBlock because SCEV expansions to generate loop bypass
3441   // checks may query it before the current function is finished.
3442   DT->addNewBlock(NewBB, BB);
3443   if (L->getParentLoop())
3444     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3445   ReplaceInstWithInst(BB->getTerminator(),
3446                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
3447   LoopBypassBlocks.push_back(BB);
3448   AddedSafetyChecks = true;
3449 }
3450 
3451 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3452   BasicBlock *BB = L->getLoopPreheader();
3453 
3454   // Generate the code that checks in runtime if arrays overlap. We put the
3455   // checks into a separate block to make the more common case of few elements
3456   // faster.
3457   Instruction *FirstCheckInst;
3458   Instruction *MemRuntimeCheck;
3459   std::tie(FirstCheckInst, MemRuntimeCheck) =
3460       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
3461   if (!MemRuntimeCheck)
3462     return;
3463 
3464   // Create a new block containing the memory check.
3465   BB->setName("vector.memcheck");
3466   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3467   // Update dominator tree immediately if the generated block is a
3468   // LoopBypassBlock because SCEV expansions to generate loop bypass
3469   // checks may query it before the current function is finished.
3470   DT->addNewBlock(NewBB, BB);
3471   if (L->getParentLoop())
3472     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3473   ReplaceInstWithInst(BB->getTerminator(),
3474                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3475   LoopBypassBlocks.push_back(BB);
3476   AddedSafetyChecks = true;
3477 
3478   // We currently don't use LoopVersioning for the actual loop cloning but we
3479   // still use it to add the noalias metadata.
3480   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3481                                            PSE.getSE());
3482   LVer->prepareNoAliasMetadata();
3483 }
3484 
3485 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3486   /*
3487    In this function we generate a new loop. The new loop will contain
3488    the vectorized instructions while the old loop will continue to run the
3489    scalar remainder.
3490 
3491        [ ] <-- loop iteration number check.
3492     /   |
3493    /    v
3494   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3495   |  /  |
3496   | /   v
3497   ||   [ ]     <-- vector pre header.
3498   |/    |
3499   |     v
3500   |    [  ] \
3501   |    [  ]_|   <-- vector loop.
3502   |     |
3503   |     v
3504   |   -[ ]   <--- middle-block.
3505   |  /  |
3506   | /   v
3507   -|- >[ ]     <--- new preheader.
3508    |    |
3509    |    v
3510    |   [ ] \
3511    |   [ ]_|   <-- old scalar loop to handle remainder.
3512     \   |
3513      \  v
3514       >[ ]     <-- exit block.
3515    ...
3516    */
3517 
3518   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3519   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3520   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3521   assert(VectorPH && "Invalid loop structure");
3522   assert(ExitBlock && "Must have an exit block");
3523 
3524   // Some loops have a single integer induction variable, while other loops
3525   // don't. One example is c++ iterators that often have multiple pointer
3526   // induction variables. In the code below we also support a case where we
3527   // don't have a single induction variable.
3528   //
3529   // We try to obtain an induction variable from the original loop as hard
3530   // as possible. However if we don't find one that:
3531   //   - is an integer
3532   //   - counts from zero, stepping by one
3533   //   - is the size of the widest induction variable type
3534   // then we create a new one.
3535   OldInduction = Legal->getPrimaryInduction();
3536   Type *IdxTy = Legal->getWidestInductionType();
3537 
3538   // Split the single block loop into the two loop structure described above.
3539   BasicBlock *VecBody =
3540       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3541   BasicBlock *MiddleBlock =
3542       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3543   BasicBlock *ScalarPH =
3544       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3545 
3546   // Create and register the new vector loop.
3547   Loop *Lp = LI->AllocateLoop();
3548   Loop *ParentLoop = OrigLoop->getParentLoop();
3549 
3550   // Insert the new loop into the loop nest and register the new basic blocks
3551   // before calling any utilities such as SCEV that require valid LoopInfo.
3552   if (ParentLoop) {
3553     ParentLoop->addChildLoop(Lp);
3554     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3555     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3556   } else {
3557     LI->addTopLevelLoop(Lp);
3558   }
3559   Lp->addBasicBlockToLoop(VecBody, *LI);
3560 
3561   // Find the loop boundaries.
3562   Value *Count = getOrCreateTripCount(Lp);
3563 
3564   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3565 
3566   // Now, compare the new count to zero. If it is zero skip the vector loop and
3567   // jump to the scalar loop. This check also covers the case where the
3568   // backedge-taken count is uint##_max: adding one to it will overflow leading
3569   // to an incorrect trip count of zero. In this (rare) case we will also jump
3570   // to the scalar loop.
3571   emitMinimumIterationCountCheck(Lp, ScalarPH);
3572 
3573   // Generate the code to check any assumptions that we've made for SCEV
3574   // expressions.
3575   emitSCEVChecks(Lp, ScalarPH);
3576 
3577   // Generate the code that checks in runtime if arrays overlap. We put the
3578   // checks into a separate block to make the more common case of few elements
3579   // faster.
3580   emitMemRuntimeChecks(Lp, ScalarPH);
3581 
3582   // Generate the induction variable.
3583   // The loop step is equal to the vectorization factor (num of SIMD elements)
3584   // times the unroll factor (num of SIMD instructions).
3585   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3586   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3587   Induction =
3588       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3589                               getDebugLocFromInstOrOperands(OldInduction));
3590 
3591   // We are going to resume the execution of the scalar loop.
3592   // Go over all of the induction variables that we found and fix the
3593   // PHIs that are left in the scalar version of the loop.
3594   // The starting values of PHI nodes depend on the counter of the last
3595   // iteration in the vectorized loop.
3596   // If we come from a bypass edge then we need to start from the original
3597   // start value.
3598 
3599   // This variable saves the new starting index for the scalar loop. It is used
3600   // to test if there are any tail iterations left once the vector loop has
3601   // completed.
3602   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3603   for (auto &InductionEntry : *List) {
3604     PHINode *OrigPhi = InductionEntry.first;
3605     InductionDescriptor II = InductionEntry.second;
3606 
3607     // Create phi nodes to merge from the  backedge-taken check block.
3608     PHINode *BCResumeVal = PHINode::Create(
3609         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3610     Value *&EndValue = IVEndValues[OrigPhi];
3611     if (OrigPhi == OldInduction) {
3612       // We know what the end value is.
3613       EndValue = CountRoundDown;
3614     } else {
3615       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3616       Type *StepType = II.getStep()->getType();
3617       Instruction::CastOps CastOp =
3618         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3619       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3620       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3621       EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3622       EndValue->setName("ind.end");
3623     }
3624 
3625     // The new PHI merges the original incoming value, in case of a bypass,
3626     // or the value at the end of the vectorized loop.
3627     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3628 
3629     // Fix the scalar body counter (PHI node).
3630     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3631 
3632     // The old induction's phi node in the scalar body needs the truncated
3633     // value.
3634     for (BasicBlock *BB : LoopBypassBlocks)
3635       BCResumeVal->addIncoming(II.getStartValue(), BB);
3636     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3637   }
3638 
3639   // Add a check in the middle block to see if we have completed
3640   // all of the iterations in the first vector loop.
3641   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3642   Value *CmpN =
3643       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3644                       CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3645   ReplaceInstWithInst(MiddleBlock->getTerminator(),
3646                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3647 
3648   // Get ready to start creating new instructions into the vectorized body.
3649   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3650 
3651   // Save the state.
3652   LoopVectorPreHeader = Lp->getLoopPreheader();
3653   LoopScalarPreHeader = ScalarPH;
3654   LoopMiddleBlock = MiddleBlock;
3655   LoopExitBlock = ExitBlock;
3656   LoopVectorBody = VecBody;
3657   LoopScalarBody = OldBasicBlock;
3658 
3659   // Keep all loop hints from the original loop on the vector loop (we'll
3660   // replace the vectorizer-specific hints below).
3661   if (MDNode *LID = OrigLoop->getLoopID())
3662     Lp->setLoopID(LID);
3663 
3664   LoopVectorizeHints Hints(Lp, true, *ORE);
3665   Hints.setAlreadyVectorized();
3666 
3667   return LoopVectorPreHeader;
3668 }
3669 
3670 // Fix up external users of the induction variable. At this point, we are
3671 // in LCSSA form, with all external PHIs that use the IV having one input value,
3672 // coming from the remainder loop. We need those PHIs to also have a correct
3673 // value for the IV when arriving directly from the middle block.
3674 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3675                                        const InductionDescriptor &II,
3676                                        Value *CountRoundDown, Value *EndValue,
3677                                        BasicBlock *MiddleBlock) {
3678   // There are two kinds of external IV usages - those that use the value
3679   // computed in the last iteration (the PHI) and those that use the penultimate
3680   // value (the value that feeds into the phi from the loop latch).
3681   // We allow both, but they, obviously, have different values.
3682 
3683   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3684 
3685   DenseMap<Value *, Value *> MissingVals;
3686 
3687   // An external user of the last iteration's value should see the value that
3688   // the remainder loop uses to initialize its own IV.
3689   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3690   for (User *U : PostInc->users()) {
3691     Instruction *UI = cast<Instruction>(U);
3692     if (!OrigLoop->contains(UI)) {
3693       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3694       MissingVals[UI] = EndValue;
3695     }
3696   }
3697 
3698   // An external user of the penultimate value need to see EndValue - Step.
3699   // The simplest way to get this is to recompute it from the constituent SCEVs,
3700   // that is Start + (Step * (CRD - 1)).
3701   for (User *U : OrigPhi->users()) {
3702     auto *UI = cast<Instruction>(U);
3703     if (!OrigLoop->contains(UI)) {
3704       const DataLayout &DL =
3705           OrigLoop->getHeader()->getModule()->getDataLayout();
3706       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3707 
3708       IRBuilder<> B(MiddleBlock->getTerminator());
3709       Value *CountMinusOne = B.CreateSub(
3710           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3711       Value *CMO =
3712           !II.getStep()->getType()->isIntegerTy()
3713               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3714                              II.getStep()->getType())
3715               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3716       CMO->setName("cast.cmo");
3717       Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3718       Escape->setName("ind.escape");
3719       MissingVals[UI] = Escape;
3720     }
3721   }
3722 
3723   for (auto &I : MissingVals) {
3724     PHINode *PHI = cast<PHINode>(I.first);
3725     // One corner case we have to handle is two IVs "chasing" each-other,
3726     // that is %IV2 = phi [...], [ %IV1, %latch ]
3727     // In this case, if IV1 has an external use, we need to avoid adding both
3728     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3729     // don't already have an incoming value for the middle block.
3730     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3731       PHI->addIncoming(I.second, MiddleBlock);
3732   }
3733 }
3734 
3735 namespace {
3736 
3737 struct CSEDenseMapInfo {
3738   static bool canHandle(const Instruction *I) {
3739     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3740            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3741   }
3742 
3743   static inline Instruction *getEmptyKey() {
3744     return DenseMapInfo<Instruction *>::getEmptyKey();
3745   }
3746 
3747   static inline Instruction *getTombstoneKey() {
3748     return DenseMapInfo<Instruction *>::getTombstoneKey();
3749   }
3750 
3751   static unsigned getHashValue(const Instruction *I) {
3752     assert(canHandle(I) && "Unknown instruction!");
3753     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3754                                                            I->value_op_end()));
3755   }
3756 
3757   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3758     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3759         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3760       return LHS == RHS;
3761     return LHS->isIdenticalTo(RHS);
3762   }
3763 };
3764 
3765 } // end anonymous namespace
3766 
3767 ///\brief Perform cse of induction variable instructions.
3768 static void cse(BasicBlock *BB) {
3769   // Perform simple cse.
3770   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3771   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3772     Instruction *In = &*I++;
3773 
3774     if (!CSEDenseMapInfo::canHandle(In))
3775       continue;
3776 
3777     // Check if we can replace this instruction with any of the
3778     // visited instructions.
3779     if (Instruction *V = CSEMap.lookup(In)) {
3780       In->replaceAllUsesWith(V);
3781       In->eraseFromParent();
3782       continue;
3783     }
3784 
3785     CSEMap[In] = In;
3786   }
3787 }
3788 
3789 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3790 /// convenience wrapper for the type-based getScalarizationOverhead API.
3791 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3792                                          const TargetTransformInfo &TTI) {
3793   if (VF == 1)
3794     return 0;
3795 
3796   unsigned Cost = 0;
3797   Type *RetTy = ToVectorTy(I->getType(), VF);
3798   if (!RetTy->isVoidTy() &&
3799       (!isa<LoadInst>(I) ||
3800        !TTI.supportsEfficientVectorElementLoadStore()))
3801     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3802 
3803   if (CallInst *CI = dyn_cast<CallInst>(I)) {
3804     SmallVector<const Value *, 4> Operands(CI->arg_operands());
3805     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3806   }
3807   else if (!isa<StoreInst>(I) ||
3808            !TTI.supportsEfficientVectorElementLoadStore()) {
3809     SmallVector<const Value *, 4> Operands(I->operand_values());
3810     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3811   }
3812 
3813   return Cost;
3814 }
3815 
3816 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3817 // Return the cost of the instruction, including scalarization overhead if it's
3818 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3819 // i.e. either vector version isn't available, or is too expensive.
3820 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3821                                   const TargetTransformInfo &TTI,
3822                                   const TargetLibraryInfo *TLI,
3823                                   bool &NeedToScalarize) {
3824   Function *F = CI->getCalledFunction();
3825   StringRef FnName = CI->getCalledFunction()->getName();
3826   Type *ScalarRetTy = CI->getType();
3827   SmallVector<Type *, 4> Tys, ScalarTys;
3828   for (auto &ArgOp : CI->arg_operands())
3829     ScalarTys.push_back(ArgOp->getType());
3830 
3831   // Estimate cost of scalarized vector call. The source operands are assumed
3832   // to be vectors, so we need to extract individual elements from there,
3833   // execute VF scalar calls, and then gather the result into the vector return
3834   // value.
3835   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3836   if (VF == 1)
3837     return ScalarCallCost;
3838 
3839   // Compute corresponding vector type for return value and arguments.
3840   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3841   for (Type *ScalarTy : ScalarTys)
3842     Tys.push_back(ToVectorTy(ScalarTy, VF));
3843 
3844   // Compute costs of unpacking argument values for the scalar calls and
3845   // packing the return values to a vector.
3846   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3847 
3848   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3849 
3850   // If we can't emit a vector call for this function, then the currently found
3851   // cost is the cost we need to return.
3852   NeedToScalarize = true;
3853   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3854     return Cost;
3855 
3856   // If the corresponding vector cost is cheaper, return its cost.
3857   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3858   if (VectorCallCost < Cost) {
3859     NeedToScalarize = false;
3860     return VectorCallCost;
3861   }
3862   return Cost;
3863 }
3864 
3865 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3866 // factor VF.  Return the cost of the instruction, including scalarization
3867 // overhead if it's needed.
3868 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3869                                        const TargetTransformInfo &TTI,
3870                                        const TargetLibraryInfo *TLI) {
3871   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3872   assert(ID && "Expected intrinsic call!");
3873 
3874   FastMathFlags FMF;
3875   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3876     FMF = FPMO->getFastMathFlags();
3877 
3878   SmallVector<Value *, 4> Operands(CI->arg_operands());
3879   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3880 }
3881 
3882 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3883   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3884   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3885   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3886 }
3887 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3888   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3889   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3890   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3891 }
3892 
3893 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3894   // For every instruction `I` in MinBWs, truncate the operands, create a
3895   // truncated version of `I` and reextend its result. InstCombine runs
3896   // later and will remove any ext/trunc pairs.
3897   SmallPtrSet<Value *, 4> Erased;
3898   for (const auto &KV : Cost->getMinimalBitwidths()) {
3899     // If the value wasn't vectorized, we must maintain the original scalar
3900     // type. The absence of the value from VectorLoopValueMap indicates that it
3901     // wasn't vectorized.
3902     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3903       continue;
3904     for (unsigned Part = 0; Part < UF; ++Part) {
3905       Value *I = getOrCreateVectorValue(KV.first, Part);
3906       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3907         continue;
3908       Type *OriginalTy = I->getType();
3909       Type *ScalarTruncatedTy =
3910           IntegerType::get(OriginalTy->getContext(), KV.second);
3911       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3912                                           OriginalTy->getVectorNumElements());
3913       if (TruncatedTy == OriginalTy)
3914         continue;
3915 
3916       IRBuilder<> B(cast<Instruction>(I));
3917       auto ShrinkOperand = [&](Value *V) -> Value * {
3918         if (auto *ZI = dyn_cast<ZExtInst>(V))
3919           if (ZI->getSrcTy() == TruncatedTy)
3920             return ZI->getOperand(0);
3921         return B.CreateZExtOrTrunc(V, TruncatedTy);
3922       };
3923 
3924       // The actual instruction modification depends on the instruction type,
3925       // unfortunately.
3926       Value *NewI = nullptr;
3927       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3928         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3929                              ShrinkOperand(BO->getOperand(1)));
3930 
3931         // Any wrapping introduced by shrinking this operation shouldn't be
3932         // considered undefined behavior. So, we can't unconditionally copy
3933         // arithmetic wrapping flags to NewI.
3934         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3935       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3936         NewI =
3937             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3938                          ShrinkOperand(CI->getOperand(1)));
3939       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3940         NewI = B.CreateSelect(SI->getCondition(),
3941                               ShrinkOperand(SI->getTrueValue()),
3942                               ShrinkOperand(SI->getFalseValue()));
3943       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3944         switch (CI->getOpcode()) {
3945         default:
3946           llvm_unreachable("Unhandled cast!");
3947         case Instruction::Trunc:
3948           NewI = ShrinkOperand(CI->getOperand(0));
3949           break;
3950         case Instruction::SExt:
3951           NewI = B.CreateSExtOrTrunc(
3952               CI->getOperand(0),
3953               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3954           break;
3955         case Instruction::ZExt:
3956           NewI = B.CreateZExtOrTrunc(
3957               CI->getOperand(0),
3958               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3959           break;
3960         }
3961       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3962         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3963         auto *O0 = B.CreateZExtOrTrunc(
3964             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3965         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3966         auto *O1 = B.CreateZExtOrTrunc(
3967             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3968 
3969         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3970       } else if (isa<LoadInst>(I)) {
3971         // Don't do anything with the operands, just extend the result.
3972         continue;
3973       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3974         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3975         auto *O0 = B.CreateZExtOrTrunc(
3976             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3977         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3978         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3979       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3980         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3981         auto *O0 = B.CreateZExtOrTrunc(
3982             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3983         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3984       } else {
3985         llvm_unreachable("Unhandled instruction type!");
3986       }
3987 
3988       // Lastly, extend the result.
3989       NewI->takeName(cast<Instruction>(I));
3990       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3991       I->replaceAllUsesWith(Res);
3992       cast<Instruction>(I)->eraseFromParent();
3993       Erased.insert(I);
3994       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3995     }
3996   }
3997 
3998   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3999   for (const auto &KV : Cost->getMinimalBitwidths()) {
4000     // If the value wasn't vectorized, we must maintain the original scalar
4001     // type. The absence of the value from VectorLoopValueMap indicates that it
4002     // wasn't vectorized.
4003     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
4004       continue;
4005     for (unsigned Part = 0; Part < UF; ++Part) {
4006       Value *I = getOrCreateVectorValue(KV.first, Part);
4007       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4008       if (Inst && Inst->use_empty()) {
4009         Value *NewI = Inst->getOperand(0);
4010         Inst->eraseFromParent();
4011         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
4012       }
4013     }
4014   }
4015 }
4016 
4017 void InnerLoopVectorizer::fixVectorizedLoop() {
4018   // Insert truncates and extends for any truncated instructions as hints to
4019   // InstCombine.
4020   if (VF > 1)
4021     truncateToMinimalBitwidths();
4022 
4023   // At this point every instruction in the original loop is widened to a
4024   // vector form. Now we need to fix the recurrences in the loop. These PHI
4025   // nodes are currently empty because we did not want to introduce cycles.
4026   // This is the second stage of vectorizing recurrences.
4027   fixCrossIterationPHIs();
4028 
4029   // Update the dominator tree.
4030   //
4031   // FIXME: After creating the structure of the new loop, the dominator tree is
4032   //        no longer up-to-date, and it remains that way until we update it
4033   //        here. An out-of-date dominator tree is problematic for SCEV,
4034   //        because SCEVExpander uses it to guide code generation. The
4035   //        vectorizer use SCEVExpanders in several places. Instead, we should
4036   //        keep the dominator tree up-to-date as we go.
4037   updateAnalysis();
4038 
4039   // Fix-up external users of the induction variables.
4040   for (auto &Entry : *Legal->getInductionVars())
4041     fixupIVUsers(Entry.first, Entry.second,
4042                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4043                  IVEndValues[Entry.first], LoopMiddleBlock);
4044 
4045   fixLCSSAPHIs();
4046   for (Instruction *PI : PredicatedInstructions)
4047     sinkScalarOperands(&*PI);
4048 
4049   // Remove redundant induction instructions.
4050   cse(LoopVectorBody);
4051 }
4052 
4053 void InnerLoopVectorizer::fixCrossIterationPHIs() {
4054   // In order to support recurrences we need to be able to vectorize Phi nodes.
4055   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4056   // stage #2: We now need to fix the recurrences by adding incoming edges to
4057   // the currently empty PHI nodes. At this point every instruction in the
4058   // original loop is widened to a vector form so we can use them to construct
4059   // the incoming edges.
4060   for (Instruction &I : *OrigLoop->getHeader()) {
4061     PHINode *Phi = dyn_cast<PHINode>(&I);
4062     if (!Phi)
4063       break;
4064     // Handle first-order recurrences and reductions that need to be fixed.
4065     if (Legal->isFirstOrderRecurrence(Phi))
4066       fixFirstOrderRecurrence(Phi);
4067     else if (Legal->isReductionVariable(Phi))
4068       fixReduction(Phi);
4069   }
4070 }
4071 
4072 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
4073   // This is the second phase of vectorizing first-order recurrences. An
4074   // overview of the transformation is described below. Suppose we have the
4075   // following loop.
4076   //
4077   //   for (int i = 0; i < n; ++i)
4078   //     b[i] = a[i] - a[i - 1];
4079   //
4080   // There is a first-order recurrence on "a". For this loop, the shorthand
4081   // scalar IR looks like:
4082   //
4083   //   scalar.ph:
4084   //     s_init = a[-1]
4085   //     br scalar.body
4086   //
4087   //   scalar.body:
4088   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4089   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4090   //     s2 = a[i]
4091   //     b[i] = s2 - s1
4092   //     br cond, scalar.body, ...
4093   //
4094   // In this example, s1 is a recurrence because it's value depends on the
4095   // previous iteration. In the first phase of vectorization, we created a
4096   // temporary value for s1. We now complete the vectorization and produce the
4097   // shorthand vector IR shown below (for VF = 4, UF = 1).
4098   //
4099   //   vector.ph:
4100   //     v_init = vector(..., ..., ..., a[-1])
4101   //     br vector.body
4102   //
4103   //   vector.body
4104   //     i = phi [0, vector.ph], [i+4, vector.body]
4105   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4106   //     v2 = a[i, i+1, i+2, i+3];
4107   //     v3 = vector(v1(3), v2(0, 1, 2))
4108   //     b[i, i+1, i+2, i+3] = v2 - v3
4109   //     br cond, vector.body, middle.block
4110   //
4111   //   middle.block:
4112   //     x = v2(3)
4113   //     br scalar.ph
4114   //
4115   //   scalar.ph:
4116   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4117   //     br scalar.body
4118   //
4119   // After execution completes the vector loop, we extract the next value of
4120   // the recurrence (x) to use as the initial value in the scalar loop.
4121 
4122   // Get the original loop preheader and single loop latch.
4123   auto *Preheader = OrigLoop->getLoopPreheader();
4124   auto *Latch = OrigLoop->getLoopLatch();
4125 
4126   // Get the initial and previous values of the scalar recurrence.
4127   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4128   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4129 
4130   // Create a vector from the initial value.
4131   auto *VectorInit = ScalarInit;
4132   if (VF > 1) {
4133     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4134     VectorInit = Builder.CreateInsertElement(
4135         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4136         Builder.getInt32(VF - 1), "vector.recur.init");
4137   }
4138 
4139   // We constructed a temporary phi node in the first phase of vectorization.
4140   // This phi node will eventually be deleted.
4141   Builder.SetInsertPoint(
4142       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4143 
4144   // Create a phi node for the new recurrence. The current value will either be
4145   // the initial value inserted into a vector or loop-varying vector value.
4146   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4147   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4148 
4149   // Get the vectorized previous value of the last part UF - 1. It appears last
4150   // among all unrolled iterations, due to the order of their construction.
4151   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4152 
4153   // Set the insertion point after the previous value if it is an instruction.
4154   // Note that the previous value may have been constant-folded so it is not
4155   // guaranteed to be an instruction in the vector loop. Also, if the previous
4156   // value is a phi node, we should insert after all the phi nodes to avoid
4157   // breaking basic block verification.
4158   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
4159       isa<PHINode>(PreviousLastPart))
4160     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
4161   else
4162     Builder.SetInsertPoint(
4163         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
4164 
4165   // We will construct a vector for the recurrence by combining the values for
4166   // the current and previous iterations. This is the required shuffle mask.
4167   SmallVector<Constant *, 8> ShuffleMask(VF);
4168   ShuffleMask[0] = Builder.getInt32(VF - 1);
4169   for (unsigned I = 1; I < VF; ++I)
4170     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4171 
4172   // The vector from which to take the initial value for the current iteration
4173   // (actual or unrolled). Initially, this is the vector phi node.
4174   Value *Incoming = VecPhi;
4175 
4176   // Shuffle the current and previous vector and update the vector parts.
4177   for (unsigned Part = 0; Part < UF; ++Part) {
4178     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4179     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4180     auto *Shuffle =
4181         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
4182                                              ConstantVector::get(ShuffleMask))
4183                : Incoming;
4184     PhiPart->replaceAllUsesWith(Shuffle);
4185     cast<Instruction>(PhiPart)->eraseFromParent();
4186     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4187     Incoming = PreviousPart;
4188   }
4189 
4190   // Fix the latch value of the new recurrence in the vector loop.
4191   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4192 
4193   // Extract the last vector element in the middle block. This will be the
4194   // initial value for the recurrence when jumping to the scalar loop.
4195   auto *ExtractForScalar = Incoming;
4196   if (VF > 1) {
4197     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4198     ExtractForScalar = Builder.CreateExtractElement(
4199         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
4200   }
4201   // Extract the second last element in the middle block if the
4202   // Phi is used outside the loop. We need to extract the phi itself
4203   // and not the last element (the phi update in the current iteration). This
4204   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4205   // when the scalar loop is not run at all.
4206   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4207   if (VF > 1)
4208     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4209         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
4210   // When loop is unrolled without vectorizing, initialize
4211   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4212   // `Incoming`. This is analogous to the vectorized case above: extracting the
4213   // second last element when VF > 1.
4214   else if (UF > 1)
4215     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4216 
4217   // Fix the initial value of the original recurrence in the scalar loop.
4218   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4219   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4220   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4221     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4222     Start->addIncoming(Incoming, BB);
4223   }
4224 
4225   Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
4226   Phi->setName("scalar.recur");
4227 
4228   // Finally, fix users of the recurrence outside the loop. The users will need
4229   // either the last value of the scalar recurrence or the last value of the
4230   // vector recurrence we extracted in the middle block. Since the loop is in
4231   // LCSSA form, we just need to find the phi node for the original scalar
4232   // recurrence in the exit block, and then add an edge for the middle block.
4233   for (auto &I : *LoopExitBlock) {
4234     auto *LCSSAPhi = dyn_cast<PHINode>(&I);
4235     if (!LCSSAPhi)
4236       break;
4237     if (LCSSAPhi->getIncomingValue(0) == Phi) {
4238       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4239       break;
4240     }
4241   }
4242 }
4243 
4244 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4245   Constant *Zero = Builder.getInt32(0);
4246 
4247   // Get it's reduction variable descriptor.
4248   assert(Legal->isReductionVariable(Phi) &&
4249          "Unable to find the reduction variable");
4250   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
4251 
4252   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4253   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4254   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4255   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4256     RdxDesc.getMinMaxRecurrenceKind();
4257   setDebugLocFromInst(Builder, ReductionStartValue);
4258 
4259   // We need to generate a reduction vector from the incoming scalar.
4260   // To do so, we need to generate the 'identity' vector and override
4261   // one of the elements with the incoming scalar reduction. We need
4262   // to do it in the vector-loop preheader.
4263   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4264 
4265   // This is the vector-clone of the value that leaves the loop.
4266   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4267 
4268   // Find the reduction identity variable. Zero for addition, or, xor,
4269   // one for multiplication, -1 for And.
4270   Value *Identity;
4271   Value *VectorStart;
4272   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4273       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4274     // MinMax reduction have the start value as their identify.
4275     if (VF == 1) {
4276       VectorStart = Identity = ReductionStartValue;
4277     } else {
4278       VectorStart = Identity =
4279         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4280     }
4281   } else {
4282     // Handle other reduction kinds:
4283     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4284         RK, VecTy->getScalarType());
4285     if (VF == 1) {
4286       Identity = Iden;
4287       // This vector is the Identity vector where the first element is the
4288       // incoming scalar reduction.
4289       VectorStart = ReductionStartValue;
4290     } else {
4291       Identity = ConstantVector::getSplat(VF, Iden);
4292 
4293       // This vector is the Identity vector where the first element is the
4294       // incoming scalar reduction.
4295       VectorStart =
4296         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4297     }
4298   }
4299 
4300   // Fix the vector-loop phi.
4301 
4302   // Reductions do not have to start at zero. They can start with
4303   // any loop invariant values.
4304   BasicBlock *Latch = OrigLoop->getLoopLatch();
4305   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4306   for (unsigned Part = 0; Part < UF; ++Part) {
4307     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4308     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4309     // Make sure to add the reduction stat value only to the
4310     // first unroll part.
4311     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4312     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4313     cast<PHINode>(VecRdxPhi)
4314       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4315   }
4316 
4317   // Before each round, move the insertion point right between
4318   // the PHIs and the values we are going to write.
4319   // This allows us to write both PHINodes and the extractelement
4320   // instructions.
4321   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4322 
4323   setDebugLocFromInst(Builder, LoopExitInst);
4324 
4325   // If the vector reduction can be performed in a smaller type, we truncate
4326   // then extend the loop exit value to enable InstCombine to evaluate the
4327   // entire expression in the smaller type.
4328   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
4329     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4330     Builder.SetInsertPoint(
4331         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4332     VectorParts RdxParts(UF);
4333     for (unsigned Part = 0; Part < UF; ++Part) {
4334       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4335       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4336       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4337                                         : Builder.CreateZExt(Trunc, VecTy);
4338       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4339            UI != RdxParts[Part]->user_end();)
4340         if (*UI != Trunc) {
4341           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4342           RdxParts[Part] = Extnd;
4343         } else {
4344           ++UI;
4345         }
4346     }
4347     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4348     for (unsigned Part = 0; Part < UF; ++Part) {
4349       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4350       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4351     }
4352   }
4353 
4354   // Reduce all of the unrolled parts into a single vector.
4355   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4356   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4357   setDebugLocFromInst(Builder, ReducedPartRdx);
4358   for (unsigned Part = 1; Part < UF; ++Part) {
4359     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4360     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4361       // Floating point operations had to be 'fast' to enable the reduction.
4362       ReducedPartRdx = addFastMathFlag(
4363           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4364                               ReducedPartRdx, "bin.rdx"));
4365     else
4366       ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4367           Builder, MinMaxKind, ReducedPartRdx, RdxPart);
4368   }
4369 
4370   if (VF > 1) {
4371     bool NoNaN = Legal->hasFunNoNaNAttr();
4372     ReducedPartRdx =
4373         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4374     // If the reduction can be performed in a smaller type, we need to extend
4375     // the reduction to the wider type before we branch to the original loop.
4376     if (Phi->getType() != RdxDesc.getRecurrenceType())
4377       ReducedPartRdx =
4378         RdxDesc.isSigned()
4379         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4380         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4381   }
4382 
4383   // Create a phi node that merges control-flow from the backedge-taken check
4384   // block and the middle block.
4385   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4386                                         LoopScalarPreHeader->getTerminator());
4387   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4388     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4389   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4390 
4391   // Now, we need to fix the users of the reduction variable
4392   // inside and outside of the scalar remainder loop.
4393   // We know that the loop is in LCSSA form. We need to update the
4394   // PHI nodes in the exit blocks.
4395   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
4396          LEE = LoopExitBlock->end();
4397        LEI != LEE; ++LEI) {
4398     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
4399     if (!LCSSAPhi)
4400       break;
4401 
4402     // All PHINodes need to have a single entry edge, or two if
4403     // we already fixed them.
4404     assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4405 
4406     // We found a reduction value exit-PHI. Update it with the
4407     // incoming bypass edge.
4408     if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
4409       LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4410   } // end of the LCSSA phi scan.
4411 
4412     // Fix the scalar loop reduction variable with the incoming reduction sum
4413     // from the vector body and from the backedge value.
4414   int IncomingEdgeBlockIdx =
4415     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4416   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4417   // Pick the other block.
4418   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4419   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4420   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4421 }
4422 
4423 void InnerLoopVectorizer::fixLCSSAPHIs() {
4424   for (Instruction &LEI : *LoopExitBlock) {
4425     auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
4426     if (!LCSSAPhi)
4427       break;
4428     if (LCSSAPhi->getNumIncomingValues() == 1) {
4429       assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
4430              "Incoming value isn't loop invariant");
4431       LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
4432     }
4433   }
4434 }
4435 
4436 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4437   // The basic block and loop containing the predicated instruction.
4438   auto *PredBB = PredInst->getParent();
4439   auto *VectorLoop = LI->getLoopFor(PredBB);
4440 
4441   // Initialize a worklist with the operands of the predicated instruction.
4442   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4443 
4444   // Holds instructions that we need to analyze again. An instruction may be
4445   // reanalyzed if we don't yet know if we can sink it or not.
4446   SmallVector<Instruction *, 8> InstsToReanalyze;
4447 
4448   // Returns true if a given use occurs in the predicated block. Phi nodes use
4449   // their operands in their corresponding predecessor blocks.
4450   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4451     auto *I = cast<Instruction>(U.getUser());
4452     BasicBlock *BB = I->getParent();
4453     if (auto *Phi = dyn_cast<PHINode>(I))
4454       BB = Phi->getIncomingBlock(
4455           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4456     return BB == PredBB;
4457   };
4458 
4459   // Iteratively sink the scalarized operands of the predicated instruction
4460   // into the block we created for it. When an instruction is sunk, it's
4461   // operands are then added to the worklist. The algorithm ends after one pass
4462   // through the worklist doesn't sink a single instruction.
4463   bool Changed;
4464   do {
4465     // Add the instructions that need to be reanalyzed to the worklist, and
4466     // reset the changed indicator.
4467     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4468     InstsToReanalyze.clear();
4469     Changed = false;
4470 
4471     while (!Worklist.empty()) {
4472       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4473 
4474       // We can't sink an instruction if it is a phi node, is already in the
4475       // predicated block, is not in the loop, or may have side effects.
4476       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4477           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4478         continue;
4479 
4480       // It's legal to sink the instruction if all its uses occur in the
4481       // predicated block. Otherwise, there's nothing to do yet, and we may
4482       // need to reanalyze the instruction.
4483       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4484         InstsToReanalyze.push_back(I);
4485         continue;
4486       }
4487 
4488       // Move the instruction to the beginning of the predicated block, and add
4489       // it's operands to the worklist.
4490       I->moveBefore(&*PredBB->getFirstInsertionPt());
4491       Worklist.insert(I->op_begin(), I->op_end());
4492 
4493       // The sinking may have enabled other instructions to be sunk, so we will
4494       // need to iterate.
4495       Changed = true;
4496     }
4497   } while (Changed);
4498 }
4499 
4500 InnerLoopVectorizer::VectorParts
4501 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
4502   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
4503 
4504   // Look for cached value.
4505   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
4506   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
4507   if (ECEntryIt != EdgeMaskCache.end())
4508     return ECEntryIt->second;
4509 
4510   VectorParts SrcMask = createBlockInMask(Src);
4511 
4512   // The terminator has to be a branch inst!
4513   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
4514   assert(BI && "Unexpected terminator found");
4515 
4516   if (!BI->isConditional())
4517     return EdgeMaskCache[Edge] = SrcMask;
4518 
4519   VectorParts EdgeMask(UF);
4520   for (unsigned Part = 0; Part < UF; ++Part) {
4521     auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
4522     if (BI->getSuccessor(0) != Dst)
4523       EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
4524 
4525     if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
4526       EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
4527 
4528     EdgeMask[Part] = EdgeMaskPart;
4529   }
4530 
4531   return EdgeMaskCache[Edge] = EdgeMask;
4532 }
4533 
4534 InnerLoopVectorizer::VectorParts
4535 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
4536   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
4537 
4538   // Look for cached value.
4539   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
4540   if (BCEntryIt != BlockMaskCache.end())
4541     return BCEntryIt->second;
4542 
4543   // All-one mask is modelled as no-mask following the convention for masked
4544   // load/store/gather/scatter. Initialize BlockMask to no-mask.
4545   VectorParts BlockMask(UF);
4546   for (unsigned Part = 0; Part < UF; ++Part)
4547     BlockMask[Part] = nullptr;
4548 
4549   // Loop incoming mask is all-one.
4550   if (OrigLoop->getHeader() == BB)
4551     return BlockMaskCache[BB] = BlockMask;
4552 
4553   // This is the block mask. We OR all incoming edges.
4554   for (auto *Predecessor : predecessors(BB)) {
4555     VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
4556     if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
4557       return BlockMaskCache[BB] = EdgeMask;
4558 
4559     if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
4560       BlockMask = EdgeMask;
4561       continue;
4562     }
4563 
4564     for (unsigned Part = 0; Part < UF; ++Part)
4565       BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
4566   }
4567 
4568   return BlockMaskCache[BB] = BlockMask;
4569 }
4570 
4571 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4572                                               unsigned VF) {
4573   PHINode *P = cast<PHINode>(PN);
4574   // In order to support recurrences we need to be able to vectorize Phi nodes.
4575   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4576   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4577   // this value when we vectorize all of the instructions that use the PHI.
4578   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4579     for (unsigned Part = 0; Part < UF; ++Part) {
4580       // This is phase one of vectorizing PHIs.
4581       Type *VecTy =
4582           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4583       Value *EntryPart = PHINode::Create(
4584           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4585       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4586     }
4587     return;
4588   }
4589 
4590   setDebugLocFromInst(Builder, P);
4591   // Check for PHI nodes that are lowered to vector selects.
4592   if (P->getParent() != OrigLoop->getHeader()) {
4593     // We know that all PHIs in non-header blocks are converted into
4594     // selects, so we don't have to worry about the insertion order and we
4595     // can just use the builder.
4596     // At this point we generate the predication tree. There may be
4597     // duplications since this is a simple recursive scan, but future
4598     // optimizations will clean it up.
4599 
4600     unsigned NumIncoming = P->getNumIncomingValues();
4601 
4602     // Generate a sequence of selects of the form:
4603     // SELECT(Mask3, In3,
4604     //      SELECT(Mask2, In2,
4605     //                   ( ...)))
4606     VectorParts Entry(UF);
4607     for (unsigned In = 0; In < NumIncoming; In++) {
4608       VectorParts Cond =
4609           createEdgeMask(P->getIncomingBlock(In), P->getParent());
4610 
4611       for (unsigned Part = 0; Part < UF; ++Part) {
4612         Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
4613         assert((Cond[Part] || NumIncoming == 1) &&
4614                "Multiple predecessors with one predecessor having a full mask");
4615         if (In == 0)
4616           Entry[Part] = In0; // Initialize with the first incoming value.
4617         else
4618           // Select between the current value and the previous incoming edge
4619           // based on the incoming mask.
4620           Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
4621                                              "predphi");
4622       }
4623     }
4624     for (unsigned Part = 0; Part < UF; ++Part)
4625       VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
4626     return;
4627   }
4628 
4629   // This PHINode must be an induction variable.
4630   // Make sure that we know about it.
4631   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4632 
4633   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4634   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4635 
4636   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4637   // which can be found from the original scalar operations.
4638   switch (II.getKind()) {
4639   case InductionDescriptor::IK_NoInduction:
4640     llvm_unreachable("Unknown induction");
4641   case InductionDescriptor::IK_IntInduction:
4642   case InductionDescriptor::IK_FpInduction:
4643     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4644   case InductionDescriptor::IK_PtrInduction: {
4645     // Handle the pointer induction variable case.
4646     assert(P->getType()->isPointerTy() && "Unexpected type.");
4647     // This is the normalized GEP that starts counting at zero.
4648     Value *PtrInd = Induction;
4649     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4650     // Determine the number of scalars we need to generate for each unroll
4651     // iteration. If the instruction is uniform, we only need to generate the
4652     // first lane. Otherwise, we generate all VF values.
4653     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4654     // These are the scalar results. Notice that we don't generate vector GEPs
4655     // because scalar GEPs result in better code.
4656     for (unsigned Part = 0; Part < UF; ++Part) {
4657       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4658         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4659         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4660         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4661         SclrGep->setName("next.gep");
4662         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4663       }
4664     }
4665     return;
4666   }
4667   }
4668 }
4669 
4670 /// A helper function for checking whether an integer division-related
4671 /// instruction may divide by zero (in which case it must be predicated if
4672 /// executed conditionally in the scalar code).
4673 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4674 /// Non-zero divisors that are non compile-time constants will not be
4675 /// converted into multiplication, so we will still end up scalarizing
4676 /// the division, but can do so w/o predication.
4677 static bool mayDivideByZero(Instruction &I) {
4678   assert((I.getOpcode() == Instruction::UDiv ||
4679           I.getOpcode() == Instruction::SDiv ||
4680           I.getOpcode() == Instruction::URem ||
4681           I.getOpcode() == Instruction::SRem) &&
4682          "Unexpected instruction");
4683   Value *Divisor = I.getOperand(1);
4684   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4685   return !CInt || CInt->isZero();
4686 }
4687 
4688 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4689   switch (I.getOpcode()) {
4690   case Instruction::Br:
4691   case Instruction::PHI:
4692     llvm_unreachable("This instruction is handled by a different recipe.");
4693   case Instruction::GetElementPtr: {
4694     // Construct a vector GEP by widening the operands of the scalar GEP as
4695     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4696     // results in a vector of pointers when at least one operand of the GEP
4697     // is vector-typed. Thus, to keep the representation compact, we only use
4698     // vector-typed operands for loop-varying values.
4699     auto *GEP = cast<GetElementPtrInst>(&I);
4700 
4701     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4702       // If we are vectorizing, but the GEP has only loop-invariant operands,
4703       // the GEP we build (by only using vector-typed operands for
4704       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4705       // produce a vector of pointers, we need to either arbitrarily pick an
4706       // operand to broadcast, or broadcast a clone of the original GEP.
4707       // Here, we broadcast a clone of the original.
4708       //
4709       // TODO: If at some point we decide to scalarize instructions having
4710       //       loop-invariant operands, this special case will no longer be
4711       //       required. We would add the scalarization decision to
4712       //       collectLoopScalars() and teach getVectorValue() to broadcast
4713       //       the lane-zero scalar value.
4714       auto *Clone = Builder.Insert(GEP->clone());
4715       for (unsigned Part = 0; Part < UF; ++Part) {
4716         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4717         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4718         addMetadata(EntryPart, GEP);
4719       }
4720     } else {
4721       // If the GEP has at least one loop-varying operand, we are sure to
4722       // produce a vector of pointers. But if we are only unrolling, we want
4723       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4724       // produce with the code below will be scalar (if VF == 1) or vector
4725       // (otherwise). Note that for the unroll-only case, we still maintain
4726       // values in the vector mapping with initVector, as we do for other
4727       // instructions.
4728       for (unsigned Part = 0; Part < UF; ++Part) {
4729         // The pointer operand of the new GEP. If it's loop-invariant, we
4730         // won't broadcast it.
4731         auto *Ptr =
4732             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4733                 ? GEP->getPointerOperand()
4734                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4735 
4736         // Collect all the indices for the new GEP. If any index is
4737         // loop-invariant, we won't broadcast it.
4738         SmallVector<Value *, 4> Indices;
4739         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4740           if (OrigLoop->isLoopInvariant(U.get()))
4741             Indices.push_back(U.get());
4742           else
4743             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4744         }
4745 
4746         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4747         // but it should be a vector, otherwise.
4748         auto *NewGEP = GEP->isInBounds()
4749                            ? Builder.CreateInBoundsGEP(Ptr, Indices)
4750                            : Builder.CreateGEP(Ptr, Indices);
4751         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4752                "NewGEP is not a pointer vector");
4753         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4754         addMetadata(NewGEP, GEP);
4755       }
4756     }
4757 
4758     break;
4759   }
4760   case Instruction::UDiv:
4761   case Instruction::SDiv:
4762   case Instruction::SRem:
4763   case Instruction::URem:
4764   case Instruction::Add:
4765   case Instruction::FAdd:
4766   case Instruction::Sub:
4767   case Instruction::FSub:
4768   case Instruction::Mul:
4769   case Instruction::FMul:
4770   case Instruction::FDiv:
4771   case Instruction::FRem:
4772   case Instruction::Shl:
4773   case Instruction::LShr:
4774   case Instruction::AShr:
4775   case Instruction::And:
4776   case Instruction::Or:
4777   case Instruction::Xor: {
4778     // Just widen binops.
4779     auto *BinOp = cast<BinaryOperator>(&I);
4780     setDebugLocFromInst(Builder, BinOp);
4781 
4782     for (unsigned Part = 0; Part < UF; ++Part) {
4783       Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
4784       Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
4785       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
4786 
4787       if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4788         VecOp->copyIRFlags(BinOp);
4789 
4790       // Use this vector value for all users of the original instruction.
4791       VectorLoopValueMap.setVectorValue(&I, Part, V);
4792       addMetadata(V, BinOp);
4793     }
4794 
4795     break;
4796   }
4797   case Instruction::Select: {
4798     // Widen selects.
4799     // If the selector is loop invariant we can create a select
4800     // instruction with a scalar condition. Otherwise, use vector-select.
4801     auto *SE = PSE.getSE();
4802     bool InvariantCond =
4803         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4804     setDebugLocFromInst(Builder, &I);
4805 
4806     // The condition can be loop invariant  but still defined inside the
4807     // loop. This means that we can't just use the original 'cond' value.
4808     // We have to take the 'vectorized' value and pick the first lane.
4809     // Instcombine will make this a no-op.
4810 
4811     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4812 
4813     for (unsigned Part = 0; Part < UF; ++Part) {
4814       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4815       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4816       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4817       Value *Sel =
4818           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4819       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4820       addMetadata(Sel, &I);
4821     }
4822 
4823     break;
4824   }
4825 
4826   case Instruction::ICmp:
4827   case Instruction::FCmp: {
4828     // Widen compares. Generate vector compares.
4829     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4830     auto *Cmp = dyn_cast<CmpInst>(&I);
4831     setDebugLocFromInst(Builder, Cmp);
4832     for (unsigned Part = 0; Part < UF; ++Part) {
4833       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4834       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4835       Value *C = nullptr;
4836       if (FCmp) {
4837         // Propagate fast math flags.
4838         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4839         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4840         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4841       } else {
4842         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4843       }
4844       VectorLoopValueMap.setVectorValue(&I, Part, C);
4845       addMetadata(C, &I);
4846     }
4847 
4848     break;
4849   }
4850 
4851   case Instruction::Store:
4852   case Instruction::Load:
4853     vectorizeMemoryInstruction(&I);
4854     break;
4855   case Instruction::ZExt:
4856   case Instruction::SExt:
4857   case Instruction::FPToUI:
4858   case Instruction::FPToSI:
4859   case Instruction::FPExt:
4860   case Instruction::PtrToInt:
4861   case Instruction::IntToPtr:
4862   case Instruction::SIToFP:
4863   case Instruction::UIToFP:
4864   case Instruction::Trunc:
4865   case Instruction::FPTrunc:
4866   case Instruction::BitCast: {
4867     auto *CI = dyn_cast<CastInst>(&I);
4868     setDebugLocFromInst(Builder, CI);
4869 
4870     /// Vectorize casts.
4871     Type *DestTy =
4872         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4873 
4874     for (unsigned Part = 0; Part < UF; ++Part) {
4875       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4876       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4877       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4878       addMetadata(Cast, &I);
4879     }
4880     break;
4881   }
4882 
4883   case Instruction::Call: {
4884     // Ignore dbg intrinsics.
4885     if (isa<DbgInfoIntrinsic>(I))
4886       break;
4887     setDebugLocFromInst(Builder, &I);
4888 
4889     Module *M = I.getParent()->getParent()->getParent();
4890     auto *CI = cast<CallInst>(&I);
4891 
4892     StringRef FnName = CI->getCalledFunction()->getName();
4893     Function *F = CI->getCalledFunction();
4894     Type *RetTy = ToVectorTy(CI->getType(), VF);
4895     SmallVector<Type *, 4> Tys;
4896     for (Value *ArgOperand : CI->arg_operands())
4897       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4898 
4899     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4900 
4901     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4902     // version of the instruction.
4903     // Is it beneficial to perform intrinsic call compared to lib call?
4904     bool NeedToScalarize;
4905     unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4906     bool UseVectorIntrinsic =
4907         ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4908     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4909            "Instruction should be scalarized elsewhere.");
4910 
4911     for (unsigned Part = 0; Part < UF; ++Part) {
4912       SmallVector<Value *, 4> Args;
4913       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4914         Value *Arg = CI->getArgOperand(i);
4915         // Some intrinsics have a scalar argument - don't replace it with a
4916         // vector.
4917         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4918           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4919         Args.push_back(Arg);
4920       }
4921 
4922       Function *VectorF;
4923       if (UseVectorIntrinsic) {
4924         // Use vector version of the intrinsic.
4925         Type *TysForDecl[] = {CI->getType()};
4926         if (VF > 1)
4927           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4928         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4929       } else {
4930         // Use vector version of the library call.
4931         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4932         assert(!VFnName.empty() && "Vector function name is empty.");
4933         VectorF = M->getFunction(VFnName);
4934         if (!VectorF) {
4935           // Generate a declaration
4936           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4937           VectorF =
4938               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4939           VectorF->copyAttributesFrom(F);
4940         }
4941       }
4942       assert(VectorF && "Can't create vector function.");
4943 
4944       SmallVector<OperandBundleDef, 1> OpBundles;
4945       CI->getOperandBundlesAsDefs(OpBundles);
4946       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4947 
4948       if (isa<FPMathOperator>(V))
4949         V->copyFastMathFlags(CI);
4950 
4951       VectorLoopValueMap.setVectorValue(&I, Part, V);
4952       addMetadata(V, &I);
4953     }
4954 
4955     break;
4956   }
4957 
4958   default:
4959     // All other instructions are scalarized.
4960     DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4961     llvm_unreachable("Unhandled instruction!");
4962   } // end of switch.
4963 }
4964 
4965 void InnerLoopVectorizer::updateAnalysis() {
4966   // Forget the original basic block.
4967   PSE.getSE()->forgetLoop(OrigLoop);
4968 
4969   // Update the dominator tree information.
4970   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4971          "Entry does not dominate exit.");
4972 
4973   DT->addNewBlock(LoopMiddleBlock,
4974                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4975   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4976   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4977   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4978   DEBUG(DT->verifyDomTree());
4979 }
4980 
4981 /// \brief Check whether it is safe to if-convert this phi node.
4982 ///
4983 /// Phi nodes with constant expressions that can trap are not safe to if
4984 /// convert.
4985 static bool canIfConvertPHINodes(BasicBlock *BB) {
4986   for (Instruction &I : *BB) {
4987     auto *Phi = dyn_cast<PHINode>(&I);
4988     if (!Phi)
4989       return true;
4990     for (Value *V : Phi->incoming_values())
4991       if (auto *C = dyn_cast<Constant>(V))
4992         if (C->canTrap())
4993           return false;
4994   }
4995   return true;
4996 }
4997 
4998 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
4999   if (!EnableIfConversion) {
5000     ORE->emit(createMissedAnalysis("IfConversionDisabled")
5001               << "if-conversion is disabled");
5002     return false;
5003   }
5004 
5005   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
5006 
5007   // A list of pointers that we can safely read and write to.
5008   SmallPtrSet<Value *, 8> SafePointes;
5009 
5010   // Collect safe addresses.
5011   for (BasicBlock *BB : TheLoop->blocks()) {
5012     if (blockNeedsPredication(BB))
5013       continue;
5014 
5015     for (Instruction &I : *BB)
5016       if (auto *Ptr = getPointerOperand(&I))
5017         SafePointes.insert(Ptr);
5018   }
5019 
5020   // Collect the blocks that need predication.
5021   BasicBlock *Header = TheLoop->getHeader();
5022   for (BasicBlock *BB : TheLoop->blocks()) {
5023     // We don't support switch statements inside loops.
5024     if (!isa<BranchInst>(BB->getTerminator())) {
5025       ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
5026                 << "loop contains a switch statement");
5027       return false;
5028     }
5029 
5030     // We must be able to predicate all blocks that need to be predicated.
5031     if (blockNeedsPredication(BB)) {
5032       if (!blockCanBePredicated(BB, SafePointes)) {
5033         ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5034                   << "control flow cannot be substituted for a select");
5035         return false;
5036       }
5037     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
5038       ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5039                 << "control flow cannot be substituted for a select");
5040       return false;
5041     }
5042   }
5043 
5044   // We can if-convert this loop.
5045   return true;
5046 }
5047 
5048 bool LoopVectorizationLegality::canVectorize() {
5049   // Store the result and return it at the end instead of exiting early, in case
5050   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
5051   bool Result = true;
5052 
5053   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
5054   if (DoExtraAnalysis)
5055   // We must have a loop in canonical form. Loops with indirectbr in them cannot
5056   // be canonicalized.
5057   if (!TheLoop->getLoopPreheader()) {
5058     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5059               << "loop control flow is not understood by vectorizer");
5060   if (DoExtraAnalysis)
5061       Result = false;
5062     else
5063       return false;
5064   }
5065 
5066   // FIXME: The code is currently dead, since the loop gets sent to
5067   // LoopVectorizationLegality is already an innermost loop.
5068   //
5069   // We can only vectorize innermost loops.
5070   if (!TheLoop->empty()) {
5071     ORE->emit(createMissedAnalysis("NotInnermostLoop")
5072               << "loop is not the innermost loop");
5073     if (DoExtraAnalysis)
5074       Result = false;
5075     else
5076       return false;
5077   }
5078 
5079   // We must have a single backedge.
5080   if (TheLoop->getNumBackEdges() != 1) {
5081     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5082               << "loop control flow is not understood by vectorizer");
5083     if (DoExtraAnalysis)
5084       Result = false;
5085     else
5086       return false;
5087   }
5088 
5089   // We must have a single exiting block.
5090   if (!TheLoop->getExitingBlock()) {
5091     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5092               << "loop control flow is not understood by vectorizer");
5093     if (DoExtraAnalysis)
5094       Result = false;
5095     else
5096       return false;
5097   }
5098 
5099   // We only handle bottom-tested loops, i.e. loop in which the condition is
5100   // checked at the end of each iteration. With that we can assume that all
5101   // instructions in the loop are executed the same number of times.
5102   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5103     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5104               << "loop control flow is not understood by vectorizer");
5105     if (DoExtraAnalysis)
5106       Result = false;
5107     else
5108       return false;
5109   }
5110 
5111   // We need to have a loop header.
5112   DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
5113                << '\n');
5114 
5115   // Check if we can if-convert non-single-bb loops.
5116   unsigned NumBlocks = TheLoop->getNumBlocks();
5117   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
5118     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
5119     if (DoExtraAnalysis)
5120       Result = false;
5121     else
5122       return false;
5123   }
5124 
5125   // Check if we can vectorize the instructions and CFG in this loop.
5126   if (!canVectorizeInstrs()) {
5127     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
5128     if (DoExtraAnalysis)
5129       Result = false;
5130     else
5131       return false;
5132   }
5133 
5134   // Go over each instruction and look at memory deps.
5135   if (!canVectorizeMemory()) {
5136     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
5137     if (DoExtraAnalysis)
5138       Result = false;
5139     else
5140       return false;
5141   }
5142 
5143   DEBUG(dbgs() << "LV: We can vectorize this loop"
5144                << (LAI->getRuntimePointerChecking()->Need
5145                        ? " (with a runtime bound check)"
5146                        : "")
5147                << "!\n");
5148 
5149   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
5150 
5151   // If an override option has been passed in for interleaved accesses, use it.
5152   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
5153     UseInterleaved = EnableInterleavedMemAccesses;
5154 
5155   // Analyze interleaved memory accesses.
5156   if (UseInterleaved)
5157     InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
5158 
5159   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
5160   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
5161     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
5162 
5163   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
5164     ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
5165               << "Too many SCEV assumptions need to be made and checked "
5166               << "at runtime");
5167     DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
5168     if (DoExtraAnalysis)
5169       Result = false;
5170     else
5171       return false;
5172   }
5173 
5174   // Okay! We've done all the tests. If any have failed, return false. Otherwise
5175   // we can vectorize, and at this point we don't have any other mem analysis
5176   // which may limit our maximum vectorization factor, so just return true with
5177   // no restrictions.
5178   return Result;
5179 }
5180 
5181 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
5182   if (Ty->isPointerTy())
5183     return DL.getIntPtrType(Ty);
5184 
5185   // It is possible that char's or short's overflow when we ask for the loop's
5186   // trip count, work around this by changing the type size.
5187   if (Ty->getScalarSizeInBits() < 32)
5188     return Type::getInt32Ty(Ty->getContext());
5189 
5190   return Ty;
5191 }
5192 
5193 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
5194   Ty0 = convertPointerToIntegerType(DL, Ty0);
5195   Ty1 = convertPointerToIntegerType(DL, Ty1);
5196   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
5197     return Ty0;
5198   return Ty1;
5199 }
5200 
5201 /// \brief Check that the instruction has outside loop users and is not an
5202 /// identified reduction variable.
5203 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
5204                                SmallPtrSetImpl<Value *> &AllowedExit) {
5205   // Reduction and Induction instructions are allowed to have exit users. All
5206   // other instructions must not have external users.
5207   if (!AllowedExit.count(Inst))
5208     // Check that all of the users of the loop are inside the BB.
5209     for (User *U : Inst->users()) {
5210       Instruction *UI = cast<Instruction>(U);
5211       // This user may be a reduction exit value.
5212       if (!TheLoop->contains(UI)) {
5213         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
5214         return true;
5215       }
5216     }
5217   return false;
5218 }
5219 
5220 void LoopVectorizationLegality::addInductionPhi(
5221     PHINode *Phi, const InductionDescriptor &ID,
5222     SmallPtrSetImpl<Value *> &AllowedExit) {
5223   Inductions[Phi] = ID;
5224   Type *PhiTy = Phi->getType();
5225   const DataLayout &DL = Phi->getModule()->getDataLayout();
5226 
5227   // Get the widest type.
5228   if (!PhiTy->isFloatingPointTy()) {
5229     if (!WidestIndTy)
5230       WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
5231     else
5232       WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
5233   }
5234 
5235   // Int inductions are special because we only allow one IV.
5236   if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
5237       ID.getConstIntStepValue() &&
5238       ID.getConstIntStepValue()->isOne() &&
5239       isa<Constant>(ID.getStartValue()) &&
5240       cast<Constant>(ID.getStartValue())->isNullValue()) {
5241 
5242     // Use the phi node with the widest type as induction. Use the last
5243     // one if there are multiple (no good reason for doing this other
5244     // than it is expedient). We've checked that it begins at zero and
5245     // steps by one, so this is a canonical induction variable.
5246     if (!PrimaryInduction || PhiTy == WidestIndTy)
5247       PrimaryInduction = Phi;
5248   }
5249 
5250   // Both the PHI node itself, and the "post-increment" value feeding
5251   // back into the PHI node may have external users.
5252   // We can allow those uses, except if the SCEVs we have for them rely
5253   // on predicates that only hold within the loop, since allowing the exit
5254   // currently means re-using this SCEV outside the loop.
5255   if (PSE.getUnionPredicate().isAlwaysTrue()) {
5256     AllowedExit.insert(Phi);
5257     AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
5258   }
5259 
5260   DEBUG(dbgs() << "LV: Found an induction variable.\n");
5261 }
5262 
5263 bool LoopVectorizationLegality::canVectorizeInstrs() {
5264   BasicBlock *Header = TheLoop->getHeader();
5265 
5266   // Look for the attribute signaling the absence of NaNs.
5267   Function &F = *Header->getParent();
5268   HasFunNoNaNAttr =
5269       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
5270 
5271   // For each block in the loop.
5272   for (BasicBlock *BB : TheLoop->blocks()) {
5273     // Scan the instructions in the block and look for hazards.
5274     for (Instruction &I : *BB) {
5275       if (auto *Phi = dyn_cast<PHINode>(&I)) {
5276         Type *PhiTy = Phi->getType();
5277         // Check that this PHI type is allowed.
5278         if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
5279             !PhiTy->isPointerTy()) {
5280           ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5281                     << "loop control flow is not understood by vectorizer");
5282           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
5283           return false;
5284         }
5285 
5286         // If this PHINode is not in the header block, then we know that we
5287         // can convert it to select during if-conversion. No need to check if
5288         // the PHIs in this block are induction or reduction variables.
5289         if (BB != Header) {
5290           // Check that this instruction has no outside users or is an
5291           // identified reduction value with an outside user.
5292           if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
5293             continue;
5294           ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
5295                     << "value could not be identified as "
5296                        "an induction or reduction variable");
5297           return false;
5298         }
5299 
5300         // We only allow if-converted PHIs with exactly two incoming values.
5301         if (Phi->getNumIncomingValues() != 2) {
5302           ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5303                     << "control flow not understood by vectorizer");
5304           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
5305           return false;
5306         }
5307 
5308         RecurrenceDescriptor RedDes;
5309         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
5310           if (RedDes.hasUnsafeAlgebra())
5311             Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
5312           AllowedExit.insert(RedDes.getLoopExitInstr());
5313           Reductions[Phi] = RedDes;
5314           continue;
5315         }
5316 
5317         InductionDescriptor ID;
5318         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
5319           addInductionPhi(Phi, ID, AllowedExit);
5320           if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
5321             Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
5322           continue;
5323         }
5324 
5325         if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
5326                                                          SinkAfter, DT)) {
5327           FirstOrderRecurrences.insert(Phi);
5328           continue;
5329         }
5330 
5331         // As a last resort, coerce the PHI to a AddRec expression
5332         // and re-try classifying it a an induction PHI.
5333         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
5334           addInductionPhi(Phi, ID, AllowedExit);
5335           continue;
5336         }
5337 
5338         ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
5339                   << "value that could not be identified as "
5340                      "reduction is used outside the loop");
5341         DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
5342         return false;
5343       } // end of PHI handling
5344 
5345       // We handle calls that:
5346       //   * Are debug info intrinsics.
5347       //   * Have a mapping to an IR intrinsic.
5348       //   * Have a vector version available.
5349       auto *CI = dyn_cast<CallInst>(&I);
5350       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
5351           !isa<DbgInfoIntrinsic>(CI) &&
5352           !(CI->getCalledFunction() && TLI &&
5353             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
5354         ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
5355                   << "call instruction cannot be vectorized");
5356         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
5357         return false;
5358       }
5359 
5360       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
5361       // second argument is the same (i.e. loop invariant)
5362       if (CI && hasVectorInstrinsicScalarOpd(
5363                     getVectorIntrinsicIDForCall(CI, TLI), 1)) {
5364         auto *SE = PSE.getSE();
5365         if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
5366           ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
5367                     << "intrinsic instruction cannot be vectorized");
5368           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
5369           return false;
5370         }
5371       }
5372 
5373       // Check that the instruction return type is vectorizable.
5374       // Also, we can't vectorize extractelement instructions.
5375       if ((!VectorType::isValidElementType(I.getType()) &&
5376            !I.getType()->isVoidTy()) ||
5377           isa<ExtractElementInst>(I)) {
5378         ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
5379                   << "instruction return type cannot be vectorized");
5380         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
5381         return false;
5382       }
5383 
5384       // Check that the stored type is vectorizable.
5385       if (auto *ST = dyn_cast<StoreInst>(&I)) {
5386         Type *T = ST->getValueOperand()->getType();
5387         if (!VectorType::isValidElementType(T)) {
5388           ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
5389                     << "store instruction cannot be vectorized");
5390           return false;
5391         }
5392 
5393         // FP instructions can allow unsafe algebra, thus vectorizable by
5394         // non-IEEE-754 compliant SIMD units.
5395         // This applies to floating-point math operations and calls, not memory
5396         // operations, shuffles, or casts, as they don't change precision or
5397         // semantics.
5398       } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
5399                  !I.isFast()) {
5400         DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
5401         Hints->setPotentiallyUnsafe();
5402       }
5403 
5404       // Reduction instructions are allowed to have exit users.
5405       // All other instructions must not have external users.
5406       if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
5407         ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
5408                   << "value cannot be used outside the loop");
5409         return false;
5410       }
5411     } // next instr.
5412   }
5413 
5414   if (!PrimaryInduction) {
5415     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
5416     if (Inductions.empty()) {
5417       ORE->emit(createMissedAnalysis("NoInductionVariable")
5418                 << "loop induction variable could not be identified");
5419       return false;
5420     }
5421   }
5422 
5423   // Now we know the widest induction type, check if our found induction
5424   // is the same size. If it's not, unset it here and InnerLoopVectorizer
5425   // will create another.
5426   if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
5427     PrimaryInduction = nullptr;
5428 
5429   return true;
5430 }
5431 
5432 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
5433   // We should not collect Scalars more than once per VF. Right now, this
5434   // function is called from collectUniformsAndScalars(), which already does
5435   // this check. Collecting Scalars for VF=1 does not make any sense.
5436   assert(VF >= 2 && !Scalars.count(VF) &&
5437          "This function should not be visited twice for the same VF");
5438 
5439   SmallSetVector<Instruction *, 8> Worklist;
5440 
5441   // These sets are used to seed the analysis with pointers used by memory
5442   // accesses that will remain scalar.
5443   SmallSetVector<Instruction *, 8> ScalarPtrs;
5444   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5445 
5446   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5447   // The pointer operands of loads and stores will be scalar as long as the
5448   // memory access is not a gather or scatter operation. The value operand of a
5449   // store will remain scalar if the store is scalarized.
5450   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5451     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5452     assert(WideningDecision != CM_Unknown &&
5453            "Widening decision should be ready at this moment");
5454     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5455       if (Ptr == Store->getValueOperand())
5456         return WideningDecision == CM_Scalarize;
5457     assert(Ptr == getPointerOperand(MemAccess) &&
5458            "Ptr is neither a value or pointer operand");
5459     return WideningDecision != CM_GatherScatter;
5460   };
5461 
5462   // A helper that returns true if the given value is a bitcast or
5463   // getelementptr instruction contained in the loop.
5464   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5465     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5466             isa<GetElementPtrInst>(V)) &&
5467            !TheLoop->isLoopInvariant(V);
5468   };
5469 
5470   // A helper that evaluates a memory access's use of a pointer. If the use
5471   // will be a scalar use, and the pointer is only used by memory accesses, we
5472   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
5473   // PossibleNonScalarPtrs.
5474   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5475     // We only care about bitcast and getelementptr instructions contained in
5476     // the loop.
5477     if (!isLoopVaryingBitCastOrGEP(Ptr))
5478       return;
5479 
5480     // If the pointer has already been identified as scalar (e.g., if it was
5481     // also identified as uniform), there's nothing to do.
5482     auto *I = cast<Instruction>(Ptr);
5483     if (Worklist.count(I))
5484       return;
5485 
5486     // If the use of the pointer will be a scalar use, and all users of the
5487     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5488     // place the pointer in PossibleNonScalarPtrs.
5489     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5490           return isa<LoadInst>(U) || isa<StoreInst>(U);
5491         }))
5492       ScalarPtrs.insert(I);
5493     else
5494       PossibleNonScalarPtrs.insert(I);
5495   };
5496 
5497   // We seed the scalars analysis with three classes of instructions: (1)
5498   // instructions marked uniform-after-vectorization, (2) bitcast and
5499   // getelementptr instructions used by memory accesses requiring a scalar use,
5500   // and (3) pointer induction variables and their update instructions (we
5501   // currently only scalarize these).
5502   //
5503   // (1) Add to the worklist all instructions that have been identified as
5504   // uniform-after-vectorization.
5505   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5506 
5507   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5508   // memory accesses requiring a scalar use. The pointer operands of loads and
5509   // stores will be scalar as long as the memory accesses is not a gather or
5510   // scatter operation. The value operand of a store will remain scalar if the
5511   // store is scalarized.
5512   for (auto *BB : TheLoop->blocks())
5513     for (auto &I : *BB) {
5514       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5515         evaluatePtrUse(Load, Load->getPointerOperand());
5516       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5517         evaluatePtrUse(Store, Store->getPointerOperand());
5518         evaluatePtrUse(Store, Store->getValueOperand());
5519       }
5520     }
5521   for (auto *I : ScalarPtrs)
5522     if (!PossibleNonScalarPtrs.count(I)) {
5523       DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5524       Worklist.insert(I);
5525     }
5526 
5527   // (3) Add to the worklist all pointer induction variables and their update
5528   // instructions.
5529   //
5530   // TODO: Once we are able to vectorize pointer induction variables we should
5531   //       no longer insert them into the worklist here.
5532   auto *Latch = TheLoop->getLoopLatch();
5533   for (auto &Induction : *Legal->getInductionVars()) {
5534     auto *Ind = Induction.first;
5535     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5536     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
5537       continue;
5538     Worklist.insert(Ind);
5539     Worklist.insert(IndUpdate);
5540     DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5541     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5542   }
5543 
5544   // Insert the forced scalars.
5545   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5546   // induction variable when the PHI user is scalarized.
5547   if (ForcedScalars.count(VF))
5548     for (auto *I : ForcedScalars.find(VF)->second)
5549       Worklist.insert(I);
5550 
5551   // Expand the worklist by looking through any bitcasts and getelementptr
5552   // instructions we've already identified as scalar. This is similar to the
5553   // expansion step in collectLoopUniforms(); however, here we're only
5554   // expanding to include additional bitcasts and getelementptr instructions.
5555   unsigned Idx = 0;
5556   while (Idx != Worklist.size()) {
5557     Instruction *Dst = Worklist[Idx++];
5558     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5559       continue;
5560     auto *Src = cast<Instruction>(Dst->getOperand(0));
5561     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5562           auto *J = cast<Instruction>(U);
5563           return !TheLoop->contains(J) || Worklist.count(J) ||
5564                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5565                   isScalarUse(J, Src));
5566         })) {
5567       Worklist.insert(Src);
5568       DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5569     }
5570   }
5571 
5572   // An induction variable will remain scalar if all users of the induction
5573   // variable and induction variable update remain scalar.
5574   for (auto &Induction : *Legal->getInductionVars()) {
5575     auto *Ind = Induction.first;
5576     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5577 
5578     // We already considered pointer induction variables, so there's no reason
5579     // to look at their users again.
5580     //
5581     // TODO: Once we are able to vectorize pointer induction variables we
5582     //       should no longer skip over them here.
5583     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
5584       continue;
5585 
5586     // Determine if all users of the induction variable are scalar after
5587     // vectorization.
5588     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5589       auto *I = cast<Instruction>(U);
5590       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5591     });
5592     if (!ScalarInd)
5593       continue;
5594 
5595     // Determine if all users of the induction variable update instruction are
5596     // scalar after vectorization.
5597     auto ScalarIndUpdate =
5598         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5599           auto *I = cast<Instruction>(U);
5600           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5601         });
5602     if (!ScalarIndUpdate)
5603       continue;
5604 
5605     // The induction variable and its update instruction will remain scalar.
5606     Worklist.insert(Ind);
5607     Worklist.insert(IndUpdate);
5608     DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5609     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5610   }
5611 
5612   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5613 }
5614 
5615 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
5616   if (!blockNeedsPredication(I->getParent()))
5617     return false;
5618   switch(I->getOpcode()) {
5619   default:
5620     break;
5621   case Instruction::Store:
5622     return !isMaskRequired(I);
5623   case Instruction::UDiv:
5624   case Instruction::SDiv:
5625   case Instruction::SRem:
5626   case Instruction::URem:
5627     return mayDivideByZero(*I);
5628   }
5629   return false;
5630 }
5631 
5632 bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
5633                                                               unsigned VF) {
5634   // Get and ensure we have a valid memory instruction.
5635   LoadInst *LI = dyn_cast<LoadInst>(I);
5636   StoreInst *SI = dyn_cast<StoreInst>(I);
5637   assert((LI || SI) && "Invalid memory instruction");
5638 
5639   auto *Ptr = getPointerOperand(I);
5640 
5641   // In order to be widened, the pointer should be consecutive, first of all.
5642   if (!isConsecutivePtr(Ptr))
5643     return false;
5644 
5645   // If the instruction is a store located in a predicated block, it will be
5646   // scalarized.
5647   if (isScalarWithPredication(I))
5648     return false;
5649 
5650   // If the instruction's allocated size doesn't equal it's type size, it
5651   // requires padding and will be scalarized.
5652   auto &DL = I->getModule()->getDataLayout();
5653   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5654   if (hasIrregularType(ScalarTy, DL, VF))
5655     return false;
5656 
5657   return true;
5658 }
5659 
5660 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
5661   // We should not collect Uniforms more than once per VF. Right now,
5662   // this function is called from collectUniformsAndScalars(), which
5663   // already does this check. Collecting Uniforms for VF=1 does not make any
5664   // sense.
5665 
5666   assert(VF >= 2 && !Uniforms.count(VF) &&
5667          "This function should not be visited twice for the same VF");
5668 
5669   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5670   // not analyze again.  Uniforms.count(VF) will return 1.
5671   Uniforms[VF].clear();
5672 
5673   // We now know that the loop is vectorizable!
5674   // Collect instructions inside the loop that will remain uniform after
5675   // vectorization.
5676 
5677   // Global values, params and instructions outside of current loop are out of
5678   // scope.
5679   auto isOutOfScope = [&](Value *V) -> bool {
5680     Instruction *I = dyn_cast<Instruction>(V);
5681     return (!I || !TheLoop->contains(I));
5682   };
5683 
5684   SetVector<Instruction *> Worklist;
5685   BasicBlock *Latch = TheLoop->getLoopLatch();
5686 
5687   // Start with the conditional branch. If the branch condition is an
5688   // instruction contained in the loop that is only used by the branch, it is
5689   // uniform.
5690   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5691   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
5692     Worklist.insert(Cmp);
5693     DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
5694   }
5695 
5696   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5697   // are pointers that are treated like consecutive pointers during
5698   // vectorization. The pointer operands of interleaved accesses are an
5699   // example.
5700   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5701 
5702   // Holds pointer operands of instructions that are possibly non-uniform.
5703   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5704 
5705   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
5706     InstWidening WideningDecision = getWideningDecision(I, VF);
5707     assert(WideningDecision != CM_Unknown &&
5708            "Widening decision should be ready at this moment");
5709 
5710     return (WideningDecision == CM_Widen ||
5711             WideningDecision == CM_Interleave);
5712   };
5713   // Iterate over the instructions in the loop, and collect all
5714   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5715   // that a consecutive-like pointer operand will be scalarized, we collect it
5716   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5717   // getelementptr instruction can be used by both vectorized and scalarized
5718   // memory instructions. For example, if a loop loads and stores from the same
5719   // location, but the store is conditional, the store will be scalarized, and
5720   // the getelementptr won't remain uniform.
5721   for (auto *BB : TheLoop->blocks())
5722     for (auto &I : *BB) {
5723       // If there's no pointer operand, there's nothing to do.
5724       auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
5725       if (!Ptr)
5726         continue;
5727 
5728       // True if all users of Ptr are memory accesses that have Ptr as their
5729       // pointer operand.
5730       auto UsersAreMemAccesses =
5731           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5732             return getPointerOperand(U) == Ptr;
5733           });
5734 
5735       // Ensure the memory instruction will not be scalarized or used by
5736       // gather/scatter, making its pointer operand non-uniform. If the pointer
5737       // operand is used by any instruction other than a memory access, we
5738       // conservatively assume the pointer operand may be non-uniform.
5739       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5740         PossibleNonUniformPtrs.insert(Ptr);
5741 
5742       // If the memory instruction will be vectorized and its pointer operand
5743       // is consecutive-like, or interleaving - the pointer operand should
5744       // remain uniform.
5745       else
5746         ConsecutiveLikePtrs.insert(Ptr);
5747     }
5748 
5749   // Add to the Worklist all consecutive and consecutive-like pointers that
5750   // aren't also identified as possibly non-uniform.
5751   for (auto *V : ConsecutiveLikePtrs)
5752     if (!PossibleNonUniformPtrs.count(V)) {
5753       DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
5754       Worklist.insert(V);
5755     }
5756 
5757   // Expand Worklist in topological order: whenever a new instruction
5758   // is added , its users should be either already inside Worklist, or
5759   // out of scope. It ensures a uniform instruction will only be used
5760   // by uniform instructions or out of scope instructions.
5761   unsigned idx = 0;
5762   while (idx != Worklist.size()) {
5763     Instruction *I = Worklist[idx++];
5764 
5765     for (auto OV : I->operand_values()) {
5766       if (isOutOfScope(OV))
5767         continue;
5768       auto *OI = cast<Instruction>(OV);
5769       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5770             auto *J = cast<Instruction>(U);
5771             return !TheLoop->contains(J) || Worklist.count(J) ||
5772                    (OI == getPointerOperand(J) && isUniformDecision(J, VF));
5773           })) {
5774         Worklist.insert(OI);
5775         DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
5776       }
5777     }
5778   }
5779 
5780   // Returns true if Ptr is the pointer operand of a memory access instruction
5781   // I, and I is known to not require scalarization.
5782   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5783     return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
5784   };
5785 
5786   // For an instruction to be added into Worklist above, all its users inside
5787   // the loop should also be in Worklist. However, this condition cannot be
5788   // true for phi nodes that form a cyclic dependence. We must process phi
5789   // nodes separately. An induction variable will remain uniform if all users
5790   // of the induction variable and induction variable update remain uniform.
5791   // The code below handles both pointer and non-pointer induction variables.
5792   for (auto &Induction : *Legal->getInductionVars()) {
5793     auto *Ind = Induction.first;
5794     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5795 
5796     // Determine if all users of the induction variable are uniform after
5797     // vectorization.
5798     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5799       auto *I = cast<Instruction>(U);
5800       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5801              isVectorizedMemAccessUse(I, Ind);
5802     });
5803     if (!UniformInd)
5804       continue;
5805 
5806     // Determine if all users of the induction variable update instruction are
5807     // uniform after vectorization.
5808     auto UniformIndUpdate =
5809         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5810           auto *I = cast<Instruction>(U);
5811           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5812                  isVectorizedMemAccessUse(I, IndUpdate);
5813         });
5814     if (!UniformIndUpdate)
5815       continue;
5816 
5817     // The induction variable and its update instruction will remain uniform.
5818     Worklist.insert(Ind);
5819     Worklist.insert(IndUpdate);
5820     DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
5821     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
5822   }
5823 
5824   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5825 }
5826 
5827 bool LoopVectorizationLegality::canVectorizeMemory() {
5828   LAI = &(*GetLAA)(*TheLoop);
5829   InterleaveInfo.setLAI(LAI);
5830   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
5831   if (LAR) {
5832     ORE->emit([&]() {
5833       return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
5834                                         "loop not vectorized: ", *LAR);
5835     });
5836   }
5837   if (!LAI->canVectorizeMemory())
5838     return false;
5839 
5840   if (LAI->hasStoreToLoopInvariantAddress()) {
5841     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
5842               << "write to a loop invariant address could not be vectorized");
5843     DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
5844     return false;
5845   }
5846 
5847   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
5848   PSE.addPredicate(LAI->getPSE().getUnionPredicate());
5849 
5850   return true;
5851 }
5852 
5853 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
5854   Value *In0 = const_cast<Value *>(V);
5855   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5856   if (!PN)
5857     return false;
5858 
5859   return Inductions.count(PN);
5860 }
5861 
5862 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
5863   return FirstOrderRecurrences.count(Phi);
5864 }
5865 
5866 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
5867   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
5868 }
5869 
5870 bool LoopVectorizationLegality::blockCanBePredicated(
5871     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
5872   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
5873 
5874   for (Instruction &I : *BB) {
5875     // Check that we don't have a constant expression that can trap as operand.
5876     for (Value *Operand : I.operands()) {
5877       if (auto *C = dyn_cast<Constant>(Operand))
5878         if (C->canTrap())
5879           return false;
5880     }
5881     // We might be able to hoist the load.
5882     if (I.mayReadFromMemory()) {
5883       auto *LI = dyn_cast<LoadInst>(&I);
5884       if (!LI)
5885         return false;
5886       if (!SafePtrs.count(LI->getPointerOperand())) {
5887         if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
5888             isLegalMaskedGather(LI->getType())) {
5889           MaskedOp.insert(LI);
5890           continue;
5891         }
5892         // !llvm.mem.parallel_loop_access implies if-conversion safety.
5893         if (IsAnnotatedParallel)
5894           continue;
5895         return false;
5896       }
5897     }
5898 
5899     if (I.mayWriteToMemory()) {
5900       auto *SI = dyn_cast<StoreInst>(&I);
5901       // We only support predication of stores in basic blocks with one
5902       // predecessor.
5903       if (!SI)
5904         return false;
5905 
5906       // Build a masked store if it is legal for the target.
5907       if (isLegalMaskedStore(SI->getValueOperand()->getType(),
5908                              SI->getPointerOperand()) ||
5909           isLegalMaskedScatter(SI->getValueOperand()->getType())) {
5910         MaskedOp.insert(SI);
5911         continue;
5912       }
5913 
5914       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
5915       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
5916 
5917       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
5918           !isSinglePredecessor)
5919         return false;
5920     }
5921     if (I.mayThrow())
5922       return false;
5923   }
5924 
5925   return true;
5926 }
5927 
5928 void InterleavedAccessInfo::collectConstStrideAccesses(
5929     MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
5930     const ValueToValueMap &Strides) {
5931   auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
5932 
5933   // Since it's desired that the load/store instructions be maintained in
5934   // "program order" for the interleaved access analysis, we have to visit the
5935   // blocks in the loop in reverse postorder (i.e., in a topological order).
5936   // Such an ordering will ensure that any load/store that may be executed
5937   // before a second load/store will precede the second load/store in
5938   // AccessStrideInfo.
5939   LoopBlocksDFS DFS(TheLoop);
5940   DFS.perform(LI);
5941   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
5942     for (auto &I : *BB) {
5943       auto *LI = dyn_cast<LoadInst>(&I);
5944       auto *SI = dyn_cast<StoreInst>(&I);
5945       if (!LI && !SI)
5946         continue;
5947 
5948       Value *Ptr = getPointerOperand(&I);
5949       // We don't check wrapping here because we don't know yet if Ptr will be
5950       // part of a full group or a group with gaps. Checking wrapping for all
5951       // pointers (even those that end up in groups with no gaps) will be overly
5952       // conservative. For full groups, wrapping should be ok since if we would
5953       // wrap around the address space we would do a memory access at nullptr
5954       // even without the transformation. The wrapping checks are therefore
5955       // deferred until after we've formed the interleaved groups.
5956       int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
5957                                     /*Assume=*/true, /*ShouldCheckWrap=*/false);
5958 
5959       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
5960       PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5961       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
5962 
5963       // An alignment of 0 means target ABI alignment.
5964       unsigned Align = getMemInstAlignment(&I);
5965       if (!Align)
5966         Align = DL.getABITypeAlignment(PtrTy->getElementType());
5967 
5968       AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
5969     }
5970 }
5971 
5972 // Analyze interleaved accesses and collect them into interleaved load and
5973 // store groups.
5974 //
5975 // When generating code for an interleaved load group, we effectively hoist all
5976 // loads in the group to the location of the first load in program order. When
5977 // generating code for an interleaved store group, we sink all stores to the
5978 // location of the last store. This code motion can change the order of load
5979 // and store instructions and may break dependences.
5980 //
5981 // The code generation strategy mentioned above ensures that we won't violate
5982 // any write-after-read (WAR) dependences.
5983 //
5984 // E.g., for the WAR dependence:  a = A[i];      // (1)
5985 //                                A[i] = b;      // (2)
5986 //
5987 // The store group of (2) is always inserted at or below (2), and the load
5988 // group of (1) is always inserted at or above (1). Thus, the instructions will
5989 // never be reordered. All other dependences are checked to ensure the
5990 // correctness of the instruction reordering.
5991 //
5992 // The algorithm visits all memory accesses in the loop in bottom-up program
5993 // order. Program order is established by traversing the blocks in the loop in
5994 // reverse postorder when collecting the accesses.
5995 //
5996 // We visit the memory accesses in bottom-up order because it can simplify the
5997 // construction of store groups in the presence of write-after-write (WAW)
5998 // dependences.
5999 //
6000 // E.g., for the WAW dependence:  A[i] = a;      // (1)
6001 //                                A[i] = b;      // (2)
6002 //                                A[i + 1] = c;  // (3)
6003 //
6004 // We will first create a store group with (3) and (2). (1) can't be added to
6005 // this group because it and (2) are dependent. However, (1) can be grouped
6006 // with other accesses that may precede it in program order. Note that a
6007 // bottom-up order does not imply that WAW dependences should not be checked.
6008 void InterleavedAccessInfo::analyzeInterleaving(
6009     const ValueToValueMap &Strides) {
6010   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
6011 
6012   // Holds all accesses with a constant stride.
6013   MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
6014   collectConstStrideAccesses(AccessStrideInfo, Strides);
6015 
6016   if (AccessStrideInfo.empty())
6017     return;
6018 
6019   // Collect the dependences in the loop.
6020   collectDependences();
6021 
6022   // Holds all interleaved store groups temporarily.
6023   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
6024   // Holds all interleaved load groups temporarily.
6025   SmallSetVector<InterleaveGroup *, 4> LoadGroups;
6026 
6027   // Search in bottom-up program order for pairs of accesses (A and B) that can
6028   // form interleaved load or store groups. In the algorithm below, access A
6029   // precedes access B in program order. We initialize a group for B in the
6030   // outer loop of the algorithm, and then in the inner loop, we attempt to
6031   // insert each A into B's group if:
6032   //
6033   //  1. A and B have the same stride,
6034   //  2. A and B have the same memory object size, and
6035   //  3. A belongs in B's group according to its distance from B.
6036   //
6037   // Special care is taken to ensure group formation will not break any
6038   // dependences.
6039   for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
6040        BI != E; ++BI) {
6041     Instruction *B = BI->first;
6042     StrideDescriptor DesB = BI->second;
6043 
6044     // Initialize a group for B if it has an allowable stride. Even if we don't
6045     // create a group for B, we continue with the bottom-up algorithm to ensure
6046     // we don't break any of B's dependences.
6047     InterleaveGroup *Group = nullptr;
6048     if (isStrided(DesB.Stride)) {
6049       Group = getInterleaveGroup(B);
6050       if (!Group) {
6051         DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
6052         Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
6053       }
6054       if (B->mayWriteToMemory())
6055         StoreGroups.insert(Group);
6056       else
6057         LoadGroups.insert(Group);
6058     }
6059 
6060     for (auto AI = std::next(BI); AI != E; ++AI) {
6061       Instruction *A = AI->first;
6062       StrideDescriptor DesA = AI->second;
6063 
6064       // Our code motion strategy implies that we can't have dependences
6065       // between accesses in an interleaved group and other accesses located
6066       // between the first and last member of the group. Note that this also
6067       // means that a group can't have more than one member at a given offset.
6068       // The accesses in a group can have dependences with other accesses, but
6069       // we must ensure we don't extend the boundaries of the group such that
6070       // we encompass those dependent accesses.
6071       //
6072       // For example, assume we have the sequence of accesses shown below in a
6073       // stride-2 loop:
6074       //
6075       //  (1, 2) is a group | A[i]   = a;  // (1)
6076       //                    | A[i-1] = b;  // (2) |
6077       //                      A[i-3] = c;  // (3)
6078       //                      A[i]   = d;  // (4) | (2, 4) is not a group
6079       //
6080       // Because accesses (2) and (3) are dependent, we can group (2) with (1)
6081       // but not with (4). If we did, the dependent access (3) would be within
6082       // the boundaries of the (2, 4) group.
6083       if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
6084         // If a dependence exists and A is already in a group, we know that A
6085         // must be a store since A precedes B and WAR dependences are allowed.
6086         // Thus, A would be sunk below B. We release A's group to prevent this
6087         // illegal code motion. A will then be free to form another group with
6088         // instructions that precede it.
6089         if (isInterleaved(A)) {
6090           InterleaveGroup *StoreGroup = getInterleaveGroup(A);
6091           StoreGroups.remove(StoreGroup);
6092           releaseGroup(StoreGroup);
6093         }
6094 
6095         // If a dependence exists and A is not already in a group (or it was
6096         // and we just released it), B might be hoisted above A (if B is a
6097         // load) or another store might be sunk below A (if B is a store). In
6098         // either case, we can't add additional instructions to B's group. B
6099         // will only form a group with instructions that it precedes.
6100         break;
6101       }
6102 
6103       // At this point, we've checked for illegal code motion. If either A or B
6104       // isn't strided, there's nothing left to do.
6105       if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
6106         continue;
6107 
6108       // Ignore A if it's already in a group or isn't the same kind of memory
6109       // operation as B.
6110       if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
6111         continue;
6112 
6113       // Check rules 1 and 2. Ignore A if its stride or size is different from
6114       // that of B.
6115       if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
6116         continue;
6117 
6118       // Ignore A if the memory object of A and B don't belong to the same
6119       // address space
6120       if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
6121         continue;
6122 
6123       // Calculate the distance from A to B.
6124       const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
6125           PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
6126       if (!DistToB)
6127         continue;
6128       int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
6129 
6130       // Check rule 3. Ignore A if its distance to B is not a multiple of the
6131       // size.
6132       if (DistanceToB % static_cast<int64_t>(DesB.Size))
6133         continue;
6134 
6135       // Ignore A if either A or B is in a predicated block. Although we
6136       // currently prevent group formation for predicated accesses, we may be
6137       // able to relax this limitation in the future once we handle more
6138       // complicated blocks.
6139       if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
6140         continue;
6141 
6142       // The index of A is the index of B plus A's distance to B in multiples
6143       // of the size.
6144       int IndexA =
6145           Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
6146 
6147       // Try to insert A into B's group.
6148       if (Group->insertMember(A, IndexA, DesA.Align)) {
6149         DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
6150                      << "    into the interleave group with" << *B << '\n');
6151         InterleaveGroupMap[A] = Group;
6152 
6153         // Set the first load in program order as the insert position.
6154         if (A->mayReadFromMemory())
6155           Group->setInsertPos(A);
6156       }
6157     } // Iteration over A accesses.
6158   } // Iteration over B accesses.
6159 
6160   // Remove interleaved store groups with gaps.
6161   for (InterleaveGroup *Group : StoreGroups)
6162     if (Group->getNumMembers() != Group->getFactor()) {
6163       DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
6164                       "to gaps.\n");
6165       releaseGroup(Group);
6166     }
6167   // Remove interleaved groups with gaps (currently only loads) whose memory
6168   // accesses may wrap around. We have to revisit the getPtrStride analysis,
6169   // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
6170   // not check wrapping (see documentation there).
6171   // FORNOW we use Assume=false;
6172   // TODO: Change to Assume=true but making sure we don't exceed the threshold
6173   // of runtime SCEV assumptions checks (thereby potentially failing to
6174   // vectorize altogether).
6175   // Additional optional optimizations:
6176   // TODO: If we are peeling the loop and we know that the first pointer doesn't
6177   // wrap then we can deduce that all pointers in the group don't wrap.
6178   // This means that we can forcefully peel the loop in order to only have to
6179   // check the first pointer for no-wrap. When we'll change to use Assume=true
6180   // we'll only need at most one runtime check per interleaved group.
6181   for (InterleaveGroup *Group : LoadGroups) {
6182     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
6183     // load would wrap around the address space we would do a memory access at
6184     // nullptr even without the transformation.
6185     if (Group->getNumMembers() == Group->getFactor())
6186       continue;
6187 
6188     // Case 2: If first and last members of the group don't wrap this implies
6189     // that all the pointers in the group don't wrap.
6190     // So we check only group member 0 (which is always guaranteed to exist),
6191     // and group member Factor - 1; If the latter doesn't exist we rely on
6192     // peeling (if it is a non-reveresed accsess -- see Case 3).
6193     Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
6194     if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
6195                       /*ShouldCheckWrap=*/true)) {
6196       DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6197                       "first group member potentially pointer-wrapping.\n");
6198       releaseGroup(Group);
6199       continue;
6200     }
6201     Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
6202     if (LastMember) {
6203       Value *LastMemberPtr = getPointerOperand(LastMember);
6204       if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
6205                         /*ShouldCheckWrap=*/true)) {
6206         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6207                         "last group member potentially pointer-wrapping.\n");
6208         releaseGroup(Group);
6209       }
6210     } else {
6211       // Case 3: A non-reversed interleaved load group with gaps: We need
6212       // to execute at least one scalar epilogue iteration. This will ensure
6213       // we don't speculatively access memory out-of-bounds. We only need
6214       // to look for a member at index factor - 1, since every group must have
6215       // a member at index zero.
6216       if (Group->isReverse()) {
6217         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6218                         "a reverse access with gaps.\n");
6219         releaseGroup(Group);
6220         continue;
6221       }
6222       DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
6223       RequiresScalarEpilogue = true;
6224     }
6225   }
6226 }
6227 
6228 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
6229   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
6230     ORE->emit(createMissedAnalysis("ConditionalStore")
6231               << "store that is conditionally executed prevents vectorization");
6232     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
6233     return None;
6234   }
6235 
6236   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
6237     // TODO: It may by useful to do since it's still likely to be dynamically
6238     // uniform if the target can skip.
6239     DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
6240 
6241     ORE->emit(
6242       createMissedAnalysis("CantVersionLoopWithDivergentTarget")
6243       << "runtime pointer checks needed. Not enabled for divergent target");
6244 
6245     return None;
6246   }
6247 
6248   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6249   if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
6250     return computeFeasibleMaxVF(OptForSize, TC);
6251 
6252   if (Legal->getRuntimePointerChecking()->Need) {
6253     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
6254               << "runtime pointer checks needed. Enable vectorization of this "
6255                  "loop with '#pragma clang loop vectorize(enable)' when "
6256                  "compiling with -Os/-Oz");
6257     DEBUG(dbgs()
6258           << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
6259     return None;
6260   }
6261 
6262   // If we optimize the program for size, avoid creating the tail loop.
6263   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
6264 
6265   // If we don't know the precise trip count, don't try to vectorize.
6266   if (TC < 2) {
6267     ORE->emit(
6268         createMissedAnalysis("UnknownLoopCountComplexCFG")
6269         << "unable to calculate the loop count due to complex control flow");
6270     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6271     return None;
6272   }
6273 
6274   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
6275 
6276   if (TC % MaxVF != 0) {
6277     // If the trip count that we found modulo the vectorization factor is not
6278     // zero then we require a tail.
6279     // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
6280     // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
6281     //        smaller MaxVF that does not require a scalar epilog.
6282 
6283     ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
6284               << "cannot optimize for size and vectorize at the "
6285                  "same time. Enable vectorization of this loop "
6286                  "with '#pragma clang loop vectorize(enable)' "
6287                  "when compiling with -Os/-Oz");
6288     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6289     return None;
6290   }
6291 
6292   return MaxVF;
6293 }
6294 
6295 unsigned
6296 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
6297                                                  unsigned ConstTripCount) {
6298   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
6299   unsigned SmallestType, WidestType;
6300   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
6301   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
6302 
6303   // Get the maximum safe dependence distance in bits computed by LAA.
6304   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
6305   // the memory accesses that is most restrictive (involved in the smallest
6306   // dependence distance).
6307   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
6308 
6309   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
6310 
6311   unsigned MaxVectorSize = WidestRegister / WidestType;
6312 
6313   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
6314                << WidestType << " bits.\n");
6315   DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
6316                << " bits.\n");
6317 
6318   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
6319                                 " into one vector!");
6320   if (MaxVectorSize == 0) {
6321     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
6322     MaxVectorSize = 1;
6323     return MaxVectorSize;
6324   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
6325              isPowerOf2_32(ConstTripCount)) {
6326     // We need to clamp the VF to be the ConstTripCount. There is no point in
6327     // choosing a higher viable VF as done in the loop below.
6328     DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
6329                  << ConstTripCount << "\n");
6330     MaxVectorSize = ConstTripCount;
6331     return MaxVectorSize;
6332   }
6333 
6334   unsigned MaxVF = MaxVectorSize;
6335   if (MaximizeBandwidth && !OptForSize) {
6336     // Collect all viable vectorization factors larger than the default MaxVF
6337     // (i.e. MaxVectorSize).
6338     SmallVector<unsigned, 8> VFs;
6339     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
6340     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
6341       VFs.push_back(VS);
6342 
6343     // For each VF calculate its register usage.
6344     auto RUs = calculateRegisterUsage(VFs);
6345 
6346     // Select the largest VF which doesn't require more registers than existing
6347     // ones.
6348     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
6349     for (int i = RUs.size() - 1; i >= 0; --i) {
6350       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
6351         MaxVF = VFs[i];
6352         break;
6353       }
6354     }
6355   }
6356   return MaxVF;
6357 }
6358 
6359 LoopVectorizationCostModel::VectorizationFactor
6360 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
6361   float Cost = expectedCost(1).first;
6362 #ifndef NDEBUG
6363   const float ScalarCost = Cost;
6364 #endif /* NDEBUG */
6365   unsigned Width = 1;
6366   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
6367 
6368   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6369   // Ignore scalar width, because the user explicitly wants vectorization.
6370   if (ForceVectorization && MaxVF > 1) {
6371     Width = 2;
6372     Cost = expectedCost(Width).first / (float)Width;
6373   }
6374 
6375   for (unsigned i = 2; i <= MaxVF; i *= 2) {
6376     // Notice that the vector loop needs to be executed less times, so
6377     // we need to divide the cost of the vector loops by the width of
6378     // the vector elements.
6379     VectorizationCostTy C = expectedCost(i);
6380     float VectorCost = C.first / (float)i;
6381     DEBUG(dbgs() << "LV: Vector loop of width " << i
6382                  << " costs: " << (int)VectorCost << ".\n");
6383     if (!C.second && !ForceVectorization) {
6384       DEBUG(
6385           dbgs() << "LV: Not considering vector loop of width " << i
6386                  << " because it will not generate any vector instructions.\n");
6387       continue;
6388     }
6389     if (VectorCost < Cost) {
6390       Cost = VectorCost;
6391       Width = i;
6392     }
6393   }
6394 
6395   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
6396         << "LV: Vectorization seems to be not beneficial, "
6397         << "but was forced by a user.\n");
6398   DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
6399   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
6400   return Factor;
6401 }
6402 
6403 std::pair<unsigned, unsigned>
6404 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6405   unsigned MinWidth = -1U;
6406   unsigned MaxWidth = 8;
6407   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6408 
6409   // For each block.
6410   for (BasicBlock *BB : TheLoop->blocks()) {
6411     // For each instruction in the loop.
6412     for (Instruction &I : *BB) {
6413       Type *T = I.getType();
6414 
6415       // Skip ignored values.
6416       if (ValuesToIgnore.count(&I))
6417         continue;
6418 
6419       // Only examine Loads, Stores and PHINodes.
6420       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6421         continue;
6422 
6423       // Examine PHI nodes that are reduction variables. Update the type to
6424       // account for the recurrence type.
6425       if (auto *PN = dyn_cast<PHINode>(&I)) {
6426         if (!Legal->isReductionVariable(PN))
6427           continue;
6428         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
6429         T = RdxDesc.getRecurrenceType();
6430       }
6431 
6432       // Examine the stored values.
6433       if (auto *ST = dyn_cast<StoreInst>(&I))
6434         T = ST->getValueOperand()->getType();
6435 
6436       // Ignore loaded pointer types and stored pointer types that are not
6437       // vectorizable.
6438       //
6439       // FIXME: The check here attempts to predict whether a load or store will
6440       //        be vectorized. We only know this for certain after a VF has
6441       //        been selected. Here, we assume that if an access can be
6442       //        vectorized, it will be. We should also look at extending this
6443       //        optimization to non-pointer types.
6444       //
6445       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6446           !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
6447         continue;
6448 
6449       MinWidth = std::min(MinWidth,
6450                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6451       MaxWidth = std::max(MaxWidth,
6452                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6453     }
6454   }
6455 
6456   return {MinWidth, MaxWidth};
6457 }
6458 
6459 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
6460                                                            unsigned VF,
6461                                                            unsigned LoopCost) {
6462   // -- The interleave heuristics --
6463   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6464   // There are many micro-architectural considerations that we can't predict
6465   // at this level. For example, frontend pressure (on decode or fetch) due to
6466   // code size, or the number and capabilities of the execution ports.
6467   //
6468   // We use the following heuristics to select the interleave count:
6469   // 1. If the code has reductions, then we interleave to break the cross
6470   // iteration dependency.
6471   // 2. If the loop is really small, then we interleave to reduce the loop
6472   // overhead.
6473   // 3. We don't interleave if we think that we will spill registers to memory
6474   // due to the increased register pressure.
6475 
6476   // When we optimize for size, we don't interleave.
6477   if (OptForSize)
6478     return 1;
6479 
6480   // We used the distance for the interleave count.
6481   if (Legal->getMaxSafeDepDistBytes() != -1U)
6482     return 1;
6483 
6484   // Do not interleave loops with a relatively small trip count.
6485   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6486   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
6487     return 1;
6488 
6489   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
6490   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6491                << " registers\n");
6492 
6493   if (VF == 1) {
6494     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6495       TargetNumRegisters = ForceTargetNumScalarRegs;
6496   } else {
6497     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6498       TargetNumRegisters = ForceTargetNumVectorRegs;
6499   }
6500 
6501   RegisterUsage R = calculateRegisterUsage({VF})[0];
6502   // We divide by these constants so assume that we have at least one
6503   // instruction that uses at least one register.
6504   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
6505   R.NumInstructions = std::max(R.NumInstructions, 1U);
6506 
6507   // We calculate the interleave count using the following formula.
6508   // Subtract the number of loop invariants from the number of available
6509   // registers. These registers are used by all of the interleaved instances.
6510   // Next, divide the remaining registers by the number of registers that is
6511   // required by the loop, in order to estimate how many parallel instances
6512   // fit without causing spills. All of this is rounded down if necessary to be
6513   // a power of two. We want power of two interleave count to simplify any
6514   // addressing operations or alignment considerations.
6515   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
6516                               R.MaxLocalUsers);
6517 
6518   // Don't count the induction variable as interleaved.
6519   if (EnableIndVarRegisterHeur)
6520     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
6521                        std::max(1U, (R.MaxLocalUsers - 1)));
6522 
6523   // Clamp the interleave ranges to reasonable counts.
6524   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
6525 
6526   // Check if the user has overridden the max.
6527   if (VF == 1) {
6528     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6529       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6530   } else {
6531     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6532       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6533   }
6534 
6535   // If we did not calculate the cost for VF (because the user selected the VF)
6536   // then we calculate the cost of VF here.
6537   if (LoopCost == 0)
6538     LoopCost = expectedCost(VF).first;
6539 
6540   // Clamp the calculated IC to be between the 1 and the max interleave count
6541   // that the target allows.
6542   if (IC > MaxInterleaveCount)
6543     IC = MaxInterleaveCount;
6544   else if (IC < 1)
6545     IC = 1;
6546 
6547   // Interleave if we vectorized this loop and there is a reduction that could
6548   // benefit from interleaving.
6549   if (VF > 1 && !Legal->getReductionVars()->empty()) {
6550     DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6551     return IC;
6552   }
6553 
6554   // Note that if we've already vectorized the loop we will have done the
6555   // runtime check and so interleaving won't require further checks.
6556   bool InterleavingRequiresRuntimePointerCheck =
6557       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
6558 
6559   // We want to interleave small loops in order to reduce the loop overhead and
6560   // potentially expose ILP opportunities.
6561   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
6562   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6563     // We assume that the cost overhead is 1 and we use the cost model
6564     // to estimate the cost of the loop and interleave until the cost of the
6565     // loop overhead is about 5% of the cost of the loop.
6566     unsigned SmallIC =
6567         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6568 
6569     // Interleave until store/load ports (estimated by max interleave count) are
6570     // saturated.
6571     unsigned NumStores = Legal->getNumStores();
6572     unsigned NumLoads = Legal->getNumLoads();
6573     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6574     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6575 
6576     // If we have a scalar reduction (vector reductions are already dealt with
6577     // by this point), we can increase the critical path length if the loop
6578     // we're interleaving is inside another loop. Limit, by default to 2, so the
6579     // critical path only gets increased by one reduction operation.
6580     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
6581       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6582       SmallIC = std::min(SmallIC, F);
6583       StoresIC = std::min(StoresIC, F);
6584       LoadsIC = std::min(LoadsIC, F);
6585     }
6586 
6587     if (EnableLoadStoreRuntimeInterleave &&
6588         std::max(StoresIC, LoadsIC) > SmallIC) {
6589       DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6590       return std::max(StoresIC, LoadsIC);
6591     }
6592 
6593     DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6594     return SmallIC;
6595   }
6596 
6597   // Interleave if this is a large loop (small loops are already dealt with by
6598   // this point) that could benefit from interleaving.
6599   bool HasReductions = !Legal->getReductionVars()->empty();
6600   if (TTI.enableAggressiveInterleaving(HasReductions)) {
6601     DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6602     return IC;
6603   }
6604 
6605   DEBUG(dbgs() << "LV: Not Interleaving.\n");
6606   return 1;
6607 }
6608 
6609 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6610 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
6611   // This function calculates the register usage by measuring the highest number
6612   // of values that are alive at a single location. Obviously, this is a very
6613   // rough estimation. We scan the loop in a topological order in order and
6614   // assign a number to each instruction. We use RPO to ensure that defs are
6615   // met before their users. We assume that each instruction that has in-loop
6616   // users starts an interval. We record every time that an in-loop value is
6617   // used, so we have a list of the first and last occurrences of each
6618   // instruction. Next, we transpose this data structure into a multi map that
6619   // holds the list of intervals that *end* at a specific location. This multi
6620   // map allows us to perform a linear search. We scan the instructions linearly
6621   // and record each time that a new interval starts, by placing it in a set.
6622   // If we find this value in the multi-map then we remove it from the set.
6623   // The max register usage is the maximum size of the set.
6624   // We also search for instructions that are defined outside the loop, but are
6625   // used inside the loop. We need this number separately from the max-interval
6626   // usage number because when we unroll, loop-invariant values do not take
6627   // more register.
6628   LoopBlocksDFS DFS(TheLoop);
6629   DFS.perform(LI);
6630 
6631   RegisterUsage RU;
6632   RU.NumInstructions = 0;
6633 
6634   // Each 'key' in the map opens a new interval. The values
6635   // of the map are the index of the 'last seen' usage of the
6636   // instruction that is the key.
6637   using IntervalMap = DenseMap<Instruction *, unsigned>;
6638 
6639   // Maps instruction to its index.
6640   DenseMap<unsigned, Instruction *> IdxToInstr;
6641   // Marks the end of each interval.
6642   IntervalMap EndPoint;
6643   // Saves the list of instruction indices that are used in the loop.
6644   SmallSet<Instruction *, 8> Ends;
6645   // Saves the list of values that are used in the loop but are
6646   // defined outside the loop, such as arguments and constants.
6647   SmallPtrSet<Value *, 8> LoopInvariants;
6648 
6649   unsigned Index = 0;
6650   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6651     RU.NumInstructions += BB->size();
6652     for (Instruction &I : *BB) {
6653       IdxToInstr[Index++] = &I;
6654 
6655       // Save the end location of each USE.
6656       for (Value *U : I.operands()) {
6657         auto *Instr = dyn_cast<Instruction>(U);
6658 
6659         // Ignore non-instruction values such as arguments, constants, etc.
6660         if (!Instr)
6661           continue;
6662 
6663         // If this instruction is outside the loop then record it and continue.
6664         if (!TheLoop->contains(Instr)) {
6665           LoopInvariants.insert(Instr);
6666           continue;
6667         }
6668 
6669         // Overwrite previous end points.
6670         EndPoint[Instr] = Index;
6671         Ends.insert(Instr);
6672       }
6673     }
6674   }
6675 
6676   // Saves the list of intervals that end with the index in 'key'.
6677   using InstrList = SmallVector<Instruction *, 2>;
6678   DenseMap<unsigned, InstrList> TransposeEnds;
6679 
6680   // Transpose the EndPoints to a list of values that end at each index.
6681   for (auto &Interval : EndPoint)
6682     TransposeEnds[Interval.second].push_back(Interval.first);
6683 
6684   SmallSet<Instruction *, 8> OpenIntervals;
6685 
6686   // Get the size of the widest register.
6687   unsigned MaxSafeDepDist = -1U;
6688   if (Legal->getMaxSafeDepDistBytes() != -1U)
6689     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
6690   unsigned WidestRegister =
6691       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
6692   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6693 
6694   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6695   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
6696 
6697   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6698 
6699   // A lambda that gets the register usage for the given type and VF.
6700   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
6701     if (Ty->isTokenTy())
6702       return 0U;
6703     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
6704     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
6705   };
6706 
6707   for (unsigned int i = 0; i < Index; ++i) {
6708     Instruction *I = IdxToInstr[i];
6709 
6710     // Remove all of the instructions that end at this location.
6711     InstrList &List = TransposeEnds[i];
6712     for (Instruction *ToRemove : List)
6713       OpenIntervals.erase(ToRemove);
6714 
6715     // Ignore instructions that are never used within the loop.
6716     if (!Ends.count(I))
6717       continue;
6718 
6719     // Skip ignored values.
6720     if (ValuesToIgnore.count(I))
6721       continue;
6722 
6723     // For each VF find the maximum usage of registers.
6724     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6725       if (VFs[j] == 1) {
6726         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
6727         continue;
6728       }
6729       collectUniformsAndScalars(VFs[j]);
6730       // Count the number of live intervals.
6731       unsigned RegUsage = 0;
6732       for (auto Inst : OpenIntervals) {
6733         // Skip ignored values for VF > 1.
6734         if (VecValuesToIgnore.count(Inst) ||
6735             isScalarAfterVectorization(Inst, VFs[j]))
6736           continue;
6737         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
6738       }
6739       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
6740     }
6741 
6742     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6743                  << OpenIntervals.size() << '\n');
6744 
6745     // Add the current instruction to the list of open intervals.
6746     OpenIntervals.insert(I);
6747   }
6748 
6749   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6750     unsigned Invariant = 0;
6751     if (VFs[i] == 1)
6752       Invariant = LoopInvariants.size();
6753     else {
6754       for (auto Inst : LoopInvariants)
6755         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
6756     }
6757 
6758     DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
6759     DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
6760     DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
6761     DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
6762 
6763     RU.LoopInvariantRegs = Invariant;
6764     RU.MaxLocalUsers = MaxUsages[i];
6765     RUs[i] = RU;
6766   }
6767 
6768   return RUs;
6769 }
6770 
6771 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
6772   // If we aren't vectorizing the loop, or if we've already collected the
6773   // instructions to scalarize, there's nothing to do. Collection may already
6774   // have occurred if we have a user-selected VF and are now computing the
6775   // expected cost for interleaving.
6776   if (VF < 2 || InstsToScalarize.count(VF))
6777     return;
6778 
6779   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6780   // not profitable to scalarize any instructions, the presence of VF in the
6781   // map will indicate that we've analyzed it already.
6782   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6783 
6784   // Find all the instructions that are scalar with predication in the loop and
6785   // determine if it would be better to not if-convert the blocks they are in.
6786   // If so, we also record the instructions to scalarize.
6787   for (BasicBlock *BB : TheLoop->blocks()) {
6788     if (!Legal->blockNeedsPredication(BB))
6789       continue;
6790     for (Instruction &I : *BB)
6791       if (Legal->isScalarWithPredication(&I)) {
6792         ScalarCostsTy ScalarCosts;
6793         if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6794           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6795 
6796         // Remember that BB will remain after vectorization.
6797         PredicatedBBsAfterVectorization.insert(BB);
6798       }
6799   }
6800 }
6801 
6802 int LoopVectorizationCostModel::computePredInstDiscount(
6803     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6804     unsigned VF) {
6805   assert(!isUniformAfterVectorization(PredInst, VF) &&
6806          "Instruction marked uniform-after-vectorization will be predicated");
6807 
6808   // Initialize the discount to zero, meaning that the scalar version and the
6809   // vector version cost the same.
6810   int Discount = 0;
6811 
6812   // Holds instructions to analyze. The instructions we visit are mapped in
6813   // ScalarCosts. Those instructions are the ones that would be scalarized if
6814   // we find that the scalar version costs less.
6815   SmallVector<Instruction *, 8> Worklist;
6816 
6817   // Returns true if the given instruction can be scalarized.
6818   auto canBeScalarized = [&](Instruction *I) -> bool {
6819     // We only attempt to scalarize instructions forming a single-use chain
6820     // from the original predicated block that would otherwise be vectorized.
6821     // Although not strictly necessary, we give up on instructions we know will
6822     // already be scalar to avoid traversing chains that are unlikely to be
6823     // beneficial.
6824     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6825         isScalarAfterVectorization(I, VF))
6826       return false;
6827 
6828     // If the instruction is scalar with predication, it will be analyzed
6829     // separately. We ignore it within the context of PredInst.
6830     if (Legal->isScalarWithPredication(I))
6831       return false;
6832 
6833     // If any of the instruction's operands are uniform after vectorization,
6834     // the instruction cannot be scalarized. This prevents, for example, a
6835     // masked load from being scalarized.
6836     //
6837     // We assume we will only emit a value for lane zero of an instruction
6838     // marked uniform after vectorization, rather than VF identical values.
6839     // Thus, if we scalarize an instruction that uses a uniform, we would
6840     // create uses of values corresponding to the lanes we aren't emitting code
6841     // for. This behavior can be changed by allowing getScalarValue to clone
6842     // the lane zero values for uniforms rather than asserting.
6843     for (Use &U : I->operands())
6844       if (auto *J = dyn_cast<Instruction>(U.get()))
6845         if (isUniformAfterVectorization(J, VF))
6846           return false;
6847 
6848     // Otherwise, we can scalarize the instruction.
6849     return true;
6850   };
6851 
6852   // Returns true if an operand that cannot be scalarized must be extracted
6853   // from a vector. We will account for this scalarization overhead below. Note
6854   // that the non-void predicated instructions are placed in their own blocks,
6855   // and their return values are inserted into vectors. Thus, an extract would
6856   // still be required.
6857   auto needsExtract = [&](Instruction *I) -> bool {
6858     return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
6859   };
6860 
6861   // Compute the expected cost discount from scalarizing the entire expression
6862   // feeding the predicated instruction. We currently only consider expressions
6863   // that are single-use instruction chains.
6864   Worklist.push_back(PredInst);
6865   while (!Worklist.empty()) {
6866     Instruction *I = Worklist.pop_back_val();
6867 
6868     // If we've already analyzed the instruction, there's nothing to do.
6869     if (ScalarCosts.count(I))
6870       continue;
6871 
6872     // Compute the cost of the vector instruction. Note that this cost already
6873     // includes the scalarization overhead of the predicated instruction.
6874     unsigned VectorCost = getInstructionCost(I, VF).first;
6875 
6876     // Compute the cost of the scalarized instruction. This cost is the cost of
6877     // the instruction as if it wasn't if-converted and instead remained in the
6878     // predicated block. We will scale this cost by block probability after
6879     // computing the scalarization overhead.
6880     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
6881 
6882     // Compute the scalarization overhead of needed insertelement instructions
6883     // and phi nodes.
6884     if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6885       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
6886                                                  true, false);
6887       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
6888     }
6889 
6890     // Compute the scalarization overhead of needed extractelement
6891     // instructions. For each of the instruction's operands, if the operand can
6892     // be scalarized, add it to the worklist; otherwise, account for the
6893     // overhead.
6894     for (Use &U : I->operands())
6895       if (auto *J = dyn_cast<Instruction>(U.get())) {
6896         assert(VectorType::isValidElementType(J->getType()) &&
6897                "Instruction has non-scalar type");
6898         if (canBeScalarized(J))
6899           Worklist.push_back(J);
6900         else if (needsExtract(J))
6901           ScalarCost += TTI.getScalarizationOverhead(
6902                               ToVectorTy(J->getType(),VF), false, true);
6903       }
6904 
6905     // Scale the total scalar cost by block probability.
6906     ScalarCost /= getReciprocalPredBlockProb();
6907 
6908     // Compute the discount. A non-negative discount means the vector version
6909     // of the instruction costs more, and scalarizing would be beneficial.
6910     Discount += VectorCost - ScalarCost;
6911     ScalarCosts[I] = ScalarCost;
6912   }
6913 
6914   return Discount;
6915 }
6916 
6917 LoopVectorizationCostModel::VectorizationCostTy
6918 LoopVectorizationCostModel::expectedCost(unsigned VF) {
6919   VectorizationCostTy Cost;
6920 
6921   // For each block.
6922   for (BasicBlock *BB : TheLoop->blocks()) {
6923     VectorizationCostTy BlockCost;
6924 
6925     // For each instruction in the old loop.
6926     for (Instruction &I : *BB) {
6927       // Skip dbg intrinsics.
6928       if (isa<DbgInfoIntrinsic>(I))
6929         continue;
6930 
6931       // Skip ignored values.
6932       if (ValuesToIgnore.count(&I))
6933         continue;
6934 
6935       VectorizationCostTy C = getInstructionCost(&I, VF);
6936 
6937       // Check if we should override the cost.
6938       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6939         C.first = ForceTargetInstructionCost;
6940 
6941       BlockCost.first += C.first;
6942       BlockCost.second |= C.second;
6943       DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
6944                    << VF << " For instruction: " << I << '\n');
6945     }
6946 
6947     // If we are vectorizing a predicated block, it will have been
6948     // if-converted. This means that the block's instructions (aside from
6949     // stores and instructions that may divide by zero) will now be
6950     // unconditionally executed. For the scalar case, we may not always execute
6951     // the predicated block. Thus, scale the block's cost by the probability of
6952     // executing it.
6953     if (VF == 1 && Legal->blockNeedsPredication(BB))
6954       BlockCost.first /= getReciprocalPredBlockProb();
6955 
6956     Cost.first += BlockCost.first;
6957     Cost.second |= BlockCost.second;
6958   }
6959 
6960   return Cost;
6961 }
6962 
6963 /// \brief Gets Address Access SCEV after verifying that the access pattern
6964 /// is loop invariant except the induction variable dependence.
6965 ///
6966 /// This SCEV can be sent to the Target in order to estimate the address
6967 /// calculation cost.
6968 static const SCEV *getAddressAccessSCEV(
6969               Value *Ptr,
6970               LoopVectorizationLegality *Legal,
6971               ScalarEvolution *SE,
6972               const Loop *TheLoop) {
6973   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6974   if (!Gep)
6975     return nullptr;
6976 
6977   // We are looking for a gep with all loop invariant indices except for one
6978   // which should be an induction variable.
6979   unsigned NumOperands = Gep->getNumOperands();
6980   for (unsigned i = 1; i < NumOperands; ++i) {
6981     Value *Opd = Gep->getOperand(i);
6982     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6983         !Legal->isInductionVariable(Opd))
6984       return nullptr;
6985   }
6986 
6987   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6988   return SE->getSCEV(Ptr);
6989 }
6990 
6991 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6992   return Legal->hasStride(I->getOperand(0)) ||
6993          Legal->hasStride(I->getOperand(1));
6994 }
6995 
6996 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6997                                                                  unsigned VF) {
6998   Type *ValTy = getMemInstValueType(I);
6999   auto SE = PSE.getSE();
7000 
7001   unsigned Alignment = getMemInstAlignment(I);
7002   unsigned AS = getMemInstAddressSpace(I);
7003   Value *Ptr = getPointerOperand(I);
7004   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7005 
7006   // Figure out whether the access is strided and get the stride value
7007   // if it's known in compile time
7008   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
7009 
7010   // Get the cost of the scalar memory instruction and address computation.
7011   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7012 
7013   Cost += VF *
7014           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
7015                               AS, I);
7016 
7017   // Get the overhead of the extractelement and insertelement instructions
7018   // we might create due to scalarization.
7019   Cost += getScalarizationOverhead(I, VF, TTI);
7020 
7021   // If we have a predicated store, it may not be executed for each vector
7022   // lane. Scale the cost by the probability of executing the predicated
7023   // block.
7024   if (Legal->isScalarWithPredication(I))
7025     Cost /= getReciprocalPredBlockProb();
7026 
7027   return Cost;
7028 }
7029 
7030 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7031                                                              unsigned VF) {
7032   Type *ValTy = getMemInstValueType(I);
7033   Type *VectorTy = ToVectorTy(ValTy, VF);
7034   unsigned Alignment = getMemInstAlignment(I);
7035   Value *Ptr = getPointerOperand(I);
7036   unsigned AS = getMemInstAddressSpace(I);
7037   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7038 
7039   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7040          "Stride should be 1 or -1 for consecutive memory access");
7041   unsigned Cost = 0;
7042   if (Legal->isMaskRequired(I))
7043     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
7044   else
7045     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
7046 
7047   bool Reverse = ConsecutiveStride < 0;
7048   if (Reverse)
7049     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
7050   return Cost;
7051 }
7052 
7053 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7054                                                          unsigned VF) {
7055   LoadInst *LI = cast<LoadInst>(I);
7056   Type *ValTy = LI->getType();
7057   Type *VectorTy = ToVectorTy(ValTy, VF);
7058   unsigned Alignment = LI->getAlignment();
7059   unsigned AS = LI->getPointerAddressSpace();
7060 
7061   return TTI.getAddressComputationCost(ValTy) +
7062          TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
7063          TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7064 }
7065 
7066 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7067                                                           unsigned VF) {
7068   Type *ValTy = getMemInstValueType(I);
7069   Type *VectorTy = ToVectorTy(ValTy, VF);
7070   unsigned Alignment = getMemInstAlignment(I);
7071   Value *Ptr = getPointerOperand(I);
7072 
7073   return TTI.getAddressComputationCost(VectorTy) +
7074          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
7075                                     Legal->isMaskRequired(I), Alignment);
7076 }
7077 
7078 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7079                                                             unsigned VF) {
7080   Type *ValTy = getMemInstValueType(I);
7081   Type *VectorTy = ToVectorTy(ValTy, VF);
7082   unsigned AS = getMemInstAddressSpace(I);
7083 
7084   auto Group = Legal->getInterleavedAccessGroup(I);
7085   assert(Group && "Fail to get an interleaved access group.");
7086 
7087   unsigned InterleaveFactor = Group->getFactor();
7088   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7089 
7090   // Holds the indices of existing members in an interleaved load group.
7091   // An interleaved store group doesn't need this as it doesn't allow gaps.
7092   SmallVector<unsigned, 4> Indices;
7093   if (isa<LoadInst>(I)) {
7094     for (unsigned i = 0; i < InterleaveFactor; i++)
7095       if (Group->getMember(i))
7096         Indices.push_back(i);
7097   }
7098 
7099   // Calculate the cost of the whole interleaved group.
7100   unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
7101                                                  Group->getFactor(), Indices,
7102                                                  Group->getAlignment(), AS);
7103 
7104   if (Group->isReverse())
7105     Cost += Group->getNumMembers() *
7106             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
7107   return Cost;
7108 }
7109 
7110 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7111                                                               unsigned VF) {
7112   // Calculate scalar cost only. Vectorization cost should be ready at this
7113   // moment.
7114   if (VF == 1) {
7115     Type *ValTy = getMemInstValueType(I);
7116     unsigned Alignment = getMemInstAlignment(I);
7117     unsigned AS = getMemInstAddressSpace(I);
7118 
7119     return TTI.getAddressComputationCost(ValTy) +
7120            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
7121   }
7122   return getWideningCost(I, VF);
7123 }
7124 
7125 LoopVectorizationCostModel::VectorizationCostTy
7126 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
7127   // If we know that this instruction will remain uniform, check the cost of
7128   // the scalar version.
7129   if (isUniformAfterVectorization(I, VF))
7130     VF = 1;
7131 
7132   if (VF > 1 && isProfitableToScalarize(I, VF))
7133     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7134 
7135   // Forced scalars do not have any scalarization overhead.
7136   if (VF > 1 && ForcedScalars.count(VF) &&
7137       ForcedScalars.find(VF)->second.count(I))
7138     return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
7139 
7140   Type *VectorTy;
7141   unsigned C = getInstructionCost(I, VF, VectorTy);
7142 
7143   bool TypeNotScalarized =
7144       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
7145   return VectorizationCostTy(C, TypeNotScalarized);
7146 }
7147 
7148 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
7149   if (VF == 1)
7150     return;
7151   for (BasicBlock *BB : TheLoop->blocks()) {
7152     // For each instruction in the old loop.
7153     for (Instruction &I : *BB) {
7154       Value *Ptr = getPointerOperand(&I);
7155       if (!Ptr)
7156         continue;
7157 
7158       if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
7159         // Scalar load + broadcast
7160         unsigned Cost = getUniformMemOpCost(&I, VF);
7161         setWideningDecision(&I, VF, CM_Scalarize, Cost);
7162         continue;
7163       }
7164 
7165       // We assume that widening is the best solution when possible.
7166       if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
7167         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
7168         setWideningDecision(&I, VF, CM_Widen, Cost);
7169         continue;
7170       }
7171 
7172       // Choose between Interleaving, Gather/Scatter or Scalarization.
7173       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
7174       unsigned NumAccesses = 1;
7175       if (Legal->isAccessInterleaved(&I)) {
7176         auto Group = Legal->getInterleavedAccessGroup(&I);
7177         assert(Group && "Fail to get an interleaved access group.");
7178 
7179         // Make one decision for the whole group.
7180         if (getWideningDecision(&I, VF) != CM_Unknown)
7181           continue;
7182 
7183         NumAccesses = Group->getNumMembers();
7184         InterleaveCost = getInterleaveGroupCost(&I, VF);
7185       }
7186 
7187       unsigned GatherScatterCost =
7188           Legal->isLegalGatherOrScatter(&I)
7189               ? getGatherScatterCost(&I, VF) * NumAccesses
7190               : std::numeric_limits<unsigned>::max();
7191 
7192       unsigned ScalarizationCost =
7193           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7194 
7195       // Choose better solution for the current VF,
7196       // write down this decision and use it during vectorization.
7197       unsigned Cost;
7198       InstWidening Decision;
7199       if (InterleaveCost <= GatherScatterCost &&
7200           InterleaveCost < ScalarizationCost) {
7201         Decision = CM_Interleave;
7202         Cost = InterleaveCost;
7203       } else if (GatherScatterCost < ScalarizationCost) {
7204         Decision = CM_GatherScatter;
7205         Cost = GatherScatterCost;
7206       } else {
7207         Decision = CM_Scalarize;
7208         Cost = ScalarizationCost;
7209       }
7210       // If the instructions belongs to an interleave group, the whole group
7211       // receives the same decision. The whole group receives the cost, but
7212       // the cost will actually be assigned to one instruction.
7213       if (auto Group = Legal->getInterleavedAccessGroup(&I))
7214         setWideningDecision(Group, VF, Decision, Cost);
7215       else
7216         setWideningDecision(&I, VF, Decision, Cost);
7217     }
7218   }
7219 
7220   // Make sure that any load of address and any other address computation
7221   // remains scalar unless there is gather/scatter support. This avoids
7222   // inevitable extracts into address registers, and also has the benefit of
7223   // activating LSR more, since that pass can't optimize vectorized
7224   // addresses.
7225   if (TTI.prefersVectorizedAddressing())
7226     return;
7227 
7228   // Start with all scalar pointer uses.
7229   SmallPtrSet<Instruction *, 8> AddrDefs;
7230   for (BasicBlock *BB : TheLoop->blocks())
7231     for (Instruction &I : *BB) {
7232       Instruction *PtrDef =
7233         dyn_cast_or_null<Instruction>(getPointerOperand(&I));
7234       if (PtrDef && TheLoop->contains(PtrDef) &&
7235           getWideningDecision(&I, VF) != CM_GatherScatter)
7236         AddrDefs.insert(PtrDef);
7237     }
7238 
7239   // Add all instructions used to generate the addresses.
7240   SmallVector<Instruction *, 4> Worklist;
7241   for (auto *I : AddrDefs)
7242     Worklist.push_back(I);
7243   while (!Worklist.empty()) {
7244     Instruction *I = Worklist.pop_back_val();
7245     for (auto &Op : I->operands())
7246       if (auto *InstOp = dyn_cast<Instruction>(Op))
7247         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7248             AddrDefs.insert(InstOp).second)
7249           Worklist.push_back(InstOp);
7250   }
7251 
7252   for (auto *I : AddrDefs) {
7253     if (isa<LoadInst>(I)) {
7254       // Setting the desired widening decision should ideally be handled in
7255       // by cost functions, but since this involves the task of finding out
7256       // if the loaded register is involved in an address computation, it is
7257       // instead changed here when we know this is the case.
7258       if (getWideningDecision(I, VF) == CM_Widen)
7259         // Scalarize a widened load of address.
7260         setWideningDecision(I, VF, CM_Scalarize,
7261                             (VF * getMemoryInstructionCost(I, 1)));
7262       else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
7263         // Scalarize an interleave group of address loads.
7264         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7265           if (Instruction *Member = Group->getMember(I))
7266             setWideningDecision(Member, VF, CM_Scalarize,
7267                                 (VF * getMemoryInstructionCost(Member, 1)));
7268         }
7269       }
7270     } else
7271       // Make sure I gets scalarized and a cost estimate without
7272       // scalarization overhead.
7273       ForcedScalars[VF].insert(I);
7274   }
7275 }
7276 
7277 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7278                                                         unsigned VF,
7279                                                         Type *&VectorTy) {
7280   Type *RetTy = I->getType();
7281   if (canTruncateToMinimalBitwidth(I, VF))
7282     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7283   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7284   auto SE = PSE.getSE();
7285 
7286   // TODO: We need to estimate the cost of intrinsic calls.
7287   switch (I->getOpcode()) {
7288   case Instruction::GetElementPtr:
7289     // We mark this instruction as zero-cost because the cost of GEPs in
7290     // vectorized code depends on whether the corresponding memory instruction
7291     // is scalarized or not. Therefore, we handle GEPs with the memory
7292     // instruction cost.
7293     return 0;
7294   case Instruction::Br: {
7295     // In cases of scalarized and predicated instructions, there will be VF
7296     // predicated blocks in the vectorized loop. Each branch around these
7297     // blocks requires also an extract of its vector compare i1 element.
7298     bool ScalarPredicatedBB = false;
7299     BranchInst *BI = cast<BranchInst>(I);
7300     if (VF > 1 && BI->isConditional() &&
7301         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7302          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7303       ScalarPredicatedBB = true;
7304 
7305     if (ScalarPredicatedBB) {
7306       // Return cost for branches around scalarized and predicated blocks.
7307       Type *Vec_i1Ty =
7308           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7309       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
7310               (TTI.getCFInstrCost(Instruction::Br) * VF));
7311     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
7312       // The back-edge branch will remain, as will all scalar branches.
7313       return TTI.getCFInstrCost(Instruction::Br);
7314     else
7315       // This branch will be eliminated by if-conversion.
7316       return 0;
7317     // Note: We currently assume zero cost for an unconditional branch inside
7318     // a predicated block since it will become a fall-through, although we
7319     // may decide in the future to call TTI for all branches.
7320   }
7321   case Instruction::PHI: {
7322     auto *Phi = cast<PHINode>(I);
7323 
7324     // First-order recurrences are replaced by vector shuffles inside the loop.
7325     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
7326       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
7327                                 VectorTy, VF - 1, VectorTy);
7328 
7329     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7330     // converted into select instructions. We require N - 1 selects per phi
7331     // node, where N is the number of incoming values.
7332     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
7333       return (Phi->getNumIncomingValues() - 1) *
7334              TTI.getCmpSelInstrCost(
7335                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7336                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
7337 
7338     return TTI.getCFInstrCost(Instruction::PHI);
7339   }
7340   case Instruction::UDiv:
7341   case Instruction::SDiv:
7342   case Instruction::URem:
7343   case Instruction::SRem:
7344     // If we have a predicated instruction, it may not be executed for each
7345     // vector lane. Get the scalarization cost and scale this amount by the
7346     // probability of executing the predicated block. If the instruction is not
7347     // predicated, we fall through to the next case.
7348     if (VF > 1 && Legal->isScalarWithPredication(I)) {
7349       unsigned Cost = 0;
7350 
7351       // These instructions have a non-void type, so account for the phi nodes
7352       // that we will create. This cost is likely to be zero. The phi node
7353       // cost, if any, should be scaled by the block probability because it
7354       // models a copy at the end of each predicated block.
7355       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
7356 
7357       // The cost of the non-predicated instruction.
7358       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
7359 
7360       // The cost of insertelement and extractelement instructions needed for
7361       // scalarization.
7362       Cost += getScalarizationOverhead(I, VF, TTI);
7363 
7364       // Scale the cost by the probability of executing the predicated blocks.
7365       // This assumes the predicated block for each vector lane is equally
7366       // likely.
7367       return Cost / getReciprocalPredBlockProb();
7368     }
7369     LLVM_FALLTHROUGH;
7370   case Instruction::Add:
7371   case Instruction::FAdd:
7372   case Instruction::Sub:
7373   case Instruction::FSub:
7374   case Instruction::Mul:
7375   case Instruction::FMul:
7376   case Instruction::FDiv:
7377   case Instruction::FRem:
7378   case Instruction::Shl:
7379   case Instruction::LShr:
7380   case Instruction::AShr:
7381   case Instruction::And:
7382   case Instruction::Or:
7383   case Instruction::Xor: {
7384     // Since we will replace the stride by 1 the multiplication should go away.
7385     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7386       return 0;
7387     // Certain instructions can be cheaper to vectorize if they have a constant
7388     // second vector operand. One example of this are shifts on x86.
7389     TargetTransformInfo::OperandValueKind Op1VK =
7390         TargetTransformInfo::OK_AnyValue;
7391     TargetTransformInfo::OperandValueKind Op2VK =
7392         TargetTransformInfo::OK_AnyValue;
7393     TargetTransformInfo::OperandValueProperties Op1VP =
7394         TargetTransformInfo::OP_None;
7395     TargetTransformInfo::OperandValueProperties Op2VP =
7396         TargetTransformInfo::OP_None;
7397     Value *Op2 = I->getOperand(1);
7398 
7399     // Check for a splat or for a non uniform vector of constants.
7400     if (isa<ConstantInt>(Op2)) {
7401       ConstantInt *CInt = cast<ConstantInt>(Op2);
7402       if (CInt && CInt->getValue().isPowerOf2())
7403         Op2VP = TargetTransformInfo::OP_PowerOf2;
7404       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
7405     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
7406       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
7407       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
7408       if (SplatValue) {
7409         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
7410         if (CInt && CInt->getValue().isPowerOf2())
7411           Op2VP = TargetTransformInfo::OP_PowerOf2;
7412         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
7413       }
7414     } else if (Legal->isUniform(Op2)) {
7415       Op2VK = TargetTransformInfo::OK_UniformValue;
7416     }
7417     SmallVector<const Value *, 4> Operands(I->operand_values());
7418     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
7419     return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
7420                                           Op2VK, Op1VP, Op2VP, Operands);
7421   }
7422   case Instruction::Select: {
7423     SelectInst *SI = cast<SelectInst>(I);
7424     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7425     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7426     Type *CondTy = SI->getCondition()->getType();
7427     if (!ScalarCond)
7428       CondTy = VectorType::get(CondTy, VF);
7429 
7430     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
7431   }
7432   case Instruction::ICmp:
7433   case Instruction::FCmp: {
7434     Type *ValTy = I->getOperand(0)->getType();
7435     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7436     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7437       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7438     VectorTy = ToVectorTy(ValTy, VF);
7439     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
7440   }
7441   case Instruction::Store:
7442   case Instruction::Load: {
7443     unsigned Width = VF;
7444     if (Width > 1) {
7445       InstWidening Decision = getWideningDecision(I, Width);
7446       assert(Decision != CM_Unknown &&
7447              "CM decision should be taken at this point");
7448       if (Decision == CM_Scalarize)
7449         Width = 1;
7450     }
7451     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7452     return getMemoryInstructionCost(I, VF);
7453   }
7454   case Instruction::ZExt:
7455   case Instruction::SExt:
7456   case Instruction::FPToUI:
7457   case Instruction::FPToSI:
7458   case Instruction::FPExt:
7459   case Instruction::PtrToInt:
7460   case Instruction::IntToPtr:
7461   case Instruction::SIToFP:
7462   case Instruction::UIToFP:
7463   case Instruction::Trunc:
7464   case Instruction::FPTrunc:
7465   case Instruction::BitCast: {
7466     // We optimize the truncation of induction variables having constant
7467     // integer steps. The cost of these truncations is the same as the scalar
7468     // operation.
7469     if (isOptimizableIVTruncate(I, VF)) {
7470       auto *Trunc = cast<TruncInst>(I);
7471       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7472                                   Trunc->getSrcTy(), Trunc);
7473     }
7474 
7475     Type *SrcScalarTy = I->getOperand(0)->getType();
7476     Type *SrcVecTy =
7477         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7478     if (canTruncateToMinimalBitwidth(I, VF)) {
7479       // This cast is going to be shrunk. This may remove the cast or it might
7480       // turn it into slightly different cast. For example, if MinBW == 16,
7481       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7482       //
7483       // Calculate the modified src and dest types.
7484       Type *MinVecTy = VectorTy;
7485       if (I->getOpcode() == Instruction::Trunc) {
7486         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7487         VectorTy =
7488             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7489       } else if (I->getOpcode() == Instruction::ZExt ||
7490                  I->getOpcode() == Instruction::SExt) {
7491         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7492         VectorTy =
7493             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7494       }
7495     }
7496 
7497     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
7498     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
7499   }
7500   case Instruction::Call: {
7501     bool NeedToScalarize;
7502     CallInst *CI = cast<CallInst>(I);
7503     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
7504     if (getVectorIntrinsicIDForCall(CI, TLI))
7505       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
7506     return CallCost;
7507   }
7508   default:
7509     // The cost of executing VF copies of the scalar instruction. This opcode
7510     // is unknown. Assume that it is the same as 'mul'.
7511     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
7512            getScalarizationOverhead(I, VF, TTI);
7513   } // end of switch.
7514 }
7515 
7516 char LoopVectorize::ID = 0;
7517 
7518 static const char lv_name[] = "Loop Vectorization";
7519 
7520 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7521 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7522 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7523 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7524 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7525 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7526 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7527 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7528 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7529 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7530 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7531 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7532 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7533 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7534 
7535 namespace llvm {
7536 
7537 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
7538   return new LoopVectorize(NoUnrolling, AlwaysVectorize);
7539 }
7540 
7541 } // end namespace llvm
7542 
7543 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7544   // Check if the pointer operand of a load or store instruction is
7545   // consecutive.
7546   if (auto *Ptr = getPointerOperand(Inst))
7547     return Legal->isConsecutivePtr(Ptr);
7548   return false;
7549 }
7550 
7551 void LoopVectorizationCostModel::collectValuesToIgnore() {
7552   // Ignore ephemeral values.
7553   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7554 
7555   // Ignore type-promoting instructions we identified during reduction
7556   // detection.
7557   for (auto &Reduction : *Legal->getReductionVars()) {
7558     RecurrenceDescriptor &RedDes = Reduction.second;
7559     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7560     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7561   }
7562 }
7563 
7564 LoopVectorizationCostModel::VectorizationFactor
7565 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
7566   // Width 1 means no vectorize, cost 0 means uncomputed cost.
7567   const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
7568                                                                            0U};
7569   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
7570   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
7571     return NoVectorization;
7572 
7573   if (UserVF) {
7574     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7575     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
7576     // Collect the instructions (and their associated costs) that will be more
7577     // profitable to scalarize.
7578     CM.selectUserVectorizationFactor(UserVF);
7579     buildVPlans(UserVF, UserVF);
7580     DEBUG(printPlans(dbgs()));
7581     return {UserVF, 0};
7582   }
7583 
7584   unsigned MaxVF = MaybeMaxVF.getValue();
7585   assert(MaxVF != 0 && "MaxVF is zero.");
7586 
7587   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
7588     // Collect Uniform and Scalar instructions after vectorization with VF.
7589     CM.collectUniformsAndScalars(VF);
7590 
7591     // Collect the instructions (and their associated costs) that will be more
7592     // profitable to scalarize.
7593     if (VF > 1)
7594       CM.collectInstsToScalarize(VF);
7595   }
7596 
7597   buildVPlans(1, MaxVF);
7598   DEBUG(printPlans(dbgs()));
7599   if (MaxVF == 1)
7600     return NoVectorization;
7601 
7602   // Select the optimal vectorization factor.
7603   return CM.selectVectorizationFactor(MaxVF);
7604 }
7605 
7606 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
7607   DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
7608   BestVF = VF;
7609   BestUF = UF;
7610 
7611   erase_if(VPlans, [VF](const std::unique_ptr<VPlan> &Plan) {
7612     return !Plan->hasVF(VF);
7613   });
7614   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7615 }
7616 
7617 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7618                                            DominatorTree *DT) {
7619   // Perform the actual loop transformation.
7620 
7621   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7622   VPTransformState State{
7623       BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV};
7624   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7625 
7626   //===------------------------------------------------===//
7627   //
7628   // Notice: any optimization or new instruction that go
7629   // into the code below should also be implemented in
7630   // the cost-model.
7631   //
7632   //===------------------------------------------------===//
7633 
7634   // 2. Copy and widen instructions from the old loop into the new loop.
7635   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7636   VPlans.front()->execute(&State);
7637 
7638   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7639   //    predication, updating analyses.
7640   ILV.fixVectorizedLoop();
7641 }
7642 
7643 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7644     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7645   BasicBlock *Latch = OrigLoop->getLoopLatch();
7646 
7647   // We create new control-flow for the vectorized loop, so the original
7648   // condition will be dead after vectorization if it's only used by the
7649   // branch.
7650   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7651   if (Cmp && Cmp->hasOneUse())
7652     DeadInstructions.insert(Cmp);
7653 
7654   // We create new "steps" for induction variable updates to which the original
7655   // induction variables map. An original update instruction will be dead if
7656   // all its users except the induction variable are dead.
7657   for (auto &Induction : *Legal->getInductionVars()) {
7658     PHINode *Ind = Induction.first;
7659     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7660     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7661           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7662         }))
7663       DeadInstructions.insert(IndUpdate);
7664   }
7665 }
7666 
7667 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7668 
7669 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7670 
7671 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7672                                         Instruction::BinaryOps BinOp) {
7673   // When unrolling and the VF is 1, we only need to add a simple scalar.
7674   Type *Ty = Val->getType();
7675   assert(!Ty->isVectorTy() && "Val must be a scalar");
7676 
7677   if (Ty->isFloatingPointTy()) {
7678     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7679 
7680     // Floating point operations had to be 'fast' to enable the unrolling.
7681     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7682     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7683   }
7684   Constant *C = ConstantInt::get(Ty, StartIdx);
7685   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7686 }
7687 
7688 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7689   SmallVector<Metadata *, 4> MDs;
7690   // Reserve first location for self reference to the LoopID metadata node.
7691   MDs.push_back(nullptr);
7692   bool IsUnrollMetadata = false;
7693   MDNode *LoopID = L->getLoopID();
7694   if (LoopID) {
7695     // First find existing loop unrolling disable metadata.
7696     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7697       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7698       if (MD) {
7699         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7700         IsUnrollMetadata =
7701             S && S->getString().startswith("llvm.loop.unroll.disable");
7702       }
7703       MDs.push_back(LoopID->getOperand(i));
7704     }
7705   }
7706 
7707   if (!IsUnrollMetadata) {
7708     // Add runtime unroll disable metadata.
7709     LLVMContext &Context = L->getHeader()->getContext();
7710     SmallVector<Metadata *, 1> DisableOperands;
7711     DisableOperands.push_back(
7712         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7713     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7714     MDs.push_back(DisableNode);
7715     MDNode *NewLoopID = MDNode::get(Context, MDs);
7716     // Set operand 0 to refer to the loop id itself.
7717     NewLoopID->replaceOperandWith(0, NewLoopID);
7718     L->setLoopID(NewLoopID);
7719   }
7720 }
7721 
7722 namespace {
7723 
7724 /// VPWidenRecipe is a recipe for producing a copy of vector type for each
7725 /// Instruction in its ingredients independently, in order. This recipe covers
7726 /// most of the traditional vectorization cases where each ingredient transforms
7727 /// into a vectorized version of itself.
7728 class VPWidenRecipe : public VPRecipeBase {
7729 private:
7730   /// Hold the ingredients by pointing to their original BasicBlock location.
7731   BasicBlock::iterator Begin;
7732   BasicBlock::iterator End;
7733 
7734 public:
7735   VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) {
7736     End = I->getIterator();
7737     Begin = End++;
7738   }
7739 
7740   ~VPWidenRecipe() override = default;
7741 
7742   /// Method to support type inquiry through isa, cast, and dyn_cast.
7743   static inline bool classof(const VPRecipeBase *V) {
7744     return V->getVPRecipeID() == VPRecipeBase::VPWidenSC;
7745   }
7746 
7747   /// Produce widened copies of all Ingredients.
7748   void execute(VPTransformState &State) override {
7749     for (auto &Instr : make_range(Begin, End))
7750       State.ILV->widenInstruction(Instr);
7751   }
7752 
7753   /// Augment the recipe to include Instr, if it lies at its End.
7754   bool appendInstruction(Instruction *Instr) {
7755     if (End != Instr->getIterator())
7756       return false;
7757     End++;
7758     return true;
7759   }
7760 
7761   /// Print the recipe.
7762   void print(raw_ostream &O, const Twine &Indent) const override {
7763     O << " +\n" << Indent << "\"WIDEN\\l\"";
7764     for (auto &Instr : make_range(Begin, End))
7765       O << " +\n" << Indent << "\"  " << VPlanIngredient(&Instr) << "\\l\"";
7766   }
7767 };
7768 
7769 /// A recipe for handling phi nodes of integer and floating-point inductions,
7770 /// producing their vector and scalar values.
7771 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
7772 private:
7773   PHINode *IV;
7774   TruncInst *Trunc;
7775 
7776 public:
7777   VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr)
7778       : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {}
7779   ~VPWidenIntOrFpInductionRecipe() override = default;
7780 
7781   /// Method to support type inquiry through isa, cast, and dyn_cast.
7782   static inline bool classof(const VPRecipeBase *V) {
7783     return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
7784   }
7785 
7786   /// Generate the vectorized and scalarized versions of the phi node as
7787   /// needed by their users.
7788   void execute(VPTransformState &State) override {
7789     assert(!State.Instance && "Int or FP induction being replicated.");
7790     State.ILV->widenIntOrFpInduction(IV, Trunc);
7791   }
7792 
7793   /// Print the recipe.
7794   void print(raw_ostream &O, const Twine &Indent) const override {
7795     O << " +\n" << Indent << "\"WIDEN-INDUCTION";
7796     if (Trunc) {
7797       O << "\\l\"";
7798       O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
7799       O << " +\n" << Indent << "\"  " << VPlanIngredient(Trunc) << "\\l\"";
7800     } else
7801       O << " " << VPlanIngredient(IV) << "\\l\"";
7802   }
7803 };
7804 
7805 /// A recipe for handling all phi nodes except for integer and FP inductions.
7806 class VPWidenPHIRecipe : public VPRecipeBase {
7807 private:
7808   PHINode *Phi;
7809 
7810 public:
7811   VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {}
7812   ~VPWidenPHIRecipe() override = default;
7813 
7814   /// Method to support type inquiry through isa, cast, and dyn_cast.
7815   static inline bool classof(const VPRecipeBase *V) {
7816     return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC;
7817   }
7818 
7819   /// Generate the phi/select nodes.
7820   void execute(VPTransformState &State) override {
7821     State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7822   }
7823 
7824   /// Print the recipe.
7825   void print(raw_ostream &O, const Twine &Indent) const override {
7826     O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
7827   }
7828 };
7829 
7830 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
7831 /// or stores into one wide load/store and shuffles.
7832 class VPInterleaveRecipe : public VPRecipeBase {
7833 private:
7834   const InterleaveGroup *IG;
7835 
7836 public:
7837   VPInterleaveRecipe(const InterleaveGroup *IG)
7838       : VPRecipeBase(VPInterleaveSC), IG(IG) {}
7839   ~VPInterleaveRecipe() override = default;
7840 
7841   /// Method to support type inquiry through isa, cast, and dyn_cast.
7842   static inline bool classof(const VPRecipeBase *V) {
7843     return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
7844   }
7845 
7846   /// Generate the wide load or store, and shuffles.
7847   void execute(VPTransformState &State) override {
7848     assert(!State.Instance && "Interleave group being replicated.");
7849     State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7850   }
7851 
7852   /// Print the recipe.
7853   void print(raw_ostream &O, const Twine &Indent) const override;
7854 
7855   const InterleaveGroup *getInterleaveGroup() { return IG; }
7856 };
7857 
7858 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
7859 /// copies of the original scalar type, one per lane, instead of producing a
7860 /// single copy of widened type for all lanes. If the instruction is known to be
7861 /// uniform only one copy, per lane zero, will be generated.
7862 class VPReplicateRecipe : public VPRecipeBase {
7863 private:
7864   /// The instruction being replicated.
7865   Instruction *Ingredient;
7866 
7867   /// Indicator if only a single replica per lane is needed.
7868   bool IsUniform;
7869 
7870   /// Indicator if the replicas are also predicated.
7871   bool IsPredicated;
7872 
7873   /// Indicator if the scalar values should also be packed into a vector.
7874   bool AlsoPack;
7875 
7876 public:
7877   VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false)
7878       : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform),
7879         IsPredicated(IsPredicated) {
7880     // Retain the previous behavior of predicateInstructions(), where an
7881     // insert-element of a predicated instruction got hoisted into the
7882     // predicated basic block iff it was its only user. This is achieved by
7883     // having predicated instructions also pack their values into a vector by
7884     // default unless they have a replicated user which uses their scalar value.
7885     AlsoPack = IsPredicated && !I->use_empty();
7886   }
7887 
7888   ~VPReplicateRecipe() override = default;
7889 
7890   /// Method to support type inquiry through isa, cast, and dyn_cast.
7891   static inline bool classof(const VPRecipeBase *V) {
7892     return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC;
7893   }
7894 
7895   /// Generate replicas of the desired Ingredient. Replicas will be generated
7896   /// for all parts and lanes unless a specific part and lane are specified in
7897   /// the \p State.
7898   void execute(VPTransformState &State) override;
7899 
7900   void setAlsoPack(bool Pack) { AlsoPack = Pack; }
7901 
7902   /// Print the recipe.
7903   void print(raw_ostream &O, const Twine &Indent) const override {
7904     O << " +\n"
7905       << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
7906       << VPlanIngredient(Ingredient);
7907     if (AlsoPack)
7908       O << " (S->V)";
7909     O << "\\l\"";
7910   }
7911 };
7912 
7913 /// A recipe for generating conditional branches on the bits of a mask.
7914 class VPBranchOnMaskRecipe : public VPRecipeBase {
7915 private:
7916   /// The input IR basic block used to obtain the mask providing the condition
7917   /// bits for the branch.
7918   BasicBlock *MaskedBasicBlock;
7919 
7920 public:
7921   VPBranchOnMaskRecipe(BasicBlock *BB)
7922       : VPRecipeBase(VPBranchOnMaskSC), MaskedBasicBlock(BB) {}
7923 
7924   /// Method to support type inquiry through isa, cast, and dyn_cast.
7925   static inline bool classof(const VPRecipeBase *V) {
7926     return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC;
7927   }
7928 
7929   /// Generate the extraction of the appropriate bit from the block mask and the
7930   /// conditional branch.
7931   void execute(VPTransformState &State) override;
7932 
7933   /// Print the recipe.
7934   void print(raw_ostream &O, const Twine &Indent) const override {
7935     O << " +\n"
7936       << Indent << "\"BRANCH-ON-MASK-OF " << MaskedBasicBlock->getName()
7937       << "\\l\"";
7938   }
7939 };
7940 
7941 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
7942 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
7943 /// order to merge values that are set under such a branch and feed their uses.
7944 /// The phi nodes can be scalar or vector depending on the users of the value.
7945 /// This recipe works in concert with VPBranchOnMaskRecipe.
7946 class VPPredInstPHIRecipe : public VPRecipeBase {
7947 private:
7948   Instruction *PredInst;
7949 
7950 public:
7951   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
7952   /// nodes after merging back from a Branch-on-Mask.
7953   VPPredInstPHIRecipe(Instruction *PredInst)
7954       : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {}
7955   ~VPPredInstPHIRecipe() override = default;
7956 
7957   /// Method to support type inquiry through isa, cast, and dyn_cast.
7958   static inline bool classof(const VPRecipeBase *V) {
7959     return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC;
7960   }
7961 
7962   /// Generates phi nodes for live-outs as needed to retain SSA form.
7963   void execute(VPTransformState &State) override;
7964 
7965   /// Print the recipe.
7966   void print(raw_ostream &O, const Twine &Indent) const override {
7967     O << " +\n"
7968       << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst)
7969       << "\\l\"";
7970   }
7971 };
7972 
7973 } // end anonymous namespace
7974 
7975 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7976     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
7977   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7978   bool PredicateAtRangeStart = Predicate(Range.Start);
7979 
7980   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7981     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7982       Range.End = TmpVF;
7983       break;
7984     }
7985 
7986   return PredicateAtRangeStart;
7987 }
7988 
7989 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7990 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7991 /// of VF's starting at a given VF and extending it as much as possible. Each
7992 /// vectorization decision can potentially shorten this sub-range during
7993 /// buildVPlan().
7994 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7995   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7996     VFRange SubRange = {VF, MaxVF + 1};
7997     VPlans.push_back(buildVPlan(SubRange));
7998     VF = SubRange.End;
7999   }
8000 }
8001 
8002 VPInterleaveRecipe *
8003 LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
8004                                                 VFRange &Range) {
8005   const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
8006   if (!IG)
8007     return nullptr;
8008 
8009   // Now check if IG is relevant for VF's in the given range.
8010   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
8011     return [=](unsigned VF) -> bool {
8012       return (VF >= 2 && // Query is illegal for VF == 1
8013               CM.getWideningDecision(I, VF) ==
8014                   LoopVectorizationCostModel::CM_Interleave);
8015     };
8016   };
8017   if (!getDecisionAndClampRange(isIGMember(I), Range))
8018     return nullptr;
8019 
8020   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
8021   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
8022   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
8023   assert(I == IG->getInsertPos() &&
8024          "Generating a recipe for an adjunct member of an interleave group");
8025 
8026   return new VPInterleaveRecipe(IG);
8027 }
8028 
8029 VPWidenIntOrFpInductionRecipe *
8030 LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
8031                                                  VFRange &Range) {
8032   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
8033     // Check if this is an integer or fp induction. If so, build the recipe that
8034     // produces its scalar and vector values.
8035     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
8036     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8037         II.getKind() == InductionDescriptor::IK_FpInduction)
8038       return new VPWidenIntOrFpInductionRecipe(Phi);
8039 
8040     return nullptr;
8041   }
8042 
8043   // Optimize the special case where the source is a constant integer
8044   // induction variable. Notice that we can only optimize the 'trunc' case
8045   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8046   // (c) other casts depend on pointer size.
8047 
8048   // Determine whether \p K is a truncation based on an induction variable that
8049   // can be optimized.
8050   auto isOptimizableIVTruncate =
8051       [&](Instruction *K) -> std::function<bool(unsigned)> {
8052     return
8053         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
8054   };
8055 
8056   if (isa<TruncInst>(I) &&
8057       getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
8058     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8059                                              cast<TruncInst>(I));
8060   return nullptr;
8061 }
8062 
8063 bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
8064                                           VFRange &Range) {
8065   if (Legal->isScalarWithPredication(I))
8066     return false;
8067 
8068   auto IsVectorizableOpcode = [](unsigned Opcode) {
8069     switch (Opcode) {
8070     case Instruction::Add:
8071     case Instruction::And:
8072     case Instruction::AShr:
8073     case Instruction::BitCast:
8074     case Instruction::Br:
8075     case Instruction::Call:
8076     case Instruction::FAdd:
8077     case Instruction::FCmp:
8078     case Instruction::FDiv:
8079     case Instruction::FMul:
8080     case Instruction::FPExt:
8081     case Instruction::FPToSI:
8082     case Instruction::FPToUI:
8083     case Instruction::FPTrunc:
8084     case Instruction::FRem:
8085     case Instruction::FSub:
8086     case Instruction::GetElementPtr:
8087     case Instruction::ICmp:
8088     case Instruction::IntToPtr:
8089     case Instruction::Load:
8090     case Instruction::LShr:
8091     case Instruction::Mul:
8092     case Instruction::Or:
8093     case Instruction::PHI:
8094     case Instruction::PtrToInt:
8095     case Instruction::SDiv:
8096     case Instruction::Select:
8097     case Instruction::SExt:
8098     case Instruction::Shl:
8099     case Instruction::SIToFP:
8100     case Instruction::SRem:
8101     case Instruction::Store:
8102     case Instruction::Sub:
8103     case Instruction::Trunc:
8104     case Instruction::UDiv:
8105     case Instruction::UIToFP:
8106     case Instruction::URem:
8107     case Instruction::Xor:
8108     case Instruction::ZExt:
8109       return true;
8110     }
8111     return false;
8112   };
8113 
8114   if (!IsVectorizableOpcode(I->getOpcode()))
8115     return false;
8116 
8117   if (CallInst *CI = dyn_cast<CallInst>(I)) {
8118     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8119     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8120                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
8121       return false;
8122   }
8123 
8124   auto willWiden = [&](unsigned VF) -> bool {
8125     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
8126                              CM.isProfitableToScalarize(I, VF)))
8127       return false;
8128     if (CallInst *CI = dyn_cast<CallInst>(I)) {
8129       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8130       // The following case may be scalarized depending on the VF.
8131       // The flag shows whether we use Intrinsic or a usual Call for vectorized
8132       // version of the instruction.
8133       // Is it beneficial to perform intrinsic call compared to lib call?
8134       bool NeedToScalarize;
8135       unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
8136       bool UseVectorIntrinsic =
8137           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
8138       return UseVectorIntrinsic || !NeedToScalarize;
8139     }
8140     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
8141       LoopVectorizationCostModel::InstWidening Decision =
8142           CM.getWideningDecision(I, VF);
8143       assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8144              "CM decision should be taken at this point.");
8145       assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
8146              "Interleave memory opportunity should be caught earlier.");
8147       return Decision != LoopVectorizationCostModel::CM_Scalarize;
8148     }
8149     return true;
8150   };
8151 
8152   if (!getDecisionAndClampRange(willWiden, Range))
8153     return false;
8154 
8155   // Success: widen this instruction. We optimize the common case where
8156   // consecutive instructions can be represented by a single recipe.
8157   if (!VPBB->empty()) {
8158     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
8159     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
8160       return true;
8161   }
8162 
8163   VPBB->appendRecipe(new VPWidenRecipe(I));
8164   return true;
8165 }
8166 
8167 VPBasicBlock *LoopVectorizationPlanner::handleReplication(
8168     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8169     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe) {
8170   bool IsUniform = getDecisionAndClampRange(
8171       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
8172       Range);
8173 
8174   bool IsPredicated = Legal->isScalarWithPredication(I);
8175   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
8176 
8177   // Find if I uses a predicated instruction. If so, it will use its scalar
8178   // value. Avoid hoisting the insert-element which packs the scalar value into
8179   // a vector value, as that happens iff all users use the vector value.
8180   for (auto &Op : I->operands())
8181     if (auto *PredInst = dyn_cast<Instruction>(Op))
8182       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8183         PredInst2Recipe[PredInst]->setAlsoPack(false);
8184 
8185   // Finalize the recipe for Instr, first if it is not predicated.
8186   if (!IsPredicated) {
8187     DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8188     VPBB->appendRecipe(Recipe);
8189     return VPBB;
8190   }
8191   DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8192   assert(VPBB->getSuccessors().empty() &&
8193          "VPBB has successors when handling predicated replication.");
8194   // Record predicated instructions for above packing optimizations.
8195   PredInst2Recipe[I] = Recipe;
8196   VPBlockBase *Region = VPBB->setOneSuccessor(createReplicateRegion(I, Recipe));
8197   return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
8198 }
8199 
8200 VPRegionBlock *
8201 LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
8202                                                 VPRecipeBase *PredRecipe) {
8203   // Instructions marked for predication are replicated and placed under an
8204   // if-then construct to prevent side-effects.
8205 
8206   // Build the triangular if-then region.
8207   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8208   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8209   auto *BOMRecipe = new VPBranchOnMaskRecipe(Instr->getParent());
8210   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8211   auto *PHIRecipe =
8212       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
8213   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8214   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8215   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8216 
8217   // Note: first set Entry as region entry and then connect successors starting
8218   // from it in order, to propagate the "parent" of each VPBasicBlock.
8219   Entry->setTwoSuccessors(Pred, Exit);
8220   Pred->setOneSuccessor(Exit);
8221 
8222   return Region;
8223 }
8224 
8225 std::unique_ptr<VPlan> LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8226   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8227   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
8228 
8229   // Collect instructions from the original loop that will become trivially dead
8230   // in the vectorized loop. We don't need to vectorize these instructions. For
8231   // example, original induction update instructions can become dead because we
8232   // separately emit induction "steps" when generating code for the new loop.
8233   // Similarly, we create a new latch condition when setting up the structure
8234   // of the new loop, so the old one can become dead.
8235   SmallPtrSet<Instruction *, 4> DeadInstructions;
8236   collectTriviallyDeadInstructions(DeadInstructions);
8237 
8238   // Hold a mapping from predicated instructions to their recipes, in order to
8239   // fix their AlsoPack behavior if a user is determined to replicate and use a
8240   // scalar instead of vector value.
8241   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8242 
8243   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8244   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8245   auto Plan = llvm::make_unique<VPlan>(VPBB);
8246 
8247   // Scan the body of the loop in a topological order to visit each basic block
8248   // after having visited its predecessor basic blocks.
8249   LoopBlocksDFS DFS(OrigLoop);
8250   DFS.perform(LI);
8251 
8252   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8253     // Relevant instructions from basic block BB will be grouped into VPRecipe
8254     // ingredients and fill a new VPBasicBlock.
8255     unsigned VPBBsForBB = 0;
8256     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8257     VPBB->setOneSuccessor(FirstVPBBForBB);
8258     VPBB = FirstVPBBForBB;
8259 
8260     std::vector<Instruction *> Ingredients;
8261 
8262     // Organize the ingredients to vectorize from current basic block in the
8263     // right order.
8264     for (Instruction &I : *BB) {
8265       Instruction *Instr = &I;
8266 
8267       // First filter out irrelevant instructions, to ensure no recipes are
8268       // built for them.
8269       if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
8270           DeadInstructions.count(Instr))
8271         continue;
8272 
8273       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
8274       // member of the IG, do not construct any Recipe for it.
8275       const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
8276       if (IG && Instr != IG->getInsertPos() &&
8277           Range.Start >= 2 && // Query is illegal for VF == 1
8278           CM.getWideningDecision(Instr, Range.Start) ==
8279               LoopVectorizationCostModel::CM_Interleave) {
8280         if (SinkAfterInverse.count(Instr))
8281           Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
8282         continue;
8283       }
8284 
8285       // Move instructions to handle first-order recurrences, step 1: avoid
8286       // handling this instruction until after we've handled the instruction it
8287       // should follow.
8288       auto SAIt = SinkAfter.find(Instr);
8289       if (SAIt != SinkAfter.end()) {
8290         DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
8291                      << " to vectorize a 1st order recurrence.\n");
8292         SinkAfterInverse[SAIt->second] = Instr;
8293         continue;
8294       }
8295 
8296       Ingredients.push_back(Instr);
8297 
8298       // Move instructions to handle first-order recurrences, step 2: push the
8299       // instruction to be sunk at its insertion point.
8300       auto SAInvIt = SinkAfterInverse.find(Instr);
8301       if (SAInvIt != SinkAfterInverse.end())
8302         Ingredients.push_back(SAInvIt->second);
8303     }
8304 
8305     // Introduce each ingredient into VPlan.
8306     for (Instruction *Instr : Ingredients) {
8307       VPRecipeBase *Recipe = nullptr;
8308 
8309       // Check if Instr should belong to an interleave memory recipe, or already
8310       // does. In the latter case Instr is irrelevant.
8311       if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
8312         VPBB->appendRecipe(Recipe);
8313         continue;
8314       }
8315 
8316       // Check if Instr should form some PHI recipe.
8317       if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
8318         VPBB->appendRecipe(Recipe);
8319         continue;
8320       }
8321       if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
8322         VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
8323         continue;
8324       }
8325 
8326       // Check if Instr is to be widened by a general VPWidenRecipe, after
8327       // having first checked for specific widening recipes that deal with
8328       // Interleave Groups, Inductions and Phi nodes.
8329       if (tryToWiden(Instr, VPBB, Range))
8330         continue;
8331 
8332       // Otherwise, if all widening options failed, Instruction is to be
8333       // replicated. This may create a successor for VPBB.
8334       VPBasicBlock *NextVPBB =
8335           handleReplication(Instr, Range, VPBB, PredInst2Recipe);
8336       if (NextVPBB != VPBB) {
8337         VPBB = NextVPBB;
8338         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8339                                     : "");
8340       }
8341     }
8342   }
8343 
8344   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8345   // may also be empty, such as the last one VPBB, reflecting original
8346   // basic-blocks with no recipes.
8347   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8348   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8349   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8350   PreEntry->disconnectSuccessor(Entry);
8351   delete PreEntry;
8352 
8353   std::string PlanName;
8354   raw_string_ostream RSO(PlanName);
8355   unsigned VF = Range.Start;
8356   Plan->addVF(VF);
8357   RSO << "Initial VPlan for VF={" << VF;
8358   for (VF *= 2; VF < Range.End; VF *= 2) {
8359     Plan->addVF(VF);
8360     RSO << "," << VF;
8361   }
8362   RSO << "},UF>=1";
8363   RSO.flush();
8364   Plan->setName(PlanName);
8365 
8366   return Plan;
8367 }
8368 
8369 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
8370   O << " +\n"
8371     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8372   IG->getInsertPos()->printAsOperand(O, false);
8373   O << "\\l\"";
8374   for (unsigned i = 0; i < IG->getFactor(); ++i)
8375     if (Instruction *I = IG->getMember(i))
8376       O << " +\n"
8377         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
8378 }
8379 
8380 void VPReplicateRecipe::execute(VPTransformState &State) {
8381   if (State.Instance) { // Generate a single instance.
8382     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
8383     // Insert scalar instance packing it into a vector.
8384     if (AlsoPack && State.VF > 1) {
8385       // If we're constructing lane 0, initialize to start from undef.
8386       if (State.Instance->Lane == 0) {
8387         Value *Undef =
8388             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8389         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8390       }
8391       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8392     }
8393     return;
8394   }
8395 
8396   // Generate scalar instances for all VF lanes of all UF parts, unless the
8397   // instruction is uniform inwhich case generate only the first lane for each
8398   // of the UF parts.
8399   unsigned EndLane = IsUniform ? 1 : State.VF;
8400   for (unsigned Part = 0; Part < State.UF; ++Part)
8401     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8402       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
8403 }
8404 
8405 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8406   assert(State.Instance && "Branch on Mask works only on single instance.");
8407 
8408   unsigned Part = State.Instance->Part;
8409   unsigned Lane = State.Instance->Lane;
8410 
8411   auto Cond = State.ILV->createBlockInMask(MaskedBasicBlock);
8412 
8413   Value *ConditionBit = Cond[Part];
8414   if (!ConditionBit) // Block in mask is all-one.
8415     ConditionBit = State.Builder.getTrue();
8416   else if (ConditionBit->getType()->isVectorTy())
8417     ConditionBit = State.Builder.CreateExtractElement(
8418         ConditionBit, State.Builder.getInt32(Lane));
8419 
8420   // Replace the temporary unreachable terminator with a new conditional branch,
8421   // whose two destinations will be set later when they are created.
8422   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8423   assert(isa<UnreachableInst>(CurrentTerminator) &&
8424          "Expected to replace unreachable terminator with conditional branch.");
8425   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8426   CondBr->setSuccessor(0, nullptr);
8427   ReplaceInstWithInst(CurrentTerminator, CondBr);
8428 
8429   DEBUG(dbgs() << "\nLV: vectorizing BranchOnMask recipe "
8430                << MaskedBasicBlock->getName());
8431 }
8432 
8433 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8434   assert(State.Instance && "Predicated instruction PHI works per instance.");
8435   Instruction *ScalarPredInst = cast<Instruction>(
8436       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8437   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8438   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8439   assert(PredicatingBB && "Predicated block has no single predecessor.");
8440 
8441   // By current pack/unpack logic we need to generate only a single phi node: if
8442   // a vector value for the predicated instruction exists at this point it means
8443   // the instruction has vector users only, and a phi for the vector value is
8444   // needed. In this case the recipe of the predicated instruction is marked to
8445   // also do that packing, thereby "hoisting" the insert-element sequence.
8446   // Otherwise, a phi node for the scalar value is needed.
8447   unsigned Part = State.Instance->Part;
8448   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8449     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8450     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8451     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8452     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8453     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8454     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8455   } else {
8456     Type *PredInstType = PredInst->getType();
8457     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8458     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8459     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8460     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8461   }
8462 }
8463 
8464 bool LoopVectorizePass::processLoop(Loop *L) {
8465   assert(L->empty() && "Only process inner loops.");
8466 
8467 #ifndef NDEBUG
8468   const std::string DebugLocStr = getDebugLocString(L);
8469 #endif /* NDEBUG */
8470 
8471   DEBUG(dbgs() << "\nLV: Checking a loop in \""
8472                << L->getHeader()->getParent()->getName() << "\" from "
8473                << DebugLocStr << "\n");
8474 
8475   LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
8476 
8477   DEBUG(dbgs() << "LV: Loop hints:"
8478                << " force="
8479                << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8480                        ? "disabled"
8481                        : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8482                               ? "enabled"
8483                               : "?"))
8484                << " width=" << Hints.getWidth()
8485                << " unroll=" << Hints.getInterleave() << "\n");
8486 
8487   // Function containing loop
8488   Function *F = L->getHeader()->getParent();
8489 
8490   // Looking at the diagnostic output is the only way to determine if a loop
8491   // was vectorized (other than looking at the IR or machine code), so it
8492   // is important to generate an optimization remark for each loop. Most of
8493   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8494   // generated as OptimizationRemark and OptimizationRemarkMissed are
8495   // less verbose reporting vectorized loops and unvectorized loops that may
8496   // benefit from vectorization, respectively.
8497 
8498   if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
8499     DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8500     return false;
8501   }
8502 
8503   PredicatedScalarEvolution PSE(*SE, *L);
8504 
8505   // Check if it is legal to vectorize the loop.
8506   LoopVectorizationRequirements Requirements(*ORE);
8507   LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
8508                                 &Requirements, &Hints);
8509   if (!LVL.canVectorize()) {
8510     DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8511     emitMissedWarning(F, L, Hints, ORE);
8512     return false;
8513   }
8514 
8515   // Check the function attributes to find out if this function should be
8516   // optimized for size.
8517   bool OptForSize =
8518       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
8519 
8520   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8521   // count by optimizing for size, to minimize overheads.
8522   unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
8523   bool HasExpectedTC = (ExpectedTC > 0);
8524 
8525   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
8526     auto EstimatedTC = getLoopEstimatedTripCount(L);
8527     if (EstimatedTC) {
8528       ExpectedTC = *EstimatedTC;
8529       HasExpectedTC = true;
8530     }
8531   }
8532 
8533   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
8534     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8535                  << "This loop is worth vectorizing only if no scalar "
8536                  << "iteration overheads are incurred.");
8537     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8538       DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8539     else {
8540       DEBUG(dbgs() << "\n");
8541       // Loops with a very small trip count are considered for vectorization
8542       // under OptForSize, thereby making sure the cost of their loop body is
8543       // dominant, free of runtime guards and scalar iteration overheads.
8544       OptForSize = true;
8545     }
8546   }
8547 
8548   // Check the function attributes to see if implicit floats are allowed.
8549   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8550   // an integer loop and the vector instructions selected are purely integer
8551   // vector instructions?
8552   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8553     DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
8554                     "attribute is used.\n");
8555     ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
8556                                    "NoImplicitFloat", L)
8557               << "loop not vectorized due to NoImplicitFloat attribute");
8558     emitMissedWarning(F, L, Hints, ORE);
8559     return false;
8560   }
8561 
8562   // Check if the target supports potentially unsafe FP vectorization.
8563   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8564   // for the target we're vectorizing for, to make sure none of the
8565   // additional fp-math flags can help.
8566   if (Hints.isPotentiallyUnsafe() &&
8567       TTI->isFPVectorizationPotentiallyUnsafe()) {
8568     DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
8569     ORE->emit(
8570         createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
8571         << "loop not vectorized due to unsafe FP support.");
8572     emitMissedWarning(F, L, Hints, ORE);
8573     return false;
8574   }
8575 
8576   // Use the cost model.
8577   LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
8578                                 &Hints);
8579   CM.collectValuesToIgnore();
8580 
8581   // Use the planner for vectorization.
8582   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
8583 
8584   // Get user vectorization factor.
8585   unsigned UserVF = Hints.getWidth();
8586 
8587   // Plan how to best vectorize, return the best VF and its cost.
8588   LoopVectorizationCostModel::VectorizationFactor VF =
8589       LVP.plan(OptForSize, UserVF);
8590 
8591   // Select the interleave count.
8592   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
8593 
8594   // Get user interleave count.
8595   unsigned UserIC = Hints.getInterleave();
8596 
8597   // Identify the diagnostic messages that should be produced.
8598   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8599   bool VectorizeLoop = true, InterleaveLoop = true;
8600   if (Requirements.doesNotMeet(F, L, Hints)) {
8601     DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8602                     "requirements.\n");
8603     emitMissedWarning(F, L, Hints, ORE);
8604     return false;
8605   }
8606 
8607   if (VF.Width == 1) {
8608     DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8609     VecDiagMsg = std::make_pair(
8610         "VectorizationNotBeneficial",
8611         "the cost-model indicates that vectorization is not beneficial");
8612     VectorizeLoop = false;
8613   }
8614 
8615   if (IC == 1 && UserIC <= 1) {
8616     // Tell the user interleaving is not beneficial.
8617     DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8618     IntDiagMsg = std::make_pair(
8619         "InterleavingNotBeneficial",
8620         "the cost-model indicates that interleaving is not beneficial");
8621     InterleaveLoop = false;
8622     if (UserIC == 1) {
8623       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8624       IntDiagMsg.second +=
8625           " and is explicitly disabled or interleave count is set to 1";
8626     }
8627   } else if (IC > 1 && UserIC == 1) {
8628     // Tell the user interleaving is beneficial, but it explicitly disabled.
8629     DEBUG(dbgs()
8630           << "LV: Interleaving is beneficial but is explicitly disabled.");
8631     IntDiagMsg = std::make_pair(
8632         "InterleavingBeneficialButDisabled",
8633         "the cost-model indicates that interleaving is beneficial "
8634         "but is explicitly disabled or interleave count is set to 1");
8635     InterleaveLoop = false;
8636   }
8637 
8638   // Override IC if user provided an interleave count.
8639   IC = UserIC > 0 ? UserIC : IC;
8640 
8641   // Emit diagnostic messages, if any.
8642   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8643   if (!VectorizeLoop && !InterleaveLoop) {
8644     // Do not vectorize or interleaving the loop.
8645     ORE->emit([&]() {
8646       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8647                                       L->getStartLoc(), L->getHeader())
8648              << VecDiagMsg.second;
8649     });
8650     ORE->emit([&]() {
8651       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8652                                       L->getStartLoc(), L->getHeader())
8653              << IntDiagMsg.second;
8654     });
8655     return false;
8656   } else if (!VectorizeLoop && InterleaveLoop) {
8657     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8658     ORE->emit([&]() {
8659       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8660                                         L->getStartLoc(), L->getHeader())
8661              << VecDiagMsg.second;
8662     });
8663   } else if (VectorizeLoop && !InterleaveLoop) {
8664     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
8665                  << DebugLocStr << '\n');
8666     ORE->emit([&]() {
8667       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8668                                         L->getStartLoc(), L->getHeader())
8669              << IntDiagMsg.second;
8670     });
8671   } else if (VectorizeLoop && InterleaveLoop) {
8672     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
8673                  << DebugLocStr << '\n');
8674     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8675   }
8676 
8677   LVP.setBestPlan(VF.Width, IC);
8678 
8679   using namespace ore;
8680 
8681   if (!VectorizeLoop) {
8682     assert(IC > 1 && "interleave count should not be 1 or 0");
8683     // If we decided that it is not legal to vectorize the loop, then
8684     // interleave it.
8685     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
8686                                &CM);
8687     LVP.executePlan(Unroller, DT);
8688 
8689     ORE->emit([&]() {
8690       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8691                                 L->getHeader())
8692              << "interleaved loop (interleaved count: "
8693              << NV("InterleaveCount", IC) << ")";
8694     });
8695   } else {
8696     // If we decided that it is *legal* to vectorize the loop, then do it.
8697     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8698                            &LVL, &CM);
8699     LVP.executePlan(LB, DT);
8700     ++LoopsVectorized;
8701 
8702     // Add metadata to disable runtime unrolling a scalar loop when there are
8703     // no runtime checks about strides and memory. A scalar loop that is
8704     // rarely used is not worth unrolling.
8705     if (!LB.areSafetyChecksAdded())
8706       AddRuntimeUnrollDisableMetaData(L);
8707 
8708     // Report the vectorization decision.
8709     ORE->emit([&]() {
8710       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8711                                 L->getHeader())
8712              << "vectorized loop (vectorization width: "
8713              << NV("VectorizationFactor", VF.Width)
8714              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8715     });
8716   }
8717 
8718   // Mark the loop as already vectorized to avoid vectorizing again.
8719   Hints.setAlreadyVectorized();
8720 
8721   DEBUG(verifyFunction(*L->getHeader()->getParent()));
8722   return true;
8723 }
8724 
8725 bool LoopVectorizePass::runImpl(
8726     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8727     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8728     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
8729     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8730     OptimizationRemarkEmitter &ORE_) {
8731   SE = &SE_;
8732   LI = &LI_;
8733   TTI = &TTI_;
8734   DT = &DT_;
8735   BFI = &BFI_;
8736   TLI = TLI_;
8737   AA = &AA_;
8738   AC = &AC_;
8739   GetLAA = &GetLAA_;
8740   DB = &DB_;
8741   ORE = &ORE_;
8742 
8743   // Don't attempt if
8744   // 1. the target claims to have no vector registers, and
8745   // 2. interleaving won't help ILP.
8746   //
8747   // The second condition is necessary because, even if the target has no
8748   // vector registers, loop vectorization may still enable scalar
8749   // interleaving.
8750   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
8751     return false;
8752 
8753   bool Changed = false;
8754 
8755   // The vectorizer requires loops to be in simplified form.
8756   // Since simplification may add new inner loops, it has to run before the
8757   // legality and profitability checks. This means running the loop vectorizer
8758   // will simplify all loops, regardless of whether anything end up being
8759   // vectorized.
8760   for (auto &L : *LI)
8761     Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
8762 
8763   // Build up a worklist of inner-loops to vectorize. This is necessary as
8764   // the act of vectorizing or partially unrolling a loop creates new loops
8765   // and can invalidate iterators across the loops.
8766   SmallVector<Loop *, 8> Worklist;
8767 
8768   for (Loop *L : *LI)
8769     addAcyclicInnerLoop(*L, Worklist);
8770 
8771   LoopsAnalyzed += Worklist.size();
8772 
8773   // Now walk the identified inner loops.
8774   while (!Worklist.empty()) {
8775     Loop *L = Worklist.pop_back_val();
8776 
8777     // For the inner loops we actually process, form LCSSA to simplify the
8778     // transform.
8779     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8780 
8781     Changed |= processLoop(L);
8782   }
8783 
8784   // Process each loop nest in the function.
8785   return Changed;
8786 }
8787 
8788 PreservedAnalyses LoopVectorizePass::run(Function &F,
8789                                          FunctionAnalysisManager &AM) {
8790     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8791     auto &LI = AM.getResult<LoopAnalysis>(F);
8792     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8793     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8794     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8795     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8796     auto &AA = AM.getResult<AAManager>(F);
8797     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8798     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8799     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8800 
8801     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8802     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8803         [&](Loop &L) -> const LoopAccessInfo & {
8804       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
8805       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8806     };
8807     bool Changed =
8808         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
8809     if (!Changed)
8810       return PreservedAnalyses::all();
8811     PreservedAnalyses PA;
8812     PA.preserve<LoopAnalysis>();
8813     PA.preserve<DominatorTreeAnalysis>();
8814     PA.preserve<BasicAA>();
8815     PA.preserve<GlobalsAA>();
8816     return PA;
8817 }
8818