1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 //    of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 //    widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 //    of vectorization. It decides on the optimal vector width, which
27 //    can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
39 //  Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
45 //  Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
50 #include "llvm/ADT/DenseMap.h"
51 #include "llvm/ADT/Hashing.h"
52 #include "llvm/ADT/MapVector.h"
53 #include "llvm/ADT/Optional.h"
54 #include "llvm/ADT/SCCIterator.h"
55 #include "llvm/ADT/SetVector.h"
56 #include "llvm/ADT/SmallPtrSet.h"
57 #include "llvm/ADT/SmallSet.h"
58 #include "llvm/ADT/SmallVector.h"
59 #include "llvm/ADT/Statistic.h"
60 #include "llvm/ADT/StringExtras.h"
61 #include "llvm/Analysis/CodeMetrics.h"
62 #include "llvm/Analysis/GlobalsModRef.h"
63 #include "llvm/Analysis/LoopInfo.h"
64 #include "llvm/Analysis/LoopIterator.h"
65 #include "llvm/Analysis/LoopPass.h"
66 #include "llvm/Analysis/ScalarEvolutionExpander.h"
67 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
68 #include "llvm/Analysis/ValueTracking.h"
69 #include "llvm/Analysis/VectorUtils.h"
70 #include "llvm/IR/Constants.h"
71 #include "llvm/IR/DataLayout.h"
72 #include "llvm/IR/DebugInfo.h"
73 #include "llvm/IR/DerivedTypes.h"
74 #include "llvm/IR/DiagnosticInfo.h"
75 #include "llvm/IR/Dominators.h"
76 #include "llvm/IR/Function.h"
77 #include "llvm/IR/IRBuilder.h"
78 #include "llvm/IR/Instructions.h"
79 #include "llvm/IR/IntrinsicInst.h"
80 #include "llvm/IR/LLVMContext.h"
81 #include "llvm/IR/Module.h"
82 #include "llvm/IR/PatternMatch.h"
83 #include "llvm/IR/Type.h"
84 #include "llvm/IR/User.h"
85 #include "llvm/IR/Value.h"
86 #include "llvm/IR/ValueHandle.h"
87 #include "llvm/IR/Verifier.h"
88 #include "llvm/Pass.h"
89 #include "llvm/Support/BranchProbability.h"
90 #include "llvm/Support/CommandLine.h"
91 #include "llvm/Support/Debug.h"
92 #include "llvm/Support/raw_ostream.h"
93 #include "llvm/Transforms/Scalar.h"
94 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
95 #include "llvm/Transforms/Utils/Local.h"
96 #include "llvm/Transforms/Utils/LoopSimplify.h"
97 #include "llvm/Transforms/Utils/LoopUtils.h"
98 #include "llvm/Transforms/Utils/LoopVersioning.h"
99 #include "llvm/Transforms/Vectorize.h"
100 #include <algorithm>
101 #include <map>
102 #include <tuple>
103 
104 using namespace llvm;
105 using namespace llvm::PatternMatch;
106 
107 #define LV_NAME "loop-vectorize"
108 #define DEBUG_TYPE LV_NAME
109 
110 STATISTIC(LoopsVectorized, "Number of loops vectorized");
111 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
112 
113 static cl::opt<bool>
114     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
115                        cl::desc("Enable if-conversion during vectorization."));
116 
117 /// Loops with a known constant trip count below this number are vectorized only
118 /// if no scalar iteration overheads are incurred.
119 static cl::opt<unsigned> TinyTripCountVectorThreshold(
120     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
121     cl::desc("Loops with a constant trip count that is smaller than this "
122              "value are vectorized only if no scalar iteration overheads "
123              "are incurred."));
124 
125 static cl::opt<bool> MaximizeBandwidth(
126     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
127     cl::desc("Maximize bandwidth when selecting vectorization factor which "
128              "will be determined by the smallest type in loop."));
129 
130 static cl::opt<bool> EnableInterleavedMemAccesses(
131     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
132     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
133 
134 /// Maximum factor for an interleaved memory access.
135 static cl::opt<unsigned> MaxInterleaveGroupFactor(
136     "max-interleave-group-factor", cl::Hidden,
137     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
138     cl::init(8));
139 
140 /// We don't interleave loops with a known constant trip count below this
141 /// number.
142 static const unsigned TinyTripCountInterleaveThreshold = 128;
143 
144 static cl::opt<unsigned> ForceTargetNumScalarRegs(
145     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
146     cl::desc("A flag that overrides the target's number of scalar registers."));
147 
148 static cl::opt<unsigned> ForceTargetNumVectorRegs(
149     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
150     cl::desc("A flag that overrides the target's number of vector registers."));
151 
152 /// Maximum vectorization interleave count.
153 static const unsigned MaxInterleaveFactor = 16;
154 
155 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
156     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
157     cl::desc("A flag that overrides the target's max interleave factor for "
158              "scalar loops."));
159 
160 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
161     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
162     cl::desc("A flag that overrides the target's max interleave factor for "
163              "vectorized loops."));
164 
165 static cl::opt<unsigned> ForceTargetInstructionCost(
166     "force-target-instruction-cost", cl::init(0), cl::Hidden,
167     cl::desc("A flag that overrides the target's expected cost for "
168              "an instruction to a single constant value. Mostly "
169              "useful for getting consistent testing."));
170 
171 static cl::opt<unsigned> SmallLoopCost(
172     "small-loop-cost", cl::init(20), cl::Hidden,
173     cl::desc(
174         "The cost of a loop that is considered 'small' by the interleaver."));
175 
176 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
177     "loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
178     cl::desc("Enable the use of the block frequency analysis to access PGO "
179              "heuristics minimizing code growth in cold regions and being more "
180              "aggressive in hot regions."));
181 
182 // Runtime interleave loops for load/store throughput.
183 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
184     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
185     cl::desc(
186         "Enable runtime interleaving until load/store ports are saturated"));
187 
188 /// The number of stores in a loop that are allowed to need predication.
189 static cl::opt<unsigned> NumberOfStoresToPredicate(
190     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
191     cl::desc("Max number of stores to be predicated behind an if."));
192 
193 static cl::opt<bool> EnableIndVarRegisterHeur(
194     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
195     cl::desc("Count the induction variable only once when interleaving"));
196 
197 static cl::opt<bool> EnableCondStoresVectorization(
198     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
199     cl::desc("Enable if predication of stores during vectorization."));
200 
201 static cl::opt<unsigned> MaxNestedScalarReductionIC(
202     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
203     cl::desc("The maximum interleave count to use when interleaving a scalar "
204              "reduction in a nested loop."));
205 
206 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
207     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
208     cl::desc("The maximum allowed number of runtime memory checks with a "
209              "vectorize(enable) pragma."));
210 
211 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
212     "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
213     cl::desc("The maximum number of SCEV checks allowed."));
214 
215 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
216     "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
217     cl::desc("The maximum number of SCEV checks allowed with a "
218              "vectorize(enable) pragma"));
219 
220 /// Create an analysis remark that explains why vectorization failed
221 ///
222 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
223 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
224 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
225 /// the location of the remark.  \return the remark object that can be
226 /// streamed to.
227 static OptimizationRemarkAnalysis
228 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
229                      Instruction *I = nullptr) {
230   Value *CodeRegion = TheLoop->getHeader();
231   DebugLoc DL = TheLoop->getStartLoc();
232 
233   if (I) {
234     CodeRegion = I->getParent();
235     // If there is no debug location attached to the instruction, revert back to
236     // using the loop's.
237     if (I->getDebugLoc())
238       DL = I->getDebugLoc();
239   }
240 
241   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
242   R << "loop not vectorized: ";
243   return R;
244 }
245 
246 namespace {
247 
248 // Forward declarations.
249 class LoopVectorizeHints;
250 class LoopVectorizationLegality;
251 class LoopVectorizationCostModel;
252 class LoopVectorizationRequirements;
253 
254 /// Returns true if the given loop body has a cycle, excluding the loop
255 /// itself.
256 static bool hasCyclesInLoopBody(const Loop &L) {
257   if (!L.empty())
258     return true;
259 
260   for (const auto &SCC :
261        make_range(scc_iterator<Loop, LoopBodyTraits>::begin(L),
262                   scc_iterator<Loop, LoopBodyTraits>::end(L))) {
263     if (SCC.size() > 1) {
264       DEBUG(dbgs() << "LVL: Detected a cycle in the loop body:\n");
265       DEBUG(L.dump());
266       return true;
267     }
268   }
269   return false;
270 }
271 
272 /// A helper function for converting Scalar types to vector types.
273 /// If the incoming type is void, we return void. If the VF is 1, we return
274 /// the scalar type.
275 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
276   if (Scalar->isVoidTy() || VF == 1)
277     return Scalar;
278   return VectorType::get(Scalar, VF);
279 }
280 
281 // FIXME: The following helper functions have multiple implementations
282 // in the project. They can be effectively organized in a common Load/Store
283 // utilities unit.
284 
285 /// A helper function that returns the pointer operand of a load or store
286 /// instruction.
287 static Value *getPointerOperand(Value *I) {
288   if (auto *LI = dyn_cast<LoadInst>(I))
289     return LI->getPointerOperand();
290   if (auto *SI = dyn_cast<StoreInst>(I))
291     return SI->getPointerOperand();
292   return nullptr;
293 }
294 
295 /// A helper function that returns the type of loaded or stored value.
296 static Type *getMemInstValueType(Value *I) {
297   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
298          "Expected Load or Store instruction");
299   if (auto *LI = dyn_cast<LoadInst>(I))
300     return LI->getType();
301   return cast<StoreInst>(I)->getValueOperand()->getType();
302 }
303 
304 /// A helper function that returns the alignment of load or store instruction.
305 static unsigned getMemInstAlignment(Value *I) {
306   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
307          "Expected Load or Store instruction");
308   if (auto *LI = dyn_cast<LoadInst>(I))
309     return LI->getAlignment();
310   return cast<StoreInst>(I)->getAlignment();
311 }
312 
313 /// A helper function that returns the address space of the pointer operand of
314 /// load or store instruction.
315 static unsigned getMemInstAddressSpace(Value *I) {
316   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
317          "Expected Load or Store instruction");
318   if (auto *LI = dyn_cast<LoadInst>(I))
319     return LI->getPointerAddressSpace();
320   return cast<StoreInst>(I)->getPointerAddressSpace();
321 }
322 
323 /// A helper function that returns true if the given type is irregular. The
324 /// type is irregular if its allocated size doesn't equal the store size of an
325 /// element of the corresponding vector type at the given vectorization factor.
326 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
327 
328   // Determine if an array of VF elements of type Ty is "bitcast compatible"
329   // with a <VF x Ty> vector.
330   if (VF > 1) {
331     auto *VectorTy = VectorType::get(Ty, VF);
332     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
333   }
334 
335   // If the vectorization factor is one, we just check if an array of type Ty
336   // requires padding between elements.
337   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
338 }
339 
340 /// A helper function that returns the reciprocal of the block probability of
341 /// predicated blocks. If we return X, we are assuming the predicated block
342 /// will execute once for for every X iterations of the loop header.
343 ///
344 /// TODO: We should use actual block probability here, if available. Currently,
345 ///       we always assume predicated blocks have a 50% chance of executing.
346 static unsigned getReciprocalPredBlockProb() { return 2; }
347 
348 /// A helper function that adds a 'fast' flag to floating-point operations.
349 static Value *addFastMathFlag(Value *V) {
350   if (isa<FPMathOperator>(V)) {
351     FastMathFlags Flags;
352     Flags.setUnsafeAlgebra();
353     cast<Instruction>(V)->setFastMathFlags(Flags);
354   }
355   return V;
356 }
357 
358 /// A helper function that returns an integer or floating-point constant with
359 /// value C.
360 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
361   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
362                            : ConstantFP::get(Ty, C);
363 }
364 
365 /// InnerLoopVectorizer vectorizes loops which contain only one basic
366 /// block to a specified vectorization factor (VF).
367 /// This class performs the widening of scalars into vectors, or multiple
368 /// scalars. This class also implements the following features:
369 /// * It inserts an epilogue loop for handling loops that don't have iteration
370 ///   counts that are known to be a multiple of the vectorization factor.
371 /// * It handles the code generation for reduction variables.
372 /// * Scalarization (implementation using scalars) of un-vectorizable
373 ///   instructions.
374 /// InnerLoopVectorizer does not perform any vectorization-legality
375 /// checks, and relies on the caller to check for the different legality
376 /// aspects. The InnerLoopVectorizer relies on the
377 /// LoopVectorizationLegality class to provide information about the induction
378 /// and reduction variables that were found to a given vectorization factor.
379 class InnerLoopVectorizer {
380 public:
381   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
382                       LoopInfo *LI, DominatorTree *DT,
383                       const TargetLibraryInfo *TLI,
384                       const TargetTransformInfo *TTI, AssumptionCache *AC,
385                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
386                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
387                       LoopVectorizationCostModel *CM)
388       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
389         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
390         Builder(PSE.getSE()->getContext()), Induction(nullptr),
391         OldInduction(nullptr), VectorLoopValueMap(UnrollFactor, VecWidth),
392         TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
393         AddedSafetyChecks(false) {}
394 
395   /// Create a new empty loop. Unlink the old loop and connect the new one.
396   void createVectorizedLoopSkeleton();
397 
398   /// Vectorize a single instruction within the innermost loop.
399   void vectorizeInstruction(Instruction &I);
400 
401   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
402   void fixVectorizedLoop();
403 
404   // Return true if any runtime check is added.
405   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
406 
407   virtual ~InnerLoopVectorizer() {}
408 
409 protected:
410   /// A small list of PHINodes.
411   typedef SmallVector<PHINode *, 4> PhiVector;
412 
413   /// A type for vectorized values in the new loop. Each value from the
414   /// original loop, when vectorized, is represented by UF vector values in the
415   /// new unrolled loop, where UF is the unroll factor.
416   typedef SmallVector<Value *, 2> VectorParts;
417 
418   /// A type for scalarized values in the new loop. Each value from the
419   /// original loop, when scalarized, is represented by UF x VF scalar values
420   /// in the new unrolled loop, where UF is the unroll factor and VF is the
421   /// vectorization factor.
422   typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
423 
424   // When we if-convert we need to create edge masks. We have to cache values
425   // so that we don't end up with exponential recursion/IR.
426   typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
427       EdgeMaskCacheTy;
428   typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
429 
430   /// Set up the values of the IVs correctly when exiting the vector loop.
431   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
432                     Value *CountRoundDown, Value *EndValue,
433                     BasicBlock *MiddleBlock);
434 
435   /// Create a new induction variable inside L.
436   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
437                                    Value *Step, Instruction *DL);
438 
439   /// Handle all cross-iteration phis in the header.
440   void fixCrossIterationPHIs();
441 
442   /// Fix a first-order recurrence. This is the second phase of vectorizing
443   /// this phi node.
444   void fixFirstOrderRecurrence(PHINode *Phi);
445 
446   /// Fix a reduction cross-iteration phi. This is the second phase of
447   /// vectorizing this phi node.
448   void fixReduction(PHINode *Phi);
449 
450   /// \brief The Loop exit block may have single value PHI nodes with some
451   /// incoming value. While vectorizing we only handled real values
452   /// that were defined inside the loop and we should have one value for
453   /// each predecessor of its parent basic block. See PR14725.
454   void fixLCSSAPHIs();
455 
456   /// Iteratively sink the scalarized operands of a predicated instruction into
457   /// the block that was created for it.
458   void sinkScalarOperands(Instruction *PredInst);
459 
460   /// Predicate conditional instructions that require predication on their
461   /// respective conditions.
462   void predicateInstructions();
463 
464   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
465   /// represented as.
466   void truncateToMinimalBitwidths();
467 
468   /// A helper function that computes the predicate of the block BB, assuming
469   /// that the header block of the loop is set to True. It returns the *entry*
470   /// mask for the block BB.
471   VectorParts createBlockInMask(BasicBlock *BB);
472   /// A helper function that computes the predicate of the edge between SRC
473   /// and DST.
474   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
475 
476   /// Vectorize a single PHINode in a block. This method handles the induction
477   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
478   /// arbitrary length vectors.
479   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
480 
481   /// Insert the new loop to the loop hierarchy and pass manager
482   /// and update the analysis passes.
483   void updateAnalysis();
484 
485   /// This instruction is un-vectorizable. Implement it as a sequence
486   /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
487   /// scalarized instruction behind an if block predicated on the control
488   /// dependence of the instruction.
489   void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
490 
491   /// Vectorize Load and Store instructions,
492   virtual void vectorizeMemoryInstruction(Instruction *Instr);
493 
494   /// Create a broadcast instruction. This method generates a broadcast
495   /// instruction (shuffle) for loop invariant values and for the induction
496   /// value. If this is the induction variable then we extend it to N, N+1, ...
497   /// this is needed because each iteration in the loop corresponds to a SIMD
498   /// element.
499   virtual Value *getBroadcastInstrs(Value *V);
500 
501   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
502   /// to each vector element of Val. The sequence starts at StartIndex.
503   /// \p Opcode is relevant for FP induction variable.
504   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
505                                Instruction::BinaryOps Opcode =
506                                Instruction::BinaryOpsEnd);
507 
508   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
509   /// variable on which to base the steps, \p Step is the size of the step, and
510   /// \p EntryVal is the value from the original loop that maps to the steps.
511   /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
512   /// can be a truncate instruction).
513   void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
514                         const InductionDescriptor &ID);
515 
516   /// Create a vector induction phi node based on an existing scalar one. \p
517   /// EntryVal is the value from the original loop that maps to the vector phi
518   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
519   /// truncate instruction, instead of widening the original IV, we widen a
520   /// version of the IV truncated to \p EntryVal's type.
521   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
522                                        Value *Step, Instruction *EntryVal);
523 
524   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
525   /// is provided, the integer induction variable will first be truncated to
526   /// the corresponding type.
527   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
528 
529   /// Returns true if an instruction \p I should be scalarized instead of
530   /// vectorized for the chosen vectorization factor.
531   bool shouldScalarizeInstruction(Instruction *I) const;
532 
533   /// Returns true if we should generate a scalar version of \p IV.
534   bool needsScalarInduction(Instruction *IV) const;
535 
536   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
537   /// vector or scalar value on-demand if one is not yet available. When
538   /// vectorizing a loop, we visit the definition of an instruction before its
539   /// uses. When visiting the definition, we either vectorize or scalarize the
540   /// instruction, creating an entry for it in the corresponding map. (In some
541   /// cases, such as induction variables, we will create both vector and scalar
542   /// entries.) Then, as we encounter uses of the definition, we derive values
543   /// for each scalar or vector use unless such a value is already available.
544   /// For example, if we scalarize a definition and one of its uses is vector,
545   /// we build the required vector on-demand with an insertelement sequence
546   /// when visiting the use. Otherwise, if the use is scalar, we can use the
547   /// existing scalar definition.
548   ///
549   /// Return a value in the new loop corresponding to \p V from the original
550   /// loop at unroll index \p Part. If the value has already been vectorized,
551   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
552   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
553   /// a new vector value on-demand by inserting the scalar values into a vector
554   /// with an insertelement sequence. If the value has been neither vectorized
555   /// nor scalarized, it must be loop invariant, so we simply broadcast the
556   /// value into a vector.
557   Value *getOrCreateVectorValue(Value *V, unsigned Part);
558 
559   /// Return a value in the new loop corresponding to \p V from the original
560   /// loop at unroll index \p Part and vector index \p Lane. If the value has
561   /// been vectorized but not scalarized, the necessary extractelement
562   /// instruction will be generated.
563   Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
564 
565   /// Try to vectorize the interleaved access group that \p Instr belongs to.
566   void vectorizeInterleaveGroup(Instruction *Instr);
567 
568   /// Generate a shuffle sequence that will reverse the vector Vec.
569   virtual Value *reverseVector(Value *Vec);
570 
571   /// Returns (and creates if needed) the original loop trip count.
572   Value *getOrCreateTripCount(Loop *NewLoop);
573 
574   /// Returns (and creates if needed) the trip count of the widened loop.
575   Value *getOrCreateVectorTripCount(Loop *NewLoop);
576 
577   /// Emit a bypass check to see if the trip count would overflow, or we
578   /// wouldn't have enough iterations to execute one vector loop.
579   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
580   /// Emit a bypass check to see if the vector trip count is nonzero.
581   void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
582   /// Emit a bypass check to see if all of the SCEV assumptions we've
583   /// had to make are correct.
584   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
585   /// Emit bypass checks to check any memory assumptions we may have made.
586   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
587 
588   /// Add additional metadata to \p To that was not present on \p Orig.
589   ///
590   /// Currently this is used to add the noalias annotations based on the
591   /// inserted memchecks.  Use this for instructions that are *cloned* into the
592   /// vector loop.
593   void addNewMetadata(Instruction *To, const Instruction *Orig);
594 
595   /// Add metadata from one instruction to another.
596   ///
597   /// This includes both the original MDs from \p From and additional ones (\see
598   /// addNewMetadata).  Use this for *newly created* instructions in the vector
599   /// loop.
600   void addMetadata(Instruction *To, Instruction *From);
601 
602   /// \brief Similar to the previous function but it adds the metadata to a
603   /// vector of instructions.
604   void addMetadata(ArrayRef<Value *> To, Instruction *From);
605 
606   /// \brief Set the debug location in the builder using the debug location in
607   /// the instruction.
608   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
609 
610   /// This is a helper class for maintaining vectorization state. It's used for
611   /// mapping values from the original loop to their corresponding values in
612   /// the new loop. Two mappings are maintained: one for vectorized values and
613   /// one for scalarized values. Vectorized values are represented with UF
614   /// vector values in the new loop, and scalarized values are represented with
615   /// UF x VF scalar values in the new loop. UF and VF are the unroll and
616   /// vectorization factors, respectively.
617   ///
618   /// Entries can be added to either map with setVectorValue and setScalarValue,
619   /// which assert that an entry was not already added before. If an entry is to
620   /// replace an existing one, call resetVectorValue. This is currently needed
621   /// to modify the mapped values during "fix-up" operations that occur once the
622   /// first phase of widening is complete. These operations include type
623   /// truncation and the second phase of recurrence widening.
624   ///
625   /// Entries from either map can be retrieved using the getVectorValue and
626   /// getScalarValue functions, which assert that the desired value exists.
627 
628   struct ValueMap {
629 
630     /// Construct an empty map with the given unroll and vectorization factors.
631     ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
632 
633     /// \return True if the map has any vector entry for \p Key.
634     bool hasAnyVectorValue(Value *Key) const {
635       return VectorMapStorage.count(Key);
636     }
637 
638     /// \return True if the map has a vector entry for \p Key and \p Part.
639     bool hasVectorValue(Value *Key, unsigned Part) const {
640       assert(Part < UF && "Queried Vector Part is too large.");
641       if (!hasAnyVectorValue(Key))
642         return false;
643       const VectorParts &Entry = VectorMapStorage.find(Key)->second;
644       assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
645       return Entry[Part] != nullptr;
646     }
647 
648     /// \return True if the map has any scalar entry for \p Key.
649     bool hasAnyScalarValue(Value *Key) const {
650       return ScalarMapStorage.count(Key);
651     }
652 
653     /// \return True if the map has a scalar entry for \p Key, \p Part and
654     /// \p Part.
655     bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
656       assert(Part < UF && "Queried Scalar Part is too large.");
657       assert(Lane < VF && "Queried Scalar Lane is too large.");
658       if (!hasAnyScalarValue(Key))
659         return false;
660       const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
661       assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
662       assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
663       return Entry[Part][Lane] != nullptr;
664     }
665 
666     /// Retrieve the existing vector value that corresponds to \p Key and
667     /// \p Part.
668     Value *getVectorValue(Value *Key, unsigned Part) {
669       assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
670       return VectorMapStorage[Key][Part];
671     }
672 
673     /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
674     /// and \p Lane.
675     Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
676       assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
677       return ScalarMapStorage[Key][Part][Lane];
678     }
679 
680     /// Set a vector value associated with \p Key and \p Part. Assumes such a
681     /// value is not already set. If it is, use resetVectorValue() instead.
682     void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
683       assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
684       if (!VectorMapStorage.count(Key)) {
685         VectorParts Entry(UF);
686         VectorMapStorage[Key] = Entry;
687       }
688       VectorMapStorage[Key][Part] = Vector;
689     }
690 
691     /// Set a scalar value associated with \p Key for \p Part and \p Lane.
692     /// Assumes such a value is not already set.
693     void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
694                         Value *Scalar) {
695       assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
696       if (!ScalarMapStorage.count(Key)) {
697         ScalarParts Entry(UF);
698         for (unsigned Part = 0; Part < UF; ++Part)
699           Entry[Part].resize(VF, nullptr);
700           // TODO: Consider storing uniform values only per-part, as they occupy
701           //       lane 0 only, keeping the other VF-1 redundant entries null.
702         ScalarMapStorage[Key] = Entry;
703       }
704       ScalarMapStorage[Key][Part][Lane] = Scalar;
705     }
706 
707     /// Reset the vector value associated with \p Key for the given \p Part.
708     /// This function can be used to update values that have already been
709     /// vectorized. This is the case for "fix-up" operations including type
710     /// truncation and the second phase of recurrence vectorization.
711     void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
712       assert(hasVectorValue(Key, Part) && "Vector value not set for part");
713       VectorMapStorage[Key][Part] = Vector;
714     }
715 
716   private:
717     /// The unroll factor. Each entry in the vector map contains UF vector
718     /// values.
719     unsigned UF;
720 
721     /// The vectorization factor. Each entry in the scalar map contains UF x VF
722     /// scalar values.
723     unsigned VF;
724 
725     /// The vector and scalar map storage. We use std::map and not DenseMap
726     /// because insertions to DenseMap invalidate its iterators.
727     std::map<Value *, VectorParts> VectorMapStorage;
728     std::map<Value *, ScalarParts> ScalarMapStorage;
729   };
730 
731   /// The original loop.
732   Loop *OrigLoop;
733   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
734   /// dynamic knowledge to simplify SCEV expressions and converts them to a
735   /// more usable form.
736   PredicatedScalarEvolution &PSE;
737   /// Loop Info.
738   LoopInfo *LI;
739   /// Dominator Tree.
740   DominatorTree *DT;
741   /// Alias Analysis.
742   AliasAnalysis *AA;
743   /// Target Library Info.
744   const TargetLibraryInfo *TLI;
745   /// Target Transform Info.
746   const TargetTransformInfo *TTI;
747   /// Assumption Cache.
748   AssumptionCache *AC;
749   /// Interface to emit optimization remarks.
750   OptimizationRemarkEmitter *ORE;
751 
752   /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
753   /// used.
754   ///
755   /// This is currently only used to add no-alias metadata based on the
756   /// memchecks.  The actually versioning is performed manually.
757   std::unique_ptr<LoopVersioning> LVer;
758 
759   /// The vectorization SIMD factor to use. Each vector will have this many
760   /// vector elements.
761   unsigned VF;
762 
763 protected:
764   /// The vectorization unroll factor to use. Each scalar is vectorized to this
765   /// many different vector instructions.
766   unsigned UF;
767 
768   /// The builder that we use
769   IRBuilder<> Builder;
770 
771   // --- Vectorization state ---
772 
773   /// The vector-loop preheader.
774   BasicBlock *LoopVectorPreHeader;
775   /// The scalar-loop preheader.
776   BasicBlock *LoopScalarPreHeader;
777   /// Middle Block between the vector and the scalar.
778   BasicBlock *LoopMiddleBlock;
779   /// The ExitBlock of the scalar loop.
780   BasicBlock *LoopExitBlock;
781   /// The vector loop body.
782   BasicBlock *LoopVectorBody;
783   /// The scalar loop body.
784   BasicBlock *LoopScalarBody;
785   /// A list of all bypass blocks. The first block is the entry of the loop.
786   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
787 
788   /// The new Induction variable which was added to the new block.
789   PHINode *Induction;
790   /// The induction variable of the old basic block.
791   PHINode *OldInduction;
792 
793   /// Maps values from the original loop to their corresponding values in the
794   /// vectorized loop. A key value can map to either vector values, scalar
795   /// values or both kinds of values, depending on whether the key was
796   /// vectorized and scalarized.
797   ValueMap VectorLoopValueMap;
798 
799   /// Store instructions that should be predicated, as a pair
800   ///   <StoreInst, Predicate>
801   SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
802   EdgeMaskCacheTy EdgeMaskCache;
803   BlockMaskCacheTy BlockMaskCache;
804   /// Trip count of the original loop.
805   Value *TripCount;
806   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
807   Value *VectorTripCount;
808 
809   /// The legality analysis.
810   LoopVectorizationLegality *Legal;
811 
812   /// The profitablity analysis.
813   LoopVectorizationCostModel *Cost;
814 
815   // Record whether runtime checks are added.
816   bool AddedSafetyChecks;
817 
818   // Holds the end values for each induction variable. We save the end values
819   // so we can later fix-up the external users of the induction variables.
820   DenseMap<PHINode *, Value *> IVEndValues;
821 };
822 
823 class InnerLoopUnroller : public InnerLoopVectorizer {
824 public:
825   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
826                     LoopInfo *LI, DominatorTree *DT,
827                     const TargetLibraryInfo *TLI,
828                     const TargetTransformInfo *TTI, AssumptionCache *AC,
829                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
830                     LoopVectorizationLegality *LVL,
831                     LoopVectorizationCostModel *CM)
832       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
833                             UnrollFactor, LVL, CM) {}
834 
835 private:
836   void vectorizeMemoryInstruction(Instruction *Instr) override;
837   Value *getBroadcastInstrs(Value *V) override;
838   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
839                        Instruction::BinaryOps Opcode =
840                        Instruction::BinaryOpsEnd) override;
841   Value *reverseVector(Value *Vec) override;
842 };
843 
844 /// \brief Look for a meaningful debug location on the instruction or it's
845 /// operands.
846 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
847   if (!I)
848     return I;
849 
850   DebugLoc Empty;
851   if (I->getDebugLoc() != Empty)
852     return I;
853 
854   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
855     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
856       if (OpInst->getDebugLoc() != Empty)
857         return OpInst;
858   }
859 
860   return I;
861 }
862 
863 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
864   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
865     const DILocation *DIL = Inst->getDebugLoc();
866     if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
867       B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
868     else
869       B.SetCurrentDebugLocation(DIL);
870   } else
871     B.SetCurrentDebugLocation(DebugLoc());
872 }
873 
874 #ifndef NDEBUG
875 /// \return string containing a file name and a line # for the given loop.
876 static std::string getDebugLocString(const Loop *L) {
877   std::string Result;
878   if (L) {
879     raw_string_ostream OS(Result);
880     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
881       LoopDbgLoc.print(OS);
882     else
883       // Just print the module name.
884       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
885     OS.flush();
886   }
887   return Result;
888 }
889 #endif
890 
891 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
892                                          const Instruction *Orig) {
893   // If the loop was versioned with memchecks, add the corresponding no-alias
894   // metadata.
895   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
896     LVer->annotateInstWithNoAlias(To, Orig);
897 }
898 
899 void InnerLoopVectorizer::addMetadata(Instruction *To,
900                                       Instruction *From) {
901   propagateMetadata(To, From);
902   addNewMetadata(To, From);
903 }
904 
905 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
906                                       Instruction *From) {
907   for (Value *V : To) {
908     if (Instruction *I = dyn_cast<Instruction>(V))
909       addMetadata(I, From);
910   }
911 }
912 
913 /// \brief The group of interleaved loads/stores sharing the same stride and
914 /// close to each other.
915 ///
916 /// Each member in this group has an index starting from 0, and the largest
917 /// index should be less than interleaved factor, which is equal to the absolute
918 /// value of the access's stride.
919 ///
920 /// E.g. An interleaved load group of factor 4:
921 ///        for (unsigned i = 0; i < 1024; i+=4) {
922 ///          a = A[i];                           // Member of index 0
923 ///          b = A[i+1];                         // Member of index 1
924 ///          d = A[i+3];                         // Member of index 3
925 ///          ...
926 ///        }
927 ///
928 ///      An interleaved store group of factor 4:
929 ///        for (unsigned i = 0; i < 1024; i+=4) {
930 ///          ...
931 ///          A[i]   = a;                         // Member of index 0
932 ///          A[i+1] = b;                         // Member of index 1
933 ///          A[i+2] = c;                         // Member of index 2
934 ///          A[i+3] = d;                         // Member of index 3
935 ///        }
936 ///
937 /// Note: the interleaved load group could have gaps (missing members), but
938 /// the interleaved store group doesn't allow gaps.
939 class InterleaveGroup {
940 public:
941   InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
942       : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
943     assert(Align && "The alignment should be non-zero");
944 
945     Factor = std::abs(Stride);
946     assert(Factor > 1 && "Invalid interleave factor");
947 
948     Reverse = Stride < 0;
949     Members[0] = Instr;
950   }
951 
952   bool isReverse() const { return Reverse; }
953   unsigned getFactor() const { return Factor; }
954   unsigned getAlignment() const { return Align; }
955   unsigned getNumMembers() const { return Members.size(); }
956 
957   /// \brief Try to insert a new member \p Instr with index \p Index and
958   /// alignment \p NewAlign. The index is related to the leader and it could be
959   /// negative if it is the new leader.
960   ///
961   /// \returns false if the instruction doesn't belong to the group.
962   bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
963     assert(NewAlign && "The new member's alignment should be non-zero");
964 
965     int Key = Index + SmallestKey;
966 
967     // Skip if there is already a member with the same index.
968     if (Members.count(Key))
969       return false;
970 
971     if (Key > LargestKey) {
972       // The largest index is always less than the interleave factor.
973       if (Index >= static_cast<int>(Factor))
974         return false;
975 
976       LargestKey = Key;
977     } else if (Key < SmallestKey) {
978       // The largest index is always less than the interleave factor.
979       if (LargestKey - Key >= static_cast<int>(Factor))
980         return false;
981 
982       SmallestKey = Key;
983     }
984 
985     // It's always safe to select the minimum alignment.
986     Align = std::min(Align, NewAlign);
987     Members[Key] = Instr;
988     return true;
989   }
990 
991   /// \brief Get the member with the given index \p Index
992   ///
993   /// \returns nullptr if contains no such member.
994   Instruction *getMember(unsigned Index) const {
995     int Key = SmallestKey + Index;
996     if (!Members.count(Key))
997       return nullptr;
998 
999     return Members.find(Key)->second;
1000   }
1001 
1002   /// \brief Get the index for the given member. Unlike the key in the member
1003   /// map, the index starts from 0.
1004   unsigned getIndex(Instruction *Instr) const {
1005     for (auto I : Members)
1006       if (I.second == Instr)
1007         return I.first - SmallestKey;
1008 
1009     llvm_unreachable("InterleaveGroup contains no such member");
1010   }
1011 
1012   Instruction *getInsertPos() const { return InsertPos; }
1013   void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
1014 
1015 private:
1016   unsigned Factor; // Interleave Factor.
1017   bool Reverse;
1018   unsigned Align;
1019   DenseMap<int, Instruction *> Members;
1020   int SmallestKey;
1021   int LargestKey;
1022 
1023   // To avoid breaking dependences, vectorized instructions of an interleave
1024   // group should be inserted at either the first load or the last store in
1025   // program order.
1026   //
1027   // E.g. %even = load i32             // Insert Position
1028   //      %add = add i32 %even         // Use of %even
1029   //      %odd = load i32
1030   //
1031   //      store i32 %even
1032   //      %odd = add i32               // Def of %odd
1033   //      store i32 %odd               // Insert Position
1034   Instruction *InsertPos;
1035 };
1036 
1037 /// \brief Drive the analysis of interleaved memory accesses in the loop.
1038 ///
1039 /// Use this class to analyze interleaved accesses only when we can vectorize
1040 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
1041 /// on interleaved accesses is unsafe.
1042 ///
1043 /// The analysis collects interleave groups and records the relationships
1044 /// between the member and the group in a map.
1045 class InterleavedAccessInfo {
1046 public:
1047   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
1048                         DominatorTree *DT, LoopInfo *LI)
1049       : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
1050         RequiresScalarEpilogue(false) {}
1051 
1052   ~InterleavedAccessInfo() {
1053     SmallSet<InterleaveGroup *, 4> DelSet;
1054     // Avoid releasing a pointer twice.
1055     for (auto &I : InterleaveGroupMap)
1056       DelSet.insert(I.second);
1057     for (auto *Ptr : DelSet)
1058       delete Ptr;
1059   }
1060 
1061   /// \brief Analyze the interleaved accesses and collect them in interleave
1062   /// groups. Substitute symbolic strides using \p Strides.
1063   void analyzeInterleaving(const ValueToValueMap &Strides);
1064 
1065   /// \brief Check if \p Instr belongs to any interleave group.
1066   bool isInterleaved(Instruction *Instr) const {
1067     return InterleaveGroupMap.count(Instr);
1068   }
1069 
1070   /// \brief Return the maximum interleave factor of all interleaved groups.
1071   unsigned getMaxInterleaveFactor() const {
1072     unsigned MaxFactor = 1;
1073     for (auto &Entry : InterleaveGroupMap)
1074       MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
1075     return MaxFactor;
1076   }
1077 
1078   /// \brief Get the interleave group that \p Instr belongs to.
1079   ///
1080   /// \returns nullptr if doesn't have such group.
1081   InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1082     if (InterleaveGroupMap.count(Instr))
1083       return InterleaveGroupMap.find(Instr)->second;
1084     return nullptr;
1085   }
1086 
1087   /// \brief Returns true if an interleaved group that may access memory
1088   /// out-of-bounds requires a scalar epilogue iteration for correctness.
1089   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1090 
1091   /// \brief Initialize the LoopAccessInfo used for dependence checking.
1092   void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
1093 
1094 private:
1095   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1096   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1097   /// The interleaved access analysis can also add new predicates (for example
1098   /// by versioning strides of pointers).
1099   PredicatedScalarEvolution &PSE;
1100   Loop *TheLoop;
1101   DominatorTree *DT;
1102   LoopInfo *LI;
1103   const LoopAccessInfo *LAI;
1104 
1105   /// True if the loop may contain non-reversed interleaved groups with
1106   /// out-of-bounds accesses. We ensure we don't speculatively access memory
1107   /// out-of-bounds by executing at least one scalar epilogue iteration.
1108   bool RequiresScalarEpilogue;
1109 
1110   /// Holds the relationships between the members and the interleave group.
1111   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1112 
1113   /// Holds dependences among the memory accesses in the loop. It maps a source
1114   /// access to a set of dependent sink accesses.
1115   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
1116 
1117   /// \brief The descriptor for a strided memory access.
1118   struct StrideDescriptor {
1119     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1120                      unsigned Align)
1121         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1122 
1123     StrideDescriptor() = default;
1124 
1125     // The access's stride. It is negative for a reverse access.
1126     int64_t Stride = 0;
1127     const SCEV *Scev = nullptr; // The scalar expression of this access
1128     uint64_t Size = 0;          // The size of the memory object.
1129     unsigned Align = 0;         // The alignment of this access.
1130   };
1131 
1132   /// \brief A type for holding instructions and their stride descriptors.
1133   typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
1134 
1135   /// \brief Create a new interleave group with the given instruction \p Instr,
1136   /// stride \p Stride and alignment \p Align.
1137   ///
1138   /// \returns the newly created interleave group.
1139   InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1140                                          unsigned Align) {
1141     assert(!InterleaveGroupMap.count(Instr) &&
1142            "Already in an interleaved access group");
1143     InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1144     return InterleaveGroupMap[Instr];
1145   }
1146 
1147   /// \brief Release the group and remove all the relationships.
1148   void releaseGroup(InterleaveGroup *Group) {
1149     for (unsigned i = 0; i < Group->getFactor(); i++)
1150       if (Instruction *Member = Group->getMember(i))
1151         InterleaveGroupMap.erase(Member);
1152 
1153     delete Group;
1154   }
1155 
1156   /// \brief Collect all the accesses with a constant stride in program order.
1157   void collectConstStrideAccesses(
1158       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
1159       const ValueToValueMap &Strides);
1160 
1161   /// \brief Returns true if \p Stride is allowed in an interleaved group.
1162   static bool isStrided(int Stride) {
1163     unsigned Factor = std::abs(Stride);
1164     return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1165   }
1166 
1167   /// \brief Returns true if \p BB is a predicated block.
1168   bool isPredicated(BasicBlock *BB) const {
1169     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1170   }
1171 
1172   /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1173   bool areDependencesValid() const {
1174     return LAI && LAI->getDepChecker().getDependences();
1175   }
1176 
1177   /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1178   /// necessary, when constructing interleaved groups.
1179   ///
1180   /// \p A must precede \p B in program order. We return false if reordering is
1181   /// not necessary or is prevented because \p A and \p B may be dependent.
1182   bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1183                                                  StrideEntry *B) const {
1184 
1185     // Code motion for interleaved accesses can potentially hoist strided loads
1186     // and sink strided stores. The code below checks the legality of the
1187     // following two conditions:
1188     //
1189     // 1. Potentially moving a strided load (B) before any store (A) that
1190     //    precedes B, or
1191     //
1192     // 2. Potentially moving a strided store (A) after any load or store (B)
1193     //    that A precedes.
1194     //
1195     // It's legal to reorder A and B if we know there isn't a dependence from A
1196     // to B. Note that this determination is conservative since some
1197     // dependences could potentially be reordered safely.
1198 
1199     // A is potentially the source of a dependence.
1200     auto *Src = A->first;
1201     auto SrcDes = A->second;
1202 
1203     // B is potentially the sink of a dependence.
1204     auto *Sink = B->first;
1205     auto SinkDes = B->second;
1206 
1207     // Code motion for interleaved accesses can't violate WAR dependences.
1208     // Thus, reordering is legal if the source isn't a write.
1209     if (!Src->mayWriteToMemory())
1210       return true;
1211 
1212     // At least one of the accesses must be strided.
1213     if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1214       return true;
1215 
1216     // If dependence information is not available from LoopAccessInfo,
1217     // conservatively assume the instructions can't be reordered.
1218     if (!areDependencesValid())
1219       return false;
1220 
1221     // If we know there is a dependence from source to sink, assume the
1222     // instructions can't be reordered. Otherwise, reordering is legal.
1223     return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1224   }
1225 
1226   /// \brief Collect the dependences from LoopAccessInfo.
1227   ///
1228   /// We process the dependences once during the interleaved access analysis to
1229   /// enable constant-time dependence queries.
1230   void collectDependences() {
1231     if (!areDependencesValid())
1232       return;
1233     auto *Deps = LAI->getDepChecker().getDependences();
1234     for (auto Dep : *Deps)
1235       Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1236   }
1237 };
1238 
1239 /// Utility class for getting and setting loop vectorizer hints in the form
1240 /// of loop metadata.
1241 /// This class keeps a number of loop annotations locally (as member variables)
1242 /// and can, upon request, write them back as metadata on the loop. It will
1243 /// initially scan the loop for existing metadata, and will update the local
1244 /// values based on information in the loop.
1245 /// We cannot write all values to metadata, as the mere presence of some info,
1246 /// for example 'force', means a decision has been made. So, we need to be
1247 /// careful NOT to add them if the user hasn't specifically asked so.
1248 class LoopVectorizeHints {
1249   enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE };
1250 
1251   /// Hint - associates name and validation with the hint value.
1252   struct Hint {
1253     const char *Name;
1254     unsigned Value; // This may have to change for non-numeric values.
1255     HintKind Kind;
1256 
1257     Hint(const char *Name, unsigned Value, HintKind Kind)
1258         : Name(Name), Value(Value), Kind(Kind) {}
1259 
1260     bool validate(unsigned Val) {
1261       switch (Kind) {
1262       case HK_WIDTH:
1263         return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1264       case HK_UNROLL:
1265         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1266       case HK_FORCE:
1267         return (Val <= 1);
1268       }
1269       return false;
1270     }
1271   };
1272 
1273   /// Vectorization width.
1274   Hint Width;
1275   /// Vectorization interleave factor.
1276   Hint Interleave;
1277   /// Vectorization forced
1278   Hint Force;
1279 
1280   /// Return the loop metadata prefix.
1281   static StringRef Prefix() { return "llvm.loop."; }
1282 
1283   /// True if there is any unsafe math in the loop.
1284   bool PotentiallyUnsafe;
1285 
1286 public:
1287   enum ForceKind {
1288     FK_Undefined = -1, ///< Not selected.
1289     FK_Disabled = 0,   ///< Forcing disabled.
1290     FK_Enabled = 1,    ///< Forcing enabled.
1291   };
1292 
1293   LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1294                      OptimizationRemarkEmitter &ORE)
1295       : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1296               HK_WIDTH),
1297         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1298         Force("vectorize.enable", FK_Undefined, HK_FORCE),
1299         PotentiallyUnsafe(false), TheLoop(L), ORE(ORE) {
1300     // Populate values with existing loop metadata.
1301     getHintsFromMetadata();
1302 
1303     // force-vector-interleave overrides DisableInterleaving.
1304     if (VectorizerParams::isInterleaveForced())
1305       Interleave.Value = VectorizerParams::VectorizationInterleave;
1306 
1307     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1308           << "LV: Interleaving disabled by the pass manager\n");
1309   }
1310 
1311   /// Mark the loop L as already vectorized by setting the width to 1.
1312   void setAlreadyVectorized() {
1313     Width.Value = Interleave.Value = 1;
1314     Hint Hints[] = {Width, Interleave};
1315     writeHintsToMetadata(Hints);
1316   }
1317 
1318   bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1319     if (getForce() == LoopVectorizeHints::FK_Disabled) {
1320       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1321       emitRemarkWithHints();
1322       return false;
1323     }
1324 
1325     if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1326       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1327       emitRemarkWithHints();
1328       return false;
1329     }
1330 
1331     if (getWidth() == 1 && getInterleave() == 1) {
1332       // FIXME: Add a separate metadata to indicate when the loop has already
1333       // been vectorized instead of setting width and count to 1.
1334       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1335       // FIXME: Add interleave.disable metadata. This will allow
1336       // vectorize.disable to be used without disabling the pass and errors
1337       // to differentiate between disabled vectorization and a width of 1.
1338       ORE.emit(OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1339                                           "AllDisabled", L->getStartLoc(),
1340                                           L->getHeader())
1341                << "loop not vectorized: vectorization and interleaving are "
1342                   "explicitly disabled, or vectorize width and interleave "
1343                   "count are both set to 1");
1344       return false;
1345     }
1346 
1347     return true;
1348   }
1349 
1350   /// Dumps all the hint information.
1351   void emitRemarkWithHints() const {
1352     using namespace ore;
1353     if (Force.Value == LoopVectorizeHints::FK_Disabled)
1354       ORE.emit(OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1355                                         TheLoop->getStartLoc(),
1356                                         TheLoop->getHeader())
1357                << "loop not vectorized: vectorization is explicitly disabled");
1358     else {
1359       OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1360                                  TheLoop->getStartLoc(), TheLoop->getHeader());
1361       R << "loop not vectorized";
1362       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1363         R << " (Force=" << NV("Force", true);
1364         if (Width.Value != 0)
1365           R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1366         if (Interleave.Value != 0)
1367           R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
1368         R << ")";
1369       }
1370       ORE.emit(R);
1371     }
1372   }
1373 
1374   unsigned getWidth() const { return Width.Value; }
1375   unsigned getInterleave() const { return Interleave.Value; }
1376   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1377 
1378   /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1379   /// pass name to force the frontend to print the diagnostic.
1380   const char *vectorizeAnalysisPassName() const {
1381     if (getWidth() == 1)
1382       return LV_NAME;
1383     if (getForce() == LoopVectorizeHints::FK_Disabled)
1384       return LV_NAME;
1385     if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1386       return LV_NAME;
1387     return OptimizationRemarkAnalysis::AlwaysPrint;
1388   }
1389 
1390   bool allowReordering() const {
1391     // When enabling loop hints are provided we allow the vectorizer to change
1392     // the order of operations that is given by the scalar loop. This is not
1393     // enabled by default because can be unsafe or inefficient. For example,
1394     // reordering floating-point operations will change the way round-off
1395     // error accumulates in the loop.
1396     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1397   }
1398 
1399   bool isPotentiallyUnsafe() const {
1400     // Avoid FP vectorization if the target is unsure about proper support.
1401     // This may be related to the SIMD unit in the target not handling
1402     // IEEE 754 FP ops properly, or bad single-to-double promotions.
1403     // Otherwise, a sequence of vectorized loops, even without reduction,
1404     // could lead to different end results on the destination vectors.
1405     return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1406   }
1407 
1408   void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1409 
1410 private:
1411   /// Find hints specified in the loop metadata and update local values.
1412   void getHintsFromMetadata() {
1413     MDNode *LoopID = TheLoop->getLoopID();
1414     if (!LoopID)
1415       return;
1416 
1417     // First operand should refer to the loop id itself.
1418     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1419     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1420 
1421     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1422       const MDString *S = nullptr;
1423       SmallVector<Metadata *, 4> Args;
1424 
1425       // The expected hint is either a MDString or a MDNode with the first
1426       // operand a MDString.
1427       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1428         if (!MD || MD->getNumOperands() == 0)
1429           continue;
1430         S = dyn_cast<MDString>(MD->getOperand(0));
1431         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1432           Args.push_back(MD->getOperand(i));
1433       } else {
1434         S = dyn_cast<MDString>(LoopID->getOperand(i));
1435         assert(Args.size() == 0 && "too many arguments for MDString");
1436       }
1437 
1438       if (!S)
1439         continue;
1440 
1441       // Check if the hint starts with the loop metadata prefix.
1442       StringRef Name = S->getString();
1443       if (Args.size() == 1)
1444         setHint(Name, Args[0]);
1445     }
1446   }
1447 
1448   /// Checks string hint with one operand and set value if valid.
1449   void setHint(StringRef Name, Metadata *Arg) {
1450     if (!Name.startswith(Prefix()))
1451       return;
1452     Name = Name.substr(Prefix().size(), StringRef::npos);
1453 
1454     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1455     if (!C)
1456       return;
1457     unsigned Val = C->getZExtValue();
1458 
1459     Hint *Hints[] = {&Width, &Interleave, &Force};
1460     for (auto H : Hints) {
1461       if (Name == H->Name) {
1462         if (H->validate(Val))
1463           H->Value = Val;
1464         else
1465           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1466         break;
1467       }
1468     }
1469   }
1470 
1471   /// Create a new hint from name / value pair.
1472   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1473     LLVMContext &Context = TheLoop->getHeader()->getContext();
1474     Metadata *MDs[] = {MDString::get(Context, Name),
1475                        ConstantAsMetadata::get(
1476                            ConstantInt::get(Type::getInt32Ty(Context), V))};
1477     return MDNode::get(Context, MDs);
1478   }
1479 
1480   /// Matches metadata with hint name.
1481   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1482     MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1483     if (!Name)
1484       return false;
1485 
1486     for (auto H : HintTypes)
1487       if (Name->getString().endswith(H.Name))
1488         return true;
1489     return false;
1490   }
1491 
1492   /// Sets current hints into loop metadata, keeping other values intact.
1493   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1494     if (HintTypes.size() == 0)
1495       return;
1496 
1497     // Reserve the first element to LoopID (see below).
1498     SmallVector<Metadata *, 4> MDs(1);
1499     // If the loop already has metadata, then ignore the existing operands.
1500     MDNode *LoopID = TheLoop->getLoopID();
1501     if (LoopID) {
1502       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1503         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1504         // If node in update list, ignore old value.
1505         if (!matchesHintMetadataName(Node, HintTypes))
1506           MDs.push_back(Node);
1507       }
1508     }
1509 
1510     // Now, add the missing hints.
1511     for (auto H : HintTypes)
1512       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1513 
1514     // Replace current metadata node with new one.
1515     LLVMContext &Context = TheLoop->getHeader()->getContext();
1516     MDNode *NewLoopID = MDNode::get(Context, MDs);
1517     // Set operand 0 to refer to the loop id itself.
1518     NewLoopID->replaceOperandWith(0, NewLoopID);
1519 
1520     TheLoop->setLoopID(NewLoopID);
1521   }
1522 
1523   /// The loop these hints belong to.
1524   const Loop *TheLoop;
1525 
1526   /// Interface to emit optimization remarks.
1527   OptimizationRemarkEmitter &ORE;
1528 };
1529 
1530 static void emitMissedWarning(Function *F, Loop *L,
1531                               const LoopVectorizeHints &LH,
1532                               OptimizationRemarkEmitter *ORE) {
1533   LH.emitRemarkWithHints();
1534 
1535   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1536     if (LH.getWidth() != 1)
1537       ORE->emit(DiagnosticInfoOptimizationFailure(
1538                     DEBUG_TYPE, "FailedRequestedVectorization",
1539                     L->getStartLoc(), L->getHeader())
1540                 << "loop not vectorized: "
1541                 << "failed explicitly specified loop vectorization");
1542     else if (LH.getInterleave() != 1)
1543       ORE->emit(DiagnosticInfoOptimizationFailure(
1544                     DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
1545                     L->getHeader())
1546                 << "loop not interleaved: "
1547                 << "failed explicitly specified loop interleaving");
1548   }
1549 }
1550 
1551 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1552 /// to what vectorization factor.
1553 /// This class does not look at the profitability of vectorization, only the
1554 /// legality. This class has two main kinds of checks:
1555 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1556 ///   will change the order of memory accesses in a way that will change the
1557 ///   correctness of the program.
1558 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1559 /// checks for a number of different conditions, such as the availability of a
1560 /// single induction variable, that all types are supported and vectorize-able,
1561 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1562 /// This class is also used by InnerLoopVectorizer for identifying
1563 /// induction variable and the different reduction variables.
1564 class LoopVectorizationLegality {
1565 public:
1566   LoopVectorizationLegality(
1567       Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
1568       TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
1569       const TargetTransformInfo *TTI,
1570       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1571       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1572       LoopVectorizeHints *H)
1573       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
1574         GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
1575         PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
1576         Requirements(R), Hints(H) {}
1577 
1578   /// ReductionList contains the reduction descriptors for all
1579   /// of the reductions that were found in the loop.
1580   typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
1581 
1582   /// InductionList saves induction variables and maps them to the
1583   /// induction descriptor.
1584   typedef MapVector<PHINode *, InductionDescriptor> InductionList;
1585 
1586   /// RecurrenceSet contains the phi nodes that are recurrences other than
1587   /// inductions and reductions.
1588   typedef SmallPtrSet<const PHINode *, 8> RecurrenceSet;
1589 
1590   /// Returns true if it is legal to vectorize this loop.
1591   /// This does not mean that it is profitable to vectorize this
1592   /// loop, only that it is legal to do so.
1593   bool canVectorize();
1594 
1595   /// Returns the primary induction variable.
1596   PHINode *getPrimaryInduction() { return PrimaryInduction; }
1597 
1598   /// Returns the reduction variables found in the loop.
1599   ReductionList *getReductionVars() { return &Reductions; }
1600 
1601   /// Returns the induction variables found in the loop.
1602   InductionList *getInductionVars() { return &Inductions; }
1603 
1604   /// Return the first-order recurrences found in the loop.
1605   RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1606 
1607   /// Returns the widest induction type.
1608   Type *getWidestInductionType() { return WidestIndTy; }
1609 
1610   /// Returns True if V is an induction variable in this loop.
1611   bool isInductionVariable(const Value *V);
1612 
1613   /// Returns True if PN is a reduction variable in this loop.
1614   bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1615 
1616   /// Returns True if Phi is a first-order recurrence in this loop.
1617   bool isFirstOrderRecurrence(const PHINode *Phi);
1618 
1619   /// Return true if the block BB needs to be predicated in order for the loop
1620   /// to be vectorized.
1621   bool blockNeedsPredication(BasicBlock *BB);
1622 
1623   /// Check if this pointer is consecutive when vectorizing. This happens
1624   /// when the last index of the GEP is the induction variable, or that the
1625   /// pointer itself is an induction variable.
1626   /// This check allows us to vectorize A[idx] into a wide load/store.
1627   /// Returns:
1628   /// 0 - Stride is unknown or non-consecutive.
1629   /// 1 - Address is consecutive.
1630   /// -1 - Address is consecutive, and decreasing.
1631   int isConsecutivePtr(Value *Ptr);
1632 
1633   /// Returns true if the value V is uniform within the loop.
1634   bool isUniform(Value *V);
1635 
1636   /// Returns the information that we collected about runtime memory check.
1637   const RuntimePointerChecking *getRuntimePointerChecking() const {
1638     return LAI->getRuntimePointerChecking();
1639   }
1640 
1641   const LoopAccessInfo *getLAI() const { return LAI; }
1642 
1643   /// \brief Check if \p Instr belongs to any interleaved access group.
1644   bool isAccessInterleaved(Instruction *Instr) {
1645     return InterleaveInfo.isInterleaved(Instr);
1646   }
1647 
1648   /// \brief Return the maximum interleave factor of all interleaved groups.
1649   unsigned getMaxInterleaveFactor() const {
1650     return InterleaveInfo.getMaxInterleaveFactor();
1651   }
1652 
1653   /// \brief Get the interleaved access group that \p Instr belongs to.
1654   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
1655     return InterleaveInfo.getInterleaveGroup(Instr);
1656   }
1657 
1658   /// \brief Returns true if an interleaved group requires a scalar iteration
1659   /// to handle accesses with gaps.
1660   bool requiresScalarEpilogue() const {
1661     return InterleaveInfo.requiresScalarEpilogue();
1662   }
1663 
1664   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1665 
1666   bool hasStride(Value *V) { return LAI->hasStride(V); }
1667 
1668   /// Returns true if the target machine supports masked store operation
1669   /// for the given \p DataType and kind of access to \p Ptr.
1670   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1671     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
1672   }
1673   /// Returns true if the target machine supports masked load operation
1674   /// for the given \p DataType and kind of access to \p Ptr.
1675   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1676     return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
1677   }
1678   /// Returns true if the target machine supports masked scatter operation
1679   /// for the given \p DataType.
1680   bool isLegalMaskedScatter(Type *DataType) {
1681     return TTI->isLegalMaskedScatter(DataType);
1682   }
1683   /// Returns true if the target machine supports masked gather operation
1684   /// for the given \p DataType.
1685   bool isLegalMaskedGather(Type *DataType) {
1686     return TTI->isLegalMaskedGather(DataType);
1687   }
1688   /// Returns true if the target machine can represent \p V as a masked gather
1689   /// or scatter operation.
1690   bool isLegalGatherOrScatter(Value *V) {
1691     auto *LI = dyn_cast<LoadInst>(V);
1692     auto *SI = dyn_cast<StoreInst>(V);
1693     if (!LI && !SI)
1694       return false;
1695     auto *Ptr = getPointerOperand(V);
1696     auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
1697     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1698   }
1699 
1700   /// Returns true if vector representation of the instruction \p I
1701   /// requires mask.
1702   bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1703   unsigned getNumStores() const { return LAI->getNumStores(); }
1704   unsigned getNumLoads() const { return LAI->getNumLoads(); }
1705   unsigned getNumPredStores() const { return NumPredStores; }
1706 
1707   /// Returns true if \p I is an instruction that will be scalarized with
1708   /// predication. Such instructions include conditional stores and
1709   /// instructions that may divide by zero.
1710   bool isScalarWithPredication(Instruction *I);
1711 
1712   /// Returns true if \p I is a memory instruction with consecutive memory
1713   /// access that can be widened.
1714   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1715 
1716   // Returns true if the NoNaN attribute is set on the function.
1717   bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
1718 
1719 private:
1720   /// Check if a single basic block loop is vectorizable.
1721   /// At this point we know that this is a loop with a constant trip count
1722   /// and we only need to check individual instructions.
1723   bool canVectorizeInstrs();
1724 
1725   /// When we vectorize loops we may change the order in which
1726   /// we read and write from memory. This method checks if it is
1727   /// legal to vectorize the code, considering only memory constrains.
1728   /// Returns true if the loop is vectorizable
1729   bool canVectorizeMemory();
1730 
1731   /// Return true if we can vectorize this loop using the IF-conversion
1732   /// transformation.
1733   bool canVectorizeWithIfConvert();
1734 
1735   /// Return true if all of the instructions in the block can be speculatively
1736   /// executed. \p SafePtrs is a list of addresses that are known to be legal
1737   /// and we know that we can read from them without segfault.
1738   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1739 
1740   /// Updates the vectorization state by adding \p Phi to the inductions list.
1741   /// This can set \p Phi as the main induction of the loop if \p Phi is a
1742   /// better choice for the main induction than the existing one.
1743   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1744                        SmallPtrSetImpl<Value *> &AllowedExit);
1745 
1746   /// Create an analysis remark that explains why vectorization failed
1747   ///
1748   /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
1749   /// an instruction that prevents vectorization.  Otherwise the loop is used
1750   /// for the location of the remark.  \return the remark object that can be
1751   /// streamed to.
1752   OptimizationRemarkAnalysis
1753   createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1754     return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1755                                   RemarkName, TheLoop, I);
1756   }
1757 
1758   /// \brief If an access has a symbolic strides, this maps the pointer value to
1759   /// the stride symbol.
1760   const ValueToValueMap *getSymbolicStrides() {
1761     // FIXME: Currently, the set of symbolic strides is sometimes queried before
1762     // it's collected.  This happens from canVectorizeWithIfConvert, when the
1763     // pointer is checked to reference consecutive elements suitable for a
1764     // masked access.
1765     return LAI ? &LAI->getSymbolicStrides() : nullptr;
1766   }
1767 
1768   unsigned NumPredStores;
1769 
1770   /// The loop that we evaluate.
1771   Loop *TheLoop;
1772   /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1773   /// Applies dynamic knowledge to simplify SCEV expressions in the context
1774   /// of existing SCEV assumptions. The analysis will also add a minimal set
1775   /// of new predicates if this is required to enable vectorization and
1776   /// unrolling.
1777   PredicatedScalarEvolution &PSE;
1778   /// Target Library Info.
1779   TargetLibraryInfo *TLI;
1780   /// Target Transform Info
1781   const TargetTransformInfo *TTI;
1782   /// Dominator Tree.
1783   DominatorTree *DT;
1784   // LoopAccess analysis.
1785   std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1786   // And the loop-accesses info corresponding to this loop.  This pointer is
1787   // null until canVectorizeMemory sets it up.
1788   const LoopAccessInfo *LAI;
1789   /// Interface to emit optimization remarks.
1790   OptimizationRemarkEmitter *ORE;
1791 
1792   /// The interleave access information contains groups of interleaved accesses
1793   /// with the same stride and close to each other.
1794   InterleavedAccessInfo InterleaveInfo;
1795 
1796   //  ---  vectorization state --- //
1797 
1798   /// Holds the primary induction variable. This is the counter of the
1799   /// loop.
1800   PHINode *PrimaryInduction;
1801   /// Holds the reduction variables.
1802   ReductionList Reductions;
1803   /// Holds all of the induction variables that we found in the loop.
1804   /// Notice that inductions don't need to start at zero and that induction
1805   /// variables can be pointers.
1806   InductionList Inductions;
1807   /// Holds the phi nodes that are first-order recurrences.
1808   RecurrenceSet FirstOrderRecurrences;
1809   /// Holds the widest induction type encountered.
1810   Type *WidestIndTy;
1811 
1812   /// Allowed outside users. This holds the induction and reduction
1813   /// vars which can be accessed from outside the loop.
1814   SmallPtrSet<Value *, 4> AllowedExit;
1815 
1816   /// Can we assume the absence of NaNs.
1817   bool HasFunNoNaNAttr;
1818 
1819   /// Vectorization requirements that will go through late-evaluation.
1820   LoopVectorizationRequirements *Requirements;
1821 
1822   /// Used to emit an analysis of any legality issues.
1823   LoopVectorizeHints *Hints;
1824 
1825   /// While vectorizing these instructions we have to generate a
1826   /// call to the appropriate masked intrinsic
1827   SmallPtrSet<const Instruction *, 8> MaskedOp;
1828 };
1829 
1830 /// LoopVectorizationCostModel - estimates the expected speedups due to
1831 /// vectorization.
1832 /// In many cases vectorization is not profitable. This can happen because of
1833 /// a number of reasons. In this class we mainly attempt to predict the
1834 /// expected speedup/slowdowns due to the supported instruction set. We use the
1835 /// TargetTransformInfo to query the different backends for the cost of
1836 /// different operations.
1837 class LoopVectorizationCostModel {
1838 public:
1839   LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
1840                              LoopInfo *LI, LoopVectorizationLegality *Legal,
1841                              const TargetTransformInfo &TTI,
1842                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1843                              AssumptionCache *AC,
1844                              OptimizationRemarkEmitter *ORE, const Function *F,
1845                              const LoopVectorizeHints *Hints)
1846       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1847         AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
1848 
1849   /// \return An upper bound for the vectorization factor, or None if
1850   /// vectorization should be avoided up front.
1851   Optional<unsigned> computeMaxVF(bool OptForSize);
1852 
1853   /// Information about vectorization costs
1854   struct VectorizationFactor {
1855     unsigned Width; // Vector width with best cost
1856     unsigned Cost;  // Cost of the loop with that width
1857   };
1858   /// \return The most profitable vectorization factor and the cost of that VF.
1859   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1860   /// then this vectorization factor will be selected if vectorization is
1861   /// possible.
1862   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1863 
1864   /// Setup cost-based decisions for user vectorization factor.
1865   void selectUserVectorizationFactor(unsigned UserVF) {
1866     collectUniformsAndScalars(UserVF);
1867     collectInstsToScalarize(UserVF);
1868   }
1869 
1870   /// \return The size (in bits) of the smallest and widest types in the code
1871   /// that needs to be vectorized. We ignore values that remain scalar such as
1872   /// 64 bit loop indices.
1873   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1874 
1875   /// \return The desired interleave count.
1876   /// If interleave count has been specified by metadata it will be returned.
1877   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1878   /// are the selected vectorization factor and the cost of the selected VF.
1879   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1880                                  unsigned LoopCost);
1881 
1882   /// Memory access instruction may be vectorized in more than one way.
1883   /// Form of instruction after vectorization depends on cost.
1884   /// This function takes cost-based decisions for Load/Store instructions
1885   /// and collects them in a map. This decisions map is used for building
1886   /// the lists of loop-uniform and loop-scalar instructions.
1887   /// The calculated cost is saved with widening decision in order to
1888   /// avoid redundant calculations.
1889   void setCostBasedWideningDecision(unsigned VF);
1890 
1891   /// \brief A struct that represents some properties of the register usage
1892   /// of a loop.
1893   struct RegisterUsage {
1894     /// Holds the number of loop invariant values that are used in the loop.
1895     unsigned LoopInvariantRegs;
1896     /// Holds the maximum number of concurrent live intervals in the loop.
1897     unsigned MaxLocalUsers;
1898     /// Holds the number of instructions in the loop.
1899     unsigned NumInstructions;
1900   };
1901 
1902   /// \return Returns information about the register usages of the loop for the
1903   /// given vectorization factors.
1904   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1905 
1906   /// Collect values we want to ignore in the cost model.
1907   void collectValuesToIgnore();
1908 
1909   /// \returns The smallest bitwidth each instruction can be represented with.
1910   /// The vector equivalents of these instructions should be truncated to this
1911   /// type.
1912   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1913     return MinBWs;
1914   }
1915 
1916   /// \returns True if it is more profitable to scalarize instruction \p I for
1917   /// vectorization factor \p VF.
1918   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1919     auto Scalars = InstsToScalarize.find(VF);
1920     assert(Scalars != InstsToScalarize.end() &&
1921            "VF not yet analyzed for scalarization profitability");
1922     return Scalars->second.count(I);
1923   }
1924 
1925   /// Returns true if \p I is known to be uniform after vectorization.
1926   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1927     if (VF == 1)
1928       return true;
1929     assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
1930     auto UniformsPerVF = Uniforms.find(VF);
1931     return UniformsPerVF->second.count(I);
1932   }
1933 
1934   /// Returns true if \p I is known to be scalar after vectorization.
1935   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1936     if (VF == 1)
1937       return true;
1938     assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
1939     auto ScalarsPerVF = Scalars.find(VF);
1940     return ScalarsPerVF->second.count(I);
1941   }
1942 
1943   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1944   /// for vectorization factor \p VF.
1945   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1946     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1947            !isScalarAfterVectorization(I, VF);
1948   }
1949 
1950   /// Decision that was taken during cost calculation for memory instruction.
1951   enum InstWidening {
1952     CM_Unknown,
1953     CM_Widen,
1954     CM_Interleave,
1955     CM_GatherScatter,
1956     CM_Scalarize
1957   };
1958 
1959   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1960   /// instruction \p I and vector width \p VF.
1961   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1962                            unsigned Cost) {
1963     assert(VF >= 2 && "Expected VF >=2");
1964     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1965   }
1966 
1967   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1968   /// interleaving group \p Grp and vector width \p VF.
1969   void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
1970                            InstWidening W, unsigned Cost) {
1971     assert(VF >= 2 && "Expected VF >=2");
1972     /// Broadcast this decicion to all instructions inside the group.
1973     /// But the cost will be assigned to one instruction only.
1974     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1975       if (auto *I = Grp->getMember(i)) {
1976         if (Grp->getInsertPos() == I)
1977           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1978         else
1979           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1980       }
1981     }
1982   }
1983 
1984   /// Return the cost model decision for the given instruction \p I and vector
1985   /// width \p VF. Return CM_Unknown if this instruction did not pass
1986   /// through the cost modeling.
1987   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1988     assert(VF >= 2 && "Expected VF >=2");
1989     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1990     auto Itr = WideningDecisions.find(InstOnVF);
1991     if (Itr == WideningDecisions.end())
1992       return CM_Unknown;
1993     return Itr->second.first;
1994   }
1995 
1996   /// Return the vectorization cost for the given instruction \p I and vector
1997   /// width \p VF.
1998   unsigned getWideningCost(Instruction *I, unsigned VF) {
1999     assert(VF >= 2 && "Expected VF >=2");
2000     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
2001     assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
2002     return WideningDecisions[InstOnVF].second;
2003   }
2004 
2005   /// Return True if instruction \p I is an optimizable truncate whose operand
2006   /// is an induction variable. Such a truncate will be removed by adding a new
2007   /// induction variable with the destination type.
2008   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
2009 
2010     // If the instruction is not a truncate, return false.
2011     auto *Trunc = dyn_cast<TruncInst>(I);
2012     if (!Trunc)
2013       return false;
2014 
2015     // Get the source and destination types of the truncate.
2016     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
2017     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
2018 
2019     // If the truncate is free for the given types, return false. Replacing a
2020     // free truncate with an induction variable would add an induction variable
2021     // update instruction to each iteration of the loop. We exclude from this
2022     // check the primary induction variable since it will need an update
2023     // instruction regardless.
2024     Value *Op = Trunc->getOperand(0);
2025     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
2026       return false;
2027 
2028     // If the truncated value is not an induction variable, return false.
2029     return Legal->isInductionVariable(Op);
2030   }
2031 
2032 private:
2033   /// \return An upper bound for the vectorization factor, larger than zero.
2034   /// One is returned if vectorization should best be avoided due to cost.
2035   unsigned computeFeasibleMaxVF(bool OptForSize);
2036 
2037   /// The vectorization cost is a combination of the cost itself and a boolean
2038   /// indicating whether any of the contributing operations will actually
2039   /// operate on
2040   /// vector values after type legalization in the backend. If this latter value
2041   /// is
2042   /// false, then all operations will be scalarized (i.e. no vectorization has
2043   /// actually taken place).
2044   typedef std::pair<unsigned, bool> VectorizationCostTy;
2045 
2046   /// Returns the expected execution cost. The unit of the cost does
2047   /// not matter because we use the 'cost' units to compare different
2048   /// vector widths. The cost that is returned is *not* normalized by
2049   /// the factor width.
2050   VectorizationCostTy expectedCost(unsigned VF);
2051 
2052   /// Returns the execution time cost of an instruction for a given vector
2053   /// width. Vector width of one means scalar.
2054   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
2055 
2056   /// The cost-computation logic from getInstructionCost which provides
2057   /// the vector type as an output parameter.
2058   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
2059 
2060   /// Calculate vectorization cost of memory instruction \p I.
2061   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
2062 
2063   /// The cost computation for scalarized memory instruction.
2064   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
2065 
2066   /// The cost computation for interleaving group of memory instructions.
2067   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
2068 
2069   /// The cost computation for Gather/Scatter instruction.
2070   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
2071 
2072   /// The cost computation for widening instruction \p I with consecutive
2073   /// memory access.
2074   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
2075 
2076   /// The cost calculation for Load instruction \p I with uniform pointer -
2077   /// scalar load + broadcast.
2078   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
2079 
2080   /// Returns whether the instruction is a load or store and will be a emitted
2081   /// as a vector operation.
2082   bool isConsecutiveLoadOrStore(Instruction *I);
2083 
2084   /// Create an analysis remark that explains why vectorization failed
2085   ///
2086   /// \p RemarkName is the identifier for the remark.  \return the remark object
2087   /// that can be streamed to.
2088   OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
2089     return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
2090                                   RemarkName, TheLoop);
2091   }
2092 
2093   /// Map of scalar integer values to the smallest bitwidth they can be legally
2094   /// represented as. The vector equivalents of these values should be truncated
2095   /// to this type.
2096   MapVector<Instruction *, uint64_t> MinBWs;
2097 
2098   /// A type representing the costs for instructions if they were to be
2099   /// scalarized rather than vectorized. The entries are Instruction-Cost
2100   /// pairs.
2101   typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
2102 
2103   /// A set containing all BasicBlocks that are known to present after
2104   /// vectorization as a predicated block.
2105   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
2106 
2107   /// A map holding scalar costs for different vectorization factors. The
2108   /// presence of a cost for an instruction in the mapping indicates that the
2109   /// instruction will be scalarized when vectorizing with the associated
2110   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
2111   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
2112 
2113   /// Holds the instructions known to be uniform after vectorization.
2114   /// The data is collected per VF.
2115   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
2116 
2117   /// Holds the instructions known to be scalar after vectorization.
2118   /// The data is collected per VF.
2119   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
2120 
2121   /// Holds the instructions (address computations) that are forced to be
2122   /// scalarized.
2123   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
2124 
2125   /// Returns the expected difference in cost from scalarizing the expression
2126   /// feeding a predicated instruction \p PredInst. The instructions to
2127   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
2128   /// non-negative return value implies the expression will be scalarized.
2129   /// Currently, only single-use chains are considered for scalarization.
2130   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
2131                               unsigned VF);
2132 
2133   /// Collects the instructions to scalarize for each predicated instruction in
2134   /// the loop.
2135   void collectInstsToScalarize(unsigned VF);
2136 
2137   /// Collect the instructions that are uniform after vectorization. An
2138   /// instruction is uniform if we represent it with a single scalar value in
2139   /// the vectorized loop corresponding to each vector iteration. Examples of
2140   /// uniform instructions include pointer operands of consecutive or
2141   /// interleaved memory accesses. Note that although uniformity implies an
2142   /// instruction will be scalar, the reverse is not true. In general, a
2143   /// scalarized instruction will be represented by VF scalar values in the
2144   /// vectorized loop, each corresponding to an iteration of the original
2145   /// scalar loop.
2146   void collectLoopUniforms(unsigned VF);
2147 
2148   /// Collect the instructions that are scalar after vectorization. An
2149   /// instruction is scalar if it is known to be uniform or will be scalarized
2150   /// during vectorization. Non-uniform scalarized instructions will be
2151   /// represented by VF values in the vectorized loop, each corresponding to an
2152   /// iteration of the original scalar loop.
2153   void collectLoopScalars(unsigned VF);
2154 
2155   /// Collect Uniform and Scalar values for the given \p VF.
2156   /// The sets depend on CM decision for Load/Store instructions
2157   /// that may be vectorized as interleave, gather-scatter or scalarized.
2158   void collectUniformsAndScalars(unsigned VF) {
2159     // Do the analysis once.
2160     if (VF == 1 || Uniforms.count(VF))
2161       return;
2162     setCostBasedWideningDecision(VF);
2163     collectLoopUniforms(VF);
2164     collectLoopScalars(VF);
2165   }
2166 
2167   /// Keeps cost model vectorization decision and cost for instructions.
2168   /// Right now it is used for memory instructions only.
2169   typedef DenseMap<std::pair<Instruction *, unsigned>,
2170                    std::pair<InstWidening, unsigned>>
2171       DecisionList;
2172 
2173   DecisionList WideningDecisions;
2174 
2175 public:
2176   /// The loop that we evaluate.
2177   Loop *TheLoop;
2178   /// Predicated scalar evolution analysis.
2179   PredicatedScalarEvolution &PSE;
2180   /// Loop Info analysis.
2181   LoopInfo *LI;
2182   /// Vectorization legality.
2183   LoopVectorizationLegality *Legal;
2184   /// Vector target information.
2185   const TargetTransformInfo &TTI;
2186   /// Target Library Info.
2187   const TargetLibraryInfo *TLI;
2188   /// Demanded bits analysis.
2189   DemandedBits *DB;
2190   /// Assumption cache.
2191   AssumptionCache *AC;
2192   /// Interface to emit optimization remarks.
2193   OptimizationRemarkEmitter *ORE;
2194 
2195   const Function *TheFunction;
2196   /// Loop Vectorize Hint.
2197   const LoopVectorizeHints *Hints;
2198   /// Values to ignore in the cost model.
2199   SmallPtrSet<const Value *, 16> ValuesToIgnore;
2200   /// Values to ignore in the cost model when VF > 1.
2201   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2202 };
2203 
2204 /// LoopVectorizationPlanner - drives the vectorization process after having
2205 /// passed Legality checks.
2206 class LoopVectorizationPlanner {
2207 public:
2208   LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
2209                            LoopVectorizationLegality *Legal,
2210                            LoopVectorizationCostModel &CM)
2211       : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
2212 
2213   ~LoopVectorizationPlanner() {}
2214 
2215   /// Plan how to best vectorize, return the best VF and its cost.
2216   LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
2217                                                        unsigned UserVF);
2218 
2219   /// Generate the IR code for the vectorized loop.
2220   void executePlan(InnerLoopVectorizer &ILV);
2221 
2222 protected:
2223   /// Collect the instructions from the original loop that would be trivially
2224   /// dead in the vectorized loop if generated.
2225   void collectTriviallyDeadInstructions(
2226       SmallPtrSetImpl<Instruction *> &DeadInstructions);
2227 
2228 private:
2229   /// The loop that we evaluate.
2230   Loop *OrigLoop;
2231 
2232   /// Loop Info analysis.
2233   LoopInfo *LI;
2234 
2235   /// The legality analysis.
2236   LoopVectorizationLegality *Legal;
2237 
2238   /// The profitablity analysis.
2239   LoopVectorizationCostModel &CM;
2240 };
2241 
2242 /// \brief This holds vectorization requirements that must be verified late in
2243 /// the process. The requirements are set by legalize and costmodel. Once
2244 /// vectorization has been determined to be possible and profitable the
2245 /// requirements can be verified by looking for metadata or compiler options.
2246 /// For example, some loops require FP commutativity which is only allowed if
2247 /// vectorization is explicitly specified or if the fast-math compiler option
2248 /// has been provided.
2249 /// Late evaluation of these requirements allows helpful diagnostics to be
2250 /// composed that tells the user what need to be done to vectorize the loop. For
2251 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2252 /// evaluation should be used only when diagnostics can generated that can be
2253 /// followed by a non-expert user.
2254 class LoopVectorizationRequirements {
2255 public:
2256   LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE)
2257       : NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr), ORE(ORE) {}
2258 
2259   void addUnsafeAlgebraInst(Instruction *I) {
2260     // First unsafe algebra instruction.
2261     if (!UnsafeAlgebraInst)
2262       UnsafeAlgebraInst = I;
2263   }
2264 
2265   void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2266 
2267   bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2268     const char *PassName = Hints.vectorizeAnalysisPassName();
2269     bool Failed = false;
2270     if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2271       ORE.emit(
2272           OptimizationRemarkAnalysisFPCommute(PassName, "CantReorderFPOps",
2273                                               UnsafeAlgebraInst->getDebugLoc(),
2274                                               UnsafeAlgebraInst->getParent())
2275           << "loop not vectorized: cannot prove it is safe to reorder "
2276              "floating-point operations");
2277       Failed = true;
2278     }
2279 
2280     // Test if runtime memcheck thresholds are exceeded.
2281     bool PragmaThresholdReached =
2282         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2283     bool ThresholdReached =
2284         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2285     if ((ThresholdReached && !Hints.allowReordering()) ||
2286         PragmaThresholdReached) {
2287       ORE.emit(OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2288                                                   L->getStartLoc(),
2289                                                   L->getHeader())
2290                << "loop not vectorized: cannot prove it is safe to reorder "
2291                   "memory operations");
2292       DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2293       Failed = true;
2294     }
2295 
2296     return Failed;
2297   }
2298 
2299 private:
2300   unsigned NumRuntimePointerChecks;
2301   Instruction *UnsafeAlgebraInst;
2302 
2303   /// Interface to emit optimization remarks.
2304   OptimizationRemarkEmitter &ORE;
2305 };
2306 
2307 static void addAcyclicInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
2308   if (L.empty()) {
2309     if (!hasCyclesInLoopBody(L))
2310       V.push_back(&L);
2311     return;
2312   }
2313   for (Loop *InnerL : L)
2314     addAcyclicInnerLoop(*InnerL, V);
2315 }
2316 
2317 /// The LoopVectorize Pass.
2318 struct LoopVectorize : public FunctionPass {
2319   /// Pass identification, replacement for typeid
2320   static char ID;
2321 
2322   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2323       : FunctionPass(ID) {
2324     Impl.DisableUnrolling = NoUnrolling;
2325     Impl.AlwaysVectorize = AlwaysVectorize;
2326     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2327   }
2328 
2329   LoopVectorizePass Impl;
2330 
2331   bool runOnFunction(Function &F) override {
2332     if (skipFunction(F))
2333       return false;
2334 
2335     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2336     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2337     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2338     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2339     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2340     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2341     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2342     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2343     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2344     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2345     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2346     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2347 
2348     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2349         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2350 
2351     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2352                         GetLAA, *ORE);
2353   }
2354 
2355   void getAnalysisUsage(AnalysisUsage &AU) const override {
2356     AU.addRequired<AssumptionCacheTracker>();
2357     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2358     AU.addRequired<DominatorTreeWrapperPass>();
2359     AU.addRequired<LoopInfoWrapperPass>();
2360     AU.addRequired<ScalarEvolutionWrapperPass>();
2361     AU.addRequired<TargetTransformInfoWrapperPass>();
2362     AU.addRequired<AAResultsWrapperPass>();
2363     AU.addRequired<LoopAccessLegacyAnalysis>();
2364     AU.addRequired<DemandedBitsWrapperPass>();
2365     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2366     AU.addPreserved<LoopInfoWrapperPass>();
2367     AU.addPreserved<DominatorTreeWrapperPass>();
2368     AU.addPreserved<BasicAAWrapperPass>();
2369     AU.addPreserved<GlobalsAAWrapperPass>();
2370   }
2371 };
2372 
2373 } // end anonymous namespace
2374 
2375 //===----------------------------------------------------------------------===//
2376 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2377 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2378 //===----------------------------------------------------------------------===//
2379 
2380 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2381   // We need to place the broadcast of invariant variables outside the loop.
2382   Instruction *Instr = dyn_cast<Instruction>(V);
2383   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2384   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2385 
2386   // Place the code for broadcasting invariant variables in the new preheader.
2387   IRBuilder<>::InsertPointGuard Guard(Builder);
2388   if (Invariant)
2389     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2390 
2391   // Broadcast the scalar into all locations in the vector.
2392   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2393 
2394   return Shuf;
2395 }
2396 
2397 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2398     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2399   Value *Start = II.getStartValue();
2400 
2401   // Construct the initial value of the vector IV in the vector loop preheader
2402   auto CurrIP = Builder.saveIP();
2403   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2404   if (isa<TruncInst>(EntryVal)) {
2405     assert(Start->getType()->isIntegerTy() &&
2406            "Truncation requires an integer type");
2407     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2408     Step = Builder.CreateTrunc(Step, TruncType);
2409     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2410   }
2411   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2412   Value *SteppedStart =
2413       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2414 
2415   // We create vector phi nodes for both integer and floating-point induction
2416   // variables. Here, we determine the kind of arithmetic we will perform.
2417   Instruction::BinaryOps AddOp;
2418   Instruction::BinaryOps MulOp;
2419   if (Step->getType()->isIntegerTy()) {
2420     AddOp = Instruction::Add;
2421     MulOp = Instruction::Mul;
2422   } else {
2423     AddOp = II.getInductionOpcode();
2424     MulOp = Instruction::FMul;
2425   }
2426 
2427   // Multiply the vectorization factor by the step using integer or
2428   // floating-point arithmetic as appropriate.
2429   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
2430   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2431 
2432   // Create a vector splat to use in the induction update.
2433   //
2434   // FIXME: If the step is non-constant, we create the vector splat with
2435   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2436   //        handle a constant vector splat.
2437   Value *SplatVF = isa<Constant>(Mul)
2438                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2439                        : Builder.CreateVectorSplat(VF, Mul);
2440   Builder.restoreIP(CurrIP);
2441 
2442   // We may need to add the step a number of times, depending on the unroll
2443   // factor. The last of those goes into the PHI.
2444   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2445                                     &*LoopVectorBody->getFirstInsertionPt());
2446   Instruction *LastInduction = VecInd;
2447   for (unsigned Part = 0; Part < UF; ++Part) {
2448     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2449     if (isa<TruncInst>(EntryVal))
2450       addMetadata(LastInduction, EntryVal);
2451     LastInduction = cast<Instruction>(addFastMathFlag(
2452         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2453   }
2454 
2455   // Move the last step to the end of the latch block. This ensures consistent
2456   // placement of all induction updates.
2457   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2458   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2459   auto *ICmp = cast<Instruction>(Br->getCondition());
2460   LastInduction->moveBefore(ICmp);
2461   LastInduction->setName("vec.ind.next");
2462 
2463   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2464   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2465 }
2466 
2467 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2468   return Cost->isScalarAfterVectorization(I, VF) ||
2469          Cost->isProfitableToScalarize(I, VF);
2470 }
2471 
2472 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2473   if (shouldScalarizeInstruction(IV))
2474     return true;
2475   auto isScalarInst = [&](User *U) -> bool {
2476     auto *I = cast<Instruction>(U);
2477     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2478   };
2479   return any_of(IV->users(), isScalarInst);
2480 }
2481 
2482 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2483 
2484   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2485          "Primary induction variable must have an integer type");
2486 
2487   auto II = Legal->getInductionVars()->find(IV);
2488   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2489 
2490   auto ID = II->second;
2491   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2492 
2493   // The scalar value to broadcast. This will be derived from the canonical
2494   // induction variable.
2495   Value *ScalarIV = nullptr;
2496 
2497   // The value from the original loop to which we are mapping the new induction
2498   // variable.
2499   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2500 
2501   // True if we have vectorized the induction variable.
2502   auto VectorizedIV = false;
2503 
2504   // Determine if we want a scalar version of the induction variable. This is
2505   // true if the induction variable itself is not widened, or if it has at
2506   // least one user in the loop that is not widened.
2507   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2508 
2509   // Generate code for the induction step. Note that induction steps are
2510   // required to be loop-invariant
2511   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
2512          "Induction step should be loop invariant");
2513   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2514   Value *Step = nullptr;
2515   if (PSE.getSE()->isSCEVable(IV->getType())) {
2516     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2517     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2518                              LoopVectorPreHeader->getTerminator());
2519   } else {
2520     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
2521   }
2522 
2523   // Try to create a new independent vector induction variable. If we can't
2524   // create the phi node, we will splat the scalar induction variable in each
2525   // loop iteration.
2526   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
2527     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2528     VectorizedIV = true;
2529   }
2530 
2531   // If we haven't yet vectorized the induction variable, or if we will create
2532   // a scalar one, we need to define the scalar induction variable and step
2533   // values. If we were given a truncation type, truncate the canonical
2534   // induction variable and step. Otherwise, derive these values from the
2535   // induction descriptor.
2536   if (!VectorizedIV || NeedsScalarIV) {
2537     ScalarIV = Induction;
2538     if (IV != OldInduction) {
2539       ScalarIV = IV->getType()->isIntegerTy()
2540                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2541                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2542                                           IV->getType());
2543       ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2544       ScalarIV->setName("offset.idx");
2545     }
2546     if (Trunc) {
2547       auto *TruncType = cast<IntegerType>(Trunc->getType());
2548       assert(Step->getType()->isIntegerTy() &&
2549              "Truncation requires an integer step");
2550       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2551       Step = Builder.CreateTrunc(Step, TruncType);
2552     }
2553   }
2554 
2555   // If we haven't yet vectorized the induction variable, splat the scalar
2556   // induction variable, and build the necessary step vectors.
2557   if (!VectorizedIV) {
2558     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2559     for (unsigned Part = 0; Part < UF; ++Part) {
2560       Value *EntryPart =
2561           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
2562       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2563       if (Trunc)
2564         addMetadata(EntryPart, Trunc);
2565     }
2566   }
2567 
2568   // If an induction variable is only used for counting loop iterations or
2569   // calculating addresses, it doesn't need to be widened. Create scalar steps
2570   // that can be used by instructions we will later scalarize. Note that the
2571   // addition of the scalar steps will not increase the number of instructions
2572   // in the loop in the common case prior to InstCombine. We will be trading
2573   // one vector extract for each scalar step.
2574   if (NeedsScalarIV)
2575     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2576 }
2577 
2578 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2579                                           Instruction::BinaryOps BinOp) {
2580   // Create and check the types.
2581   assert(Val->getType()->isVectorTy() && "Must be a vector");
2582   int VLen = Val->getType()->getVectorNumElements();
2583 
2584   Type *STy = Val->getType()->getScalarType();
2585   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2586          "Induction Step must be an integer or FP");
2587   assert(Step->getType() == STy && "Step has wrong type");
2588 
2589   SmallVector<Constant *, 8> Indices;
2590 
2591   if (STy->isIntegerTy()) {
2592     // Create a vector of consecutive numbers from zero to VF.
2593     for (int i = 0; i < VLen; ++i)
2594       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2595 
2596     // Add the consecutive indices to the vector value.
2597     Constant *Cv = ConstantVector::get(Indices);
2598     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2599     Step = Builder.CreateVectorSplat(VLen, Step);
2600     assert(Step->getType() == Val->getType() && "Invalid step vec");
2601     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2602     // which can be found from the original scalar operations.
2603     Step = Builder.CreateMul(Cv, Step);
2604     return Builder.CreateAdd(Val, Step, "induction");
2605   }
2606 
2607   // Floating point induction.
2608   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2609          "Binary Opcode should be specified for FP induction");
2610   // Create a vector of consecutive numbers from zero to VF.
2611   for (int i = 0; i < VLen; ++i)
2612     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2613 
2614   // Add the consecutive indices to the vector value.
2615   Constant *Cv = ConstantVector::get(Indices);
2616 
2617   Step = Builder.CreateVectorSplat(VLen, Step);
2618 
2619   // Floating point operations had to be 'fast' to enable the induction.
2620   FastMathFlags Flags;
2621   Flags.setUnsafeAlgebra();
2622 
2623   Value *MulOp = Builder.CreateFMul(Cv, Step);
2624   if (isa<Instruction>(MulOp))
2625     // Have to check, MulOp may be a constant
2626     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2627 
2628   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2629   if (isa<Instruction>(BOp))
2630     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2631   return BOp;
2632 }
2633 
2634 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2635                                            Value *EntryVal,
2636                                            const InductionDescriptor &ID) {
2637 
2638   // We shouldn't have to build scalar steps if we aren't vectorizing.
2639   assert(VF > 1 && "VF should be greater than one");
2640 
2641   // Get the value type and ensure it and the step have the same integer type.
2642   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2643   assert(ScalarIVTy == Step->getType() &&
2644          "Val and Step should have the same type");
2645 
2646   // We build scalar steps for both integer and floating-point induction
2647   // variables. Here, we determine the kind of arithmetic we will perform.
2648   Instruction::BinaryOps AddOp;
2649   Instruction::BinaryOps MulOp;
2650   if (ScalarIVTy->isIntegerTy()) {
2651     AddOp = Instruction::Add;
2652     MulOp = Instruction::Mul;
2653   } else {
2654     AddOp = ID.getInductionOpcode();
2655     MulOp = Instruction::FMul;
2656   }
2657 
2658   // Determine the number of scalars we need to generate for each unroll
2659   // iteration. If EntryVal is uniform, we only need to generate the first
2660   // lane. Otherwise, we generate all VF values.
2661   unsigned Lanes =
2662     Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
2663 
2664   // Compute the scalar steps and save the results in VectorLoopValueMap.
2665   for (unsigned Part = 0; Part < UF; ++Part) {
2666     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2667       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2668       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2669       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2670       VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
2671     }
2672   }
2673 }
2674 
2675 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
2676 
2677   const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2678     ValueToValueMap();
2679 
2680   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2681   if (Stride == 1 || Stride == -1)
2682     return Stride;
2683   return 0;
2684 }
2685 
2686 bool LoopVectorizationLegality::isUniform(Value *V) {
2687   return LAI->isUniform(V);
2688 }
2689 
2690 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2691   assert(V != Induction && "The new induction variable should not be used.");
2692   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2693   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2694 
2695   // If we have a stride that is replaced by one, do it here.
2696   if (Legal->hasStride(V))
2697     V = ConstantInt::get(V->getType(), 1);
2698 
2699   // If we have a vector mapped to this value, return it.
2700   if (VectorLoopValueMap.hasVectorValue(V, Part))
2701     return VectorLoopValueMap.getVectorValue(V, Part);
2702 
2703   // If the value has not been vectorized, check if it has been scalarized
2704   // instead. If it has been scalarized, and we actually need the value in
2705   // vector form, we will construct the vector values on demand.
2706   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2707 
2708     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
2709 
2710     // If we've scalarized a value, that value should be an instruction.
2711     auto *I = cast<Instruction>(V);
2712 
2713     // If we aren't vectorizing, we can just copy the scalar map values over to
2714     // the vector map.
2715     if (VF == 1) {
2716       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2717       return ScalarValue;
2718     }
2719 
2720     // Get the last scalar instruction we generated for V and Part. If the value
2721     // is known to be uniform after vectorization, this corresponds to lane zero
2722     // of the Part unroll iteration. Otherwise, the last instruction is the one
2723     // we created for the last vector lane of the Part unroll iteration.
2724     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2725     auto *LastInst =
2726         cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
2727 
2728     // Set the insert point after the last scalarized instruction. This ensures
2729     // the insertelement sequence will directly follow the scalar definitions.
2730     auto OldIP = Builder.saveIP();
2731     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2732     Builder.SetInsertPoint(&*NewIP);
2733 
2734     // However, if we are vectorizing, we need to construct the vector values.
2735     // If the value is known to be uniform after vectorization, we can just
2736     // broadcast the scalar value corresponding to lane zero for each unroll
2737     // iteration. Otherwise, we construct the vector values using insertelement
2738     // instructions. Since the resulting vectors are stored in
2739     // VectorLoopValueMap, we will only generate the insertelements once.
2740     Value *VectorValue = nullptr;
2741     if (Cost->isUniformAfterVectorization(I, VF)) {
2742       VectorValue = getBroadcastInstrs(ScalarValue);
2743     } else {
2744       VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
2745       for (unsigned Lane = 0; Lane < VF; ++Lane)
2746         VectorValue = Builder.CreateInsertElement(
2747             VectorValue, getOrCreateScalarValue(V, Part, Lane),
2748             Builder.getInt32(Lane));
2749     }
2750     VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2751     Builder.restoreIP(OldIP);
2752     return VectorValue;
2753   }
2754 
2755   // If this scalar is unknown, assume that it is a constant or that it is
2756   // loop invariant. Broadcast V and save the value for future uses.
2757   Value *B = getBroadcastInstrs(V);
2758   VectorLoopValueMap.setVectorValue(V, Part, B);
2759   return B;
2760 }
2761 
2762 Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
2763                                                    unsigned Lane) {
2764 
2765   // If the value is not an instruction contained in the loop, it should
2766   // already be scalar.
2767   if (OrigLoop->isLoopInvariant(V))
2768     return V;
2769 
2770   assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2771                   : true && "Uniform values only have lane zero");
2772 
2773   // If the value from the original loop has not been vectorized, it is
2774   // represented by UF x VF scalar values in the new loop. Return the requested
2775   // scalar value.
2776   if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
2777     return VectorLoopValueMap.getScalarValue(V, Part, Lane);
2778 
2779   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2780   // for the given unroll part. If this entry is not a vector type (i.e., the
2781   // vectorization factor is one), there is no need to generate an
2782   // extractelement instruction.
2783   auto *U = getOrCreateVectorValue(V, Part);
2784   if (!U->getType()->isVectorTy()) {
2785     assert(VF == 1 && "Value not scalarized has non-vector type");
2786     return U;
2787   }
2788 
2789   // Otherwise, the value from the original loop has been vectorized and is
2790   // represented by UF vector values. Extract and return the requested scalar
2791   // value from the appropriate vector lane.
2792   return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
2793 }
2794 
2795 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2796   assert(Vec->getType()->isVectorTy() && "Invalid type");
2797   SmallVector<Constant *, 8> ShuffleMask;
2798   for (unsigned i = 0; i < VF; ++i)
2799     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2800 
2801   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2802                                      ConstantVector::get(ShuffleMask),
2803                                      "reverse");
2804 }
2805 
2806 // Try to vectorize the interleave group that \p Instr belongs to.
2807 //
2808 // E.g. Translate following interleaved load group (factor = 3):
2809 //   for (i = 0; i < N; i+=3) {
2810 //     R = Pic[i];             // Member of index 0
2811 //     G = Pic[i+1];           // Member of index 1
2812 //     B = Pic[i+2];           // Member of index 2
2813 //     ... // do something to R, G, B
2814 //   }
2815 // To:
2816 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2817 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2818 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2819 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2820 //
2821 // Or translate following interleaved store group (factor = 3):
2822 //   for (i = 0; i < N; i+=3) {
2823 //     ... do something to R, G, B
2824 //     Pic[i]   = R;           // Member of index 0
2825 //     Pic[i+1] = G;           // Member of index 1
2826 //     Pic[i+2] = B;           // Member of index 2
2827 //   }
2828 // To:
2829 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2830 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2831 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2832 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2833 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2834 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
2835   const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
2836   assert(Group && "Fail to get an interleaved access group.");
2837 
2838   // Skip if current instruction is not the insert position.
2839   if (Instr != Group->getInsertPos())
2840     return;
2841 
2842   Value *Ptr = getPointerOperand(Instr);
2843 
2844   // Prepare for the vector type of the interleaved load/store.
2845   Type *ScalarTy = getMemInstValueType(Instr);
2846   unsigned InterleaveFactor = Group->getFactor();
2847   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2848   Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
2849 
2850   // Prepare for the new pointers.
2851   setDebugLocFromInst(Builder, Ptr);
2852   SmallVector<Value *, 2> NewPtrs;
2853   unsigned Index = Group->getIndex(Instr);
2854 
2855   // If the group is reverse, adjust the index to refer to the last vector lane
2856   // instead of the first. We adjust the index from the first vector lane,
2857   // rather than directly getting the pointer for lane VF - 1, because the
2858   // pointer operand of the interleaved access is supposed to be uniform. For
2859   // uniform instructions, we're only required to generate a value for the
2860   // first vector lane in each unroll iteration.
2861   if (Group->isReverse())
2862     Index += (VF - 1) * Group->getFactor();
2863 
2864   for (unsigned Part = 0; Part < UF; Part++) {
2865     Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
2866 
2867     // Notice current instruction could be any index. Need to adjust the address
2868     // to the member of index 0.
2869     //
2870     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2871     //       b = A[i];       // Member of index 0
2872     // Current pointer is pointed to A[i+1], adjust it to A[i].
2873     //
2874     // E.g.  A[i+1] = a;     // Member of index 1
2875     //       A[i]   = b;     // Member of index 0
2876     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2877     // Current pointer is pointed to A[i+2], adjust it to A[i].
2878     NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2879 
2880     // Cast to the vector pointer type.
2881     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2882   }
2883 
2884   setDebugLocFromInst(Builder, Instr);
2885   Value *UndefVec = UndefValue::get(VecTy);
2886 
2887   // Vectorize the interleaved load group.
2888   if (isa<LoadInst>(Instr)) {
2889 
2890     // For each unroll part, create a wide load for the group.
2891     SmallVector<Value *, 2> NewLoads;
2892     for (unsigned Part = 0; Part < UF; Part++) {
2893       auto *NewLoad = Builder.CreateAlignedLoad(
2894           NewPtrs[Part], Group->getAlignment(), "wide.vec");
2895       addMetadata(NewLoad, Instr);
2896       NewLoads.push_back(NewLoad);
2897     }
2898 
2899     // For each member in the group, shuffle out the appropriate data from the
2900     // wide loads.
2901     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2902       Instruction *Member = Group->getMember(I);
2903 
2904       // Skip the gaps in the group.
2905       if (!Member)
2906         continue;
2907 
2908       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2909       for (unsigned Part = 0; Part < UF; Part++) {
2910         Value *StridedVec = Builder.CreateShuffleVector(
2911             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2912 
2913         // If this member has different type, cast the result type.
2914         if (Member->getType() != ScalarTy) {
2915           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2916           StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
2917         }
2918 
2919         if (Group->isReverse())
2920           StridedVec = reverseVector(StridedVec);
2921 
2922         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2923       }
2924     }
2925     return;
2926   }
2927 
2928   // The sub vector type for current instruction.
2929   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2930 
2931   // Vectorize the interleaved store group.
2932   for (unsigned Part = 0; Part < UF; Part++) {
2933     // Collect the stored vector from each member.
2934     SmallVector<Value *, 4> StoredVecs;
2935     for (unsigned i = 0; i < InterleaveFactor; i++) {
2936       // Interleaved store group doesn't allow a gap, so each index has a member
2937       Instruction *Member = Group->getMember(i);
2938       assert(Member && "Fail to get a member from an interleaved store group");
2939 
2940       Value *StoredVec = getOrCreateVectorValue(
2941           cast<StoreInst>(Member)->getValueOperand(), Part);
2942       if (Group->isReverse())
2943         StoredVec = reverseVector(StoredVec);
2944 
2945       // If this member has different type, cast it to an unified type.
2946       if (StoredVec->getType() != SubVT)
2947         StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
2948 
2949       StoredVecs.push_back(StoredVec);
2950     }
2951 
2952     // Concatenate all vectors into a wide vector.
2953     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2954 
2955     // Interleave the elements in the wide vector.
2956     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2957     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2958                                               "interleaved.vec");
2959 
2960     Instruction *NewStoreInstr =
2961         Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2962     addMetadata(NewStoreInstr, Instr);
2963   }
2964 }
2965 
2966 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
2967   // Attempt to issue a wide load.
2968   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2969   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2970 
2971   assert((LI || SI) && "Invalid Load/Store instruction");
2972 
2973   LoopVectorizationCostModel::InstWidening Decision =
2974       Cost->getWideningDecision(Instr, VF);
2975   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2976          "CM decision should be taken at this point");
2977   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2978     return vectorizeInterleaveGroup(Instr);
2979 
2980   Type *ScalarDataTy = getMemInstValueType(Instr);
2981   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2982   Value *Ptr = getPointerOperand(Instr);
2983   unsigned Alignment = getMemInstAlignment(Instr);
2984   // An alignment of 0 means target abi alignment. We need to use the scalar's
2985   // target abi alignment in such a case.
2986   const DataLayout &DL = Instr->getModule()->getDataLayout();
2987   if (!Alignment)
2988     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2989   unsigned AddressSpace = getMemInstAddressSpace(Instr);
2990 
2991   // Scalarize the memory instruction if necessary.
2992   if (Decision == LoopVectorizationCostModel::CM_Scalarize)
2993     return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
2994 
2995   // Determine if the pointer operand of the access is either consecutive or
2996   // reverse consecutive.
2997   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
2998   bool Reverse = ConsecutiveStride < 0;
2999   bool CreateGatherScatter =
3000       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
3001 
3002   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
3003   // gather/scatter. Otherwise Decision should have been to Scalarize.
3004   assert((ConsecutiveStride || CreateGatherScatter) &&
3005          "The instruction should be scalarized");
3006 
3007   // Handle consecutive loads/stores.
3008   if (ConsecutiveStride)
3009     Ptr = getOrCreateScalarValue(Ptr, 0, 0);
3010 
3011   VectorParts Mask = createBlockInMask(Instr->getParent());
3012   // Handle Stores:
3013   if (SI) {
3014     assert(!Legal->isUniform(SI->getPointerOperand()) &&
3015            "We do not allow storing to uniform addresses");
3016     setDebugLocFromInst(Builder, SI);
3017 
3018     for (unsigned Part = 0; Part < UF; ++Part) {
3019       Instruction *NewSI = nullptr;
3020       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
3021       if (CreateGatherScatter) {
3022         Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
3023         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3024         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3025                                             MaskPart);
3026       } else {
3027         // Calculate the pointer for the specific unroll-part.
3028         Value *PartPtr =
3029             Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3030 
3031         if (Reverse) {
3032           // If we store to reverse consecutive memory locations, then we need
3033           // to reverse the order of elements in the stored value.
3034           StoredVal = reverseVector(StoredVal);
3035           // We don't want to update the value in the map as it might be used in
3036           // another expression. So don't call resetVectorValue(StoredVal).
3037 
3038           // If the address is consecutive but reversed, then the
3039           // wide store needs to start at the last vector element.
3040           PartPtr =
3041               Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3042           PartPtr =
3043               Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3044           Mask[Part] = reverseVector(Mask[Part]);
3045         }
3046 
3047         Value *VecPtr =
3048             Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3049 
3050         if (Legal->isMaskRequired(SI))
3051           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3052                                             Mask[Part]);
3053         else
3054           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3055       }
3056       addMetadata(NewSI, SI);
3057     }
3058     return;
3059   }
3060 
3061   // Handle loads.
3062   assert(LI && "Must have a load instruction");
3063   setDebugLocFromInst(Builder, LI);
3064   for (unsigned Part = 0; Part < UF; ++Part) {
3065     Value *NewLI;
3066     if (CreateGatherScatter) {
3067       Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
3068       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3069       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
3070                                          nullptr, "wide.masked.gather");
3071       addMetadata(NewLI, LI);
3072     } else {
3073       // Calculate the pointer for the specific unroll-part.
3074       Value *PartPtr =
3075           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3076 
3077       if (Reverse) {
3078         // If the address is consecutive but reversed, then the
3079         // wide load needs to start at the last vector element.
3080         PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3081         PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3082         Mask[Part] = reverseVector(Mask[Part]);
3083       }
3084 
3085       Value *VecPtr =
3086           Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3087       if (Legal->isMaskRequired(LI))
3088         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
3089                                          UndefValue::get(DataTy),
3090                                          "wide.masked.load");
3091       else
3092         NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
3093 
3094       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3095       addMetadata(NewLI, LI);
3096       if (Reverse)
3097         NewLI = reverseVector(NewLI);
3098     }
3099     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
3100   }
3101 }
3102 
3103 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
3104                                                bool IfPredicateInstr) {
3105   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3106   DEBUG(dbgs() << "LV: Scalarizing"
3107                << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
3108                << '\n');
3109   // Holds vector parameters or scalars, in case of uniform vals.
3110   SmallVector<VectorParts, 4> Params;
3111 
3112   setDebugLocFromInst(Builder, Instr);
3113 
3114   // Does this instruction return a value ?
3115   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3116 
3117   VectorParts Cond;
3118   if (IfPredicateInstr)
3119     Cond = createBlockInMask(Instr->getParent());
3120 
3121   // Determine the number of scalars we need to generate for each unroll
3122   // iteration. If the instruction is uniform, we only need to generate the
3123   // first lane. Otherwise, we generate all VF values.
3124   unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
3125 
3126   // For each vector unroll 'part':
3127   for (unsigned Part = 0; Part < UF; ++Part) {
3128     // For each scalar that we create:
3129     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3130 
3131       // Start if-block.
3132       Value *Cmp = nullptr;
3133       if (IfPredicateInstr) {
3134         Cmp = Cond[Part];
3135         if (Cmp->getType()->isVectorTy())
3136           Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
3137         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
3138                                  ConstantInt::get(Cmp->getType(), 1));
3139       }
3140 
3141       Instruction *Cloned = Instr->clone();
3142       if (!IsVoidRetTy)
3143         Cloned->setName(Instr->getName() + ".cloned");
3144 
3145       // Replace the operands of the cloned instructions with their scalar
3146       // equivalents in the new loop.
3147       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3148         auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
3149         Cloned->setOperand(op, NewOp);
3150       }
3151       addNewMetadata(Cloned, Instr);
3152 
3153       // Place the cloned scalar in the new loop.
3154       Builder.Insert(Cloned);
3155 
3156       // Add the cloned scalar to the scalar map entry.
3157       VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
3158 
3159       // If we just cloned a new assumption, add it the assumption cache.
3160       if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3161         if (II->getIntrinsicID() == Intrinsic::assume)
3162           AC->registerAssumption(II);
3163 
3164       // End if-block.
3165       if (IfPredicateInstr)
3166         PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
3167     }
3168   }
3169 }
3170 
3171 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3172                                                       Value *End, Value *Step,
3173                                                       Instruction *DL) {
3174   BasicBlock *Header = L->getHeader();
3175   BasicBlock *Latch = L->getLoopLatch();
3176   // As we're just creating this loop, it's possible no latch exists
3177   // yet. If so, use the header as this will be a single block loop.
3178   if (!Latch)
3179     Latch = Header;
3180 
3181   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3182   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3183   setDebugLocFromInst(Builder, OldInst);
3184   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3185 
3186   Builder.SetInsertPoint(Latch->getTerminator());
3187   setDebugLocFromInst(Builder, OldInst);
3188 
3189   // Create i+1 and fill the PHINode.
3190   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3191   Induction->addIncoming(Start, L->getLoopPreheader());
3192   Induction->addIncoming(Next, Latch);
3193   // Create the compare.
3194   Value *ICmp = Builder.CreateICmpEQ(Next, End);
3195   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3196 
3197   // Now we have two terminators. Remove the old one from the block.
3198   Latch->getTerminator()->eraseFromParent();
3199 
3200   return Induction;
3201 }
3202 
3203 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3204   if (TripCount)
3205     return TripCount;
3206 
3207   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3208   // Find the loop boundaries.
3209   ScalarEvolution *SE = PSE.getSE();
3210   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3211   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3212          "Invalid loop count");
3213 
3214   Type *IdxTy = Legal->getWidestInductionType();
3215 
3216   // The exit count might have the type of i64 while the phi is i32. This can
3217   // happen if we have an induction variable that is sign extended before the
3218   // compare. The only way that we get a backedge taken count is that the
3219   // induction variable was signed and as such will not overflow. In such a case
3220   // truncation is legal.
3221   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3222       IdxTy->getPrimitiveSizeInBits())
3223     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3224   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3225 
3226   // Get the total trip count from the count by adding 1.
3227   const SCEV *ExitCount = SE->getAddExpr(
3228       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3229 
3230   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3231 
3232   // Expand the trip count and place the new instructions in the preheader.
3233   // Notice that the pre-header does not change, only the loop body.
3234   SCEVExpander Exp(*SE, DL, "induction");
3235 
3236   // Count holds the overall loop count (N).
3237   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3238                                 L->getLoopPreheader()->getTerminator());
3239 
3240   if (TripCount->getType()->isPointerTy())
3241     TripCount =
3242         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3243                                     L->getLoopPreheader()->getTerminator());
3244 
3245   return TripCount;
3246 }
3247 
3248 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3249   if (VectorTripCount)
3250     return VectorTripCount;
3251 
3252   Value *TC = getOrCreateTripCount(L);
3253   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3254 
3255   // Now we need to generate the expression for the part of the loop that the
3256   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3257   // iterations are not required for correctness, or N - Step, otherwise. Step
3258   // is equal to the vectorization factor (number of SIMD elements) times the
3259   // unroll factor (number of SIMD instructions).
3260   Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3261   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3262 
3263   // If there is a non-reversed interleaved group that may speculatively access
3264   // memory out-of-bounds, we need to ensure that there will be at least one
3265   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3266   // the trip count, we set the remainder to be equal to the step. If the step
3267   // does not evenly divide the trip count, no adjustment is necessary since
3268   // there will already be scalar iterations. Note that the minimum iterations
3269   // check ensures that N >= Step.
3270   if (VF > 1 && Legal->requiresScalarEpilogue()) {
3271     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3272     R = Builder.CreateSelect(IsZero, Step, R);
3273   }
3274 
3275   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3276 
3277   return VectorTripCount;
3278 }
3279 
3280 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3281                                                          BasicBlock *Bypass) {
3282   Value *Count = getOrCreateTripCount(L);
3283   BasicBlock *BB = L->getLoopPreheader();
3284   IRBuilder<> Builder(BB->getTerminator());
3285 
3286   // Generate code to check that the loop's trip count that we computed by
3287   // adding one to the backedge-taken count will not overflow.
3288   Value *CheckMinIters = Builder.CreateICmpULT(
3289       Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3290 
3291   BasicBlock *NewBB =
3292       BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
3293   // Update dominator tree immediately if the generated block is a
3294   // LoopBypassBlock because SCEV expansions to generate loop bypass
3295   // checks may query it before the current function is finished.
3296   DT->addNewBlock(NewBB, BB);
3297   if (L->getParentLoop())
3298     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3299   ReplaceInstWithInst(BB->getTerminator(),
3300                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
3301   LoopBypassBlocks.push_back(BB);
3302 }
3303 
3304 void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
3305                                                      BasicBlock *Bypass) {
3306   Value *TC = getOrCreateVectorTripCount(L);
3307   BasicBlock *BB = L->getLoopPreheader();
3308   IRBuilder<> Builder(BB->getTerminator());
3309 
3310   // Now, compare the new count to zero. If it is zero skip the vector loop and
3311   // jump to the scalar loop.
3312   Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
3313                                     "cmp.zero");
3314 
3315   // Generate code to check that the loop's trip count that we computed by
3316   // adding one to the backedge-taken count will not overflow.
3317   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3318   // Update dominator tree immediately if the generated block is a
3319   // LoopBypassBlock because SCEV expansions to generate loop bypass
3320   // checks may query it before the current function is finished.
3321   DT->addNewBlock(NewBB, BB);
3322   if (L->getParentLoop())
3323     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3324   ReplaceInstWithInst(BB->getTerminator(),
3325                       BranchInst::Create(Bypass, NewBB, Cmp));
3326   LoopBypassBlocks.push_back(BB);
3327 }
3328 
3329 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3330   BasicBlock *BB = L->getLoopPreheader();
3331 
3332   // Generate the code to check that the SCEV assumptions that we made.
3333   // We want the new basic block to start at the first instruction in a
3334   // sequence of instructions that form a check.
3335   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3336                    "scev.check");
3337   Value *SCEVCheck =
3338       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3339 
3340   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3341     if (C->isZero())
3342       return;
3343 
3344   // Create a new block containing the stride check.
3345   BB->setName("vector.scevcheck");
3346   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3347   // Update dominator tree immediately if the generated block is a
3348   // LoopBypassBlock because SCEV expansions to generate loop bypass
3349   // checks may query it before the current function is finished.
3350   DT->addNewBlock(NewBB, BB);
3351   if (L->getParentLoop())
3352     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3353   ReplaceInstWithInst(BB->getTerminator(),
3354                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
3355   LoopBypassBlocks.push_back(BB);
3356   AddedSafetyChecks = true;
3357 }
3358 
3359 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3360   BasicBlock *BB = L->getLoopPreheader();
3361 
3362   // Generate the code that checks in runtime if arrays overlap. We put the
3363   // checks into a separate block to make the more common case of few elements
3364   // faster.
3365   Instruction *FirstCheckInst;
3366   Instruction *MemRuntimeCheck;
3367   std::tie(FirstCheckInst, MemRuntimeCheck) =
3368       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
3369   if (!MemRuntimeCheck)
3370     return;
3371 
3372   // Create a new block containing the memory check.
3373   BB->setName("vector.memcheck");
3374   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3375   // Update dominator tree immediately if the generated block is a
3376   // LoopBypassBlock because SCEV expansions to generate loop bypass
3377   // checks may query it before the current function is finished.
3378   DT->addNewBlock(NewBB, BB);
3379   if (L->getParentLoop())
3380     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3381   ReplaceInstWithInst(BB->getTerminator(),
3382                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3383   LoopBypassBlocks.push_back(BB);
3384   AddedSafetyChecks = true;
3385 
3386   // We currently don't use LoopVersioning for the actual loop cloning but we
3387   // still use it to add the noalias metadata.
3388   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3389                                            PSE.getSE());
3390   LVer->prepareNoAliasMetadata();
3391 }
3392 
3393 void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3394   /*
3395    In this function we generate a new loop. The new loop will contain
3396    the vectorized instructions while the old loop will continue to run the
3397    scalar remainder.
3398 
3399        [ ] <-- loop iteration number check.
3400     /   |
3401    /    v
3402   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3403   |  /  |
3404   | /   v
3405   ||   [ ]     <-- vector pre header.
3406   |/    |
3407   |     v
3408   |    [  ] \
3409   |    [  ]_|   <-- vector loop.
3410   |     |
3411   |     v
3412   |   -[ ]   <--- middle-block.
3413   |  /  |
3414   | /   v
3415   -|- >[ ]     <--- new preheader.
3416    |    |
3417    |    v
3418    |   [ ] \
3419    |   [ ]_|   <-- old scalar loop to handle remainder.
3420     \   |
3421      \  v
3422       >[ ]     <-- exit block.
3423    ...
3424    */
3425 
3426   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3427   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3428   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3429   assert(VectorPH && "Invalid loop structure");
3430   assert(ExitBlock && "Must have an exit block");
3431 
3432   // Some loops have a single integer induction variable, while other loops
3433   // don't. One example is c++ iterators that often have multiple pointer
3434   // induction variables. In the code below we also support a case where we
3435   // don't have a single induction variable.
3436   //
3437   // We try to obtain an induction variable from the original loop as hard
3438   // as possible. However if we don't find one that:
3439   //   - is an integer
3440   //   - counts from zero, stepping by one
3441   //   - is the size of the widest induction variable type
3442   // then we create a new one.
3443   OldInduction = Legal->getPrimaryInduction();
3444   Type *IdxTy = Legal->getWidestInductionType();
3445 
3446   // Split the single block loop into the two loop structure described above.
3447   BasicBlock *VecBody =
3448       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3449   BasicBlock *MiddleBlock =
3450       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3451   BasicBlock *ScalarPH =
3452       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3453 
3454   // Create and register the new vector loop.
3455   Loop *Lp = new Loop();
3456   Loop *ParentLoop = OrigLoop->getParentLoop();
3457 
3458   // Insert the new loop into the loop nest and register the new basic blocks
3459   // before calling any utilities such as SCEV that require valid LoopInfo.
3460   if (ParentLoop) {
3461     ParentLoop->addChildLoop(Lp);
3462     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3463     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3464   } else {
3465     LI->addTopLevelLoop(Lp);
3466   }
3467   Lp->addBasicBlockToLoop(VecBody, *LI);
3468 
3469   // Find the loop boundaries.
3470   Value *Count = getOrCreateTripCount(Lp);
3471 
3472   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3473 
3474   // We need to test whether the backedge-taken count is uint##_max. Adding one
3475   // to it will cause overflow and an incorrect loop trip count in the vector
3476   // body. In case of overflow we want to directly jump to the scalar remainder
3477   // loop.
3478   emitMinimumIterationCountCheck(Lp, ScalarPH);
3479   // Now, compare the new count to zero. If it is zero skip the vector loop and
3480   // jump to the scalar loop.
3481   emitVectorLoopEnteredCheck(Lp, ScalarPH);
3482   // Generate the code to check any assumptions that we've made for SCEV
3483   // expressions.
3484   emitSCEVChecks(Lp, ScalarPH);
3485 
3486   // Generate the code that checks in runtime if arrays overlap. We put the
3487   // checks into a separate block to make the more common case of few elements
3488   // faster.
3489   emitMemRuntimeChecks(Lp, ScalarPH);
3490 
3491   // Generate the induction variable.
3492   // The loop step is equal to the vectorization factor (num of SIMD elements)
3493   // times the unroll factor (num of SIMD instructions).
3494   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3495   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3496   Induction =
3497       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3498                               getDebugLocFromInstOrOperands(OldInduction));
3499 
3500   // We are going to resume the execution of the scalar loop.
3501   // Go over all of the induction variables that we found and fix the
3502   // PHIs that are left in the scalar version of the loop.
3503   // The starting values of PHI nodes depend on the counter of the last
3504   // iteration in the vectorized loop.
3505   // If we come from a bypass edge then we need to start from the original
3506   // start value.
3507 
3508   // This variable saves the new starting index for the scalar loop. It is used
3509   // to test if there are any tail iterations left once the vector loop has
3510   // completed.
3511   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3512   for (auto &InductionEntry : *List) {
3513     PHINode *OrigPhi = InductionEntry.first;
3514     InductionDescriptor II = InductionEntry.second;
3515 
3516     // Create phi nodes to merge from the  backedge-taken check block.
3517     PHINode *BCResumeVal = PHINode::Create(
3518         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3519     Value *&EndValue = IVEndValues[OrigPhi];
3520     if (OrigPhi == OldInduction) {
3521       // We know what the end value is.
3522       EndValue = CountRoundDown;
3523     } else {
3524       IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
3525       Type *StepType = II.getStep()->getType();
3526       Instruction::CastOps CastOp =
3527         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3528       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3529       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3530       EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3531       EndValue->setName("ind.end");
3532     }
3533 
3534     // The new PHI merges the original incoming value, in case of a bypass,
3535     // or the value at the end of the vectorized loop.
3536     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3537 
3538     // Fix the scalar body counter (PHI node).
3539     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3540 
3541     // The old induction's phi node in the scalar body needs the truncated
3542     // value.
3543     for (BasicBlock *BB : LoopBypassBlocks)
3544       BCResumeVal->addIncoming(II.getStartValue(), BB);
3545     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3546   }
3547 
3548   // Add a check in the middle block to see if we have completed
3549   // all of the iterations in the first vector loop.
3550   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3551   Value *CmpN =
3552       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3553                       CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3554   ReplaceInstWithInst(MiddleBlock->getTerminator(),
3555                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3556 
3557   // Get ready to start creating new instructions into the vectorized body.
3558   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3559 
3560   // Save the state.
3561   LoopVectorPreHeader = Lp->getLoopPreheader();
3562   LoopScalarPreHeader = ScalarPH;
3563   LoopMiddleBlock = MiddleBlock;
3564   LoopExitBlock = ExitBlock;
3565   LoopVectorBody = VecBody;
3566   LoopScalarBody = OldBasicBlock;
3567 
3568   // Keep all loop hints from the original loop on the vector loop (we'll
3569   // replace the vectorizer-specific hints below).
3570   if (MDNode *LID = OrigLoop->getLoopID())
3571     Lp->setLoopID(LID);
3572 
3573   LoopVectorizeHints Hints(Lp, true, *ORE);
3574   Hints.setAlreadyVectorized();
3575 }
3576 
3577 // Fix up external users of the induction variable. At this point, we are
3578 // in LCSSA form, with all external PHIs that use the IV having one input value,
3579 // coming from the remainder loop. We need those PHIs to also have a correct
3580 // value for the IV when arriving directly from the middle block.
3581 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3582                                        const InductionDescriptor &II,
3583                                        Value *CountRoundDown, Value *EndValue,
3584                                        BasicBlock *MiddleBlock) {
3585   // There are two kinds of external IV usages - those that use the value
3586   // computed in the last iteration (the PHI) and those that use the penultimate
3587   // value (the value that feeds into the phi from the loop latch).
3588   // We allow both, but they, obviously, have different values.
3589 
3590   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3591 
3592   DenseMap<Value *, Value *> MissingVals;
3593 
3594   // An external user of the last iteration's value should see the value that
3595   // the remainder loop uses to initialize its own IV.
3596   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3597   for (User *U : PostInc->users()) {
3598     Instruction *UI = cast<Instruction>(U);
3599     if (!OrigLoop->contains(UI)) {
3600       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3601       MissingVals[UI] = EndValue;
3602     }
3603   }
3604 
3605   // An external user of the penultimate value need to see EndValue - Step.
3606   // The simplest way to get this is to recompute it from the constituent SCEVs,
3607   // that is Start + (Step * (CRD - 1)).
3608   for (User *U : OrigPhi->users()) {
3609     auto *UI = cast<Instruction>(U);
3610     if (!OrigLoop->contains(UI)) {
3611       const DataLayout &DL =
3612           OrigLoop->getHeader()->getModule()->getDataLayout();
3613       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3614 
3615       IRBuilder<> B(MiddleBlock->getTerminator());
3616       Value *CountMinusOne = B.CreateSub(
3617           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3618       Value *CMO =
3619           !II.getStep()->getType()->isIntegerTy()
3620               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3621                              II.getStep()->getType())
3622               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3623       CMO->setName("cast.cmo");
3624       Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3625       Escape->setName("ind.escape");
3626       MissingVals[UI] = Escape;
3627     }
3628   }
3629 
3630   for (auto &I : MissingVals) {
3631     PHINode *PHI = cast<PHINode>(I.first);
3632     // One corner case we have to handle is two IVs "chasing" each-other,
3633     // that is %IV2 = phi [...], [ %IV1, %latch ]
3634     // In this case, if IV1 has an external use, we need to avoid adding both
3635     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3636     // don't already have an incoming value for the middle block.
3637     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3638       PHI->addIncoming(I.second, MiddleBlock);
3639   }
3640 }
3641 
3642 namespace {
3643 struct CSEDenseMapInfo {
3644   static bool canHandle(const Instruction *I) {
3645     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3646            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3647   }
3648   static inline Instruction *getEmptyKey() {
3649     return DenseMapInfo<Instruction *>::getEmptyKey();
3650   }
3651   static inline Instruction *getTombstoneKey() {
3652     return DenseMapInfo<Instruction *>::getTombstoneKey();
3653   }
3654   static unsigned getHashValue(const Instruction *I) {
3655     assert(canHandle(I) && "Unknown instruction!");
3656     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3657                                                            I->value_op_end()));
3658   }
3659   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3660     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3661         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3662       return LHS == RHS;
3663     return LHS->isIdenticalTo(RHS);
3664   }
3665 };
3666 }
3667 
3668 ///\brief Perform cse of induction variable instructions.
3669 static void cse(BasicBlock *BB) {
3670   // Perform simple cse.
3671   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3672   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3673     Instruction *In = &*I++;
3674 
3675     if (!CSEDenseMapInfo::canHandle(In))
3676       continue;
3677 
3678     // Check if we can replace this instruction with any of the
3679     // visited instructions.
3680     if (Instruction *V = CSEMap.lookup(In)) {
3681       In->replaceAllUsesWith(V);
3682       In->eraseFromParent();
3683       continue;
3684     }
3685 
3686     CSEMap[In] = In;
3687   }
3688 }
3689 
3690 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3691 /// convenience wrapper for the type-based getScalarizationOverhead API.
3692 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3693                                          const TargetTransformInfo &TTI) {
3694   if (VF == 1)
3695     return 0;
3696 
3697   unsigned Cost = 0;
3698   Type *RetTy = ToVectorTy(I->getType(), VF);
3699   if (!RetTy->isVoidTy() &&
3700       (!isa<LoadInst>(I) ||
3701        !TTI.supportsEfficientVectorElementLoadStore()))
3702     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3703 
3704   if (CallInst *CI = dyn_cast<CallInst>(I)) {
3705     SmallVector<const Value *, 4> Operands(CI->arg_operands());
3706     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3707   }
3708   else if (!isa<StoreInst>(I) ||
3709            !TTI.supportsEfficientVectorElementLoadStore()) {
3710     SmallVector<const Value *, 4> Operands(I->operand_values());
3711     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3712   }
3713 
3714   return Cost;
3715 }
3716 
3717 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3718 // Return the cost of the instruction, including scalarization overhead if it's
3719 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3720 // i.e. either vector version isn't available, or is too expensive.
3721 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3722                                   const TargetTransformInfo &TTI,
3723                                   const TargetLibraryInfo *TLI,
3724                                   bool &NeedToScalarize) {
3725   Function *F = CI->getCalledFunction();
3726   StringRef FnName = CI->getCalledFunction()->getName();
3727   Type *ScalarRetTy = CI->getType();
3728   SmallVector<Type *, 4> Tys, ScalarTys;
3729   for (auto &ArgOp : CI->arg_operands())
3730     ScalarTys.push_back(ArgOp->getType());
3731 
3732   // Estimate cost of scalarized vector call. The source operands are assumed
3733   // to be vectors, so we need to extract individual elements from there,
3734   // execute VF scalar calls, and then gather the result into the vector return
3735   // value.
3736   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3737   if (VF == 1)
3738     return ScalarCallCost;
3739 
3740   // Compute corresponding vector type for return value and arguments.
3741   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3742   for (Type *ScalarTy : ScalarTys)
3743     Tys.push_back(ToVectorTy(ScalarTy, VF));
3744 
3745   // Compute costs of unpacking argument values for the scalar calls and
3746   // packing the return values to a vector.
3747   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3748 
3749   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3750 
3751   // If we can't emit a vector call for this function, then the currently found
3752   // cost is the cost we need to return.
3753   NeedToScalarize = true;
3754   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3755     return Cost;
3756 
3757   // If the corresponding vector cost is cheaper, return its cost.
3758   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3759   if (VectorCallCost < Cost) {
3760     NeedToScalarize = false;
3761     return VectorCallCost;
3762   }
3763   return Cost;
3764 }
3765 
3766 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3767 // factor VF.  Return the cost of the instruction, including scalarization
3768 // overhead if it's needed.
3769 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3770                                        const TargetTransformInfo &TTI,
3771                                        const TargetLibraryInfo *TLI) {
3772   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3773   assert(ID && "Expected intrinsic call!");
3774 
3775   FastMathFlags FMF;
3776   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3777     FMF = FPMO->getFastMathFlags();
3778 
3779   SmallVector<Value *, 4> Operands(CI->arg_operands());
3780   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3781 }
3782 
3783 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3784   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3785   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3786   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3787 }
3788 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3789   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3790   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3791   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3792 }
3793 
3794 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3795   // For every instruction `I` in MinBWs, truncate the operands, create a
3796   // truncated version of `I` and reextend its result. InstCombine runs
3797   // later and will remove any ext/trunc pairs.
3798   //
3799   SmallPtrSet<Value *, 4> Erased;
3800   for (const auto &KV : Cost->getMinimalBitwidths()) {
3801     // If the value wasn't vectorized, we must maintain the original scalar
3802     // type. The absence of the value from VectorLoopValueMap indicates that it
3803     // wasn't vectorized.
3804     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3805       continue;
3806     for (unsigned Part = 0; Part < UF; ++Part) {
3807       Value *I = getOrCreateVectorValue(KV.first, Part);
3808       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3809         continue;
3810       Type *OriginalTy = I->getType();
3811       Type *ScalarTruncatedTy =
3812           IntegerType::get(OriginalTy->getContext(), KV.second);
3813       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3814                                           OriginalTy->getVectorNumElements());
3815       if (TruncatedTy == OriginalTy)
3816         continue;
3817 
3818       IRBuilder<> B(cast<Instruction>(I));
3819       auto ShrinkOperand = [&](Value *V) -> Value * {
3820         if (auto *ZI = dyn_cast<ZExtInst>(V))
3821           if (ZI->getSrcTy() == TruncatedTy)
3822             return ZI->getOperand(0);
3823         return B.CreateZExtOrTrunc(V, TruncatedTy);
3824       };
3825 
3826       // The actual instruction modification depends on the instruction type,
3827       // unfortunately.
3828       Value *NewI = nullptr;
3829       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3830         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3831                              ShrinkOperand(BO->getOperand(1)));
3832 
3833         // Any wrapping introduced by shrinking this operation shouldn't be
3834         // considered undefined behavior. So, we can't unconditionally copy
3835         // arithmetic wrapping flags to NewI.
3836         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3837       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3838         NewI =
3839             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3840                          ShrinkOperand(CI->getOperand(1)));
3841       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3842         NewI = B.CreateSelect(SI->getCondition(),
3843                               ShrinkOperand(SI->getTrueValue()),
3844                               ShrinkOperand(SI->getFalseValue()));
3845       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3846         switch (CI->getOpcode()) {
3847         default:
3848           llvm_unreachable("Unhandled cast!");
3849         case Instruction::Trunc:
3850           NewI = ShrinkOperand(CI->getOperand(0));
3851           break;
3852         case Instruction::SExt:
3853           NewI = B.CreateSExtOrTrunc(
3854               CI->getOperand(0),
3855               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3856           break;
3857         case Instruction::ZExt:
3858           NewI = B.CreateZExtOrTrunc(
3859               CI->getOperand(0),
3860               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3861           break;
3862         }
3863       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3864         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3865         auto *O0 = B.CreateZExtOrTrunc(
3866             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3867         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3868         auto *O1 = B.CreateZExtOrTrunc(
3869             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3870 
3871         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3872       } else if (isa<LoadInst>(I)) {
3873         // Don't do anything with the operands, just extend the result.
3874         continue;
3875       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3876         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3877         auto *O0 = B.CreateZExtOrTrunc(
3878             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3879         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3880         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3881       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3882         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3883         auto *O0 = B.CreateZExtOrTrunc(
3884             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3885         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3886       } else {
3887         llvm_unreachable("Unhandled instruction type!");
3888       }
3889 
3890       // Lastly, extend the result.
3891       NewI->takeName(cast<Instruction>(I));
3892       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3893       I->replaceAllUsesWith(Res);
3894       cast<Instruction>(I)->eraseFromParent();
3895       Erased.insert(I);
3896       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3897     }
3898   }
3899 
3900   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3901   for (const auto &KV : Cost->getMinimalBitwidths()) {
3902     // If the value wasn't vectorized, we must maintain the original scalar
3903     // type. The absence of the value from VectorLoopValueMap indicates that it
3904     // wasn't vectorized.
3905     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3906       continue;
3907     for (unsigned Part = 0; Part < UF; ++Part) {
3908       Value *I = getOrCreateVectorValue(KV.first, Part);
3909       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3910       if (Inst && Inst->use_empty()) {
3911         Value *NewI = Inst->getOperand(0);
3912         Inst->eraseFromParent();
3913         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3914       }
3915     }
3916   }
3917 }
3918 
3919 void InnerLoopVectorizer::fixVectorizedLoop() {
3920   // Insert truncates and extends for any truncated instructions as hints to
3921   // InstCombine.
3922   if (VF > 1)
3923     truncateToMinimalBitwidths();
3924 
3925   // At this point every instruction in the original loop is widened to a
3926   // vector form. Now we need to fix the recurrences in the loop. These PHI
3927   // nodes are currently empty because we did not want to introduce cycles.
3928   // This is the second stage of vectorizing recurrences.
3929   fixCrossIterationPHIs();
3930 
3931   // Update the dominator tree.
3932   //
3933   // FIXME: After creating the structure of the new loop, the dominator tree is
3934   //        no longer up-to-date, and it remains that way until we update it
3935   //        here. An out-of-date dominator tree is problematic for SCEV,
3936   //        because SCEVExpander uses it to guide code generation. The
3937   //        vectorizer use SCEVExpanders in several places. Instead, we should
3938   //        keep the dominator tree up-to-date as we go.
3939   updateAnalysis();
3940 
3941   // Fix-up external users of the induction variables.
3942   for (auto &Entry : *Legal->getInductionVars())
3943     fixupIVUsers(Entry.first, Entry.second,
3944                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3945                  IVEndValues[Entry.first], LoopMiddleBlock);
3946 
3947   fixLCSSAPHIs();
3948   predicateInstructions();
3949 
3950   // Remove redundant induction instructions.
3951   cse(LoopVectorBody);
3952 }
3953 
3954 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3955   // In order to support recurrences we need to be able to vectorize Phi nodes.
3956   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3957   // stage #2: We now need to fix the recurrences by adding incoming edges to
3958   // the currently empty PHI nodes. At this point every instruction in the
3959   // original loop is widened to a vector form so we can use them to construct
3960   // the incoming edges.
3961   for (Instruction &I : *OrigLoop->getHeader()) {
3962     PHINode *Phi = dyn_cast<PHINode>(&I);
3963     if (!Phi)
3964       break;
3965     // Handle first-order recurrences and reductions that need to be fixed.
3966     if (Legal->isFirstOrderRecurrence(Phi))
3967       fixFirstOrderRecurrence(Phi);
3968     else if (Legal->isReductionVariable(Phi))
3969       fixReduction(Phi);
3970   }
3971 }
3972 
3973 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3974 
3975   // This is the second phase of vectorizing first-order recurrences. An
3976   // overview of the transformation is described below. Suppose we have the
3977   // following loop.
3978   //
3979   //   for (int i = 0; i < n; ++i)
3980   //     b[i] = a[i] - a[i - 1];
3981   //
3982   // There is a first-order recurrence on "a". For this loop, the shorthand
3983   // scalar IR looks like:
3984   //
3985   //   scalar.ph:
3986   //     s_init = a[-1]
3987   //     br scalar.body
3988   //
3989   //   scalar.body:
3990   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3991   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3992   //     s2 = a[i]
3993   //     b[i] = s2 - s1
3994   //     br cond, scalar.body, ...
3995   //
3996   // In this example, s1 is a recurrence because it's value depends on the
3997   // previous iteration. In the first phase of vectorization, we created a
3998   // temporary value for s1. We now complete the vectorization and produce the
3999   // shorthand vector IR shown below (for VF = 4, UF = 1).
4000   //
4001   //   vector.ph:
4002   //     v_init = vector(..., ..., ..., a[-1])
4003   //     br vector.body
4004   //
4005   //   vector.body
4006   //     i = phi [0, vector.ph], [i+4, vector.body]
4007   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4008   //     v2 = a[i, i+1, i+2, i+3];
4009   //     v3 = vector(v1(3), v2(0, 1, 2))
4010   //     b[i, i+1, i+2, i+3] = v2 - v3
4011   //     br cond, vector.body, middle.block
4012   //
4013   //   middle.block:
4014   //     x = v2(3)
4015   //     br scalar.ph
4016   //
4017   //   scalar.ph:
4018   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4019   //     br scalar.body
4020   //
4021   // After execution completes the vector loop, we extract the next value of
4022   // the recurrence (x) to use as the initial value in the scalar loop.
4023 
4024   // Get the original loop preheader and single loop latch.
4025   auto *Preheader = OrigLoop->getLoopPreheader();
4026   auto *Latch = OrigLoop->getLoopLatch();
4027 
4028   // Get the initial and previous values of the scalar recurrence.
4029   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4030   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4031 
4032   // Create a vector from the initial value.
4033   auto *VectorInit = ScalarInit;
4034   if (VF > 1) {
4035     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4036     VectorInit = Builder.CreateInsertElement(
4037         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4038         Builder.getInt32(VF - 1), "vector.recur.init");
4039   }
4040 
4041   // We constructed a temporary phi node in the first phase of vectorization.
4042   // This phi node will eventually be deleted.
4043   Builder.SetInsertPoint(
4044       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4045 
4046   // Create a phi node for the new recurrence. The current value will either be
4047   // the initial value inserted into a vector or loop-varying vector value.
4048   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4049   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4050 
4051   // Get the vectorized previous value of the last part UF - 1. It appears last
4052   // among all unrolled iterations, due to the order of their construction.
4053   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4054 
4055   // Set the insertion point after the previous value if it is an instruction.
4056   // Note that the previous value may have been constant-folded so it is not
4057   // guaranteed to be an instruction in the vector loop. Also, if the previous
4058   // value is a phi node, we should insert after all the phi nodes to avoid
4059   // breaking basic block verification.
4060   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
4061       isa<PHINode>(PreviousLastPart))
4062     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
4063   else
4064     Builder.SetInsertPoint(
4065         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
4066 
4067   // We will construct a vector for the recurrence by combining the values for
4068   // the current and previous iterations. This is the required shuffle mask.
4069   SmallVector<Constant *, 8> ShuffleMask(VF);
4070   ShuffleMask[0] = Builder.getInt32(VF - 1);
4071   for (unsigned I = 1; I < VF; ++I)
4072     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4073 
4074   // The vector from which to take the initial value for the current iteration
4075   // (actual or unrolled). Initially, this is the vector phi node.
4076   Value *Incoming = VecPhi;
4077 
4078   // Shuffle the current and previous vector and update the vector parts.
4079   for (unsigned Part = 0; Part < UF; ++Part) {
4080     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4081     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4082     auto *Shuffle =
4083         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
4084                                              ConstantVector::get(ShuffleMask))
4085                : Incoming;
4086     PhiPart->replaceAllUsesWith(Shuffle);
4087     cast<Instruction>(PhiPart)->eraseFromParent();
4088     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4089     Incoming = PreviousPart;
4090   }
4091 
4092   // Fix the latch value of the new recurrence in the vector loop.
4093   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4094 
4095   // Extract the last vector element in the middle block. This will be the
4096   // initial value for the recurrence when jumping to the scalar loop.
4097   auto *ExtractForScalar = Incoming;
4098   if (VF > 1) {
4099     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4100     ExtractForScalar = Builder.CreateExtractElement(
4101         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
4102   }
4103   // Extract the second last element in the middle block if the
4104   // Phi is used outside the loop. We need to extract the phi itself
4105   // and not the last element (the phi update in the current iteration). This
4106   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4107   // when the scalar loop is not run at all.
4108   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4109   if (VF > 1)
4110     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4111         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
4112   // When loop is unrolled without vectorizing, initialize
4113   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4114   // `Incoming`. This is analogous to the vectorized case above: extracting the
4115   // second last element when VF > 1.
4116   else if (UF > 1)
4117     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4118 
4119   // Fix the initial value of the original recurrence in the scalar loop.
4120   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4121   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4122   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4123     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4124     Start->addIncoming(Incoming, BB);
4125   }
4126 
4127   Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
4128   Phi->setName("scalar.recur");
4129 
4130   // Finally, fix users of the recurrence outside the loop. The users will need
4131   // either the last value of the scalar recurrence or the last value of the
4132   // vector recurrence we extracted in the middle block. Since the loop is in
4133   // LCSSA form, we just need to find the phi node for the original scalar
4134   // recurrence in the exit block, and then add an edge for the middle block.
4135   for (auto &I : *LoopExitBlock) {
4136     auto *LCSSAPhi = dyn_cast<PHINode>(&I);
4137     if (!LCSSAPhi)
4138       break;
4139     if (LCSSAPhi->getIncomingValue(0) == Phi) {
4140       LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4141       break;
4142     }
4143   }
4144 }
4145 
4146 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4147   Constant *Zero = Builder.getInt32(0);
4148 
4149   // Get it's reduction variable descriptor.
4150   assert(Legal->isReductionVariable(Phi) &&
4151          "Unable to find the reduction variable");
4152   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
4153 
4154   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4155   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4156   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4157   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4158     RdxDesc.getMinMaxRecurrenceKind();
4159   setDebugLocFromInst(Builder, ReductionStartValue);
4160 
4161   // We need to generate a reduction vector from the incoming scalar.
4162   // To do so, we need to generate the 'identity' vector and override
4163   // one of the elements with the incoming scalar reduction. We need
4164   // to do it in the vector-loop preheader.
4165   Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
4166 
4167   // This is the vector-clone of the value that leaves the loop.
4168   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4169 
4170   // Find the reduction identity variable. Zero for addition, or, xor,
4171   // one for multiplication, -1 for And.
4172   Value *Identity;
4173   Value *VectorStart;
4174   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4175       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4176     // MinMax reduction have the start value as their identify.
4177     if (VF == 1) {
4178       VectorStart = Identity = ReductionStartValue;
4179     } else {
4180       VectorStart = Identity =
4181         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4182     }
4183   } else {
4184     // Handle other reduction kinds:
4185     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4186         RK, VecTy->getScalarType());
4187     if (VF == 1) {
4188       Identity = Iden;
4189       // This vector is the Identity vector where the first element is the
4190       // incoming scalar reduction.
4191       VectorStart = ReductionStartValue;
4192     } else {
4193       Identity = ConstantVector::getSplat(VF, Iden);
4194 
4195       // This vector is the Identity vector where the first element is the
4196       // incoming scalar reduction.
4197       VectorStart =
4198         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4199     }
4200   }
4201 
4202   // Fix the vector-loop phi.
4203 
4204   // Reductions do not have to start at zero. They can start with
4205   // any loop invariant values.
4206   BasicBlock *Latch = OrigLoop->getLoopLatch();
4207   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4208   for (unsigned Part = 0; Part < UF; ++Part) {
4209     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4210     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4211     // Make sure to add the reduction stat value only to the
4212     // first unroll part.
4213     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4214     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4215     cast<PHINode>(VecRdxPhi)
4216       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4217   }
4218 
4219   // Before each round, move the insertion point right between
4220   // the PHIs and the values we are going to write.
4221   // This allows us to write both PHINodes and the extractelement
4222   // instructions.
4223   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4224 
4225   setDebugLocFromInst(Builder, LoopExitInst);
4226 
4227   // If the vector reduction can be performed in a smaller type, we truncate
4228   // then extend the loop exit value to enable InstCombine to evaluate the
4229   // entire expression in the smaller type.
4230   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
4231     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4232     Builder.SetInsertPoint(LoopVectorBody->getTerminator());
4233     VectorParts RdxParts(UF);
4234     for (unsigned Part = 0; Part < UF; ++Part) {
4235       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4236       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4237       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4238                                         : Builder.CreateZExt(Trunc, VecTy);
4239       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4240            UI != RdxParts[Part]->user_end();)
4241         if (*UI != Trunc) {
4242           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4243           RdxParts[Part] = Extnd;
4244         } else {
4245           ++UI;
4246         }
4247     }
4248     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4249     for (unsigned Part = 0; Part < UF; ++Part) {
4250       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4251       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4252     }
4253   }
4254 
4255   // Reduce all of the unrolled parts into a single vector.
4256   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4257   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4258   setDebugLocFromInst(Builder, ReducedPartRdx);
4259   for (unsigned Part = 1; Part < UF; ++Part) {
4260     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4261     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4262       // Floating point operations had to be 'fast' to enable the reduction.
4263       ReducedPartRdx = addFastMathFlag(
4264           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4265                               ReducedPartRdx, "bin.rdx"));
4266     else
4267       ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4268           Builder, MinMaxKind, ReducedPartRdx, RdxPart);
4269   }
4270 
4271   if (VF > 1) {
4272     bool NoNaN = Legal->hasFunNoNaNAttr();
4273     ReducedPartRdx =
4274         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4275     // If the reduction can be performed in a smaller type, we need to extend
4276     // the reduction to the wider type before we branch to the original loop.
4277     if (Phi->getType() != RdxDesc.getRecurrenceType())
4278       ReducedPartRdx =
4279         RdxDesc.isSigned()
4280         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4281         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4282   }
4283 
4284   // Create a phi node that merges control-flow from the backedge-taken check
4285   // block and the middle block.
4286   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4287                                         LoopScalarPreHeader->getTerminator());
4288   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4289     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4290   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4291 
4292   // Now, we need to fix the users of the reduction variable
4293   // inside and outside of the scalar remainder loop.
4294   // We know that the loop is in LCSSA form. We need to update the
4295   // PHI nodes in the exit blocks.
4296   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
4297          LEE = LoopExitBlock->end();
4298        LEI != LEE; ++LEI) {
4299     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
4300     if (!LCSSAPhi)
4301       break;
4302 
4303     // All PHINodes need to have a single entry edge, or two if
4304     // we already fixed them.
4305     assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4306 
4307     // We found a reduction value exit-PHI. Update it with the
4308     // incoming bypass edge.
4309     if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
4310       LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4311   } // end of the LCSSA phi scan.
4312 
4313     // Fix the scalar loop reduction variable with the incoming reduction sum
4314     // from the vector body and from the backedge value.
4315   int IncomingEdgeBlockIdx =
4316     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4317   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4318   // Pick the other block.
4319   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4320   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4321   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4322 }
4323 
4324 void InnerLoopVectorizer::fixLCSSAPHIs() {
4325   for (Instruction &LEI : *LoopExitBlock) {
4326     auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
4327     if (!LCSSAPhi)
4328       break;
4329     if (LCSSAPhi->getNumIncomingValues() == 1) {
4330       assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
4331              "Incoming value isn't loop invariant");
4332       LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
4333     }
4334   }
4335 }
4336 
4337 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4338 
4339   // The basic block and loop containing the predicated instruction.
4340   auto *PredBB = PredInst->getParent();
4341   auto *VectorLoop = LI->getLoopFor(PredBB);
4342 
4343   // Initialize a worklist with the operands of the predicated instruction.
4344   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4345 
4346   // Holds instructions that we need to analyze again. An instruction may be
4347   // reanalyzed if we don't yet know if we can sink it or not.
4348   SmallVector<Instruction *, 8> InstsToReanalyze;
4349 
4350   // Returns true if a given use occurs in the predicated block. Phi nodes use
4351   // their operands in their corresponding predecessor blocks.
4352   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4353     auto *I = cast<Instruction>(U.getUser());
4354     BasicBlock *BB = I->getParent();
4355     if (auto *Phi = dyn_cast<PHINode>(I))
4356       BB = Phi->getIncomingBlock(
4357           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4358     return BB == PredBB;
4359   };
4360 
4361   // Iteratively sink the scalarized operands of the predicated instruction
4362   // into the block we created for it. When an instruction is sunk, it's
4363   // operands are then added to the worklist. The algorithm ends after one pass
4364   // through the worklist doesn't sink a single instruction.
4365   bool Changed;
4366   do {
4367 
4368     // Add the instructions that need to be reanalyzed to the worklist, and
4369     // reset the changed indicator.
4370     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4371     InstsToReanalyze.clear();
4372     Changed = false;
4373 
4374     while (!Worklist.empty()) {
4375       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4376 
4377       // We can't sink an instruction if it is a phi node, is already in the
4378       // predicated block, is not in the loop, or may have side effects.
4379       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4380           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4381         continue;
4382 
4383       // It's legal to sink the instruction if all its uses occur in the
4384       // predicated block. Otherwise, there's nothing to do yet, and we may
4385       // need to reanalyze the instruction.
4386       if (!all_of(I->uses(), isBlockOfUsePredicated)) {
4387         InstsToReanalyze.push_back(I);
4388         continue;
4389       }
4390 
4391       // Move the instruction to the beginning of the predicated block, and add
4392       // it's operands to the worklist.
4393       I->moveBefore(&*PredBB->getFirstInsertionPt());
4394       Worklist.insert(I->op_begin(), I->op_end());
4395 
4396       // The sinking may have enabled other instructions to be sunk, so we will
4397       // need to iterate.
4398       Changed = true;
4399     }
4400   } while (Changed);
4401 }
4402 
4403 void InnerLoopVectorizer::predicateInstructions() {
4404 
4405   // For each instruction I marked for predication on value C, split I into its
4406   // own basic block to form an if-then construct over C. Since I may be fed by
4407   // an extractelement instruction or other scalar operand, we try to
4408   // iteratively sink its scalar operands into the predicated block. If I feeds
4409   // an insertelement instruction, we try to move this instruction into the
4410   // predicated block as well. For non-void types, a phi node will be created
4411   // for the resulting value (either vector or scalar).
4412   //
4413   // So for some predicated instruction, e.g. the conditional sdiv in:
4414   //
4415   // for.body:
4416   //  ...
4417   //  %add = add nsw i32 %mul, %0
4418   //  %cmp5 = icmp sgt i32 %2, 7
4419   //  br i1 %cmp5, label %if.then, label %if.end
4420   //
4421   // if.then:
4422   //  %div = sdiv i32 %0, %1
4423   //  br label %if.end
4424   //
4425   // if.end:
4426   //  %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
4427   //
4428   // the sdiv at this point is scalarized and if-converted using a select.
4429   // The inactive elements in the vector are not used, but the predicated
4430   // instruction is still executed for all vector elements, essentially:
4431   //
4432   // vector.body:
4433   //  ...
4434   //  %17 = add nsw <2 x i32> %16, %wide.load
4435   //  %29 = extractelement <2 x i32> %wide.load, i32 0
4436   //  %30 = extractelement <2 x i32> %wide.load51, i32 0
4437   //  %31 = sdiv i32 %29, %30
4438   //  %32 = insertelement <2 x i32> undef, i32 %31, i32 0
4439   //  %35 = extractelement <2 x i32> %wide.load, i32 1
4440   //  %36 = extractelement <2 x i32> %wide.load51, i32 1
4441   //  %37 = sdiv i32 %35, %36
4442   //  %38 = insertelement <2 x i32> %32, i32 %37, i32 1
4443   //  %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
4444   //
4445   // Predication will now re-introduce the original control flow to avoid false
4446   // side-effects by the sdiv instructions on the inactive elements, yielding
4447   // (after cleanup):
4448   //
4449   // vector.body:
4450   //  ...
4451   //  %5 = add nsw <2 x i32> %4, %wide.load
4452   //  %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
4453   //  %9 = extractelement <2 x i1> %8, i32 0
4454   //  br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
4455   //
4456   // pred.sdiv.if:
4457   //  %10 = extractelement <2 x i32> %wide.load, i32 0
4458   //  %11 = extractelement <2 x i32> %wide.load51, i32 0
4459   //  %12 = sdiv i32 %10, %11
4460   //  %13 = insertelement <2 x i32> undef, i32 %12, i32 0
4461   //  br label %pred.sdiv.continue
4462   //
4463   // pred.sdiv.continue:
4464   //  %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
4465   //  %15 = extractelement <2 x i1> %8, i32 1
4466   //  br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
4467   //
4468   // pred.sdiv.if54:
4469   //  %16 = extractelement <2 x i32> %wide.load, i32 1
4470   //  %17 = extractelement <2 x i32> %wide.load51, i32 1
4471   //  %18 = sdiv i32 %16, %17
4472   //  %19 = insertelement <2 x i32> %14, i32 %18, i32 1
4473   //  br label %pred.sdiv.continue55
4474   //
4475   // pred.sdiv.continue55:
4476   //  %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
4477   //  %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
4478 
4479   for (auto KV : PredicatedInstructions) {
4480     BasicBlock::iterator I(KV.first);
4481     BasicBlock *Head = I->getParent();
4482     auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
4483                                         /*BranchWeights=*/nullptr, DT, LI);
4484     I->moveBefore(T);
4485     sinkScalarOperands(&*I);
4486 
4487     BasicBlock *PredicatedBlock = I->getParent();
4488     Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
4489     PredicatedBlock->setName(BBNamePrefix + ".if");
4490     PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
4491 
4492     // If the instruction is non-void create a Phi node at reconvergence point.
4493     if (!I->getType()->isVoidTy()) {
4494       Value *IncomingTrue = nullptr;
4495       Value *IncomingFalse = nullptr;
4496 
4497       if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
4498         // If the predicated instruction is feeding an insert-element, move it
4499         // into the Then block; Phi node will be created for the vector.
4500         InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
4501         IEI->moveBefore(T);
4502         IncomingTrue = IEI; // the new vector with the inserted element.
4503         IncomingFalse = IEI->getOperand(0); // the unmodified vector
4504       } else {
4505         // Phi node will be created for the scalar predicated instruction.
4506         IncomingTrue = &*I;
4507         IncomingFalse = UndefValue::get(I->getType());
4508       }
4509 
4510       BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
4511       assert(PostDom && "Then block has multiple successors");
4512       PHINode *Phi =
4513           PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
4514       IncomingTrue->replaceAllUsesWith(Phi);
4515       Phi->addIncoming(IncomingFalse, Head);
4516       Phi->addIncoming(IncomingTrue, I->getParent());
4517     }
4518   }
4519 
4520   DEBUG(DT->verifyDomTree());
4521 }
4522 
4523 InnerLoopVectorizer::VectorParts
4524 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
4525   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
4526 
4527   // Look for cached value.
4528   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
4529   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
4530   if (ECEntryIt != EdgeMaskCache.end())
4531     return ECEntryIt->second;
4532 
4533   VectorParts SrcMask = createBlockInMask(Src);
4534 
4535   // The terminator has to be a branch inst!
4536   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
4537   assert(BI && "Unexpected terminator found");
4538 
4539   if (BI->isConditional()) {
4540 
4541     VectorParts EdgeMask(UF);
4542     for (unsigned Part = 0; Part < UF; ++Part) {
4543       auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
4544       if (BI->getSuccessor(0) != Dst)
4545         EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
4546 
4547       EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
4548       EdgeMask[Part] = EdgeMaskPart;
4549     }
4550 
4551     EdgeMaskCache[Edge] = EdgeMask;
4552     return EdgeMask;
4553   }
4554 
4555   EdgeMaskCache[Edge] = SrcMask;
4556   return SrcMask;
4557 }
4558 
4559 InnerLoopVectorizer::VectorParts
4560 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
4561   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
4562 
4563   // Look for cached value.
4564   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
4565   if (BCEntryIt != BlockMaskCache.end())
4566     return BCEntryIt->second;
4567 
4568   VectorParts BlockMask(UF);
4569 
4570   // Loop incoming mask is all-one.
4571   if (OrigLoop->getHeader() == BB) {
4572     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
4573     for (unsigned Part = 0; Part < UF; ++Part)
4574       BlockMask[Part] = getOrCreateVectorValue(C, Part);
4575     BlockMaskCache[BB] = BlockMask;
4576     return BlockMask;
4577   }
4578 
4579   // This is the block mask. We OR all incoming edges, and with zero.
4580   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
4581   for (unsigned Part = 0; Part < UF; ++Part)
4582     BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
4583 
4584   // For each pred:
4585   for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
4586     VectorParts EM = createEdgeMask(*It, BB);
4587     for (unsigned Part = 0; Part < UF; ++Part)
4588       BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
4589   }
4590 
4591   BlockMaskCache[BB] = BlockMask;
4592   return BlockMask;
4593 }
4594 
4595 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4596                                               unsigned VF) {
4597   PHINode *P = cast<PHINode>(PN);
4598   // In order to support recurrences we need to be able to vectorize Phi nodes.
4599   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4600   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4601   // this value when we vectorize all of the instructions that use the PHI.
4602   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4603     for (unsigned Part = 0; Part < UF; ++Part) {
4604       // This is phase one of vectorizing PHIs.
4605       Type *VecTy =
4606           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4607       Value *EntryPart = PHINode::Create(
4608           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4609       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4610     }
4611     return;
4612   }
4613 
4614   setDebugLocFromInst(Builder, P);
4615   // Check for PHI nodes that are lowered to vector selects.
4616   if (P->getParent() != OrigLoop->getHeader()) {
4617     // We know that all PHIs in non-header blocks are converted into
4618     // selects, so we don't have to worry about the insertion order and we
4619     // can just use the builder.
4620     // At this point we generate the predication tree. There may be
4621     // duplications since this is a simple recursive scan, but future
4622     // optimizations will clean it up.
4623 
4624     unsigned NumIncoming = P->getNumIncomingValues();
4625 
4626     // Generate a sequence of selects of the form:
4627     // SELECT(Mask3, In3,
4628     //      SELECT(Mask2, In2,
4629     //                   ( ...)))
4630     VectorParts Entry(UF);
4631     for (unsigned In = 0; In < NumIncoming; In++) {
4632       VectorParts Cond =
4633           createEdgeMask(P->getIncomingBlock(In), P->getParent());
4634 
4635       for (unsigned Part = 0; Part < UF; ++Part) {
4636         Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
4637         // We might have single edge PHIs (blocks) - use an identity
4638         // 'select' for the first PHI operand.
4639         if (In == 0)
4640           Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
4641         else
4642           // Select between the current value and the previous incoming edge
4643           // based on the incoming mask.
4644           Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
4645                                              "predphi");
4646       }
4647     }
4648     for (unsigned Part = 0; Part < UF; ++Part)
4649       VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
4650     return;
4651   }
4652 
4653   // This PHINode must be an induction variable.
4654   // Make sure that we know about it.
4655   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4656 
4657   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4658   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4659 
4660   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4661   // which can be found from the original scalar operations.
4662   switch (II.getKind()) {
4663   case InductionDescriptor::IK_NoInduction:
4664     llvm_unreachable("Unknown induction");
4665   case InductionDescriptor::IK_IntInduction:
4666   case InductionDescriptor::IK_FpInduction:
4667     return widenIntOrFpInduction(P);
4668   case InductionDescriptor::IK_PtrInduction: {
4669     // Handle the pointer induction variable case.
4670     assert(P->getType()->isPointerTy() && "Unexpected type.");
4671     // This is the normalized GEP that starts counting at zero.
4672     Value *PtrInd = Induction;
4673     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4674     // Determine the number of scalars we need to generate for each unroll
4675     // iteration. If the instruction is uniform, we only need to generate the
4676     // first lane. Otherwise, we generate all VF values.
4677     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4678     // These are the scalar results. Notice that we don't generate vector GEPs
4679     // because scalar GEPs result in better code.
4680     for (unsigned Part = 0; Part < UF; ++Part) {
4681       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4682         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4683         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4684         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4685         SclrGep->setName("next.gep");
4686         VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
4687       }
4688     }
4689     return;
4690   }
4691   }
4692 }
4693 
4694 /// A helper function for checking whether an integer division-related
4695 /// instruction may divide by zero (in which case it must be predicated if
4696 /// executed conditionally in the scalar code).
4697 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4698 /// Non-zero divisors that are non compile-time constants will not be
4699 /// converted into multiplication, so we will still end up scalarizing
4700 /// the division, but can do so w/o predication.
4701 static bool mayDivideByZero(Instruction &I) {
4702   assert((I.getOpcode() == Instruction::UDiv ||
4703           I.getOpcode() == Instruction::SDiv ||
4704           I.getOpcode() == Instruction::URem ||
4705           I.getOpcode() == Instruction::SRem) &&
4706          "Unexpected instruction");
4707   Value *Divisor = I.getOperand(1);
4708   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4709   return !CInt || CInt->isZero();
4710 }
4711 
4712 void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
4713   // Scalarize instructions that should remain scalar after vectorization.
4714   if (VF > 1 &&
4715       !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
4716       shouldScalarizeInstruction(&I)) {
4717     scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
4718     return;
4719   }
4720 
4721   switch (I.getOpcode()) {
4722   case Instruction::Br:
4723     // Nothing to do for PHIs and BR, since we already took care of the
4724     // loop control flow instructions.
4725     break;
4726   case Instruction::PHI: {
4727     // Vectorize PHINodes.
4728     widenPHIInstruction(&I, UF, VF);
4729     break;
4730   } // End of PHI.
4731   case Instruction::GetElementPtr: {
4732     // Construct a vector GEP by widening the operands of the scalar GEP as
4733     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4734     // results in a vector of pointers when at least one operand of the GEP
4735     // is vector-typed. Thus, to keep the representation compact, we only use
4736     // vector-typed operands for loop-varying values.
4737     auto *GEP = cast<GetElementPtrInst>(&I);
4738 
4739     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4740       // If we are vectorizing, but the GEP has only loop-invariant operands,
4741       // the GEP we build (by only using vector-typed operands for
4742       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4743       // produce a vector of pointers, we need to either arbitrarily pick an
4744       // operand to broadcast, or broadcast a clone of the original GEP.
4745       // Here, we broadcast a clone of the original.
4746       //
4747       // TODO: If at some point we decide to scalarize instructions having
4748       //       loop-invariant operands, this special case will no longer be
4749       //       required. We would add the scalarization decision to
4750       //       collectLoopScalars() and teach getVectorValue() to broadcast
4751       //       the lane-zero scalar value.
4752       auto *Clone = Builder.Insert(GEP->clone());
4753       for (unsigned Part = 0; Part < UF; ++Part) {
4754         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4755         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4756         addMetadata(EntryPart, GEP);
4757       }
4758     } else {
4759       // If the GEP has at least one loop-varying operand, we are sure to
4760       // produce a vector of pointers. But if we are only unrolling, we want
4761       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4762       // produce with the code below will be scalar (if VF == 1) or vector
4763       // (otherwise). Note that for the unroll-only case, we still maintain
4764       // values in the vector mapping with initVector, as we do for other
4765       // instructions.
4766       for (unsigned Part = 0; Part < UF; ++Part) {
4767 
4768         // The pointer operand of the new GEP. If it's loop-invariant, we
4769         // won't broadcast it.
4770         auto *Ptr =
4771             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4772                 ? GEP->getPointerOperand()
4773                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4774 
4775         // Collect all the indices for the new GEP. If any index is
4776         // loop-invariant, we won't broadcast it.
4777         SmallVector<Value *, 4> Indices;
4778         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4779           if (OrigLoop->isLoopInvariant(U.get()))
4780             Indices.push_back(U.get());
4781           else
4782             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4783         }
4784 
4785         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4786         // but it should be a vector, otherwise.
4787         auto *NewGEP = GEP->isInBounds()
4788                            ? Builder.CreateInBoundsGEP(Ptr, Indices)
4789                            : Builder.CreateGEP(Ptr, Indices);
4790         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4791                "NewGEP is not a pointer vector");
4792         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4793         addMetadata(NewGEP, GEP);
4794       }
4795     }
4796 
4797     break;
4798   }
4799   case Instruction::UDiv:
4800   case Instruction::SDiv:
4801   case Instruction::SRem:
4802   case Instruction::URem:
4803     // Scalarize with predication if this instruction may divide by zero and
4804     // block execution is conditional, otherwise fallthrough.
4805     if (Legal->isScalarWithPredication(&I)) {
4806       scalarizeInstruction(&I, true);
4807       break;
4808     }
4809     LLVM_FALLTHROUGH;
4810   case Instruction::Add:
4811   case Instruction::FAdd:
4812   case Instruction::Sub:
4813   case Instruction::FSub:
4814   case Instruction::Mul:
4815   case Instruction::FMul:
4816   case Instruction::FDiv:
4817   case Instruction::FRem:
4818   case Instruction::Shl:
4819   case Instruction::LShr:
4820   case Instruction::AShr:
4821   case Instruction::And:
4822   case Instruction::Or:
4823   case Instruction::Xor: {
4824     // Just widen binops.
4825     auto *BinOp = cast<BinaryOperator>(&I);
4826     setDebugLocFromInst(Builder, BinOp);
4827 
4828     for (unsigned Part = 0; Part < UF; ++Part) {
4829       Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
4830       Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
4831       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
4832 
4833       if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4834         VecOp->copyIRFlags(BinOp);
4835 
4836       // Use this vector value for all users of the original instruction.
4837       VectorLoopValueMap.setVectorValue(&I, Part, V);
4838       addMetadata(V, BinOp);
4839     }
4840 
4841     break;
4842   }
4843   case Instruction::Select: {
4844     // Widen selects.
4845     // If the selector is loop invariant we can create a select
4846     // instruction with a scalar condition. Otherwise, use vector-select.
4847     auto *SE = PSE.getSE();
4848     bool InvariantCond =
4849         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4850     setDebugLocFromInst(Builder, &I);
4851 
4852     // The condition can be loop invariant  but still defined inside the
4853     // loop. This means that we can't just use the original 'cond' value.
4854     // We have to take the 'vectorized' value and pick the first lane.
4855     // Instcombine will make this a no-op.
4856 
4857     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
4858 
4859     for (unsigned Part = 0; Part < UF; ++Part) {
4860       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4861       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4862       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4863       Value *Sel =
4864           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4865       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4866       addMetadata(Sel, &I);
4867     }
4868 
4869     break;
4870   }
4871 
4872   case Instruction::ICmp:
4873   case Instruction::FCmp: {
4874     // Widen compares. Generate vector compares.
4875     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4876     auto *Cmp = dyn_cast<CmpInst>(&I);
4877     setDebugLocFromInst(Builder, Cmp);
4878     for (unsigned Part = 0; Part < UF; ++Part) {
4879       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4880       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4881       Value *C = nullptr;
4882       if (FCmp) {
4883         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4884         cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
4885       } else {
4886         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4887       }
4888       VectorLoopValueMap.setVectorValue(&I, Part, C);
4889       addMetadata(C, &I);
4890     }
4891 
4892     break;
4893   }
4894 
4895   case Instruction::Store:
4896   case Instruction::Load:
4897     vectorizeMemoryInstruction(&I);
4898     break;
4899   case Instruction::ZExt:
4900   case Instruction::SExt:
4901   case Instruction::FPToUI:
4902   case Instruction::FPToSI:
4903   case Instruction::FPExt:
4904   case Instruction::PtrToInt:
4905   case Instruction::IntToPtr:
4906   case Instruction::SIToFP:
4907   case Instruction::UIToFP:
4908   case Instruction::Trunc:
4909   case Instruction::FPTrunc:
4910   case Instruction::BitCast: {
4911     auto *CI = dyn_cast<CastInst>(&I);
4912     setDebugLocFromInst(Builder, CI);
4913 
4914     // Optimize the special case where the source is a constant integer
4915     // induction variable. Notice that we can only optimize the 'trunc' case
4916     // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
4917     // (c) other casts depend on pointer size.
4918     if (Cost->isOptimizableIVTruncate(CI, VF)) {
4919       widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
4920                             cast<TruncInst>(CI));
4921       break;
4922     }
4923 
4924     /// Vectorize casts.
4925     Type *DestTy =
4926         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4927 
4928     for (unsigned Part = 0; Part < UF; ++Part) {
4929       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4930       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4931       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4932       addMetadata(Cast, &I);
4933     }
4934     break;
4935   }
4936 
4937   case Instruction::Call: {
4938     // Ignore dbg intrinsics.
4939     if (isa<DbgInfoIntrinsic>(I))
4940       break;
4941     setDebugLocFromInst(Builder, &I);
4942 
4943     Module *M = I.getParent()->getParent()->getParent();
4944     auto *CI = cast<CallInst>(&I);
4945 
4946     StringRef FnName = CI->getCalledFunction()->getName();
4947     Function *F = CI->getCalledFunction();
4948     Type *RetTy = ToVectorTy(CI->getType(), VF);
4949     SmallVector<Type *, 4> Tys;
4950     for (Value *ArgOperand : CI->arg_operands())
4951       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4952 
4953     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4954     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
4955                ID == Intrinsic::lifetime_start)) {
4956       scalarizeInstruction(&I);
4957       break;
4958     }
4959     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4960     // version of the instruction.
4961     // Is it beneficial to perform intrinsic call compared to lib call?
4962     bool NeedToScalarize;
4963     unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4964     bool UseVectorIntrinsic =
4965         ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4966     if (!UseVectorIntrinsic && NeedToScalarize) {
4967       scalarizeInstruction(&I);
4968       break;
4969     }
4970 
4971     for (unsigned Part = 0; Part < UF; ++Part) {
4972       SmallVector<Value *, 4> Args;
4973       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4974         Value *Arg = CI->getArgOperand(i);
4975         // Some intrinsics have a scalar argument - don't replace it with a
4976         // vector.
4977         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4978           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4979         Args.push_back(Arg);
4980       }
4981 
4982       Function *VectorF;
4983       if (UseVectorIntrinsic) {
4984         // Use vector version of the intrinsic.
4985         Type *TysForDecl[] = {CI->getType()};
4986         if (VF > 1)
4987           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4988         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4989       } else {
4990         // Use vector version of the library call.
4991         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4992         assert(!VFnName.empty() && "Vector function name is empty.");
4993         VectorF = M->getFunction(VFnName);
4994         if (!VectorF) {
4995           // Generate a declaration
4996           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4997           VectorF =
4998               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4999           VectorF->copyAttributesFrom(F);
5000         }
5001       }
5002       assert(VectorF && "Can't create vector function.");
5003 
5004       SmallVector<OperandBundleDef, 1> OpBundles;
5005       CI->getOperandBundlesAsDefs(OpBundles);
5006       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5007 
5008       if (isa<FPMathOperator>(V))
5009         V->copyFastMathFlags(CI);
5010 
5011       VectorLoopValueMap.setVectorValue(&I, Part, V);
5012       addMetadata(V, &I);
5013     }
5014 
5015     break;
5016   }
5017 
5018   default:
5019     // All other instructions are unsupported. Scalarize them.
5020     scalarizeInstruction(&I);
5021     break;
5022   } // end of switch.
5023 }
5024 
5025 void InnerLoopVectorizer::updateAnalysis() {
5026   // Forget the original basic block.
5027   PSE.getSE()->forgetLoop(OrigLoop);
5028 
5029   // Update the dominator tree information.
5030   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
5031          "Entry does not dominate exit.");
5032 
5033   DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
5034                   LoopVectorPreHeader);
5035   DT->addNewBlock(LoopMiddleBlock,
5036                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
5037   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
5038   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
5039   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
5040 
5041   DEBUG(DT->verifyDomTree());
5042 }
5043 
5044 /// \brief Check whether it is safe to if-convert this phi node.
5045 ///
5046 /// Phi nodes with constant expressions that can trap are not safe to if
5047 /// convert.
5048 static bool canIfConvertPHINodes(BasicBlock *BB) {
5049   for (Instruction &I : *BB) {
5050     auto *Phi = dyn_cast<PHINode>(&I);
5051     if (!Phi)
5052       return true;
5053     for (Value *V : Phi->incoming_values())
5054       if (auto *C = dyn_cast<Constant>(V))
5055         if (C->canTrap())
5056           return false;
5057   }
5058   return true;
5059 }
5060 
5061 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
5062   if (!EnableIfConversion) {
5063     ORE->emit(createMissedAnalysis("IfConversionDisabled")
5064               << "if-conversion is disabled");
5065     return false;
5066   }
5067 
5068   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
5069 
5070   // A list of pointers that we can safely read and write to.
5071   SmallPtrSet<Value *, 8> SafePointes;
5072 
5073   // Collect safe addresses.
5074   for (BasicBlock *BB : TheLoop->blocks()) {
5075     if (blockNeedsPredication(BB))
5076       continue;
5077 
5078     for (Instruction &I : *BB)
5079       if (auto *Ptr = getPointerOperand(&I))
5080         SafePointes.insert(Ptr);
5081   }
5082 
5083   // Collect the blocks that need predication.
5084   BasicBlock *Header = TheLoop->getHeader();
5085   for (BasicBlock *BB : TheLoop->blocks()) {
5086     // We don't support switch statements inside loops.
5087     if (!isa<BranchInst>(BB->getTerminator())) {
5088       ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
5089                 << "loop contains a switch statement");
5090       return false;
5091     }
5092 
5093     // We must be able to predicate all blocks that need to be predicated.
5094     if (blockNeedsPredication(BB)) {
5095       if (!blockCanBePredicated(BB, SafePointes)) {
5096         ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5097                   << "control flow cannot be substituted for a select");
5098         return false;
5099       }
5100     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
5101       ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
5102                 << "control flow cannot be substituted for a select");
5103       return false;
5104     }
5105   }
5106 
5107   // We can if-convert this loop.
5108   return true;
5109 }
5110 
5111 bool LoopVectorizationLegality::canVectorize() {
5112   // Store the result and return it at the end instead of exiting early, in case
5113   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
5114   bool Result = true;
5115   // We must have a loop in canonical form. Loops with indirectbr in them cannot
5116   // be canonicalized.
5117   if (!TheLoop->getLoopPreheader()) {
5118     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5119               << "loop control flow is not understood by vectorizer");
5120     if (ORE->allowExtraAnalysis())
5121       Result = false;
5122     else
5123       return false;
5124   }
5125 
5126   // FIXME: The code is currently dead, since the loop gets sent to
5127   // LoopVectorizationLegality is already an innermost loop.
5128   //
5129   // We can only vectorize innermost loops.
5130   if (!TheLoop->empty()) {
5131     ORE->emit(createMissedAnalysis("NotInnermostLoop")
5132               << "loop is not the innermost loop");
5133     if (ORE->allowExtraAnalysis())
5134       Result = false;
5135     else
5136       return false;
5137   }
5138 
5139   // We must have a single backedge.
5140   if (TheLoop->getNumBackEdges() != 1) {
5141     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5142               << "loop control flow is not understood by vectorizer");
5143     if (ORE->allowExtraAnalysis())
5144       Result = false;
5145     else
5146       return false;
5147   }
5148 
5149   // We must have a single exiting block.
5150   if (!TheLoop->getExitingBlock()) {
5151     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5152               << "loop control flow is not understood by vectorizer");
5153     if (ORE->allowExtraAnalysis())
5154       Result = false;
5155     else
5156       return false;
5157   }
5158 
5159   // We only handle bottom-tested loops, i.e. loop in which the condition is
5160   // checked at the end of each iteration. With that we can assume that all
5161   // instructions in the loop are executed the same number of times.
5162   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5163     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
5164               << "loop control flow is not understood by vectorizer");
5165     if (ORE->allowExtraAnalysis())
5166       Result = false;
5167     else
5168       return false;
5169   }
5170 
5171   // We need to have a loop header.
5172   DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
5173                << '\n');
5174 
5175   // Check if we can if-convert non-single-bb loops.
5176   unsigned NumBlocks = TheLoop->getNumBlocks();
5177   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
5178     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
5179     if (ORE->allowExtraAnalysis())
5180       Result = false;
5181     else
5182       return false;
5183   }
5184 
5185   // Check if we can vectorize the instructions and CFG in this loop.
5186   if (!canVectorizeInstrs()) {
5187     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
5188     if (ORE->allowExtraAnalysis())
5189       Result = false;
5190     else
5191       return false;
5192   }
5193 
5194   // Go over each instruction and look at memory deps.
5195   if (!canVectorizeMemory()) {
5196     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
5197     if (ORE->allowExtraAnalysis())
5198       Result = false;
5199     else
5200       return false;
5201   }
5202 
5203   DEBUG(dbgs() << "LV: We can vectorize this loop"
5204                << (LAI->getRuntimePointerChecking()->Need
5205                        ? " (with a runtime bound check)"
5206                        : "")
5207                << "!\n");
5208 
5209   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
5210 
5211   // If an override option has been passed in for interleaved accesses, use it.
5212   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
5213     UseInterleaved = EnableInterleavedMemAccesses;
5214 
5215   // Analyze interleaved memory accesses.
5216   if (UseInterleaved)
5217     InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
5218 
5219   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
5220   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
5221     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
5222 
5223   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
5224     ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
5225               << "Too many SCEV assumptions need to be made and checked "
5226               << "at runtime");
5227     DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
5228     if (ORE->allowExtraAnalysis())
5229       Result = false;
5230     else
5231       return false;
5232   }
5233 
5234   // Okay! We've done all the tests. If any have failed, return false. Otherwise
5235   // we can vectorize, and at this point we don't have any other mem analysis
5236   // which may limit our maximum vectorization factor, so just return true with
5237   // no restrictions.
5238   return Result;
5239 }
5240 
5241 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
5242   if (Ty->isPointerTy())
5243     return DL.getIntPtrType(Ty);
5244 
5245   // It is possible that char's or short's overflow when we ask for the loop's
5246   // trip count, work around this by changing the type size.
5247   if (Ty->getScalarSizeInBits() < 32)
5248     return Type::getInt32Ty(Ty->getContext());
5249 
5250   return Ty;
5251 }
5252 
5253 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
5254   Ty0 = convertPointerToIntegerType(DL, Ty0);
5255   Ty1 = convertPointerToIntegerType(DL, Ty1);
5256   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
5257     return Ty0;
5258   return Ty1;
5259 }
5260 
5261 /// \brief Check that the instruction has outside loop users and is not an
5262 /// identified reduction variable.
5263 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
5264                                SmallPtrSetImpl<Value *> &AllowedExit) {
5265   // Reduction and Induction instructions are allowed to have exit users. All
5266   // other instructions must not have external users.
5267   if (!AllowedExit.count(Inst))
5268     // Check that all of the users of the loop are inside the BB.
5269     for (User *U : Inst->users()) {
5270       Instruction *UI = cast<Instruction>(U);
5271       // This user may be a reduction exit value.
5272       if (!TheLoop->contains(UI)) {
5273         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
5274         return true;
5275       }
5276     }
5277   return false;
5278 }
5279 
5280 void LoopVectorizationLegality::addInductionPhi(
5281     PHINode *Phi, const InductionDescriptor &ID,
5282     SmallPtrSetImpl<Value *> &AllowedExit) {
5283   Inductions[Phi] = ID;
5284   Type *PhiTy = Phi->getType();
5285   const DataLayout &DL = Phi->getModule()->getDataLayout();
5286 
5287   // Get the widest type.
5288   if (!PhiTy->isFloatingPointTy()) {
5289     if (!WidestIndTy)
5290       WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
5291     else
5292       WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
5293   }
5294 
5295   // Int inductions are special because we only allow one IV.
5296   if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
5297       ID.getConstIntStepValue() &&
5298       ID.getConstIntStepValue()->isOne() &&
5299       isa<Constant>(ID.getStartValue()) &&
5300       cast<Constant>(ID.getStartValue())->isNullValue()) {
5301 
5302     // Use the phi node with the widest type as induction. Use the last
5303     // one if there are multiple (no good reason for doing this other
5304     // than it is expedient). We've checked that it begins at zero and
5305     // steps by one, so this is a canonical induction variable.
5306     if (!PrimaryInduction || PhiTy == WidestIndTy)
5307       PrimaryInduction = Phi;
5308   }
5309 
5310   // Both the PHI node itself, and the "post-increment" value feeding
5311   // back into the PHI node may have external users.
5312   AllowedExit.insert(Phi);
5313   AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
5314 
5315   DEBUG(dbgs() << "LV: Found an induction variable.\n");
5316   return;
5317 }
5318 
5319 bool LoopVectorizationLegality::canVectorizeInstrs() {
5320   BasicBlock *Header = TheLoop->getHeader();
5321 
5322   // Look for the attribute signaling the absence of NaNs.
5323   Function &F = *Header->getParent();
5324   HasFunNoNaNAttr =
5325       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
5326 
5327   // For each block in the loop.
5328   for (BasicBlock *BB : TheLoop->blocks()) {
5329     // Scan the instructions in the block and look for hazards.
5330     for (Instruction &I : *BB) {
5331       if (auto *Phi = dyn_cast<PHINode>(&I)) {
5332         Type *PhiTy = Phi->getType();
5333         // Check that this PHI type is allowed.
5334         if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
5335             !PhiTy->isPointerTy()) {
5336           ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5337                     << "loop control flow is not understood by vectorizer");
5338           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
5339           return false;
5340         }
5341 
5342         // If this PHINode is not in the header block, then we know that we
5343         // can convert it to select during if-conversion. No need to check if
5344         // the PHIs in this block are induction or reduction variables.
5345         if (BB != Header) {
5346           // Check that this instruction has no outside users or is an
5347           // identified reduction value with an outside user.
5348           if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
5349             continue;
5350           ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
5351                     << "value could not be identified as "
5352                        "an induction or reduction variable");
5353           return false;
5354         }
5355 
5356         // We only allow if-converted PHIs with exactly two incoming values.
5357         if (Phi->getNumIncomingValues() != 2) {
5358           ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5359                     << "control flow not understood by vectorizer");
5360           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
5361           return false;
5362         }
5363 
5364         RecurrenceDescriptor RedDes;
5365         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
5366           if (RedDes.hasUnsafeAlgebra())
5367             Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
5368           AllowedExit.insert(RedDes.getLoopExitInstr());
5369           Reductions[Phi] = RedDes;
5370           continue;
5371         }
5372 
5373         InductionDescriptor ID;
5374         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
5375           addInductionPhi(Phi, ID, AllowedExit);
5376           if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
5377             Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
5378           continue;
5379         }
5380 
5381         if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
5382           FirstOrderRecurrences.insert(Phi);
5383           continue;
5384         }
5385 
5386         // As a last resort, coerce the PHI to a AddRec expression
5387         // and re-try classifying it a an induction PHI.
5388         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
5389           addInductionPhi(Phi, ID, AllowedExit);
5390           continue;
5391         }
5392 
5393         ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
5394                   << "value that could not be identified as "
5395                      "reduction is used outside the loop");
5396         DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
5397         return false;
5398       } // end of PHI handling
5399 
5400       // We handle calls that:
5401       //   * Are debug info intrinsics.
5402       //   * Have a mapping to an IR intrinsic.
5403       //   * Have a vector version available.
5404       auto *CI = dyn_cast<CallInst>(&I);
5405       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
5406           !isa<DbgInfoIntrinsic>(CI) &&
5407           !(CI->getCalledFunction() && TLI &&
5408             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
5409         ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
5410                   << "call instruction cannot be vectorized");
5411         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
5412         return false;
5413       }
5414 
5415       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
5416       // second argument is the same (i.e. loop invariant)
5417       if (CI && hasVectorInstrinsicScalarOpd(
5418                     getVectorIntrinsicIDForCall(CI, TLI), 1)) {
5419         auto *SE = PSE.getSE();
5420         if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
5421           ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
5422                     << "intrinsic instruction cannot be vectorized");
5423           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
5424           return false;
5425         }
5426       }
5427 
5428       // Check that the instruction return type is vectorizable.
5429       // Also, we can't vectorize extractelement instructions.
5430       if ((!VectorType::isValidElementType(I.getType()) &&
5431            !I.getType()->isVoidTy()) ||
5432           isa<ExtractElementInst>(I)) {
5433         ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
5434                   << "instruction return type cannot be vectorized");
5435         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
5436         return false;
5437       }
5438 
5439       // Check that the stored type is vectorizable.
5440       if (auto *ST = dyn_cast<StoreInst>(&I)) {
5441         Type *T = ST->getValueOperand()->getType();
5442         if (!VectorType::isValidElementType(T)) {
5443           ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
5444                     << "store instruction cannot be vectorized");
5445           return false;
5446         }
5447 
5448         // FP instructions can allow unsafe algebra, thus vectorizable by
5449         // non-IEEE-754 compliant SIMD units.
5450         // This applies to floating-point math operations and calls, not memory
5451         // operations, shuffles, or casts, as they don't change precision or
5452         // semantics.
5453       } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
5454                  !I.hasUnsafeAlgebra()) {
5455         DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
5456         Hints->setPotentiallyUnsafe();
5457       }
5458 
5459       // Reduction instructions are allowed to have exit users.
5460       // All other instructions must not have external users.
5461       if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
5462         ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
5463                   << "value cannot be used outside the loop");
5464         return false;
5465       }
5466 
5467     } // next instr.
5468   }
5469 
5470   if (!PrimaryInduction) {
5471     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
5472     if (Inductions.empty()) {
5473       ORE->emit(createMissedAnalysis("NoInductionVariable")
5474                 << "loop induction variable could not be identified");
5475       return false;
5476     }
5477   }
5478 
5479   // Now we know the widest induction type, check if our found induction
5480   // is the same size. If it's not, unset it here and InnerLoopVectorizer
5481   // will create another.
5482   if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
5483     PrimaryInduction = nullptr;
5484 
5485   return true;
5486 }
5487 
5488 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
5489 
5490   // We should not collect Scalars more than once per VF. Right now, this
5491   // function is called from collectUniformsAndScalars(), which already does
5492   // this check. Collecting Scalars for VF=1 does not make any sense.
5493   assert(VF >= 2 && !Scalars.count(VF) &&
5494          "This function should not be visited twice for the same VF");
5495 
5496   SmallSetVector<Instruction *, 8> Worklist;
5497 
5498   // These sets are used to seed the analysis with pointers used by memory
5499   // accesses that will remain scalar.
5500   SmallSetVector<Instruction *, 8> ScalarPtrs;
5501   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5502 
5503   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5504   // The pointer operands of loads and stores will be scalar as long as the
5505   // memory access is not a gather or scatter operation. The value operand of a
5506   // store will remain scalar if the store is scalarized.
5507   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5508     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5509     assert(WideningDecision != CM_Unknown &&
5510            "Widening decision should be ready at this moment");
5511     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5512       if (Ptr == Store->getValueOperand())
5513         return WideningDecision == CM_Scalarize;
5514     assert(Ptr == getPointerOperand(MemAccess) &&
5515            "Ptr is neither a value or pointer operand");
5516     return WideningDecision != CM_GatherScatter;
5517   };
5518 
5519   // A helper that returns true if the given value is a bitcast or
5520   // getelementptr instruction contained in the loop.
5521   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5522     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5523             isa<GetElementPtrInst>(V)) &&
5524            !TheLoop->isLoopInvariant(V);
5525   };
5526 
5527   // A helper that evaluates a memory access's use of a pointer. If the use
5528   // will be a scalar use, and the pointer is only used by memory accesses, we
5529   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
5530   // PossibleNonScalarPtrs.
5531   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5532 
5533     // We only care about bitcast and getelementptr instructions contained in
5534     // the loop.
5535     if (!isLoopVaryingBitCastOrGEP(Ptr))
5536       return;
5537 
5538     // If the pointer has already been identified as scalar (e.g., if it was
5539     // also identified as uniform), there's nothing to do.
5540     auto *I = cast<Instruction>(Ptr);
5541     if (Worklist.count(I))
5542       return;
5543 
5544     // If the use of the pointer will be a scalar use, and all users of the
5545     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5546     // place the pointer in PossibleNonScalarPtrs.
5547     if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
5548           return isa<LoadInst>(U) || isa<StoreInst>(U);
5549         }))
5550       ScalarPtrs.insert(I);
5551     else
5552       PossibleNonScalarPtrs.insert(I);
5553   };
5554 
5555   // We seed the scalars analysis with three classes of instructions: (1)
5556   // instructions marked uniform-after-vectorization, (2) bitcast and
5557   // getelementptr instructions used by memory accesses requiring a scalar use,
5558   // and (3) pointer induction variables and their update instructions (we
5559   // currently only scalarize these).
5560   //
5561   // (1) Add to the worklist all instructions that have been identified as
5562   // uniform-after-vectorization.
5563   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5564 
5565   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5566   // memory accesses requiring a scalar use. The pointer operands of loads and
5567   // stores will be scalar as long as the memory accesses is not a gather or
5568   // scatter operation. The value operand of a store will remain scalar if the
5569   // store is scalarized.
5570   for (auto *BB : TheLoop->blocks())
5571     for (auto &I : *BB) {
5572       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5573         evaluatePtrUse(Load, Load->getPointerOperand());
5574       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5575         evaluatePtrUse(Store, Store->getPointerOperand());
5576         evaluatePtrUse(Store, Store->getValueOperand());
5577       }
5578     }
5579   for (auto *I : ScalarPtrs)
5580     if (!PossibleNonScalarPtrs.count(I)) {
5581       DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5582       Worklist.insert(I);
5583     }
5584 
5585   // (3) Add to the worklist all pointer induction variables and their update
5586   // instructions.
5587   //
5588   // TODO: Once we are able to vectorize pointer induction variables we should
5589   //       no longer insert them into the worklist here.
5590   auto *Latch = TheLoop->getLoopLatch();
5591   for (auto &Induction : *Legal->getInductionVars()) {
5592     auto *Ind = Induction.first;
5593     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5594     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
5595       continue;
5596     Worklist.insert(Ind);
5597     Worklist.insert(IndUpdate);
5598     DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5599     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5600   }
5601 
5602   // Insert the forced scalars.
5603   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5604   // induction variable when the PHI user is scalarized.
5605   if (ForcedScalars.count(VF))
5606     for (auto *I : ForcedScalars.find(VF)->second)
5607       Worklist.insert(I);
5608 
5609   // Expand the worklist by looking through any bitcasts and getelementptr
5610   // instructions we've already identified as scalar. This is similar to the
5611   // expansion step in collectLoopUniforms(); however, here we're only
5612   // expanding to include additional bitcasts and getelementptr instructions.
5613   unsigned Idx = 0;
5614   while (Idx != Worklist.size()) {
5615     Instruction *Dst = Worklist[Idx++];
5616     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5617       continue;
5618     auto *Src = cast<Instruction>(Dst->getOperand(0));
5619     if (all_of(Src->users(), [&](User *U) -> bool {
5620           auto *J = cast<Instruction>(U);
5621           return !TheLoop->contains(J) || Worklist.count(J) ||
5622                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5623                   isScalarUse(J, Src));
5624         })) {
5625       Worklist.insert(Src);
5626       DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5627     }
5628   }
5629 
5630   // An induction variable will remain scalar if all users of the induction
5631   // variable and induction variable update remain scalar.
5632   for (auto &Induction : *Legal->getInductionVars()) {
5633     auto *Ind = Induction.first;
5634     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5635 
5636     // We already considered pointer induction variables, so there's no reason
5637     // to look at their users again.
5638     //
5639     // TODO: Once we are able to vectorize pointer induction variables we
5640     //       should no longer skip over them here.
5641     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
5642       continue;
5643 
5644     // Determine if all users of the induction variable are scalar after
5645     // vectorization.
5646     auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
5647       auto *I = cast<Instruction>(U);
5648       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5649     });
5650     if (!ScalarInd)
5651       continue;
5652 
5653     // Determine if all users of the induction variable update instruction are
5654     // scalar after vectorization.
5655     auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
5656       auto *I = cast<Instruction>(U);
5657       return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5658     });
5659     if (!ScalarIndUpdate)
5660       continue;
5661 
5662     // The induction variable and its update instruction will remain scalar.
5663     Worklist.insert(Ind);
5664     Worklist.insert(IndUpdate);
5665     DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5666     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5667   }
5668 
5669   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5670 }
5671 
5672 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
5673   if (!blockNeedsPredication(I->getParent()))
5674     return false;
5675   switch(I->getOpcode()) {
5676   default:
5677     break;
5678   case Instruction::Store:
5679     return !isMaskRequired(I);
5680   case Instruction::UDiv:
5681   case Instruction::SDiv:
5682   case Instruction::SRem:
5683   case Instruction::URem:
5684     return mayDivideByZero(*I);
5685   }
5686   return false;
5687 }
5688 
5689 bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
5690                                                               unsigned VF) {
5691   // Get and ensure we have a valid memory instruction.
5692   LoadInst *LI = dyn_cast<LoadInst>(I);
5693   StoreInst *SI = dyn_cast<StoreInst>(I);
5694   assert((LI || SI) && "Invalid memory instruction");
5695 
5696   auto *Ptr = getPointerOperand(I);
5697 
5698   // In order to be widened, the pointer should be consecutive, first of all.
5699   if (!isConsecutivePtr(Ptr))
5700     return false;
5701 
5702   // If the instruction is a store located in a predicated block, it will be
5703   // scalarized.
5704   if (isScalarWithPredication(I))
5705     return false;
5706 
5707   // If the instruction's allocated size doesn't equal it's type size, it
5708   // requires padding and will be scalarized.
5709   auto &DL = I->getModule()->getDataLayout();
5710   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5711   if (hasIrregularType(ScalarTy, DL, VF))
5712     return false;
5713 
5714   return true;
5715 }
5716 
5717 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
5718 
5719   // We should not collect Uniforms more than once per VF. Right now,
5720   // this function is called from collectUniformsAndScalars(), which
5721   // already does this check. Collecting Uniforms for VF=1 does not make any
5722   // sense.
5723 
5724   assert(VF >= 2 && !Uniforms.count(VF) &&
5725          "This function should not be visited twice for the same VF");
5726 
5727   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5728   // not analyze again.  Uniforms.count(VF) will return 1.
5729   Uniforms[VF].clear();
5730 
5731   // We now know that the loop is vectorizable!
5732   // Collect instructions inside the loop that will remain uniform after
5733   // vectorization.
5734 
5735   // Global values, params and instructions outside of current loop are out of
5736   // scope.
5737   auto isOutOfScope = [&](Value *V) -> bool {
5738     Instruction *I = dyn_cast<Instruction>(V);
5739     return (!I || !TheLoop->contains(I));
5740   };
5741 
5742   SetVector<Instruction *> Worklist;
5743   BasicBlock *Latch = TheLoop->getLoopLatch();
5744 
5745   // Start with the conditional branch. If the branch condition is an
5746   // instruction contained in the loop that is only used by the branch, it is
5747   // uniform.
5748   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5749   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
5750     Worklist.insert(Cmp);
5751     DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
5752   }
5753 
5754   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5755   // are pointers that are treated like consecutive pointers during
5756   // vectorization. The pointer operands of interleaved accesses are an
5757   // example.
5758   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5759 
5760   // Holds pointer operands of instructions that are possibly non-uniform.
5761   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5762 
5763   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
5764     InstWidening WideningDecision = getWideningDecision(I, VF);
5765     assert(WideningDecision != CM_Unknown &&
5766            "Widening decision should be ready at this moment");
5767 
5768     return (WideningDecision == CM_Widen ||
5769             WideningDecision == CM_Interleave);
5770   };
5771   // Iterate over the instructions in the loop, and collect all
5772   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5773   // that a consecutive-like pointer operand will be scalarized, we collect it
5774   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5775   // getelementptr instruction can be used by both vectorized and scalarized
5776   // memory instructions. For example, if a loop loads and stores from the same
5777   // location, but the store is conditional, the store will be scalarized, and
5778   // the getelementptr won't remain uniform.
5779   for (auto *BB : TheLoop->blocks())
5780     for (auto &I : *BB) {
5781 
5782       // If there's no pointer operand, there's nothing to do.
5783       auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
5784       if (!Ptr)
5785         continue;
5786 
5787       // True if all users of Ptr are memory accesses that have Ptr as their
5788       // pointer operand.
5789       auto UsersAreMemAccesses = all_of(Ptr->users(), [&](User *U) -> bool {
5790         return getPointerOperand(U) == Ptr;
5791       });
5792 
5793       // Ensure the memory instruction will not be scalarized or used by
5794       // gather/scatter, making its pointer operand non-uniform. If the pointer
5795       // operand is used by any instruction other than a memory access, we
5796       // conservatively assume the pointer operand may be non-uniform.
5797       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5798         PossibleNonUniformPtrs.insert(Ptr);
5799 
5800       // If the memory instruction will be vectorized and its pointer operand
5801       // is consecutive-like, or interleaving - the pointer operand should
5802       // remain uniform.
5803       else
5804         ConsecutiveLikePtrs.insert(Ptr);
5805     }
5806 
5807   // Add to the Worklist all consecutive and consecutive-like pointers that
5808   // aren't also identified as possibly non-uniform.
5809   for (auto *V : ConsecutiveLikePtrs)
5810     if (!PossibleNonUniformPtrs.count(V)) {
5811       DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
5812       Worklist.insert(V);
5813     }
5814 
5815   // Expand Worklist in topological order: whenever a new instruction
5816   // is added , its users should be either already inside Worklist, or
5817   // out of scope. It ensures a uniform instruction will only be used
5818   // by uniform instructions or out of scope instructions.
5819   unsigned idx = 0;
5820   while (idx != Worklist.size()) {
5821     Instruction *I = Worklist[idx++];
5822 
5823     for (auto OV : I->operand_values()) {
5824       if (isOutOfScope(OV))
5825         continue;
5826       auto *OI = cast<Instruction>(OV);
5827       if (all_of(OI->users(), [&](User *U) -> bool {
5828             auto *J = cast<Instruction>(U);
5829             return !TheLoop->contains(J) || Worklist.count(J) ||
5830                    (OI == getPointerOperand(J) && isUniformDecision(J, VF));
5831           })) {
5832         Worklist.insert(OI);
5833         DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
5834       }
5835     }
5836   }
5837 
5838   // Returns true if Ptr is the pointer operand of a memory access instruction
5839   // I, and I is known to not require scalarization.
5840   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5841     return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
5842   };
5843 
5844   // For an instruction to be added into Worklist above, all its users inside
5845   // the loop should also be in Worklist. However, this condition cannot be
5846   // true for phi nodes that form a cyclic dependence. We must process phi
5847   // nodes separately. An induction variable will remain uniform if all users
5848   // of the induction variable and induction variable update remain uniform.
5849   // The code below handles both pointer and non-pointer induction variables.
5850   for (auto &Induction : *Legal->getInductionVars()) {
5851     auto *Ind = Induction.first;
5852     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5853 
5854     // Determine if all users of the induction variable are uniform after
5855     // vectorization.
5856     auto UniformInd = all_of(Ind->users(), [&](User *U) -> bool {
5857       auto *I = cast<Instruction>(U);
5858       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5859              isVectorizedMemAccessUse(I, Ind);
5860     });
5861     if (!UniformInd)
5862       continue;
5863 
5864     // Determine if all users of the induction variable update instruction are
5865     // uniform after vectorization.
5866     auto UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
5867       auto *I = cast<Instruction>(U);
5868       return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5869              isVectorizedMemAccessUse(I, IndUpdate);
5870     });
5871     if (!UniformIndUpdate)
5872       continue;
5873 
5874     // The induction variable and its update instruction will remain uniform.
5875     Worklist.insert(Ind);
5876     Worklist.insert(IndUpdate);
5877     DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
5878     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
5879   }
5880 
5881   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5882 }
5883 
5884 bool LoopVectorizationLegality::canVectorizeMemory() {
5885   LAI = &(*GetLAA)(*TheLoop);
5886   InterleaveInfo.setLAI(LAI);
5887   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
5888   if (LAR) {
5889     OptimizationRemarkAnalysis VR(Hints->vectorizeAnalysisPassName(),
5890                                   "loop not vectorized: ", *LAR);
5891     ORE->emit(VR);
5892   }
5893   if (!LAI->canVectorizeMemory())
5894     return false;
5895 
5896   if (LAI->hasStoreToLoopInvariantAddress()) {
5897     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
5898               << "write to a loop invariant address could not be vectorized");
5899     DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
5900     return false;
5901   }
5902 
5903   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
5904   PSE.addPredicate(LAI->getPSE().getUnionPredicate());
5905 
5906   return true;
5907 }
5908 
5909 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
5910   Value *In0 = const_cast<Value *>(V);
5911   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5912   if (!PN)
5913     return false;
5914 
5915   return Inductions.count(PN);
5916 }
5917 
5918 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
5919   return FirstOrderRecurrences.count(Phi);
5920 }
5921 
5922 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
5923   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
5924 }
5925 
5926 bool LoopVectorizationLegality::blockCanBePredicated(
5927     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
5928   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
5929 
5930   for (Instruction &I : *BB) {
5931     // Check that we don't have a constant expression that can trap as operand.
5932     for (Value *Operand : I.operands()) {
5933       if (auto *C = dyn_cast<Constant>(Operand))
5934         if (C->canTrap())
5935           return false;
5936     }
5937     // We might be able to hoist the load.
5938     if (I.mayReadFromMemory()) {
5939       auto *LI = dyn_cast<LoadInst>(&I);
5940       if (!LI)
5941         return false;
5942       if (!SafePtrs.count(LI->getPointerOperand())) {
5943         if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
5944             isLegalMaskedGather(LI->getType())) {
5945           MaskedOp.insert(LI);
5946           continue;
5947         }
5948         // !llvm.mem.parallel_loop_access implies if-conversion safety.
5949         if (IsAnnotatedParallel)
5950           continue;
5951         return false;
5952       }
5953     }
5954 
5955     if (I.mayWriteToMemory()) {
5956       auto *SI = dyn_cast<StoreInst>(&I);
5957       // We only support predication of stores in basic blocks with one
5958       // predecessor.
5959       if (!SI)
5960         return false;
5961 
5962       // Build a masked store if it is legal for the target.
5963       if (isLegalMaskedStore(SI->getValueOperand()->getType(),
5964                              SI->getPointerOperand()) ||
5965           isLegalMaskedScatter(SI->getValueOperand()->getType())) {
5966         MaskedOp.insert(SI);
5967         continue;
5968       }
5969 
5970       bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
5971       bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
5972 
5973       if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
5974           !isSinglePredecessor)
5975         return false;
5976     }
5977     if (I.mayThrow())
5978       return false;
5979   }
5980 
5981   return true;
5982 }
5983 
5984 void InterleavedAccessInfo::collectConstStrideAccesses(
5985     MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
5986     const ValueToValueMap &Strides) {
5987 
5988   auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
5989 
5990   // Since it's desired that the load/store instructions be maintained in
5991   // "program order" for the interleaved access analysis, we have to visit the
5992   // blocks in the loop in reverse postorder (i.e., in a topological order).
5993   // Such an ordering will ensure that any load/store that may be executed
5994   // before a second load/store will precede the second load/store in
5995   // AccessStrideInfo.
5996   LoopBlocksDFS DFS(TheLoop);
5997   DFS.perform(LI);
5998   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
5999     for (auto &I : *BB) {
6000       auto *LI = dyn_cast<LoadInst>(&I);
6001       auto *SI = dyn_cast<StoreInst>(&I);
6002       if (!LI && !SI)
6003         continue;
6004 
6005       Value *Ptr = getPointerOperand(&I);
6006       // We don't check wrapping here because we don't know yet if Ptr will be
6007       // part of a full group or a group with gaps. Checking wrapping for all
6008       // pointers (even those that end up in groups with no gaps) will be overly
6009       // conservative. For full groups, wrapping should be ok since if we would
6010       // wrap around the address space we would do a memory access at nullptr
6011       // even without the transformation. The wrapping checks are therefore
6012       // deferred until after we've formed the interleaved groups.
6013       int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
6014                                     /*Assume=*/true, /*ShouldCheckWrap=*/false);
6015 
6016       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
6017       PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
6018       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
6019 
6020       // An alignment of 0 means target ABI alignment.
6021       unsigned Align = getMemInstAlignment(&I);
6022       if (!Align)
6023         Align = DL.getABITypeAlignment(PtrTy->getElementType());
6024 
6025       AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
6026     }
6027 }
6028 
6029 // Analyze interleaved accesses and collect them into interleaved load and
6030 // store groups.
6031 //
6032 // When generating code for an interleaved load group, we effectively hoist all
6033 // loads in the group to the location of the first load in program order. When
6034 // generating code for an interleaved store group, we sink all stores to the
6035 // location of the last store. This code motion can change the order of load
6036 // and store instructions and may break dependences.
6037 //
6038 // The code generation strategy mentioned above ensures that we won't violate
6039 // any write-after-read (WAR) dependences.
6040 //
6041 // E.g., for the WAR dependence:  a = A[i];      // (1)
6042 //                                A[i] = b;      // (2)
6043 //
6044 // The store group of (2) is always inserted at or below (2), and the load
6045 // group of (1) is always inserted at or above (1). Thus, the instructions will
6046 // never be reordered. All other dependences are checked to ensure the
6047 // correctness of the instruction reordering.
6048 //
6049 // The algorithm visits all memory accesses in the loop in bottom-up program
6050 // order. Program order is established by traversing the blocks in the loop in
6051 // reverse postorder when collecting the accesses.
6052 //
6053 // We visit the memory accesses in bottom-up order because it can simplify the
6054 // construction of store groups in the presence of write-after-write (WAW)
6055 // dependences.
6056 //
6057 // E.g., for the WAW dependence:  A[i] = a;      // (1)
6058 //                                A[i] = b;      // (2)
6059 //                                A[i + 1] = c;  // (3)
6060 //
6061 // We will first create a store group with (3) and (2). (1) can't be added to
6062 // this group because it and (2) are dependent. However, (1) can be grouped
6063 // with other accesses that may precede it in program order. Note that a
6064 // bottom-up order does not imply that WAW dependences should not be checked.
6065 void InterleavedAccessInfo::analyzeInterleaving(
6066     const ValueToValueMap &Strides) {
6067   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
6068 
6069   // Holds all accesses with a constant stride.
6070   MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
6071   collectConstStrideAccesses(AccessStrideInfo, Strides);
6072 
6073   if (AccessStrideInfo.empty())
6074     return;
6075 
6076   // Collect the dependences in the loop.
6077   collectDependences();
6078 
6079   // Holds all interleaved store groups temporarily.
6080   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
6081   // Holds all interleaved load groups temporarily.
6082   SmallSetVector<InterleaveGroup *, 4> LoadGroups;
6083 
6084   // Search in bottom-up program order for pairs of accesses (A and B) that can
6085   // form interleaved load or store groups. In the algorithm below, access A
6086   // precedes access B in program order. We initialize a group for B in the
6087   // outer loop of the algorithm, and then in the inner loop, we attempt to
6088   // insert each A into B's group if:
6089   //
6090   //  1. A and B have the same stride,
6091   //  2. A and B have the same memory object size, and
6092   //  3. A belongs in B's group according to its distance from B.
6093   //
6094   // Special care is taken to ensure group formation will not break any
6095   // dependences.
6096   for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
6097        BI != E; ++BI) {
6098     Instruction *B = BI->first;
6099     StrideDescriptor DesB = BI->second;
6100 
6101     // Initialize a group for B if it has an allowable stride. Even if we don't
6102     // create a group for B, we continue with the bottom-up algorithm to ensure
6103     // we don't break any of B's dependences.
6104     InterleaveGroup *Group = nullptr;
6105     if (isStrided(DesB.Stride)) {
6106       Group = getInterleaveGroup(B);
6107       if (!Group) {
6108         DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
6109         Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
6110       }
6111       if (B->mayWriteToMemory())
6112         StoreGroups.insert(Group);
6113       else
6114         LoadGroups.insert(Group);
6115     }
6116 
6117     for (auto AI = std::next(BI); AI != E; ++AI) {
6118       Instruction *A = AI->first;
6119       StrideDescriptor DesA = AI->second;
6120 
6121       // Our code motion strategy implies that we can't have dependences
6122       // between accesses in an interleaved group and other accesses located
6123       // between the first and last member of the group. Note that this also
6124       // means that a group can't have more than one member at a given offset.
6125       // The accesses in a group can have dependences with other accesses, but
6126       // we must ensure we don't extend the boundaries of the group such that
6127       // we encompass those dependent accesses.
6128       //
6129       // For example, assume we have the sequence of accesses shown below in a
6130       // stride-2 loop:
6131       //
6132       //  (1, 2) is a group | A[i]   = a;  // (1)
6133       //                    | A[i-1] = b;  // (2) |
6134       //                      A[i-3] = c;  // (3)
6135       //                      A[i]   = d;  // (4) | (2, 4) is not a group
6136       //
6137       // Because accesses (2) and (3) are dependent, we can group (2) with (1)
6138       // but not with (4). If we did, the dependent access (3) would be within
6139       // the boundaries of the (2, 4) group.
6140       if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
6141 
6142         // If a dependence exists and A is already in a group, we know that A
6143         // must be a store since A precedes B and WAR dependences are allowed.
6144         // Thus, A would be sunk below B. We release A's group to prevent this
6145         // illegal code motion. A will then be free to form another group with
6146         // instructions that precede it.
6147         if (isInterleaved(A)) {
6148           InterleaveGroup *StoreGroup = getInterleaveGroup(A);
6149           StoreGroups.remove(StoreGroup);
6150           releaseGroup(StoreGroup);
6151         }
6152 
6153         // If a dependence exists and A is not already in a group (or it was
6154         // and we just released it), B might be hoisted above A (if B is a
6155         // load) or another store might be sunk below A (if B is a store). In
6156         // either case, we can't add additional instructions to B's group. B
6157         // will only form a group with instructions that it precedes.
6158         break;
6159       }
6160 
6161       // At this point, we've checked for illegal code motion. If either A or B
6162       // isn't strided, there's nothing left to do.
6163       if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
6164         continue;
6165 
6166       // Ignore A if it's already in a group or isn't the same kind of memory
6167       // operation as B.
6168       if (isInterleaved(A) || A->mayReadFromMemory() != B->mayReadFromMemory())
6169         continue;
6170 
6171       // Check rules 1 and 2. Ignore A if its stride or size is different from
6172       // that of B.
6173       if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
6174         continue;
6175 
6176       // Ignore A if the memory object of A and B don't belong to the same
6177       // address space
6178       if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
6179         continue;
6180 
6181       // Calculate the distance from A to B.
6182       const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
6183           PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
6184       if (!DistToB)
6185         continue;
6186       int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
6187 
6188       // Check rule 3. Ignore A if its distance to B is not a multiple of the
6189       // size.
6190       if (DistanceToB % static_cast<int64_t>(DesB.Size))
6191         continue;
6192 
6193       // Ignore A if either A or B is in a predicated block. Although we
6194       // currently prevent group formation for predicated accesses, we may be
6195       // able to relax this limitation in the future once we handle more
6196       // complicated blocks.
6197       if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
6198         continue;
6199 
6200       // The index of A is the index of B plus A's distance to B in multiples
6201       // of the size.
6202       int IndexA =
6203           Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
6204 
6205       // Try to insert A into B's group.
6206       if (Group->insertMember(A, IndexA, DesA.Align)) {
6207         DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
6208                      << "    into the interleave group with" << *B << '\n');
6209         InterleaveGroupMap[A] = Group;
6210 
6211         // Set the first load in program order as the insert position.
6212         if (A->mayReadFromMemory())
6213           Group->setInsertPos(A);
6214       }
6215     } // Iteration over A accesses.
6216   } // Iteration over B accesses.
6217 
6218   // Remove interleaved store groups with gaps.
6219   for (InterleaveGroup *Group : StoreGroups)
6220     if (Group->getNumMembers() != Group->getFactor())
6221       releaseGroup(Group);
6222 
6223   // Remove interleaved groups with gaps (currently only loads) whose memory
6224   // accesses may wrap around. We have to revisit the getPtrStride analysis,
6225   // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
6226   // not check wrapping (see documentation there).
6227   // FORNOW we use Assume=false;
6228   // TODO: Change to Assume=true but making sure we don't exceed the threshold
6229   // of runtime SCEV assumptions checks (thereby potentially failing to
6230   // vectorize altogether).
6231   // Additional optional optimizations:
6232   // TODO: If we are peeling the loop and we know that the first pointer doesn't
6233   // wrap then we can deduce that all pointers in the group don't wrap.
6234   // This means that we can forcefully peel the loop in order to only have to
6235   // check the first pointer for no-wrap. When we'll change to use Assume=true
6236   // we'll only need at most one runtime check per interleaved group.
6237   //
6238   for (InterleaveGroup *Group : LoadGroups) {
6239 
6240     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
6241     // load would wrap around the address space we would do a memory access at
6242     // nullptr even without the transformation.
6243     if (Group->getNumMembers() == Group->getFactor())
6244       continue;
6245 
6246     // Case 2: If first and last members of the group don't wrap this implies
6247     // that all the pointers in the group don't wrap.
6248     // So we check only group member 0 (which is always guaranteed to exist),
6249     // and group member Factor - 1; If the latter doesn't exist we rely on
6250     // peeling (if it is a non-reveresed accsess -- see Case 3).
6251     Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
6252     if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
6253                       /*ShouldCheckWrap=*/true)) {
6254       DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6255                       "first group member potentially pointer-wrapping.\n");
6256       releaseGroup(Group);
6257       continue;
6258     }
6259     Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
6260     if (LastMember) {
6261       Value *LastMemberPtr = getPointerOperand(LastMember);
6262       if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
6263                         /*ShouldCheckWrap=*/true)) {
6264         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6265                         "last group member potentially pointer-wrapping.\n");
6266         releaseGroup(Group);
6267       }
6268     } else {
6269       // Case 3: A non-reversed interleaved load group with gaps: We need
6270       // to execute at least one scalar epilogue iteration. This will ensure
6271       // we don't speculatively access memory out-of-bounds. We only need
6272       // to look for a member at index factor - 1, since every group must have
6273       // a member at index zero.
6274       if (Group->isReverse()) {
6275         releaseGroup(Group);
6276         continue;
6277       }
6278       DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
6279       RequiresScalarEpilogue = true;
6280     }
6281   }
6282 }
6283 
6284 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
6285   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
6286     ORE->emit(createMissedAnalysis("ConditionalStore")
6287               << "store that is conditionally executed prevents vectorization");
6288     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
6289     return None;
6290   }
6291 
6292   if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
6293     return computeFeasibleMaxVF(OptForSize);
6294 
6295   if (Legal->getRuntimePointerChecking()->Need) {
6296     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
6297               << "runtime pointer checks needed. Enable vectorization of this "
6298                  "loop with '#pragma clang loop vectorize(enable)' when "
6299                  "compiling with -Os/-Oz");
6300     DEBUG(dbgs()
6301           << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
6302     return None;
6303   }
6304 
6305   // If we optimize the program for size, avoid creating the tail loop.
6306   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6307   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
6308 
6309   // If we don't know the precise trip count, don't try to vectorize.
6310   if (TC < 2) {
6311     ORE->emit(
6312         createMissedAnalysis("UnknownLoopCountComplexCFG")
6313         << "unable to calculate the loop count due to complex control flow");
6314     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6315     return None;
6316   }
6317 
6318   unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
6319 
6320   if (TC % MaxVF != 0) {
6321     // If the trip count that we found modulo the vectorization factor is not
6322     // zero then we require a tail.
6323     // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
6324     // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
6325     //        smaller MaxVF that does not require a scalar epilog.
6326 
6327     ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
6328               << "cannot optimize for size and vectorize at the "
6329                  "same time. Enable vectorization of this loop "
6330                  "with '#pragma clang loop vectorize(enable)' "
6331                  "when compiling with -Os/-Oz");
6332     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6333     return None;
6334   }
6335 
6336   return MaxVF;
6337 }
6338 
6339 unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
6340   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
6341   unsigned SmallestType, WidestType;
6342   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
6343   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
6344   unsigned MaxSafeDepDist = -1U;
6345 
6346   // Get the maximum safe dependence distance in bits computed by LAA. If the
6347   // loop contains any interleaved accesses, we divide the dependence distance
6348   // by the maximum interleave factor of all interleaved groups. Note that
6349   // although the division ensures correctness, this is a fairly conservative
6350   // computation because the maximum distance computed by LAA may not involve
6351   // any of the interleaved accesses.
6352   if (Legal->getMaxSafeDepDistBytes() != -1U)
6353     MaxSafeDepDist =
6354         Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
6355 
6356   WidestRegister =
6357       ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
6358   unsigned MaxVectorSize = WidestRegister / WidestType;
6359 
6360   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
6361                << WidestType << " bits.\n");
6362   DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
6363                << " bits.\n");
6364 
6365   if (MaxVectorSize == 0) {
6366     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
6367     MaxVectorSize = 1;
6368   }
6369 
6370   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
6371                                 " into one vector!");
6372 
6373   unsigned MaxVF = MaxVectorSize;
6374   if (MaximizeBandwidth && !OptForSize) {
6375     // Collect all viable vectorization factors.
6376     SmallVector<unsigned, 8> VFs;
6377     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
6378     for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
6379       VFs.push_back(VS);
6380 
6381     // For each VF calculate its register usage.
6382     auto RUs = calculateRegisterUsage(VFs);
6383 
6384     // Select the largest VF which doesn't require more registers than existing
6385     // ones.
6386     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
6387     for (int i = RUs.size() - 1; i >= 0; --i) {
6388       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
6389         MaxVF = VFs[i];
6390         break;
6391       }
6392     }
6393   }
6394   return MaxVF;
6395 }
6396 
6397 LoopVectorizationCostModel::VectorizationFactor
6398 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
6399   float Cost = expectedCost(1).first;
6400 #ifndef NDEBUG
6401   const float ScalarCost = Cost;
6402 #endif /* NDEBUG */
6403   unsigned Width = 1;
6404   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
6405 
6406   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6407   // Ignore scalar width, because the user explicitly wants vectorization.
6408   if (ForceVectorization && MaxVF > 1) {
6409     Width = 2;
6410     Cost = expectedCost(Width).first / (float)Width;
6411   }
6412 
6413   for (unsigned i = 2; i <= MaxVF; i *= 2) {
6414     // Notice that the vector loop needs to be executed less times, so
6415     // we need to divide the cost of the vector loops by the width of
6416     // the vector elements.
6417     VectorizationCostTy C = expectedCost(i);
6418     float VectorCost = C.first / (float)i;
6419     DEBUG(dbgs() << "LV: Vector loop of width " << i
6420                  << " costs: " << (int)VectorCost << ".\n");
6421     if (!C.second && !ForceVectorization) {
6422       DEBUG(
6423           dbgs() << "LV: Not considering vector loop of width " << i
6424                  << " because it will not generate any vector instructions.\n");
6425       continue;
6426     }
6427     if (VectorCost < Cost) {
6428       Cost = VectorCost;
6429       Width = i;
6430     }
6431   }
6432 
6433   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
6434         << "LV: Vectorization seems to be not beneficial, "
6435         << "but was forced by a user.\n");
6436   DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
6437   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
6438   return Factor;
6439 }
6440 
6441 std::pair<unsigned, unsigned>
6442 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6443   unsigned MinWidth = -1U;
6444   unsigned MaxWidth = 8;
6445   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6446 
6447   // For each block.
6448   for (BasicBlock *BB : TheLoop->blocks()) {
6449     // For each instruction in the loop.
6450     for (Instruction &I : *BB) {
6451       Type *T = I.getType();
6452 
6453       // Skip ignored values.
6454       if (ValuesToIgnore.count(&I))
6455         continue;
6456 
6457       // Only examine Loads, Stores and PHINodes.
6458       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6459         continue;
6460 
6461       // Examine PHI nodes that are reduction variables. Update the type to
6462       // account for the recurrence type.
6463       if (auto *PN = dyn_cast<PHINode>(&I)) {
6464         if (!Legal->isReductionVariable(PN))
6465           continue;
6466         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
6467         T = RdxDesc.getRecurrenceType();
6468       }
6469 
6470       // Examine the stored values.
6471       if (auto *ST = dyn_cast<StoreInst>(&I))
6472         T = ST->getValueOperand()->getType();
6473 
6474       // Ignore loaded pointer types and stored pointer types that are not
6475       // vectorizable.
6476       //
6477       // FIXME: The check here attempts to predict whether a load or store will
6478       //        be vectorized. We only know this for certain after a VF has
6479       //        been selected. Here, we assume that if an access can be
6480       //        vectorized, it will be. We should also look at extending this
6481       //        optimization to non-pointer types.
6482       //
6483       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6484           !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
6485         continue;
6486 
6487       MinWidth = std::min(MinWidth,
6488                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6489       MaxWidth = std::max(MaxWidth,
6490                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6491     }
6492   }
6493 
6494   return {MinWidth, MaxWidth};
6495 }
6496 
6497 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
6498                                                            unsigned VF,
6499                                                            unsigned LoopCost) {
6500 
6501   // -- The interleave heuristics --
6502   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6503   // There are many micro-architectural considerations that we can't predict
6504   // at this level. For example, frontend pressure (on decode or fetch) due to
6505   // code size, or the number and capabilities of the execution ports.
6506   //
6507   // We use the following heuristics to select the interleave count:
6508   // 1. If the code has reductions, then we interleave to break the cross
6509   // iteration dependency.
6510   // 2. If the loop is really small, then we interleave to reduce the loop
6511   // overhead.
6512   // 3. We don't interleave if we think that we will spill registers to memory
6513   // due to the increased register pressure.
6514 
6515   // When we optimize for size, we don't interleave.
6516   if (OptForSize)
6517     return 1;
6518 
6519   // We used the distance for the interleave count.
6520   if (Legal->getMaxSafeDepDistBytes() != -1U)
6521     return 1;
6522 
6523   // Do not interleave loops with a relatively small trip count.
6524   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6525   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
6526     return 1;
6527 
6528   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
6529   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6530                << " registers\n");
6531 
6532   if (VF == 1) {
6533     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6534       TargetNumRegisters = ForceTargetNumScalarRegs;
6535   } else {
6536     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6537       TargetNumRegisters = ForceTargetNumVectorRegs;
6538   }
6539 
6540   RegisterUsage R = calculateRegisterUsage({VF})[0];
6541   // We divide by these constants so assume that we have at least one
6542   // instruction that uses at least one register.
6543   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
6544   R.NumInstructions = std::max(R.NumInstructions, 1U);
6545 
6546   // We calculate the interleave count using the following formula.
6547   // Subtract the number of loop invariants from the number of available
6548   // registers. These registers are used by all of the interleaved instances.
6549   // Next, divide the remaining registers by the number of registers that is
6550   // required by the loop, in order to estimate how many parallel instances
6551   // fit without causing spills. All of this is rounded down if necessary to be
6552   // a power of two. We want power of two interleave count to simplify any
6553   // addressing operations or alignment considerations.
6554   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
6555                               R.MaxLocalUsers);
6556 
6557   // Don't count the induction variable as interleaved.
6558   if (EnableIndVarRegisterHeur)
6559     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
6560                        std::max(1U, (R.MaxLocalUsers - 1)));
6561 
6562   // Clamp the interleave ranges to reasonable counts.
6563   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
6564 
6565   // Check if the user has overridden the max.
6566   if (VF == 1) {
6567     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6568       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6569   } else {
6570     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6571       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6572   }
6573 
6574   // If we did not calculate the cost for VF (because the user selected the VF)
6575   // then we calculate the cost of VF here.
6576   if (LoopCost == 0)
6577     LoopCost = expectedCost(VF).first;
6578 
6579   // Clamp the calculated IC to be between the 1 and the max interleave count
6580   // that the target allows.
6581   if (IC > MaxInterleaveCount)
6582     IC = MaxInterleaveCount;
6583   else if (IC < 1)
6584     IC = 1;
6585 
6586   // Interleave if we vectorized this loop and there is a reduction that could
6587   // benefit from interleaving.
6588   if (VF > 1 && Legal->getReductionVars()->size()) {
6589     DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6590     return IC;
6591   }
6592 
6593   // Note that if we've already vectorized the loop we will have done the
6594   // runtime check and so interleaving won't require further checks.
6595   bool InterleavingRequiresRuntimePointerCheck =
6596       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
6597 
6598   // We want to interleave small loops in order to reduce the loop overhead and
6599   // potentially expose ILP opportunities.
6600   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
6601   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6602     // We assume that the cost overhead is 1 and we use the cost model
6603     // to estimate the cost of the loop and interleave until the cost of the
6604     // loop overhead is about 5% of the cost of the loop.
6605     unsigned SmallIC =
6606         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6607 
6608     // Interleave until store/load ports (estimated by max interleave count) are
6609     // saturated.
6610     unsigned NumStores = Legal->getNumStores();
6611     unsigned NumLoads = Legal->getNumLoads();
6612     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6613     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6614 
6615     // If we have a scalar reduction (vector reductions are already dealt with
6616     // by this point), we can increase the critical path length if the loop
6617     // we're interleaving is inside another loop. Limit, by default to 2, so the
6618     // critical path only gets increased by one reduction operation.
6619     if (Legal->getReductionVars()->size() && TheLoop->getLoopDepth() > 1) {
6620       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6621       SmallIC = std::min(SmallIC, F);
6622       StoresIC = std::min(StoresIC, F);
6623       LoadsIC = std::min(LoadsIC, F);
6624     }
6625 
6626     if (EnableLoadStoreRuntimeInterleave &&
6627         std::max(StoresIC, LoadsIC) > SmallIC) {
6628       DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6629       return std::max(StoresIC, LoadsIC);
6630     }
6631 
6632     DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6633     return SmallIC;
6634   }
6635 
6636   // Interleave if this is a large loop (small loops are already dealt with by
6637   // this point) that could benefit from interleaving.
6638   bool HasReductions = (Legal->getReductionVars()->size() > 0);
6639   if (TTI.enableAggressiveInterleaving(HasReductions)) {
6640     DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6641     return IC;
6642   }
6643 
6644   DEBUG(dbgs() << "LV: Not Interleaving.\n");
6645   return 1;
6646 }
6647 
6648 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6649 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
6650   // This function calculates the register usage by measuring the highest number
6651   // of values that are alive at a single location. Obviously, this is a very
6652   // rough estimation. We scan the loop in a topological order in order and
6653   // assign a number to each instruction. We use RPO to ensure that defs are
6654   // met before their users. We assume that each instruction that has in-loop
6655   // users starts an interval. We record every time that an in-loop value is
6656   // used, so we have a list of the first and last occurrences of each
6657   // instruction. Next, we transpose this data structure into a multi map that
6658   // holds the list of intervals that *end* at a specific location. This multi
6659   // map allows us to perform a linear search. We scan the instructions linearly
6660   // and record each time that a new interval starts, by placing it in a set.
6661   // If we find this value in the multi-map then we remove it from the set.
6662   // The max register usage is the maximum size of the set.
6663   // We also search for instructions that are defined outside the loop, but are
6664   // used inside the loop. We need this number separately from the max-interval
6665   // usage number because when we unroll, loop-invariant values do not take
6666   // more register.
6667   LoopBlocksDFS DFS(TheLoop);
6668   DFS.perform(LI);
6669 
6670   RegisterUsage RU;
6671   RU.NumInstructions = 0;
6672 
6673   // Each 'key' in the map opens a new interval. The values
6674   // of the map are the index of the 'last seen' usage of the
6675   // instruction that is the key.
6676   typedef DenseMap<Instruction *, unsigned> IntervalMap;
6677   // Maps instruction to its index.
6678   DenseMap<unsigned, Instruction *> IdxToInstr;
6679   // Marks the end of each interval.
6680   IntervalMap EndPoint;
6681   // Saves the list of instruction indices that are used in the loop.
6682   SmallSet<Instruction *, 8> Ends;
6683   // Saves the list of values that are used in the loop but are
6684   // defined outside the loop, such as arguments and constants.
6685   SmallPtrSet<Value *, 8> LoopInvariants;
6686 
6687   unsigned Index = 0;
6688   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6689     RU.NumInstructions += BB->size();
6690     for (Instruction &I : *BB) {
6691       IdxToInstr[Index++] = &I;
6692 
6693       // Save the end location of each USE.
6694       for (Value *U : I.operands()) {
6695         auto *Instr = dyn_cast<Instruction>(U);
6696 
6697         // Ignore non-instruction values such as arguments, constants, etc.
6698         if (!Instr)
6699           continue;
6700 
6701         // If this instruction is outside the loop then record it and continue.
6702         if (!TheLoop->contains(Instr)) {
6703           LoopInvariants.insert(Instr);
6704           continue;
6705         }
6706 
6707         // Overwrite previous end points.
6708         EndPoint[Instr] = Index;
6709         Ends.insert(Instr);
6710       }
6711     }
6712   }
6713 
6714   // Saves the list of intervals that end with the index in 'key'.
6715   typedef SmallVector<Instruction *, 2> InstrList;
6716   DenseMap<unsigned, InstrList> TransposeEnds;
6717 
6718   // Transpose the EndPoints to a list of values that end at each index.
6719   for (auto &Interval : EndPoint)
6720     TransposeEnds[Interval.second].push_back(Interval.first);
6721 
6722   SmallSet<Instruction *, 8> OpenIntervals;
6723 
6724   // Get the size of the widest register.
6725   unsigned MaxSafeDepDist = -1U;
6726   if (Legal->getMaxSafeDepDistBytes() != -1U)
6727     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
6728   unsigned WidestRegister =
6729       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
6730   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6731 
6732   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6733   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
6734 
6735   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6736 
6737   // A lambda that gets the register usage for the given type and VF.
6738   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
6739     if (Ty->isTokenTy())
6740       return 0U;
6741     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
6742     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
6743   };
6744 
6745   for (unsigned int i = 0; i < Index; ++i) {
6746     Instruction *I = IdxToInstr[i];
6747 
6748     // Remove all of the instructions that end at this location.
6749     InstrList &List = TransposeEnds[i];
6750     for (Instruction *ToRemove : List)
6751       OpenIntervals.erase(ToRemove);
6752 
6753     // Ignore instructions that are never used within the loop.
6754     if (!Ends.count(I))
6755       continue;
6756 
6757     // Skip ignored values.
6758     if (ValuesToIgnore.count(I))
6759       continue;
6760 
6761     // For each VF find the maximum usage of registers.
6762     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6763       if (VFs[j] == 1) {
6764         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
6765         continue;
6766       }
6767       collectUniformsAndScalars(VFs[j]);
6768       // Count the number of live intervals.
6769       unsigned RegUsage = 0;
6770       for (auto Inst : OpenIntervals) {
6771         // Skip ignored values for VF > 1.
6772         if (VecValuesToIgnore.count(Inst) ||
6773             isScalarAfterVectorization(Inst, VFs[j]))
6774           continue;
6775         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
6776       }
6777       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
6778     }
6779 
6780     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6781                  << OpenIntervals.size() << '\n');
6782 
6783     // Add the current instruction to the list of open intervals.
6784     OpenIntervals.insert(I);
6785   }
6786 
6787   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6788     unsigned Invariant = 0;
6789     if (VFs[i] == 1)
6790       Invariant = LoopInvariants.size();
6791     else {
6792       for (auto Inst : LoopInvariants)
6793         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
6794     }
6795 
6796     DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
6797     DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
6798     DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
6799     DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
6800 
6801     RU.LoopInvariantRegs = Invariant;
6802     RU.MaxLocalUsers = MaxUsages[i];
6803     RUs[i] = RU;
6804   }
6805 
6806   return RUs;
6807 }
6808 
6809 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
6810 
6811   // If we aren't vectorizing the loop, or if we've already collected the
6812   // instructions to scalarize, there's nothing to do. Collection may already
6813   // have occurred if we have a user-selected VF and are now computing the
6814   // expected cost for interleaving.
6815   if (VF < 2 || InstsToScalarize.count(VF))
6816     return;
6817 
6818   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6819   // not profitable to scalarize any instructions, the presence of VF in the
6820   // map will indicate that we've analyzed it already.
6821   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6822 
6823   // Find all the instructions that are scalar with predication in the loop and
6824   // determine if it would be better to not if-convert the blocks they are in.
6825   // If so, we also record the instructions to scalarize.
6826   for (BasicBlock *BB : TheLoop->blocks()) {
6827     if (!Legal->blockNeedsPredication(BB))
6828       continue;
6829     for (Instruction &I : *BB)
6830       if (Legal->isScalarWithPredication(&I)) {
6831         ScalarCostsTy ScalarCosts;
6832         if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6833           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6834 
6835         // Remember that BB will remain after vectorization.
6836         PredicatedBBsAfterVectorization.insert(BB);
6837       }
6838   }
6839 }
6840 
6841 int LoopVectorizationCostModel::computePredInstDiscount(
6842     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6843     unsigned VF) {
6844 
6845   assert(!isUniformAfterVectorization(PredInst, VF) &&
6846          "Instruction marked uniform-after-vectorization will be predicated");
6847 
6848   // Initialize the discount to zero, meaning that the scalar version and the
6849   // vector version cost the same.
6850   int Discount = 0;
6851 
6852   // Holds instructions to analyze. The instructions we visit are mapped in
6853   // ScalarCosts. Those instructions are the ones that would be scalarized if
6854   // we find that the scalar version costs less.
6855   SmallVector<Instruction *, 8> Worklist;
6856 
6857   // Returns true if the given instruction can be scalarized.
6858   auto canBeScalarized = [&](Instruction *I) -> bool {
6859 
6860     // We only attempt to scalarize instructions forming a single-use chain
6861     // from the original predicated block that would otherwise be vectorized.
6862     // Although not strictly necessary, we give up on instructions we know will
6863     // already be scalar to avoid traversing chains that are unlikely to be
6864     // beneficial.
6865     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6866         isScalarAfterVectorization(I, VF))
6867       return false;
6868 
6869     // If the instruction is scalar with predication, it will be analyzed
6870     // separately. We ignore it within the context of PredInst.
6871     if (Legal->isScalarWithPredication(I))
6872       return false;
6873 
6874     // If any of the instruction's operands are uniform after vectorization,
6875     // the instruction cannot be scalarized. This prevents, for example, a
6876     // masked load from being scalarized.
6877     //
6878     // We assume we will only emit a value for lane zero of an instruction
6879     // marked uniform after vectorization, rather than VF identical values.
6880     // Thus, if we scalarize an instruction that uses a uniform, we would
6881     // create uses of values corresponding to the lanes we aren't emitting code
6882     // for. This behavior can be changed by allowing getScalarValue to clone
6883     // the lane zero values for uniforms rather than asserting.
6884     for (Use &U : I->operands())
6885       if (auto *J = dyn_cast<Instruction>(U.get()))
6886         if (isUniformAfterVectorization(J, VF))
6887           return false;
6888 
6889     // Otherwise, we can scalarize the instruction.
6890     return true;
6891   };
6892 
6893   // Returns true if an operand that cannot be scalarized must be extracted
6894   // from a vector. We will account for this scalarization overhead below. Note
6895   // that the non-void predicated instructions are placed in their own blocks,
6896   // and their return values are inserted into vectors. Thus, an extract would
6897   // still be required.
6898   auto needsExtract = [&](Instruction *I) -> bool {
6899     return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
6900   };
6901 
6902   // Compute the expected cost discount from scalarizing the entire expression
6903   // feeding the predicated instruction. We currently only consider expressions
6904   // that are single-use instruction chains.
6905   Worklist.push_back(PredInst);
6906   while (!Worklist.empty()) {
6907     Instruction *I = Worklist.pop_back_val();
6908 
6909     // If we've already analyzed the instruction, there's nothing to do.
6910     if (ScalarCosts.count(I))
6911       continue;
6912 
6913     // Compute the cost of the vector instruction. Note that this cost already
6914     // includes the scalarization overhead of the predicated instruction.
6915     unsigned VectorCost = getInstructionCost(I, VF).first;
6916 
6917     // Compute the cost of the scalarized instruction. This cost is the cost of
6918     // the instruction as if it wasn't if-converted and instead remained in the
6919     // predicated block. We will scale this cost by block probability after
6920     // computing the scalarization overhead.
6921     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
6922 
6923     // Compute the scalarization overhead of needed insertelement instructions
6924     // and phi nodes.
6925     if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6926       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
6927                                                  true, false);
6928       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
6929     }
6930 
6931     // Compute the scalarization overhead of needed extractelement
6932     // instructions. For each of the instruction's operands, if the operand can
6933     // be scalarized, add it to the worklist; otherwise, account for the
6934     // overhead.
6935     for (Use &U : I->operands())
6936       if (auto *J = dyn_cast<Instruction>(U.get())) {
6937         assert(VectorType::isValidElementType(J->getType()) &&
6938                "Instruction has non-scalar type");
6939         if (canBeScalarized(J))
6940           Worklist.push_back(J);
6941         else if (needsExtract(J))
6942           ScalarCost += TTI.getScalarizationOverhead(
6943                               ToVectorTy(J->getType(),VF), false, true);
6944       }
6945 
6946     // Scale the total scalar cost by block probability.
6947     ScalarCost /= getReciprocalPredBlockProb();
6948 
6949     // Compute the discount. A non-negative discount means the vector version
6950     // of the instruction costs more, and scalarizing would be beneficial.
6951     Discount += VectorCost - ScalarCost;
6952     ScalarCosts[I] = ScalarCost;
6953   }
6954 
6955   return Discount;
6956 }
6957 
6958 LoopVectorizationCostModel::VectorizationCostTy
6959 LoopVectorizationCostModel::expectedCost(unsigned VF) {
6960   VectorizationCostTy Cost;
6961 
6962   // Collect Uniform and Scalar instructions after vectorization with VF.
6963   collectUniformsAndScalars(VF);
6964 
6965   // Collect the instructions (and their associated costs) that will be more
6966   // profitable to scalarize.
6967   collectInstsToScalarize(VF);
6968 
6969   // For each block.
6970   for (BasicBlock *BB : TheLoop->blocks()) {
6971     VectorizationCostTy BlockCost;
6972 
6973     // For each instruction in the old loop.
6974     for (Instruction &I : *BB) {
6975       // Skip dbg intrinsics.
6976       if (isa<DbgInfoIntrinsic>(I))
6977         continue;
6978 
6979       // Skip ignored values.
6980       if (ValuesToIgnore.count(&I))
6981         continue;
6982 
6983       VectorizationCostTy C = getInstructionCost(&I, VF);
6984 
6985       // Check if we should override the cost.
6986       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6987         C.first = ForceTargetInstructionCost;
6988 
6989       BlockCost.first += C.first;
6990       BlockCost.second |= C.second;
6991       DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
6992                    << VF << " For instruction: " << I << '\n');
6993     }
6994 
6995     // If we are vectorizing a predicated block, it will have been
6996     // if-converted. This means that the block's instructions (aside from
6997     // stores and instructions that may divide by zero) will now be
6998     // unconditionally executed. For the scalar case, we may not always execute
6999     // the predicated block. Thus, scale the block's cost by the probability of
7000     // executing it.
7001     if (VF == 1 && Legal->blockNeedsPredication(BB))
7002       BlockCost.first /= getReciprocalPredBlockProb();
7003 
7004     Cost.first += BlockCost.first;
7005     Cost.second |= BlockCost.second;
7006   }
7007 
7008   return Cost;
7009 }
7010 
7011 /// \brief Gets Address Access SCEV after verifying that the access pattern
7012 /// is loop invariant except the induction variable dependence.
7013 ///
7014 /// This SCEV can be sent to the Target in order to estimate the address
7015 /// calculation cost.
7016 static const SCEV *getAddressAccessSCEV(
7017               Value *Ptr,
7018               LoopVectorizationLegality *Legal,
7019               ScalarEvolution *SE,
7020               const Loop *TheLoop) {
7021   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
7022   if (!Gep)
7023     return nullptr;
7024 
7025   // We are looking for a gep with all loop invariant indices except for one
7026   // which should be an induction variable.
7027   unsigned NumOperands = Gep->getNumOperands();
7028   for (unsigned i = 1; i < NumOperands; ++i) {
7029     Value *Opd = Gep->getOperand(i);
7030     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
7031         !Legal->isInductionVariable(Opd))
7032       return nullptr;
7033   }
7034 
7035   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
7036   return SE->getSCEV(Ptr);
7037 }
7038 
7039 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
7040   return Legal->hasStride(I->getOperand(0)) ||
7041          Legal->hasStride(I->getOperand(1));
7042 }
7043 
7044 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
7045                                                                  unsigned VF) {
7046   Type *ValTy = getMemInstValueType(I);
7047   auto SE = PSE.getSE();
7048 
7049   unsigned Alignment = getMemInstAlignment(I);
7050   unsigned AS = getMemInstAddressSpace(I);
7051   Value *Ptr = getPointerOperand(I);
7052   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7053 
7054   // Figure out whether the access is strided and get the stride value
7055   // if it's known in compile time
7056   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
7057 
7058   // Get the cost of the scalar memory instruction and address computation.
7059   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7060 
7061   Cost += VF *
7062           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
7063                               AS, I);
7064 
7065   // Get the overhead of the extractelement and insertelement instructions
7066   // we might create due to scalarization.
7067   Cost += getScalarizationOverhead(I, VF, TTI);
7068 
7069   // If we have a predicated store, it may not be executed for each vector
7070   // lane. Scale the cost by the probability of executing the predicated
7071   // block.
7072   if (Legal->isScalarWithPredication(I))
7073     Cost /= getReciprocalPredBlockProb();
7074 
7075   return Cost;
7076 }
7077 
7078 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7079                                                              unsigned VF) {
7080   Type *ValTy = getMemInstValueType(I);
7081   Type *VectorTy = ToVectorTy(ValTy, VF);
7082   unsigned Alignment = getMemInstAlignment(I);
7083   Value *Ptr = getPointerOperand(I);
7084   unsigned AS = getMemInstAddressSpace(I);
7085   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7086 
7087   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7088          "Stride should be 1 or -1 for consecutive memory access");
7089   unsigned Cost = 0;
7090   if (Legal->isMaskRequired(I))
7091     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
7092   else
7093     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
7094 
7095   bool Reverse = ConsecutiveStride < 0;
7096   if (Reverse)
7097     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
7098   return Cost;
7099 }
7100 
7101 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7102                                                          unsigned VF) {
7103   LoadInst *LI = cast<LoadInst>(I);
7104   Type *ValTy = LI->getType();
7105   Type *VectorTy = ToVectorTy(ValTy, VF);
7106   unsigned Alignment = LI->getAlignment();
7107   unsigned AS = LI->getPointerAddressSpace();
7108 
7109   return TTI.getAddressComputationCost(ValTy) +
7110          TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
7111          TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7112 }
7113 
7114 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7115                                                           unsigned VF) {
7116   Type *ValTy = getMemInstValueType(I);
7117   Type *VectorTy = ToVectorTy(ValTy, VF);
7118   unsigned Alignment = getMemInstAlignment(I);
7119   Value *Ptr = getPointerOperand(I);
7120 
7121   return TTI.getAddressComputationCost(VectorTy) +
7122          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
7123                                     Legal->isMaskRequired(I), Alignment);
7124 }
7125 
7126 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7127                                                             unsigned VF) {
7128   Type *ValTy = getMemInstValueType(I);
7129   Type *VectorTy = ToVectorTy(ValTy, VF);
7130   unsigned AS = getMemInstAddressSpace(I);
7131 
7132   auto Group = Legal->getInterleavedAccessGroup(I);
7133   assert(Group && "Fail to get an interleaved access group.");
7134 
7135   unsigned InterleaveFactor = Group->getFactor();
7136   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7137 
7138   // Holds the indices of existing members in an interleaved load group.
7139   // An interleaved store group doesn't need this as it doesn't allow gaps.
7140   SmallVector<unsigned, 4> Indices;
7141   if (isa<LoadInst>(I)) {
7142     for (unsigned i = 0; i < InterleaveFactor; i++)
7143       if (Group->getMember(i))
7144         Indices.push_back(i);
7145   }
7146 
7147   // Calculate the cost of the whole interleaved group.
7148   unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
7149                                                  Group->getFactor(), Indices,
7150                                                  Group->getAlignment(), AS);
7151 
7152   if (Group->isReverse())
7153     Cost += Group->getNumMembers() *
7154             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
7155   return Cost;
7156 }
7157 
7158 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7159                                                               unsigned VF) {
7160 
7161   // Calculate scalar cost only. Vectorization cost should be ready at this
7162   // moment.
7163   if (VF == 1) {
7164     Type *ValTy = getMemInstValueType(I);
7165     unsigned Alignment = getMemInstAlignment(I);
7166     unsigned AS = getMemInstAddressSpace(I);
7167 
7168     return TTI.getAddressComputationCost(ValTy) +
7169            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
7170   }
7171   return getWideningCost(I, VF);
7172 }
7173 
7174 LoopVectorizationCostModel::VectorizationCostTy
7175 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
7176   // If we know that this instruction will remain uniform, check the cost of
7177   // the scalar version.
7178   if (isUniformAfterVectorization(I, VF))
7179     VF = 1;
7180 
7181   if (VF > 1 && isProfitableToScalarize(I, VF))
7182     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7183 
7184   // Forced scalars do not have any scalarization overhead.
7185   if (VF > 1 && ForcedScalars.count(VF) &&
7186       ForcedScalars.find(VF)->second.count(I))
7187     return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
7188 
7189   Type *VectorTy;
7190   unsigned C = getInstructionCost(I, VF, VectorTy);
7191 
7192   bool TypeNotScalarized =
7193       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
7194   return VectorizationCostTy(C, TypeNotScalarized);
7195 }
7196 
7197 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
7198   if (VF == 1)
7199     return;
7200   for (BasicBlock *BB : TheLoop->blocks()) {
7201     // For each instruction in the old loop.
7202     for (Instruction &I : *BB) {
7203       Value *Ptr = getPointerOperand(&I);
7204       if (!Ptr)
7205         continue;
7206 
7207       if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
7208         // Scalar load + broadcast
7209         unsigned Cost = getUniformMemOpCost(&I, VF);
7210         setWideningDecision(&I, VF, CM_Scalarize, Cost);
7211         continue;
7212       }
7213 
7214       // We assume that widening is the best solution when possible.
7215       if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
7216         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
7217         setWideningDecision(&I, VF, CM_Widen, Cost);
7218         continue;
7219       }
7220 
7221       // Choose between Interleaving, Gather/Scatter or Scalarization.
7222       unsigned InterleaveCost = UINT_MAX;
7223       unsigned NumAccesses = 1;
7224       if (Legal->isAccessInterleaved(&I)) {
7225         auto Group = Legal->getInterleavedAccessGroup(&I);
7226         assert(Group && "Fail to get an interleaved access group.");
7227 
7228         // Make one decision for the whole group.
7229         if (getWideningDecision(&I, VF) != CM_Unknown)
7230           continue;
7231 
7232         NumAccesses = Group->getNumMembers();
7233         InterleaveCost = getInterleaveGroupCost(&I, VF);
7234       }
7235 
7236       unsigned GatherScatterCost =
7237           Legal->isLegalGatherOrScatter(&I)
7238               ? getGatherScatterCost(&I, VF) * NumAccesses
7239               : UINT_MAX;
7240 
7241       unsigned ScalarizationCost =
7242           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7243 
7244       // Choose better solution for the current VF,
7245       // write down this decision and use it during vectorization.
7246       unsigned Cost;
7247       InstWidening Decision;
7248       if (InterleaveCost <= GatherScatterCost &&
7249           InterleaveCost < ScalarizationCost) {
7250         Decision = CM_Interleave;
7251         Cost = InterleaveCost;
7252       } else if (GatherScatterCost < ScalarizationCost) {
7253         Decision = CM_GatherScatter;
7254         Cost = GatherScatterCost;
7255       } else {
7256         Decision = CM_Scalarize;
7257         Cost = ScalarizationCost;
7258       }
7259       // If the instructions belongs to an interleave group, the whole group
7260       // receives the same decision. The whole group receives the cost, but
7261       // the cost will actually be assigned to one instruction.
7262       if (auto Group = Legal->getInterleavedAccessGroup(&I))
7263         setWideningDecision(Group, VF, Decision, Cost);
7264       else
7265         setWideningDecision(&I, VF, Decision, Cost);
7266     }
7267   }
7268 
7269   // Make sure that any load of address and any other address computation
7270   // remains scalar unless there is gather/scatter support. This avoids
7271   // inevitable extracts into address registers, and also has the benefit of
7272   // activating LSR more, since that pass can't optimize vectorized
7273   // addresses.
7274   if (TTI.prefersVectorizedAddressing())
7275     return;
7276 
7277   // Start with all scalar pointer uses.
7278   SmallPtrSet<Instruction *, 8> AddrDefs;
7279   for (BasicBlock *BB : TheLoop->blocks())
7280     for (Instruction &I : *BB) {
7281       Instruction *PtrDef =
7282         dyn_cast_or_null<Instruction>(getPointerOperand(&I));
7283       if (PtrDef && TheLoop->contains(PtrDef) &&
7284           getWideningDecision(&I, VF) != CM_GatherScatter)
7285         AddrDefs.insert(PtrDef);
7286     }
7287 
7288   // Add all instructions used to generate the addresses.
7289   SmallVector<Instruction *, 4> Worklist;
7290   for (auto *I : AddrDefs)
7291     Worklist.push_back(I);
7292   while (!Worklist.empty()) {
7293     Instruction *I = Worklist.pop_back_val();
7294     for (auto &Op : I->operands())
7295       if (auto *InstOp = dyn_cast<Instruction>(Op))
7296         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7297             AddrDefs.insert(InstOp).second == true)
7298           Worklist.push_back(InstOp);
7299   }
7300 
7301   for (auto *I : AddrDefs) {
7302     if (isa<LoadInst>(I)) {
7303       // Setting the desired widening decision should ideally be handled in
7304       // by cost functions, but since this involves the task of finding out
7305       // if the loaded register is involved in an address computation, it is
7306       // instead changed here when we know this is the case.
7307       if (getWideningDecision(I, VF) == CM_Widen)
7308         // Scalarize a widened load of address.
7309         setWideningDecision(I, VF, CM_Scalarize,
7310                             (VF * getMemoryInstructionCost(I, 1)));
7311       else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
7312         // Scalarize an interleave group of address loads.
7313         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7314           if (Instruction *Member = Group->getMember(I))
7315             setWideningDecision(Member, VF, CM_Scalarize,
7316                                 (VF * getMemoryInstructionCost(Member, 1)));
7317         }
7318       }
7319     } else
7320       // Make sure I gets scalarized and a cost estimate without
7321       // scalarization overhead.
7322       ForcedScalars[VF].insert(I);
7323   }
7324 }
7325 
7326 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7327                                                         unsigned VF,
7328                                                         Type *&VectorTy) {
7329   Type *RetTy = I->getType();
7330   if (canTruncateToMinimalBitwidth(I, VF))
7331     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7332   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7333   auto SE = PSE.getSE();
7334 
7335   // TODO: We need to estimate the cost of intrinsic calls.
7336   switch (I->getOpcode()) {
7337   case Instruction::GetElementPtr:
7338     // We mark this instruction as zero-cost because the cost of GEPs in
7339     // vectorized code depends on whether the corresponding memory instruction
7340     // is scalarized or not. Therefore, we handle GEPs with the memory
7341     // instruction cost.
7342     return 0;
7343   case Instruction::Br: {
7344     // In cases of scalarized and predicated instructions, there will be VF
7345     // predicated blocks in the vectorized loop. Each branch around these
7346     // blocks requires also an extract of its vector compare i1 element.
7347     bool ScalarPredicatedBB = false;
7348     BranchInst *BI = cast<BranchInst>(I);
7349     if (VF > 1 && BI->isConditional() &&
7350         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7351          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7352       ScalarPredicatedBB = true;
7353 
7354     if (ScalarPredicatedBB) {
7355       // Return cost for branches around scalarized and predicated blocks.
7356       Type *Vec_i1Ty =
7357           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7358       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
7359               (TTI.getCFInstrCost(Instruction::Br) * VF));
7360     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
7361       // The back-edge branch will remain, as will all scalar branches.
7362       return TTI.getCFInstrCost(Instruction::Br);
7363     else
7364       // This branch will be eliminated by if-conversion.
7365       return 0;
7366     // Note: We currently assume zero cost for an unconditional branch inside
7367     // a predicated block since it will become a fall-through, although we
7368     // may decide in the future to call TTI for all branches.
7369   }
7370   case Instruction::PHI: {
7371     auto *Phi = cast<PHINode>(I);
7372 
7373     // First-order recurrences are replaced by vector shuffles inside the loop.
7374     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
7375       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
7376                                 VectorTy, VF - 1, VectorTy);
7377 
7378     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7379     // converted into select instructions. We require N - 1 selects per phi
7380     // node, where N is the number of incoming values.
7381     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
7382       return (Phi->getNumIncomingValues() - 1) *
7383              TTI.getCmpSelInstrCost(
7384                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7385                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
7386 
7387     return TTI.getCFInstrCost(Instruction::PHI);
7388   }
7389   case Instruction::UDiv:
7390   case Instruction::SDiv:
7391   case Instruction::URem:
7392   case Instruction::SRem:
7393     // If we have a predicated instruction, it may not be executed for each
7394     // vector lane. Get the scalarization cost and scale this amount by the
7395     // probability of executing the predicated block. If the instruction is not
7396     // predicated, we fall through to the next case.
7397     if (VF > 1 && Legal->isScalarWithPredication(I)) {
7398       unsigned Cost = 0;
7399 
7400       // These instructions have a non-void type, so account for the phi nodes
7401       // that we will create. This cost is likely to be zero. The phi node
7402       // cost, if any, should be scaled by the block probability because it
7403       // models a copy at the end of each predicated block.
7404       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
7405 
7406       // The cost of the non-predicated instruction.
7407       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
7408 
7409       // The cost of insertelement and extractelement instructions needed for
7410       // scalarization.
7411       Cost += getScalarizationOverhead(I, VF, TTI);
7412 
7413       // Scale the cost by the probability of executing the predicated blocks.
7414       // This assumes the predicated block for each vector lane is equally
7415       // likely.
7416       return Cost / getReciprocalPredBlockProb();
7417     }
7418     LLVM_FALLTHROUGH;
7419   case Instruction::Add:
7420   case Instruction::FAdd:
7421   case Instruction::Sub:
7422   case Instruction::FSub:
7423   case Instruction::Mul:
7424   case Instruction::FMul:
7425   case Instruction::FDiv:
7426   case Instruction::FRem:
7427   case Instruction::Shl:
7428   case Instruction::LShr:
7429   case Instruction::AShr:
7430   case Instruction::And:
7431   case Instruction::Or:
7432   case Instruction::Xor: {
7433     // Since we will replace the stride by 1 the multiplication should go away.
7434     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7435       return 0;
7436     // Certain instructions can be cheaper to vectorize if they have a constant
7437     // second vector operand. One example of this are shifts on x86.
7438     TargetTransformInfo::OperandValueKind Op1VK =
7439         TargetTransformInfo::OK_AnyValue;
7440     TargetTransformInfo::OperandValueKind Op2VK =
7441         TargetTransformInfo::OK_AnyValue;
7442     TargetTransformInfo::OperandValueProperties Op1VP =
7443         TargetTransformInfo::OP_None;
7444     TargetTransformInfo::OperandValueProperties Op2VP =
7445         TargetTransformInfo::OP_None;
7446     Value *Op2 = I->getOperand(1);
7447 
7448     // Check for a splat or for a non uniform vector of constants.
7449     if (isa<ConstantInt>(Op2)) {
7450       ConstantInt *CInt = cast<ConstantInt>(Op2);
7451       if (CInt && CInt->getValue().isPowerOf2())
7452         Op2VP = TargetTransformInfo::OP_PowerOf2;
7453       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
7454     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
7455       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
7456       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
7457       if (SplatValue) {
7458         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
7459         if (CInt && CInt->getValue().isPowerOf2())
7460           Op2VP = TargetTransformInfo::OP_PowerOf2;
7461         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
7462       }
7463     } else if (Legal->isUniform(Op2)) {
7464       Op2VK = TargetTransformInfo::OK_UniformValue;
7465     }
7466     SmallVector<const Value *, 4> Operands(I->operand_values());
7467     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
7468     return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
7469                                           Op2VK, Op1VP, Op2VP, Operands);
7470   }
7471   case Instruction::Select: {
7472     SelectInst *SI = cast<SelectInst>(I);
7473     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7474     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7475     Type *CondTy = SI->getCondition()->getType();
7476     if (!ScalarCond)
7477       CondTy = VectorType::get(CondTy, VF);
7478 
7479     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
7480   }
7481   case Instruction::ICmp:
7482   case Instruction::FCmp: {
7483     Type *ValTy = I->getOperand(0)->getType();
7484     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7485     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7486       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7487     VectorTy = ToVectorTy(ValTy, VF);
7488     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
7489   }
7490   case Instruction::Store:
7491   case Instruction::Load: {
7492     unsigned Width = VF;
7493     if (Width > 1) {
7494       InstWidening Decision = getWideningDecision(I, Width);
7495       assert(Decision != CM_Unknown &&
7496              "CM decision should be taken at this point");
7497       if (Decision == CM_Scalarize)
7498         Width = 1;
7499     }
7500     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7501     return getMemoryInstructionCost(I, VF);
7502   }
7503   case Instruction::ZExt:
7504   case Instruction::SExt:
7505   case Instruction::FPToUI:
7506   case Instruction::FPToSI:
7507   case Instruction::FPExt:
7508   case Instruction::PtrToInt:
7509   case Instruction::IntToPtr:
7510   case Instruction::SIToFP:
7511   case Instruction::UIToFP:
7512   case Instruction::Trunc:
7513   case Instruction::FPTrunc:
7514   case Instruction::BitCast: {
7515     // We optimize the truncation of induction variables having constant
7516     // integer steps. The cost of these truncations is the same as the scalar
7517     // operation.
7518     if (isOptimizableIVTruncate(I, VF)) {
7519       auto *Trunc = cast<TruncInst>(I);
7520       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7521                                   Trunc->getSrcTy(), Trunc);
7522     }
7523 
7524     Type *SrcScalarTy = I->getOperand(0)->getType();
7525     Type *SrcVecTy =
7526         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7527     if (canTruncateToMinimalBitwidth(I, VF)) {
7528       // This cast is going to be shrunk. This may remove the cast or it might
7529       // turn it into slightly different cast. For example, if MinBW == 16,
7530       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7531       //
7532       // Calculate the modified src and dest types.
7533       Type *MinVecTy = VectorTy;
7534       if (I->getOpcode() == Instruction::Trunc) {
7535         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7536         VectorTy =
7537             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7538       } else if (I->getOpcode() == Instruction::ZExt ||
7539                  I->getOpcode() == Instruction::SExt) {
7540         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7541         VectorTy =
7542             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7543       }
7544     }
7545 
7546     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
7547     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
7548   }
7549   case Instruction::Call: {
7550     bool NeedToScalarize;
7551     CallInst *CI = cast<CallInst>(I);
7552     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
7553     if (getVectorIntrinsicIDForCall(CI, TLI))
7554       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
7555     return CallCost;
7556   }
7557   default:
7558     // The cost of executing VF copies of the scalar instruction. This opcode
7559     // is unknown. Assume that it is the same as 'mul'.
7560     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
7561            getScalarizationOverhead(I, VF, TTI);
7562   } // end of switch.
7563 }
7564 
7565 char LoopVectorize::ID = 0;
7566 static const char lv_name[] = "Loop Vectorization";
7567 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7568 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7569 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7570 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7571 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7572 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7573 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7574 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7575 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7576 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7577 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7578 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7579 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7580 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7581 
7582 namespace llvm {
7583 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
7584   return new LoopVectorize(NoUnrolling, AlwaysVectorize);
7585 }
7586 }
7587 
7588 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7589 
7590   // Check if the pointer operand of a load or store instruction is
7591   // consecutive.
7592   if (auto *Ptr = getPointerOperand(Inst))
7593     return Legal->isConsecutivePtr(Ptr);
7594   return false;
7595 }
7596 
7597 void LoopVectorizationCostModel::collectValuesToIgnore() {
7598   // Ignore ephemeral values.
7599   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7600 
7601   // Ignore type-promoting instructions we identified during reduction
7602   // detection.
7603   for (auto &Reduction : *Legal->getReductionVars()) {
7604     RecurrenceDescriptor &RedDes = Reduction.second;
7605     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7606     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7607   }
7608 }
7609 
7610 LoopVectorizationCostModel::VectorizationFactor
7611 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
7612 
7613   // Width 1 means no vectorize, cost 0 means uncomputed cost.
7614   const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
7615                                                                            0U};
7616   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
7617   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
7618     return NoVectorization;
7619 
7620   if (UserVF) {
7621     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7622     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
7623     // Collect the instructions (and their associated costs) that will be more
7624     // profitable to scalarize.
7625     CM.selectUserVectorizationFactor(UserVF);
7626     return {UserVF, 0};
7627   }
7628 
7629   unsigned MaxVF = MaybeMaxVF.getValue();
7630   assert(MaxVF != 0 && "MaxVF is zero.");
7631   if (MaxVF == 1)
7632     return NoVectorization;
7633 
7634   // Select the optimal vectorization factor.
7635   return CM.selectVectorizationFactor(MaxVF);
7636 }
7637 
7638 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
7639   // Perform the actual loop transformation.
7640 
7641   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7642   ILV.createVectorizedLoopSkeleton();
7643 
7644   //===------------------------------------------------===//
7645   //
7646   // Notice: any optimization or new instruction that go
7647   // into the code below should also be implemented in
7648   // the cost-model.
7649   //
7650   //===------------------------------------------------===//
7651 
7652   // 2. Copy and widen instructions from the old loop into the new loop.
7653 
7654   // Collect instructions from the original loop that will become trivially dead
7655   // in the vectorized loop. We don't need to vectorize these instructions. For
7656   // example, original induction update instructions can become dead because we
7657   // separately emit induction "steps" when generating code for the new loop.
7658   // Similarly, we create a new latch condition when setting up the structure
7659   // of the new loop, so the old one can become dead.
7660   SmallPtrSet<Instruction *, 4> DeadInstructions;
7661   collectTriviallyDeadInstructions(DeadInstructions);
7662 
7663   // Scan the loop in a topological order to ensure that defs are vectorized
7664   // before users.
7665   LoopBlocksDFS DFS(OrigLoop);
7666   DFS.perform(LI);
7667 
7668   // Vectorize all instructions in the original loop that will not become
7669   // trivially dead when vectorized.
7670   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
7671     for (Instruction &I : *BB)
7672       if (!DeadInstructions.count(&I))
7673         ILV.vectorizeInstruction(I);
7674 
7675   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7676   //    predication, updating analyses.
7677   ILV.fixVectorizedLoop();
7678 }
7679 
7680 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7681     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7682   BasicBlock *Latch = OrigLoop->getLoopLatch();
7683 
7684   // We create new control-flow for the vectorized loop, so the original
7685   // condition will be dead after vectorization if it's only used by the
7686   // branch.
7687   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7688   if (Cmp && Cmp->hasOneUse())
7689     DeadInstructions.insert(Cmp);
7690 
7691   // We create new "steps" for induction variable updates to which the original
7692   // induction variables map. An original update instruction will be dead if
7693   // all its users except the induction variable are dead.
7694   for (auto &Induction : *Legal->getInductionVars()) {
7695     PHINode *Ind = Induction.first;
7696     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7697     if (all_of(IndUpdate->users(), [&](User *U) -> bool {
7698           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7699         }))
7700       DeadInstructions.insert(IndUpdate);
7701   }
7702 }
7703 
7704 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
7705   auto *SI = dyn_cast<StoreInst>(Instr);
7706   bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
7707 
7708   return scalarizeInstruction(Instr, IfPredicateInstr);
7709 }
7710 
7711 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7712 
7713 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7714 
7715 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7716                                         Instruction::BinaryOps BinOp) {
7717   // When unrolling and the VF is 1, we only need to add a simple scalar.
7718   Type *Ty = Val->getType();
7719   assert(!Ty->isVectorTy() && "Val must be a scalar");
7720 
7721   if (Ty->isFloatingPointTy()) {
7722     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7723 
7724     // Floating point operations had to be 'fast' to enable the unrolling.
7725     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7726     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7727   }
7728   Constant *C = ConstantInt::get(Ty, StartIdx);
7729   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7730 }
7731 
7732 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7733   SmallVector<Metadata *, 4> MDs;
7734   // Reserve first location for self reference to the LoopID metadata node.
7735   MDs.push_back(nullptr);
7736   bool IsUnrollMetadata = false;
7737   MDNode *LoopID = L->getLoopID();
7738   if (LoopID) {
7739     // First find existing loop unrolling disable metadata.
7740     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7741       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7742       if (MD) {
7743         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7744         IsUnrollMetadata =
7745             S && S->getString().startswith("llvm.loop.unroll.disable");
7746       }
7747       MDs.push_back(LoopID->getOperand(i));
7748     }
7749   }
7750 
7751   if (!IsUnrollMetadata) {
7752     // Add runtime unroll disable metadata.
7753     LLVMContext &Context = L->getHeader()->getContext();
7754     SmallVector<Metadata *, 1> DisableOperands;
7755     DisableOperands.push_back(
7756         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7757     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7758     MDs.push_back(DisableNode);
7759     MDNode *NewLoopID = MDNode::get(Context, MDs);
7760     // Set operand 0 to refer to the loop id itself.
7761     NewLoopID->replaceOperandWith(0, NewLoopID);
7762     L->setLoopID(NewLoopID);
7763   }
7764 }
7765 
7766 bool LoopVectorizePass::processLoop(Loop *L) {
7767   assert(L->empty() && "Only process inner loops.");
7768 
7769 #ifndef NDEBUG
7770   const std::string DebugLocStr = getDebugLocString(L);
7771 #endif /* NDEBUG */
7772 
7773   DEBUG(dbgs() << "\nLV: Checking a loop in \""
7774                << L->getHeader()->getParent()->getName() << "\" from "
7775                << DebugLocStr << "\n");
7776 
7777   LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
7778 
7779   DEBUG(dbgs() << "LV: Loop hints:"
7780                << " force="
7781                << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7782                        ? "disabled"
7783                        : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7784                               ? "enabled"
7785                               : "?"))
7786                << " width=" << Hints.getWidth()
7787                << " unroll=" << Hints.getInterleave() << "\n");
7788 
7789   // Function containing loop
7790   Function *F = L->getHeader()->getParent();
7791 
7792   // Looking at the diagnostic output is the only way to determine if a loop
7793   // was vectorized (other than looking at the IR or machine code), so it
7794   // is important to generate an optimization remark for each loop. Most of
7795   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7796   // generated as OptimizationRemark and OptimizationRemarkMissed are
7797   // less verbose reporting vectorized loops and unvectorized loops that may
7798   // benefit from vectorization, respectively.
7799 
7800   if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
7801     DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7802     return false;
7803   }
7804 
7805   PredicatedScalarEvolution PSE(*SE, *L);
7806 
7807   // Check if it is legal to vectorize the loop.
7808   LoopVectorizationRequirements Requirements(*ORE);
7809   LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
7810                                 &Requirements, &Hints);
7811   if (!LVL.canVectorize()) {
7812     DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7813     emitMissedWarning(F, L, Hints, ORE);
7814     return false;
7815   }
7816 
7817   // Check the function attributes to find out if this function should be
7818   // optimized for size.
7819   bool OptForSize =
7820       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7821 
7822   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7823   // count by optimizing for size, to minimize overheads.
7824   unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7825   bool HasExpectedTC = (ExpectedTC > 0);
7826 
7827   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7828     auto EstimatedTC = getLoopEstimatedTripCount(L);
7829     if (EstimatedTC) {
7830       ExpectedTC = *EstimatedTC;
7831       HasExpectedTC = true;
7832     }
7833   }
7834 
7835   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7836     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7837                  << "This loop is worth vectorizing only if no scalar "
7838                  << "iteration overheads are incurred.");
7839     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7840       DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7841     else {
7842       DEBUG(dbgs() << "\n");
7843       // Loops with a very small trip count are considered for vectorization
7844       // under OptForSize, thereby making sure the cost of their loop body is
7845       // dominant, free of runtime guards and scalar iteration overheads.
7846       OptForSize = true;
7847     }
7848   }
7849 
7850   // Check the function attributes to see if implicit floats are allowed.
7851   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7852   // an integer loop and the vector instructions selected are purely integer
7853   // vector instructions?
7854   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7855     DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
7856                     "attribute is used.\n");
7857     ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7858                                    "NoImplicitFloat", L)
7859               << "loop not vectorized due to NoImplicitFloat attribute");
7860     emitMissedWarning(F, L, Hints, ORE);
7861     return false;
7862   }
7863 
7864   // Check if the target supports potentially unsafe FP vectorization.
7865   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7866   // for the target we're vectorizing for, to make sure none of the
7867   // additional fp-math flags can help.
7868   if (Hints.isPotentiallyUnsafe() &&
7869       TTI->isFPVectorizationPotentiallyUnsafe()) {
7870     DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
7871     ORE->emit(
7872         createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7873         << "loop not vectorized due to unsafe FP support.");
7874     emitMissedWarning(F, L, Hints, ORE);
7875     return false;
7876   }
7877 
7878   // Use the cost model.
7879   LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7880                                 &Hints);
7881   CM.collectValuesToIgnore();
7882 
7883   // Use the planner for vectorization.
7884   LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
7885 
7886   // Get user vectorization factor.
7887   unsigned UserVF = Hints.getWidth();
7888 
7889   // Plan how to best vectorize, return the best VF and its cost.
7890   LoopVectorizationCostModel::VectorizationFactor VF =
7891       LVP.plan(OptForSize, UserVF);
7892 
7893   // Select the interleave count.
7894   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7895 
7896   // Get user interleave count.
7897   unsigned UserIC = Hints.getInterleave();
7898 
7899   // Identify the diagnostic messages that should be produced.
7900   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7901   bool VectorizeLoop = true, InterleaveLoop = true;
7902   if (Requirements.doesNotMeet(F, L, Hints)) {
7903     DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7904                     "requirements.\n");
7905     emitMissedWarning(F, L, Hints, ORE);
7906     return false;
7907   }
7908 
7909   if (VF.Width == 1) {
7910     DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7911     VecDiagMsg = std::make_pair(
7912         "VectorizationNotBeneficial",
7913         "the cost-model indicates that vectorization is not beneficial");
7914     VectorizeLoop = false;
7915   }
7916 
7917   if (IC == 1 && UserIC <= 1) {
7918     // Tell the user interleaving is not beneficial.
7919     DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7920     IntDiagMsg = std::make_pair(
7921         "InterleavingNotBeneficial",
7922         "the cost-model indicates that interleaving is not beneficial");
7923     InterleaveLoop = false;
7924     if (UserIC == 1) {
7925       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7926       IntDiagMsg.second +=
7927           " and is explicitly disabled or interleave count is set to 1";
7928     }
7929   } else if (IC > 1 && UserIC == 1) {
7930     // Tell the user interleaving is beneficial, but it explicitly disabled.
7931     DEBUG(dbgs()
7932           << "LV: Interleaving is beneficial but is explicitly disabled.");
7933     IntDiagMsg = std::make_pair(
7934         "InterleavingBeneficialButDisabled",
7935         "the cost-model indicates that interleaving is beneficial "
7936         "but is explicitly disabled or interleave count is set to 1");
7937     InterleaveLoop = false;
7938   }
7939 
7940   // Override IC if user provided an interleave count.
7941   IC = UserIC > 0 ? UserIC : IC;
7942 
7943   // Emit diagnostic messages, if any.
7944   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7945   if (!VectorizeLoop && !InterleaveLoop) {
7946     // Do not vectorize or interleaving the loop.
7947     ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7948                                          L->getStartLoc(), L->getHeader())
7949               << VecDiagMsg.second);
7950     ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7951                                          L->getStartLoc(), L->getHeader())
7952               << IntDiagMsg.second);
7953     return false;
7954   } else if (!VectorizeLoop && InterleaveLoop) {
7955     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7956     ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7957                                          L->getStartLoc(), L->getHeader())
7958               << VecDiagMsg.second);
7959   } else if (VectorizeLoop && !InterleaveLoop) {
7960     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
7961                  << DebugLocStr << '\n');
7962     ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7963                                          L->getStartLoc(), L->getHeader())
7964               << IntDiagMsg.second);
7965   } else if (VectorizeLoop && InterleaveLoop) {
7966     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
7967                  << DebugLocStr << '\n');
7968     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7969   }
7970 
7971   using namespace ore;
7972   if (!VectorizeLoop) {
7973     assert(IC > 1 && "interleave count should not be 1 or 0");
7974     // If we decided that it is not legal to vectorize the loop, then
7975     // interleave it.
7976     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7977                                &CM);
7978     LVP.executePlan(Unroller);
7979 
7980     ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7981                                  L->getHeader())
7982               << "interleaved loop (interleaved count: "
7983               << NV("InterleaveCount", IC) << ")");
7984   } else {
7985     // If we decided that it is *legal* to vectorize the loop, then do it.
7986     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7987                            &LVL, &CM);
7988     LVP.executePlan(LB);
7989     ++LoopsVectorized;
7990 
7991     // Add metadata to disable runtime unrolling a scalar loop when there are
7992     // no runtime checks about strides and memory. A scalar loop that is
7993     // rarely used is not worth unrolling.
7994     if (!LB.areSafetyChecksAdded())
7995       AddRuntimeUnrollDisableMetaData(L);
7996 
7997     // Report the vectorization decision.
7998     ORE->emit(OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7999                                  L->getHeader())
8000               << "vectorized loop (vectorization width: "
8001               << NV("VectorizationFactor", VF.Width)
8002               << ", interleaved count: " << NV("InterleaveCount", IC) << ")");
8003   }
8004 
8005   // Mark the loop as already vectorized to avoid vectorizing again.
8006   Hints.setAlreadyVectorized();
8007 
8008   DEBUG(verifyFunction(*L->getHeader()->getParent()));
8009   return true;
8010 }
8011 
8012 bool LoopVectorizePass::runImpl(
8013     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8014     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8015     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
8016     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8017     OptimizationRemarkEmitter &ORE_) {
8018 
8019   SE = &SE_;
8020   LI = &LI_;
8021   TTI = &TTI_;
8022   DT = &DT_;
8023   BFI = &BFI_;
8024   TLI = TLI_;
8025   AA = &AA_;
8026   AC = &AC_;
8027   GetLAA = &GetLAA_;
8028   DB = &DB_;
8029   ORE = &ORE_;
8030 
8031   // Don't attempt if
8032   // 1. the target claims to have no vector registers, and
8033   // 2. interleaving won't help ILP.
8034   //
8035   // The second condition is necessary because, even if the target has no
8036   // vector registers, loop vectorization may still enable scalar
8037   // interleaving.
8038   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
8039     return false;
8040 
8041   bool Changed = false;
8042 
8043   // The vectorizer requires loops to be in simplified form.
8044   // Since simplification may add new inner loops, it has to run before the
8045   // legality and profitability checks. This means running the loop vectorizer
8046   // will simplify all loops, regardless of whether anything end up being
8047   // vectorized.
8048   for (auto &L : *LI)
8049     Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
8050 
8051   // Build up a worklist of inner-loops to vectorize. This is necessary as
8052   // the act of vectorizing or partially unrolling a loop creates new loops
8053   // and can invalidate iterators across the loops.
8054   SmallVector<Loop *, 8> Worklist;
8055 
8056   for (Loop *L : *LI)
8057     addAcyclicInnerLoop(*L, Worklist);
8058 
8059   LoopsAnalyzed += Worklist.size();
8060 
8061   // Now walk the identified inner loops.
8062   while (!Worklist.empty()) {
8063     Loop *L = Worklist.pop_back_val();
8064 
8065     // For the inner loops we actually process, form LCSSA to simplify the
8066     // transform.
8067     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8068 
8069     Changed |= processLoop(L);
8070   }
8071 
8072   // Process each loop nest in the function.
8073   return Changed;
8074 
8075 }
8076 
8077 
8078 PreservedAnalyses LoopVectorizePass::run(Function &F,
8079                                          FunctionAnalysisManager &AM) {
8080     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8081     auto &LI = AM.getResult<LoopAnalysis>(F);
8082     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8083     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8084     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8085     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8086     auto &AA = AM.getResult<AAManager>(F);
8087     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8088     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8089     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8090 
8091     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8092     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8093         [&](Loop &L) -> const LoopAccessInfo & {
8094       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
8095       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8096     };
8097     bool Changed =
8098         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
8099     if (!Changed)
8100       return PreservedAnalyses::all();
8101     PreservedAnalyses PA;
8102     PA.preserve<LoopAnalysis>();
8103     PA.preserve<DominatorTreeAnalysis>();
8104     PA.preserve<BasicAA>();
8105     PA.preserve<GlobalsAA>();
8106     return PA;
8107 }
8108