1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
11 // and generates target-independent LLVM-IR.
12 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
13 // of instructions in order to estimate the profitability of vectorization.
14 //
15 // The loop vectorizer combines consecutive loop iterations into a single
16 // 'wide' iteration. After this transformation the index is incremented
17 // by the SIMD vector width, and not by one.
18 //
19 // This pass has three parts:
20 // 1. The main loop pass that drives the different parts.
21 // 2. LoopVectorizationLegality - A unit that checks for the legality
22 //    of the vectorization.
23 // 3. InnerLoopVectorizer - A unit that performs the actual
24 //    widening of instructions.
25 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
26 //    of vectorization. It decides on the optimal vector width, which
27 //    can be one, if vectorization is not profitable.
28 //
29 //===----------------------------------------------------------------------===//
30 //
31 // The reduction-variable vectorization is based on the paper:
32 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
33 //
34 // Variable uniformity checks are inspired by:
35 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
36 //
37 // The interleaved access vectorization is based on the paper:
38 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
39 //  Data for SIMD
40 //
41 // Other ideas/concepts are from:
42 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
43 //
44 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
45 //  Vectorizing Compilers.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
50 #include "LoopVectorizationPlanner.h"
51 #include "llvm/ADT/APInt.h"
52 #include "llvm/ADT/ArrayRef.h"
53 #include "llvm/ADT/DenseMap.h"
54 #include "llvm/ADT/DenseMapInfo.h"
55 #include "llvm/ADT/Hashing.h"
56 #include "llvm/ADT/MapVector.h"
57 #include "llvm/ADT/None.h"
58 #include "llvm/ADT/Optional.h"
59 #include "llvm/ADT/STLExtras.h"
60 #include "llvm/ADT/SetVector.h"
61 #include "llvm/ADT/SmallPtrSet.h"
62 #include "llvm/ADT/SmallSet.h"
63 #include "llvm/ADT/SmallVector.h"
64 #include "llvm/ADT/Statistic.h"
65 #include "llvm/ADT/StringRef.h"
66 #include "llvm/ADT/Twine.h"
67 #include "llvm/ADT/iterator_range.h"
68 #include "llvm/Analysis/AssumptionCache.h"
69 #include "llvm/Analysis/BasicAliasAnalysis.h"
70 #include "llvm/Analysis/BlockFrequencyInfo.h"
71 #include "llvm/Analysis/CFG.h"
72 #include "llvm/Analysis/CodeMetrics.h"
73 #include "llvm/Analysis/DemandedBits.h"
74 #include "llvm/Analysis/GlobalsModRef.h"
75 #include "llvm/Analysis/LoopAccessAnalysis.h"
76 #include "llvm/Analysis/LoopAnalysisManager.h"
77 #include "llvm/Analysis/LoopInfo.h"
78 #include "llvm/Analysis/LoopIterator.h"
79 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
80 #include "llvm/Analysis/ScalarEvolution.h"
81 #include "llvm/Analysis/ScalarEvolutionExpander.h"
82 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
83 #include "llvm/Analysis/TargetLibraryInfo.h"
84 #include "llvm/Analysis/TargetTransformInfo.h"
85 #include "llvm/Analysis/VectorUtils.h"
86 #include "llvm/IR/Attributes.h"
87 #include "llvm/IR/BasicBlock.h"
88 #include "llvm/IR/CFG.h"
89 #include "llvm/IR/Constant.h"
90 #include "llvm/IR/Constants.h"
91 #include "llvm/IR/DataLayout.h"
92 #include "llvm/IR/DebugInfoMetadata.h"
93 #include "llvm/IR/DebugLoc.h"
94 #include "llvm/IR/DerivedTypes.h"
95 #include "llvm/IR/DiagnosticInfo.h"
96 #include "llvm/IR/Dominators.h"
97 #include "llvm/IR/Function.h"
98 #include "llvm/IR/IRBuilder.h"
99 #include "llvm/IR/InstrTypes.h"
100 #include "llvm/IR/Instruction.h"
101 #include "llvm/IR/Instructions.h"
102 #include "llvm/IR/IntrinsicInst.h"
103 #include "llvm/IR/Intrinsics.h"
104 #include "llvm/IR/LLVMContext.h"
105 #include "llvm/IR/Metadata.h"
106 #include "llvm/IR/Module.h"
107 #include "llvm/IR/Operator.h"
108 #include "llvm/IR/Type.h"
109 #include "llvm/IR/Use.h"
110 #include "llvm/IR/User.h"
111 #include "llvm/IR/Value.h"
112 #include "llvm/IR/ValueHandle.h"
113 #include "llvm/IR/Verifier.h"
114 #include "llvm/Pass.h"
115 #include "llvm/Support/Casting.h"
116 #include "llvm/Support/CommandLine.h"
117 #include "llvm/Support/Compiler.h"
118 #include "llvm/Support/Debug.h"
119 #include "llvm/Support/ErrorHandling.h"
120 #include "llvm/Support/MathExtras.h"
121 #include "llvm/Support/raw_ostream.h"
122 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
123 #include "llvm/Transforms/Utils/LoopSimplify.h"
124 #include "llvm/Transforms/Utils/LoopUtils.h"
125 #include "llvm/Transforms/Utils/LoopVersioning.h"
126 #include <algorithm>
127 #include <cassert>
128 #include <cstdint>
129 #include <cstdlib>
130 #include <functional>
131 #include <iterator>
132 #include <limits>
133 #include <memory>
134 #include <string>
135 #include <tuple>
136 #include <utility>
137 #include <vector>
138 
139 using namespace llvm;
140 
141 #define LV_NAME "loop-vectorize"
142 #define DEBUG_TYPE LV_NAME
143 
144 STATISTIC(LoopsVectorized, "Number of loops vectorized");
145 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
146 
147 static cl::opt<bool>
148     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
149                        cl::desc("Enable if-conversion during vectorization."));
150 
151 /// Loops with a known constant trip count below this number are vectorized only
152 /// if no scalar iteration overheads are incurred.
153 static cl::opt<unsigned> TinyTripCountVectorThreshold(
154     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
155     cl::desc("Loops with a constant trip count that is smaller than this "
156              "value are vectorized only if no scalar iteration overheads "
157              "are incurred."));
158 
159 static cl::opt<bool> MaximizeBandwidth(
160     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
161     cl::desc("Maximize bandwidth when selecting vectorization factor which "
162              "will be determined by the smallest type in loop."));
163 
164 static cl::opt<bool> EnableInterleavedMemAccesses(
165     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
166     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
167 
168 /// Maximum factor for an interleaved memory access.
169 static cl::opt<unsigned> MaxInterleaveGroupFactor(
170     "max-interleave-group-factor", cl::Hidden,
171     cl::desc("Maximum factor for an interleaved access group (default = 8)"),
172     cl::init(8));
173 
174 /// We don't interleave loops with a known constant trip count below this
175 /// number.
176 static const unsigned TinyTripCountInterleaveThreshold = 128;
177 
178 static cl::opt<unsigned> ForceTargetNumScalarRegs(
179     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
180     cl::desc("A flag that overrides the target's number of scalar registers."));
181 
182 static cl::opt<unsigned> ForceTargetNumVectorRegs(
183     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
184     cl::desc("A flag that overrides the target's number of vector registers."));
185 
186 /// Maximum vectorization interleave count.
187 static const unsigned MaxInterleaveFactor = 16;
188 
189 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
190     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
191     cl::desc("A flag that overrides the target's max interleave factor for "
192              "scalar loops."));
193 
194 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
195     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
196     cl::desc("A flag that overrides the target's max interleave factor for "
197              "vectorized loops."));
198 
199 static cl::opt<unsigned> ForceTargetInstructionCost(
200     "force-target-instruction-cost", cl::init(0), cl::Hidden,
201     cl::desc("A flag that overrides the target's expected cost for "
202              "an instruction to a single constant value. Mostly "
203              "useful for getting consistent testing."));
204 
205 static cl::opt<unsigned> SmallLoopCost(
206     "small-loop-cost", cl::init(20), cl::Hidden,
207     cl::desc(
208         "The cost of a loop that is considered 'small' by the interleaver."));
209 
210 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
211     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
212     cl::desc("Enable the use of the block frequency analysis to access PGO "
213              "heuristics minimizing code growth in cold regions and being more "
214              "aggressive in hot regions."));
215 
216 // Runtime interleave loops for load/store throughput.
217 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
218     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
219     cl::desc(
220         "Enable runtime interleaving until load/store ports are saturated"));
221 
222 /// The number of stores in a loop that are allowed to need predication.
223 static cl::opt<unsigned> NumberOfStoresToPredicate(
224     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
225     cl::desc("Max number of stores to be predicated behind an if."));
226 
227 static cl::opt<bool> EnableIndVarRegisterHeur(
228     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
229     cl::desc("Count the induction variable only once when interleaving"));
230 
231 static cl::opt<bool> EnableCondStoresVectorization(
232     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
233     cl::desc("Enable if predication of stores during vectorization."));
234 
235 static cl::opt<unsigned> MaxNestedScalarReductionIC(
236     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
237     cl::desc("The maximum interleave count to use when interleaving a scalar "
238              "reduction in a nested loop."));
239 
240 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
241     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
242     cl::desc("The maximum allowed number of runtime memory checks with a "
243              "vectorize(enable) pragma."));
244 
245 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
246     "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
247     cl::desc("The maximum number of SCEV checks allowed."));
248 
249 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
250     "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
251     cl::desc("The maximum number of SCEV checks allowed with a "
252              "vectorize(enable) pragma"));
253 
254 /// Create an analysis remark that explains why vectorization failed
255 ///
256 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
257 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
258 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
259 /// the location of the remark.  \return the remark object that can be
260 /// streamed to.
261 static OptimizationRemarkAnalysis
262 createMissedAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
263                      Instruction *I = nullptr) {
264   Value *CodeRegion = TheLoop->getHeader();
265   DebugLoc DL = TheLoop->getStartLoc();
266 
267   if (I) {
268     CodeRegion = I->getParent();
269     // If there is no debug location attached to the instruction, revert back to
270     // using the loop's.
271     if (I->getDebugLoc())
272       DL = I->getDebugLoc();
273   }
274 
275   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
276   R << "loop not vectorized: ";
277   return R;
278 }
279 
280 namespace {
281 
282 class LoopVectorizationRequirements;
283 
284 } // end anonymous namespace
285 
286 /// A helper function for converting Scalar types to vector types.
287 /// If the incoming type is void, we return void. If the VF is 1, we return
288 /// the scalar type.
289 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
290   if (Scalar->isVoidTy() || VF == 1)
291     return Scalar;
292   return VectorType::get(Scalar, VF);
293 }
294 
295 // FIXME: The following helper functions have multiple implementations
296 // in the project. They can be effectively organized in a common Load/Store
297 // utilities unit.
298 
299 /// A helper function that returns the type of loaded or stored value.
300 static Type *getMemInstValueType(Value *I) {
301   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
302          "Expected Load or Store instruction");
303   if (auto *LI = dyn_cast<LoadInst>(I))
304     return LI->getType();
305   return cast<StoreInst>(I)->getValueOperand()->getType();
306 }
307 
308 /// A helper function that returns the alignment of load or store instruction.
309 static unsigned getMemInstAlignment(Value *I) {
310   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
311          "Expected Load or Store instruction");
312   if (auto *LI = dyn_cast<LoadInst>(I))
313     return LI->getAlignment();
314   return cast<StoreInst>(I)->getAlignment();
315 }
316 
317 /// A helper function that returns the address space of the pointer operand of
318 /// load or store instruction.
319 static unsigned getMemInstAddressSpace(Value *I) {
320   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
321          "Expected Load or Store instruction");
322   if (auto *LI = dyn_cast<LoadInst>(I))
323     return LI->getPointerAddressSpace();
324   return cast<StoreInst>(I)->getPointerAddressSpace();
325 }
326 
327 /// A helper function that returns true if the given type is irregular. The
328 /// type is irregular if its allocated size doesn't equal the store size of an
329 /// element of the corresponding vector type at the given vectorization factor.
330 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
331   // Determine if an array of VF elements of type Ty is "bitcast compatible"
332   // with a <VF x Ty> vector.
333   if (VF > 1) {
334     auto *VectorTy = VectorType::get(Ty, VF);
335     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
336   }
337 
338   // If the vectorization factor is one, we just check if an array of type Ty
339   // requires padding between elements.
340   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
341 }
342 
343 /// A helper function that returns the reciprocal of the block probability of
344 /// predicated blocks. If we return X, we are assuming the predicated block
345 /// will execute once for every X iterations of the loop header.
346 ///
347 /// TODO: We should use actual block probability here, if available. Currently,
348 ///       we always assume predicated blocks have a 50% chance of executing.
349 static unsigned getReciprocalPredBlockProb() { return 2; }
350 
351 /// A helper function that adds a 'fast' flag to floating-point operations.
352 static Value *addFastMathFlag(Value *V) {
353   if (isa<FPMathOperator>(V)) {
354     FastMathFlags Flags;
355     Flags.setFast();
356     cast<Instruction>(V)->setFastMathFlags(Flags);
357   }
358   return V;
359 }
360 
361 /// A helper function that returns an integer or floating-point constant with
362 /// value C.
363 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
364   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
365                            : ConstantFP::get(Ty, C);
366 }
367 
368 namespace llvm {
369 
370 /// InnerLoopVectorizer vectorizes loops which contain only one basic
371 /// block to a specified vectorization factor (VF).
372 /// This class performs the widening of scalars into vectors, or multiple
373 /// scalars. This class also implements the following features:
374 /// * It inserts an epilogue loop for handling loops that don't have iteration
375 ///   counts that are known to be a multiple of the vectorization factor.
376 /// * It handles the code generation for reduction variables.
377 /// * Scalarization (implementation using scalars) of un-vectorizable
378 ///   instructions.
379 /// InnerLoopVectorizer does not perform any vectorization-legality
380 /// checks, and relies on the caller to check for the different legality
381 /// aspects. The InnerLoopVectorizer relies on the
382 /// LoopVectorizationLegality class to provide information about the induction
383 /// and reduction variables that were found to a given vectorization factor.
384 class InnerLoopVectorizer {
385 public:
386   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
387                       LoopInfo *LI, DominatorTree *DT,
388                       const TargetLibraryInfo *TLI,
389                       const TargetTransformInfo *TTI, AssumptionCache *AC,
390                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
391                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
392                       LoopVectorizationCostModel *CM)
393       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
394         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
395         Builder(PSE.getSE()->getContext()),
396         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
397   virtual ~InnerLoopVectorizer() = default;
398 
399   /// Create a new empty loop. Unlink the old loop and connect the new one.
400   /// Return the pre-header block of the new loop.
401   BasicBlock *createVectorizedLoopSkeleton();
402 
403   /// Widen a single instruction within the innermost loop.
404   void widenInstruction(Instruction &I);
405 
406   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
407   void fixVectorizedLoop();
408 
409   // Return true if any runtime check is added.
410   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
411 
412   /// A type for vectorized values in the new loop. Each value from the
413   /// original loop, when vectorized, is represented by UF vector values in the
414   /// new unrolled loop, where UF is the unroll factor.
415   using VectorParts = SmallVector<Value *, 2>;
416 
417   /// Vectorize a single PHINode in a block. This method handles the induction
418   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
419   /// arbitrary length vectors.
420   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
421 
422   /// A helper function to scalarize a single Instruction in the innermost loop.
423   /// Generates a sequence of scalar instances for each lane between \p MinLane
424   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
425   /// inclusive..
426   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
427                             bool IfPredicateInstr);
428 
429   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
430   /// is provided, the integer induction variable will first be truncated to
431   /// the corresponding type.
432   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
433 
434   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
435   /// vector or scalar value on-demand if one is not yet available. When
436   /// vectorizing a loop, we visit the definition of an instruction before its
437   /// uses. When visiting the definition, we either vectorize or scalarize the
438   /// instruction, creating an entry for it in the corresponding map. (In some
439   /// cases, such as induction variables, we will create both vector and scalar
440   /// entries.) Then, as we encounter uses of the definition, we derive values
441   /// for each scalar or vector use unless such a value is already available.
442   /// For example, if we scalarize a definition and one of its uses is vector,
443   /// we build the required vector on-demand with an insertelement sequence
444   /// when visiting the use. Otherwise, if the use is scalar, we can use the
445   /// existing scalar definition.
446   ///
447   /// Return a value in the new loop corresponding to \p V from the original
448   /// loop at unroll index \p Part. If the value has already been vectorized,
449   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
450   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
451   /// a new vector value on-demand by inserting the scalar values into a vector
452   /// with an insertelement sequence. If the value has been neither vectorized
453   /// nor scalarized, it must be loop invariant, so we simply broadcast the
454   /// value into a vector.
455   Value *getOrCreateVectorValue(Value *V, unsigned Part);
456 
457   /// Return a value in the new loop corresponding to \p V from the original
458   /// loop at unroll and vector indices \p Instance. If the value has been
459   /// vectorized but not scalarized, the necessary extractelement instruction
460   /// will be generated.
461   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
462 
463   /// Construct the vector value of a scalarized value \p V one lane at a time.
464   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
465 
466   /// Try to vectorize the interleaved access group that \p Instr belongs to.
467   void vectorizeInterleaveGroup(Instruction *Instr);
468 
469   /// Vectorize Load and Store instructions, optionally masking the vector
470   /// operations if \p BlockInMask is non-null.
471   void vectorizeMemoryInstruction(Instruction *Instr,
472                                   VectorParts *BlockInMask = nullptr);
473 
474   /// \brief Set the debug location in the builder using the debug location in
475   /// the instruction.
476   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
477 
478 protected:
479   friend class LoopVectorizationPlanner;
480 
481   /// A small list of PHINodes.
482   using PhiVector = SmallVector<PHINode *, 4>;
483 
484   /// A type for scalarized values in the new loop. Each value from the
485   /// original loop, when scalarized, is represented by UF x VF scalar values
486   /// in the new unrolled loop, where UF is the unroll factor and VF is the
487   /// vectorization factor.
488   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
489 
490   /// Set up the values of the IVs correctly when exiting the vector loop.
491   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
492                     Value *CountRoundDown, Value *EndValue,
493                     BasicBlock *MiddleBlock);
494 
495   /// Create a new induction variable inside L.
496   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
497                                    Value *Step, Instruction *DL);
498 
499   /// Handle all cross-iteration phis in the header.
500   void fixCrossIterationPHIs();
501 
502   /// Fix a first-order recurrence. This is the second phase of vectorizing
503   /// this phi node.
504   void fixFirstOrderRecurrence(PHINode *Phi);
505 
506   /// Fix a reduction cross-iteration phi. This is the second phase of
507   /// vectorizing this phi node.
508   void fixReduction(PHINode *Phi);
509 
510   /// \brief The Loop exit block may have single value PHI nodes with some
511   /// incoming value. While vectorizing we only handled real values
512   /// that were defined inside the loop and we should have one value for
513   /// each predecessor of its parent basic block. See PR14725.
514   void fixLCSSAPHIs();
515 
516   /// Iteratively sink the scalarized operands of a predicated instruction into
517   /// the block that was created for it.
518   void sinkScalarOperands(Instruction *PredInst);
519 
520   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
521   /// represented as.
522   void truncateToMinimalBitwidths();
523 
524   /// Insert the new loop to the loop hierarchy and pass manager
525   /// and update the analysis passes.
526   void updateAnalysis();
527 
528   /// Create a broadcast instruction. This method generates a broadcast
529   /// instruction (shuffle) for loop invariant values and for the induction
530   /// value. If this is the induction variable then we extend it to N, N+1, ...
531   /// this is needed because each iteration in the loop corresponds to a SIMD
532   /// element.
533   virtual Value *getBroadcastInstrs(Value *V);
534 
535   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
536   /// to each vector element of Val. The sequence starts at StartIndex.
537   /// \p Opcode is relevant for FP induction variable.
538   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
539                                Instruction::BinaryOps Opcode =
540                                Instruction::BinaryOpsEnd);
541 
542   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
543   /// variable on which to base the steps, \p Step is the size of the step, and
544   /// \p EntryVal is the value from the original loop that maps to the steps.
545   /// Note that \p EntryVal doesn't have to be an induction variable - it
546   /// can also be a truncate instruction.
547   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
548                         const InductionDescriptor &ID);
549 
550   /// Create a vector induction phi node based on an existing scalar one. \p
551   /// EntryVal is the value from the original loop that maps to the vector phi
552   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
553   /// truncate instruction, instead of widening the original IV, we widen a
554   /// version of the IV truncated to \p EntryVal's type.
555   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
556                                        Value *Step, Instruction *EntryVal);
557 
558   /// Returns true if an instruction \p I should be scalarized instead of
559   /// vectorized for the chosen vectorization factor.
560   bool shouldScalarizeInstruction(Instruction *I) const;
561 
562   /// Returns true if we should generate a scalar version of \p IV.
563   bool needsScalarInduction(Instruction *IV) const;
564 
565   /// If there is a cast involved in the induction variable \p ID, which should
566   /// be ignored in the vectorized loop body, this function records the
567   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
568   /// cast. We had already proved that the casted Phi is equal to the uncasted
569   /// Phi in the vectorized loop (under a runtime guard), and therefore
570   /// there is no need to vectorize the cast - the same value can be used in the
571   /// vector loop for both the Phi and the cast.
572   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
573   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
574   ///
575   /// \p EntryVal is the value from the original loop that maps to the vector
576   /// phi node and is used to distinguish what is the IV currently being
577   /// processed - original one (if \p EntryVal is a phi corresponding to the
578   /// original IV) or the "newly-created" one based on the proof mentioned above
579   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
580   /// latter case \p EntryVal is a TruncInst and we must not record anything for
581   /// that IV, but it's error-prone to expect callers of this routine to care
582   /// about that, hence this explicit parameter.
583   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
584                                              const Instruction *EntryVal,
585                                              Value *VectorLoopValue,
586                                              unsigned Part,
587                                              unsigned Lane = UINT_MAX);
588 
589   /// Generate a shuffle sequence that will reverse the vector Vec.
590   virtual Value *reverseVector(Value *Vec);
591 
592   /// Returns (and creates if needed) the original loop trip count.
593   Value *getOrCreateTripCount(Loop *NewLoop);
594 
595   /// Returns (and creates if needed) the trip count of the widened loop.
596   Value *getOrCreateVectorTripCount(Loop *NewLoop);
597 
598   /// Returns a bitcasted value to the requested vector type.
599   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
600   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
601                                 const DataLayout &DL);
602 
603   /// Emit a bypass check to see if the vector trip count is zero, including if
604   /// it overflows.
605   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
606 
607   /// Emit a bypass check to see if all of the SCEV assumptions we've
608   /// had to make are correct.
609   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
610 
611   /// Emit bypass checks to check any memory assumptions we may have made.
612   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
613 
614   /// Add additional metadata to \p To that was not present on \p Orig.
615   ///
616   /// Currently this is used to add the noalias annotations based on the
617   /// inserted memchecks.  Use this for instructions that are *cloned* into the
618   /// vector loop.
619   void addNewMetadata(Instruction *To, const Instruction *Orig);
620 
621   /// Add metadata from one instruction to another.
622   ///
623   /// This includes both the original MDs from \p From and additional ones (\see
624   /// addNewMetadata).  Use this for *newly created* instructions in the vector
625   /// loop.
626   void addMetadata(Instruction *To, Instruction *From);
627 
628   /// \brief Similar to the previous function but it adds the metadata to a
629   /// vector of instructions.
630   void addMetadata(ArrayRef<Value *> To, Instruction *From);
631 
632   /// The original loop.
633   Loop *OrigLoop;
634 
635   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
636   /// dynamic knowledge to simplify SCEV expressions and converts them to a
637   /// more usable form.
638   PredicatedScalarEvolution &PSE;
639 
640   /// Loop Info.
641   LoopInfo *LI;
642 
643   /// Dominator Tree.
644   DominatorTree *DT;
645 
646   /// Alias Analysis.
647   AliasAnalysis *AA;
648 
649   /// Target Library Info.
650   const TargetLibraryInfo *TLI;
651 
652   /// Target Transform Info.
653   const TargetTransformInfo *TTI;
654 
655   /// Assumption Cache.
656   AssumptionCache *AC;
657 
658   /// Interface to emit optimization remarks.
659   OptimizationRemarkEmitter *ORE;
660 
661   /// \brief LoopVersioning.  It's only set up (non-null) if memchecks were
662   /// used.
663   ///
664   /// This is currently only used to add no-alias metadata based on the
665   /// memchecks.  The actually versioning is performed manually.
666   std::unique_ptr<LoopVersioning> LVer;
667 
668   /// The vectorization SIMD factor to use. Each vector will have this many
669   /// vector elements.
670   unsigned VF;
671 
672   /// The vectorization unroll factor to use. Each scalar is vectorized to this
673   /// many different vector instructions.
674   unsigned UF;
675 
676   /// The builder that we use
677   IRBuilder<> Builder;
678 
679   // --- Vectorization state ---
680 
681   /// The vector-loop preheader.
682   BasicBlock *LoopVectorPreHeader;
683 
684   /// The scalar-loop preheader.
685   BasicBlock *LoopScalarPreHeader;
686 
687   /// Middle Block between the vector and the scalar.
688   BasicBlock *LoopMiddleBlock;
689 
690   /// The ExitBlock of the scalar loop.
691   BasicBlock *LoopExitBlock;
692 
693   /// The vector loop body.
694   BasicBlock *LoopVectorBody;
695 
696   /// The scalar loop body.
697   BasicBlock *LoopScalarBody;
698 
699   /// A list of all bypass blocks. The first block is the entry of the loop.
700   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
701 
702   /// The new Induction variable which was added to the new block.
703   PHINode *Induction = nullptr;
704 
705   /// The induction variable of the old basic block.
706   PHINode *OldInduction = nullptr;
707 
708   /// Maps values from the original loop to their corresponding values in the
709   /// vectorized loop. A key value can map to either vector values, scalar
710   /// values or both kinds of values, depending on whether the key was
711   /// vectorized and scalarized.
712   VectorizerValueMap VectorLoopValueMap;
713 
714   /// Store instructions that were predicated.
715   SmallVector<Instruction *, 4> PredicatedInstructions;
716 
717   /// Trip count of the original loop.
718   Value *TripCount = nullptr;
719 
720   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
721   Value *VectorTripCount = nullptr;
722 
723   /// The legality analysis.
724   LoopVectorizationLegality *Legal;
725 
726   /// The profitablity analysis.
727   LoopVectorizationCostModel *Cost;
728 
729   // Record whether runtime checks are added.
730   bool AddedSafetyChecks = false;
731 
732   // Holds the end values for each induction variable. We save the end values
733   // so we can later fix-up the external users of the induction variables.
734   DenseMap<PHINode *, Value *> IVEndValues;
735 };
736 
737 class InnerLoopUnroller : public InnerLoopVectorizer {
738 public:
739   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
740                     LoopInfo *LI, DominatorTree *DT,
741                     const TargetLibraryInfo *TLI,
742                     const TargetTransformInfo *TTI, AssumptionCache *AC,
743                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
744                     LoopVectorizationLegality *LVL,
745                     LoopVectorizationCostModel *CM)
746       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
747                             UnrollFactor, LVL, CM) {}
748 
749 private:
750   Value *getBroadcastInstrs(Value *V) override;
751   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
752                        Instruction::BinaryOps Opcode =
753                        Instruction::BinaryOpsEnd) override;
754   Value *reverseVector(Value *Vec) override;
755 };
756 
757 } // end namespace llvm
758 
759 /// \brief Look for a meaningful debug location on the instruction or it's
760 /// operands.
761 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
762   if (!I)
763     return I;
764 
765   DebugLoc Empty;
766   if (I->getDebugLoc() != Empty)
767     return I;
768 
769   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
770     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
771       if (OpInst->getDebugLoc() != Empty)
772         return OpInst;
773   }
774 
775   return I;
776 }
777 
778 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
779   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
780     const DILocation *DIL = Inst->getDebugLoc();
781     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
782         !isa<DbgInfoIntrinsic>(Inst))
783       B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
784     else
785       B.SetCurrentDebugLocation(DIL);
786   } else
787     B.SetCurrentDebugLocation(DebugLoc());
788 }
789 
790 #ifndef NDEBUG
791 /// \return string containing a file name and a line # for the given loop.
792 static std::string getDebugLocString(const Loop *L) {
793   std::string Result;
794   if (L) {
795     raw_string_ostream OS(Result);
796     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
797       LoopDbgLoc.print(OS);
798     else
799       // Just print the module name.
800       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
801     OS.flush();
802   }
803   return Result;
804 }
805 #endif
806 
807 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
808                                          const Instruction *Orig) {
809   // If the loop was versioned with memchecks, add the corresponding no-alias
810   // metadata.
811   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
812     LVer->annotateInstWithNoAlias(To, Orig);
813 }
814 
815 void InnerLoopVectorizer::addMetadata(Instruction *To,
816                                       Instruction *From) {
817   propagateMetadata(To, From);
818   addNewMetadata(To, From);
819 }
820 
821 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
822                                       Instruction *From) {
823   for (Value *V : To) {
824     if (Instruction *I = dyn_cast<Instruction>(V))
825       addMetadata(I, From);
826   }
827 }
828 
829 namespace llvm {
830 
831 /// \brief The group of interleaved loads/stores sharing the same stride and
832 /// close to each other.
833 ///
834 /// Each member in this group has an index starting from 0, and the largest
835 /// index should be less than interleaved factor, which is equal to the absolute
836 /// value of the access's stride.
837 ///
838 /// E.g. An interleaved load group of factor 4:
839 ///        for (unsigned i = 0; i < 1024; i+=4) {
840 ///          a = A[i];                           // Member of index 0
841 ///          b = A[i+1];                         // Member of index 1
842 ///          d = A[i+3];                         // Member of index 3
843 ///          ...
844 ///        }
845 ///
846 ///      An interleaved store group of factor 4:
847 ///        for (unsigned i = 0; i < 1024; i+=4) {
848 ///          ...
849 ///          A[i]   = a;                         // Member of index 0
850 ///          A[i+1] = b;                         // Member of index 1
851 ///          A[i+2] = c;                         // Member of index 2
852 ///          A[i+3] = d;                         // Member of index 3
853 ///        }
854 ///
855 /// Note: the interleaved load group could have gaps (missing members), but
856 /// the interleaved store group doesn't allow gaps.
857 class InterleaveGroup {
858 public:
859   InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
860       : Align(Align), InsertPos(Instr) {
861     assert(Align && "The alignment should be non-zero");
862 
863     Factor = std::abs(Stride);
864     assert(Factor > 1 && "Invalid interleave factor");
865 
866     Reverse = Stride < 0;
867     Members[0] = Instr;
868   }
869 
870   bool isReverse() const { return Reverse; }
871   unsigned getFactor() const { return Factor; }
872   unsigned getAlignment() const { return Align; }
873   unsigned getNumMembers() const { return Members.size(); }
874 
875   /// \brief Try to insert a new member \p Instr with index \p Index and
876   /// alignment \p NewAlign. The index is related to the leader and it could be
877   /// negative if it is the new leader.
878   ///
879   /// \returns false if the instruction doesn't belong to the group.
880   bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
881     assert(NewAlign && "The new member's alignment should be non-zero");
882 
883     int Key = Index + SmallestKey;
884 
885     // Skip if there is already a member with the same index.
886     if (Members.count(Key))
887       return false;
888 
889     if (Key > LargestKey) {
890       // The largest index is always less than the interleave factor.
891       if (Index >= static_cast<int>(Factor))
892         return false;
893 
894       LargestKey = Key;
895     } else if (Key < SmallestKey) {
896       // The largest index is always less than the interleave factor.
897       if (LargestKey - Key >= static_cast<int>(Factor))
898         return false;
899 
900       SmallestKey = Key;
901     }
902 
903     // It's always safe to select the minimum alignment.
904     Align = std::min(Align, NewAlign);
905     Members[Key] = Instr;
906     return true;
907   }
908 
909   /// \brief Get the member with the given index \p Index
910   ///
911   /// \returns nullptr if contains no such member.
912   Instruction *getMember(unsigned Index) const {
913     int Key = SmallestKey + Index;
914     if (!Members.count(Key))
915       return nullptr;
916 
917     return Members.find(Key)->second;
918   }
919 
920   /// \brief Get the index for the given member. Unlike the key in the member
921   /// map, the index starts from 0.
922   unsigned getIndex(Instruction *Instr) const {
923     for (auto I : Members)
924       if (I.second == Instr)
925         return I.first - SmallestKey;
926 
927     llvm_unreachable("InterleaveGroup contains no such member");
928   }
929 
930   Instruction *getInsertPos() const { return InsertPos; }
931   void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
932 
933   /// Add metadata (e.g. alias info) from the instructions in this group to \p
934   /// NewInst.
935   ///
936   /// FIXME: this function currently does not add noalias metadata a'la
937   /// addNewMedata.  To do that we need to compute the intersection of the
938   /// noalias info from all members.
939   void addMetadata(Instruction *NewInst) const {
940     SmallVector<Value *, 4> VL;
941     std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
942                    [](std::pair<int, Instruction *> p) { return p.second; });
943     propagateMetadata(NewInst, VL);
944   }
945 
946 private:
947   unsigned Factor; // Interleave Factor.
948   bool Reverse;
949   unsigned Align;
950   DenseMap<int, Instruction *> Members;
951   int SmallestKey = 0;
952   int LargestKey = 0;
953 
954   // To avoid breaking dependences, vectorized instructions of an interleave
955   // group should be inserted at either the first load or the last store in
956   // program order.
957   //
958   // E.g. %even = load i32             // Insert Position
959   //      %add = add i32 %even         // Use of %even
960   //      %odd = load i32
961   //
962   //      store i32 %even
963   //      %odd = add i32               // Def of %odd
964   //      store i32 %odd               // Insert Position
965   Instruction *InsertPos;
966 };
967 } // end namespace llvm
968 
969 namespace {
970 
971 /// \brief Drive the analysis of interleaved memory accesses in the loop.
972 ///
973 /// Use this class to analyze interleaved accesses only when we can vectorize
974 /// a loop. Otherwise it's meaningless to do analysis as the vectorization
975 /// on interleaved accesses is unsafe.
976 ///
977 /// The analysis collects interleave groups and records the relationships
978 /// between the member and the group in a map.
979 class InterleavedAccessInfo {
980 public:
981   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
982                         DominatorTree *DT, LoopInfo *LI,
983                         const LoopAccessInfo *LAI)
984     : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
985 
986   ~InterleavedAccessInfo() {
987     SmallSet<InterleaveGroup *, 4> DelSet;
988     // Avoid releasing a pointer twice.
989     for (auto &I : InterleaveGroupMap)
990       DelSet.insert(I.second);
991     for (auto *Ptr : DelSet)
992       delete Ptr;
993   }
994 
995   /// \brief Analyze the interleaved accesses and collect them in interleave
996   /// groups. Substitute symbolic strides using \p Strides.
997   void analyzeInterleaving();
998 
999   /// \brief Check if \p Instr belongs to any interleave group.
1000   bool isInterleaved(Instruction *Instr) const {
1001     return InterleaveGroupMap.count(Instr);
1002   }
1003 
1004   /// \brief Get the interleave group that \p Instr belongs to.
1005   ///
1006   /// \returns nullptr if doesn't have such group.
1007   InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
1008     if (InterleaveGroupMap.count(Instr))
1009       return InterleaveGroupMap.find(Instr)->second;
1010     return nullptr;
1011   }
1012 
1013   /// \brief Returns true if an interleaved group that may access memory
1014   /// out-of-bounds requires a scalar epilogue iteration for correctness.
1015   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
1016 
1017 private:
1018   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
1019   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
1020   /// The interleaved access analysis can also add new predicates (for example
1021   /// by versioning strides of pointers).
1022   PredicatedScalarEvolution &PSE;
1023 
1024   Loop *TheLoop;
1025   DominatorTree *DT;
1026   LoopInfo *LI;
1027   const LoopAccessInfo *LAI;
1028 
1029   /// True if the loop may contain non-reversed interleaved groups with
1030   /// out-of-bounds accesses. We ensure we don't speculatively access memory
1031   /// out-of-bounds by executing at least one scalar epilogue iteration.
1032   bool RequiresScalarEpilogue = false;
1033 
1034   /// Holds the relationships between the members and the interleave group.
1035   DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
1036 
1037   /// Holds dependences among the memory accesses in the loop. It maps a source
1038   /// access to a set of dependent sink accesses.
1039   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
1040 
1041   /// \brief The descriptor for a strided memory access.
1042   struct StrideDescriptor {
1043     StrideDescriptor() = default;
1044     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
1045                      unsigned Align)
1046         : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
1047 
1048     // The access's stride. It is negative for a reverse access.
1049     int64_t Stride = 0;
1050 
1051     // The scalar expression of this access.
1052     const SCEV *Scev = nullptr;
1053 
1054     // The size of the memory object.
1055     uint64_t Size = 0;
1056 
1057     // The alignment of this access.
1058     unsigned Align = 0;
1059   };
1060 
1061   /// \brief A type for holding instructions and their stride descriptors.
1062   using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
1063 
1064   /// \brief Create a new interleave group with the given instruction \p Instr,
1065   /// stride \p Stride and alignment \p Align.
1066   ///
1067   /// \returns the newly created interleave group.
1068   InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
1069                                          unsigned Align) {
1070     assert(!InterleaveGroupMap.count(Instr) &&
1071            "Already in an interleaved access group");
1072     InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
1073     return InterleaveGroupMap[Instr];
1074   }
1075 
1076   /// \brief Release the group and remove all the relationships.
1077   void releaseGroup(InterleaveGroup *Group) {
1078     for (unsigned i = 0; i < Group->getFactor(); i++)
1079       if (Instruction *Member = Group->getMember(i))
1080         InterleaveGroupMap.erase(Member);
1081 
1082     delete Group;
1083   }
1084 
1085   /// \brief Collect all the accesses with a constant stride in program order.
1086   void collectConstStrideAccesses(
1087       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
1088       const ValueToValueMap &Strides);
1089 
1090   /// \brief Returns true if \p Stride is allowed in an interleaved group.
1091   static bool isStrided(int Stride) {
1092     unsigned Factor = std::abs(Stride);
1093     return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1094   }
1095 
1096   /// \brief Returns true if \p BB is a predicated block.
1097   bool isPredicated(BasicBlock *BB) const {
1098     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1099   }
1100 
1101   /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
1102   bool areDependencesValid() const {
1103     return LAI && LAI->getDepChecker().getDependences();
1104   }
1105 
1106   /// \brief Returns true if memory accesses \p A and \p B can be reordered, if
1107   /// necessary, when constructing interleaved groups.
1108   ///
1109   /// \p A must precede \p B in program order. We return false if reordering is
1110   /// not necessary or is prevented because \p A and \p B may be dependent.
1111   bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
1112                                                  StrideEntry *B) const {
1113     // Code motion for interleaved accesses can potentially hoist strided loads
1114     // and sink strided stores. The code below checks the legality of the
1115     // following two conditions:
1116     //
1117     // 1. Potentially moving a strided load (B) before any store (A) that
1118     //    precedes B, or
1119     //
1120     // 2. Potentially moving a strided store (A) after any load or store (B)
1121     //    that A precedes.
1122     //
1123     // It's legal to reorder A and B if we know there isn't a dependence from A
1124     // to B. Note that this determination is conservative since some
1125     // dependences could potentially be reordered safely.
1126 
1127     // A is potentially the source of a dependence.
1128     auto *Src = A->first;
1129     auto SrcDes = A->second;
1130 
1131     // B is potentially the sink of a dependence.
1132     auto *Sink = B->first;
1133     auto SinkDes = B->second;
1134 
1135     // Code motion for interleaved accesses can't violate WAR dependences.
1136     // Thus, reordering is legal if the source isn't a write.
1137     if (!Src->mayWriteToMemory())
1138       return true;
1139 
1140     // At least one of the accesses must be strided.
1141     if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
1142       return true;
1143 
1144     // If dependence information is not available from LoopAccessInfo,
1145     // conservatively assume the instructions can't be reordered.
1146     if (!areDependencesValid())
1147       return false;
1148 
1149     // If we know there is a dependence from source to sink, assume the
1150     // instructions can't be reordered. Otherwise, reordering is legal.
1151     return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
1152   }
1153 
1154   /// \brief Collect the dependences from LoopAccessInfo.
1155   ///
1156   /// We process the dependences once during the interleaved access analysis to
1157   /// enable constant-time dependence queries.
1158   void collectDependences() {
1159     if (!areDependencesValid())
1160       return;
1161     auto *Deps = LAI->getDepChecker().getDependences();
1162     for (auto Dep : *Deps)
1163       Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
1164   }
1165 };
1166 
1167 /// Utility class for getting and setting loop vectorizer hints in the form
1168 /// of loop metadata.
1169 /// This class keeps a number of loop annotations locally (as member variables)
1170 /// and can, upon request, write them back as metadata on the loop. It will
1171 /// initially scan the loop for existing metadata, and will update the local
1172 /// values based on information in the loop.
1173 /// We cannot write all values to metadata, as the mere presence of some info,
1174 /// for example 'force', means a decision has been made. So, we need to be
1175 /// careful NOT to add them if the user hasn't specifically asked so.
1176 class LoopVectorizeHints {
1177   enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED };
1178 
1179   /// Hint - associates name and validation with the hint value.
1180   struct Hint {
1181     const char *Name;
1182     unsigned Value; // This may have to change for non-numeric values.
1183     HintKind Kind;
1184 
1185     Hint(const char *Name, unsigned Value, HintKind Kind)
1186         : Name(Name), Value(Value), Kind(Kind) {}
1187 
1188     bool validate(unsigned Val) {
1189       switch (Kind) {
1190       case HK_WIDTH:
1191         return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
1192       case HK_UNROLL:
1193         return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
1194       case HK_FORCE:
1195         return (Val <= 1);
1196       case HK_ISVECTORIZED:
1197         return (Val==0 || Val==1);
1198       }
1199       return false;
1200     }
1201   };
1202 
1203   /// Vectorization width.
1204   Hint Width;
1205 
1206   /// Vectorization interleave factor.
1207   Hint Interleave;
1208 
1209   /// Vectorization forced
1210   Hint Force;
1211 
1212   /// Already Vectorized
1213   Hint IsVectorized;
1214 
1215   /// Return the loop metadata prefix.
1216   static StringRef Prefix() { return "llvm.loop."; }
1217 
1218   /// True if there is any unsafe math in the loop.
1219   bool PotentiallyUnsafe = false;
1220 
1221 public:
1222   enum ForceKind {
1223     FK_Undefined = -1, ///< Not selected.
1224     FK_Disabled = 0,   ///< Forcing disabled.
1225     FK_Enabled = 1,    ///< Forcing enabled.
1226   };
1227 
1228   LoopVectorizeHints(const Loop *L, bool DisableInterleaving,
1229                      OptimizationRemarkEmitter &ORE)
1230       : Width("vectorize.width", VectorizerParams::VectorizationFactor,
1231               HK_WIDTH),
1232         Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
1233         Force("vectorize.enable", FK_Undefined, HK_FORCE),
1234         IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
1235     // Populate values with existing loop metadata.
1236     getHintsFromMetadata();
1237 
1238     // force-vector-interleave overrides DisableInterleaving.
1239     if (VectorizerParams::isInterleaveForced())
1240       Interleave.Value = VectorizerParams::VectorizationInterleave;
1241 
1242     if (IsVectorized.Value != 1)
1243       // If the vectorization width and interleaving count are both 1 then
1244       // consider the loop to have been already vectorized because there's
1245       // nothing more that we can do.
1246       IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
1247     DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
1248           << "LV: Interleaving disabled by the pass manager\n");
1249   }
1250 
1251   /// Mark the loop L as already vectorized by setting the width to 1.
1252   void setAlreadyVectorized() {
1253     IsVectorized.Value = 1;
1254     Hint Hints[] = {IsVectorized};
1255     writeHintsToMetadata(Hints);
1256   }
1257 
1258   bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
1259     if (getForce() == LoopVectorizeHints::FK_Disabled) {
1260       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
1261       emitRemarkWithHints();
1262       return false;
1263     }
1264 
1265     if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
1266       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
1267       emitRemarkWithHints();
1268       return false;
1269     }
1270 
1271     if (getIsVectorized() == 1) {
1272       DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
1273       // FIXME: Add interleave.disable metadata. This will allow
1274       // vectorize.disable to be used without disabling the pass and errors
1275       // to differentiate between disabled vectorization and a width of 1.
1276       ORE.emit([&]() {
1277         return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
1278                                           "AllDisabled", L->getStartLoc(),
1279                                           L->getHeader())
1280                << "loop not vectorized: vectorization and interleaving are "
1281                   "explicitly disabled, or the loop has already been "
1282                   "vectorized";
1283       });
1284       return false;
1285     }
1286 
1287     return true;
1288   }
1289 
1290   /// Dumps all the hint information.
1291   void emitRemarkWithHints() const {
1292     using namespace ore;
1293 
1294     ORE.emit([&]() {
1295       if (Force.Value == LoopVectorizeHints::FK_Disabled)
1296         return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
1297                                         TheLoop->getStartLoc(),
1298                                         TheLoop->getHeader())
1299                << "loop not vectorized: vectorization is explicitly disabled";
1300       else {
1301         OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
1302                                    TheLoop->getStartLoc(),
1303                                    TheLoop->getHeader());
1304         R << "loop not vectorized";
1305         if (Force.Value == LoopVectorizeHints::FK_Enabled) {
1306           R << " (Force=" << NV("Force", true);
1307           if (Width.Value != 0)
1308             R << ", Vector Width=" << NV("VectorWidth", Width.Value);
1309           if (Interleave.Value != 0)
1310             R << ", Interleave Count="
1311               << NV("InterleaveCount", Interleave.Value);
1312           R << ")";
1313         }
1314         return R;
1315       }
1316     });
1317   }
1318 
1319   unsigned getWidth() const { return Width.Value; }
1320   unsigned getInterleave() const { return Interleave.Value; }
1321   unsigned getIsVectorized() const { return IsVectorized.Value; }
1322   enum ForceKind getForce() const { return (ForceKind)Force.Value; }
1323 
1324   /// \brief If hints are provided that force vectorization, use the AlwaysPrint
1325   /// pass name to force the frontend to print the diagnostic.
1326   const char *vectorizeAnalysisPassName() const {
1327     if (getWidth() == 1)
1328       return LV_NAME;
1329     if (getForce() == LoopVectorizeHints::FK_Disabled)
1330       return LV_NAME;
1331     if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
1332       return LV_NAME;
1333     return OptimizationRemarkAnalysis::AlwaysPrint;
1334   }
1335 
1336   bool allowReordering() const {
1337     // When enabling loop hints are provided we allow the vectorizer to change
1338     // the order of operations that is given by the scalar loop. This is not
1339     // enabled by default because can be unsafe or inefficient. For example,
1340     // reordering floating-point operations will change the way round-off
1341     // error accumulates in the loop.
1342     return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
1343   }
1344 
1345   bool isPotentiallyUnsafe() const {
1346     // Avoid FP vectorization if the target is unsure about proper support.
1347     // This may be related to the SIMD unit in the target not handling
1348     // IEEE 754 FP ops properly, or bad single-to-double promotions.
1349     // Otherwise, a sequence of vectorized loops, even without reduction,
1350     // could lead to different end results on the destination vectors.
1351     return getForce() != LoopVectorizeHints::FK_Enabled && PotentiallyUnsafe;
1352   }
1353 
1354   void setPotentiallyUnsafe() { PotentiallyUnsafe = true; }
1355 
1356 private:
1357   /// Find hints specified in the loop metadata and update local values.
1358   void getHintsFromMetadata() {
1359     MDNode *LoopID = TheLoop->getLoopID();
1360     if (!LoopID)
1361       return;
1362 
1363     // First operand should refer to the loop id itself.
1364     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
1365     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
1366 
1367     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1368       const MDString *S = nullptr;
1369       SmallVector<Metadata *, 4> Args;
1370 
1371       // The expected hint is either a MDString or a MDNode with the first
1372       // operand a MDString.
1373       if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
1374         if (!MD || MD->getNumOperands() == 0)
1375           continue;
1376         S = dyn_cast<MDString>(MD->getOperand(0));
1377         for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
1378           Args.push_back(MD->getOperand(i));
1379       } else {
1380         S = dyn_cast<MDString>(LoopID->getOperand(i));
1381         assert(Args.size() == 0 && "too many arguments for MDString");
1382       }
1383 
1384       if (!S)
1385         continue;
1386 
1387       // Check if the hint starts with the loop metadata prefix.
1388       StringRef Name = S->getString();
1389       if (Args.size() == 1)
1390         setHint(Name, Args[0]);
1391     }
1392   }
1393 
1394   /// Checks string hint with one operand and set value if valid.
1395   void setHint(StringRef Name, Metadata *Arg) {
1396     if (!Name.startswith(Prefix()))
1397       return;
1398     Name = Name.substr(Prefix().size(), StringRef::npos);
1399 
1400     const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
1401     if (!C)
1402       return;
1403     unsigned Val = C->getZExtValue();
1404 
1405     Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
1406     for (auto H : Hints) {
1407       if (Name == H->Name) {
1408         if (H->validate(Val))
1409           H->Value = Val;
1410         else
1411           DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
1412         break;
1413       }
1414     }
1415   }
1416 
1417   /// Create a new hint from name / value pair.
1418   MDNode *createHintMetadata(StringRef Name, unsigned V) const {
1419     LLVMContext &Context = TheLoop->getHeader()->getContext();
1420     Metadata *MDs[] = {MDString::get(Context, Name),
1421                        ConstantAsMetadata::get(
1422                            ConstantInt::get(Type::getInt32Ty(Context), V))};
1423     return MDNode::get(Context, MDs);
1424   }
1425 
1426   /// Matches metadata with hint name.
1427   bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
1428     MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
1429     if (!Name)
1430       return false;
1431 
1432     for (auto H : HintTypes)
1433       if (Name->getString().endswith(H.Name))
1434         return true;
1435     return false;
1436   }
1437 
1438   /// Sets current hints into loop metadata, keeping other values intact.
1439   void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
1440     if (HintTypes.empty())
1441       return;
1442 
1443     // Reserve the first element to LoopID (see below).
1444     SmallVector<Metadata *, 4> MDs(1);
1445     // If the loop already has metadata, then ignore the existing operands.
1446     MDNode *LoopID = TheLoop->getLoopID();
1447     if (LoopID) {
1448       for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
1449         MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
1450         // If node in update list, ignore old value.
1451         if (!matchesHintMetadataName(Node, HintTypes))
1452           MDs.push_back(Node);
1453       }
1454     }
1455 
1456     // Now, add the missing hints.
1457     for (auto H : HintTypes)
1458       MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
1459 
1460     // Replace current metadata node with new one.
1461     LLVMContext &Context = TheLoop->getHeader()->getContext();
1462     MDNode *NewLoopID = MDNode::get(Context, MDs);
1463     // Set operand 0 to refer to the loop id itself.
1464     NewLoopID->replaceOperandWith(0, NewLoopID);
1465 
1466     TheLoop->setLoopID(NewLoopID);
1467   }
1468 
1469   /// The loop these hints belong to.
1470   const Loop *TheLoop;
1471 
1472   /// Interface to emit optimization remarks.
1473   OptimizationRemarkEmitter &ORE;
1474 };
1475 
1476 } // end anonymous namespace
1477 
1478 static void emitMissedWarning(Function *F, Loop *L,
1479                               const LoopVectorizeHints &LH,
1480                               OptimizationRemarkEmitter *ORE) {
1481   LH.emitRemarkWithHints();
1482 
1483   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
1484     if (LH.getWidth() != 1)
1485       ORE->emit(DiagnosticInfoOptimizationFailure(
1486                     DEBUG_TYPE, "FailedRequestedVectorization",
1487                     L->getStartLoc(), L->getHeader())
1488                 << "loop not vectorized: "
1489                 << "failed explicitly specified loop vectorization");
1490     else if (LH.getInterleave() != 1)
1491       ORE->emit(DiagnosticInfoOptimizationFailure(
1492                     DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
1493                     L->getHeader())
1494                 << "loop not interleaved: "
1495                 << "failed explicitly specified loop interleaving");
1496   }
1497 }
1498 
1499 namespace llvm {
1500 
1501 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
1502 /// to what vectorization factor.
1503 /// This class does not look at the profitability of vectorization, only the
1504 /// legality. This class has two main kinds of checks:
1505 /// * Memory checks - The code in canVectorizeMemory checks if vectorization
1506 ///   will change the order of memory accesses in a way that will change the
1507 ///   correctness of the program.
1508 /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
1509 /// checks for a number of different conditions, such as the availability of a
1510 /// single induction variable, that all types are supported and vectorize-able,
1511 /// etc. This code reflects the capabilities of InnerLoopVectorizer.
1512 /// This class is also used by InnerLoopVectorizer for identifying
1513 /// induction variable and the different reduction variables.
1514 class LoopVectorizationLegality {
1515 public:
1516   LoopVectorizationLegality(
1517       Loop *L, PredicatedScalarEvolution &PSE, DominatorTree *DT,
1518       TargetLibraryInfo *TLI, AliasAnalysis *AA, Function *F,
1519       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
1520       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
1521       LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
1522       : TheLoop(L), PSE(PSE), TLI(TLI), DT(DT), GetLAA(GetLAA),
1523         ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
1524 
1525   /// ReductionList contains the reduction descriptors for all
1526   /// of the reductions that were found in the loop.
1527   using ReductionList = DenseMap<PHINode *, RecurrenceDescriptor>;
1528 
1529   /// InductionList saves induction variables and maps them to the
1530   /// induction descriptor.
1531   using InductionList = MapVector<PHINode *, InductionDescriptor>;
1532 
1533   /// RecurrenceSet contains the phi nodes that are recurrences other than
1534   /// inductions and reductions.
1535   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
1536 
1537   /// Returns true if it is legal to vectorize this loop.
1538   /// This does not mean that it is profitable to vectorize this
1539   /// loop, only that it is legal to do so.
1540   bool canVectorize();
1541 
1542   /// Returns the primary induction variable.
1543   PHINode *getPrimaryInduction() { return PrimaryInduction; }
1544 
1545   /// Returns the reduction variables found in the loop.
1546   ReductionList *getReductionVars() { return &Reductions; }
1547 
1548   /// Returns the induction variables found in the loop.
1549   InductionList *getInductionVars() { return &Inductions; }
1550 
1551   /// Return the first-order recurrences found in the loop.
1552   RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
1553 
1554   /// Return the set of instructions to sink to handle first-order recurrences.
1555   DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
1556 
1557   /// Returns the widest induction type.
1558   Type *getWidestInductionType() { return WidestIndTy; }
1559 
1560   /// Returns True if V is a Phi node of an induction variable in this loop.
1561   bool isInductionPhi(const Value *V);
1562 
1563   /// Returns True if V is a cast that is part of an induction def-use chain,
1564   /// and had been proven to be redundant under a runtime guard (in other
1565   /// words, the cast has the same SCEV expression as the induction phi).
1566   bool isCastedInductionVariable(const Value *V);
1567 
1568   /// Returns True if V can be considered as an induction variable in this
1569   /// loop. V can be the induction phi, or some redundant cast in the def-use
1570   /// chain of the inducion phi.
1571   bool isInductionVariable(const Value *V);
1572 
1573   /// Returns True if PN is a reduction variable in this loop.
1574   bool isReductionVariable(PHINode *PN) { return Reductions.count(PN); }
1575 
1576   /// Returns True if Phi is a first-order recurrence in this loop.
1577   bool isFirstOrderRecurrence(const PHINode *Phi);
1578 
1579   /// Return true if the block BB needs to be predicated in order for the loop
1580   /// to be vectorized.
1581   bool blockNeedsPredication(BasicBlock *BB);
1582 
1583   /// Check if this pointer is consecutive when vectorizing. This happens
1584   /// when the last index of the GEP is the induction variable, or that the
1585   /// pointer itself is an induction variable.
1586   /// This check allows us to vectorize A[idx] into a wide load/store.
1587   /// Returns:
1588   /// 0 - Stride is unknown or non-consecutive.
1589   /// 1 - Address is consecutive.
1590   /// -1 - Address is consecutive, and decreasing.
1591   /// NOTE: This method must only be used before modifying the original scalar
1592   /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
1593   int isConsecutivePtr(Value *Ptr);
1594 
1595   /// Returns true if the value V is uniform within the loop.
1596   bool isUniform(Value *V);
1597 
1598   /// Returns the information that we collected about runtime memory check.
1599   const RuntimePointerChecking *getRuntimePointerChecking() const {
1600     return LAI->getRuntimePointerChecking();
1601   }
1602 
1603   const LoopAccessInfo *getLAI() const { return LAI; }
1604 
1605   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
1606 
1607   uint64_t getMaxSafeRegisterWidth() const {
1608 	  return LAI->getDepChecker().getMaxSafeRegisterWidth();
1609   }
1610 
1611   bool hasStride(Value *V) { return LAI->hasStride(V); }
1612 
1613   /// Returns true if vector representation of the instruction \p I
1614   /// requires mask.
1615   bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
1616 
1617   unsigned getNumStores() const { return LAI->getNumStores(); }
1618   unsigned getNumLoads() const { return LAI->getNumLoads(); }
1619 
1620   // Returns true if the NoNaN attribute is set on the function.
1621   bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
1622 
1623 private:
1624   /// Check if a single basic block loop is vectorizable.
1625   /// At this point we know that this is a loop with a constant trip count
1626   /// and we only need to check individual instructions.
1627   bool canVectorizeInstrs();
1628 
1629   /// When we vectorize loops we may change the order in which
1630   /// we read and write from memory. This method checks if it is
1631   /// legal to vectorize the code, considering only memory constrains.
1632   /// Returns true if the loop is vectorizable
1633   bool canVectorizeMemory();
1634 
1635   /// Return true if we can vectorize this loop using the IF-conversion
1636   /// transformation.
1637   bool canVectorizeWithIfConvert();
1638 
1639   /// Return true if all of the instructions in the block can be speculatively
1640   /// executed. \p SafePtrs is a list of addresses that are known to be legal
1641   /// and we know that we can read from them without segfault.
1642   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
1643 
1644   /// Updates the vectorization state by adding \p Phi to the inductions list.
1645   /// This can set \p Phi as the main induction of the loop if \p Phi is a
1646   /// better choice for the main induction than the existing one.
1647   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
1648                        SmallPtrSetImpl<Value *> &AllowedExit);
1649 
1650   /// Create an analysis remark that explains why vectorization failed
1651   ///
1652   /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
1653   /// an instruction that prevents vectorization.  Otherwise the loop is used
1654   /// for the location of the remark.  \return the remark object that can be
1655   /// streamed to.
1656   OptimizationRemarkAnalysis
1657   createMissedAnalysis(StringRef RemarkName, Instruction *I = nullptr) const {
1658     return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1659                                   RemarkName, TheLoop, I);
1660   }
1661 
1662   /// \brief If an access has a symbolic strides, this maps the pointer value to
1663   /// the stride symbol.
1664   const ValueToValueMap *getSymbolicStrides() {
1665     // FIXME: Currently, the set of symbolic strides is sometimes queried before
1666     // it's collected.  This happens from canVectorizeWithIfConvert, when the
1667     // pointer is checked to reference consecutive elements suitable for a
1668     // masked access.
1669     return LAI ? &LAI->getSymbolicStrides() : nullptr;
1670   }
1671 
1672   /// The loop that we evaluate.
1673   Loop *TheLoop;
1674 
1675   /// A wrapper around ScalarEvolution used to add runtime SCEV checks.
1676   /// Applies dynamic knowledge to simplify SCEV expressions in the context
1677   /// of existing SCEV assumptions. The analysis will also add a minimal set
1678   /// of new predicates if this is required to enable vectorization and
1679   /// unrolling.
1680   PredicatedScalarEvolution &PSE;
1681 
1682   /// Target Library Info.
1683   TargetLibraryInfo *TLI;
1684 
1685   /// Dominator Tree.
1686   DominatorTree *DT;
1687 
1688   // LoopAccess analysis.
1689   std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
1690 
1691   // And the loop-accesses info corresponding to this loop.  This pointer is
1692   // null until canVectorizeMemory sets it up.
1693   const LoopAccessInfo *LAI = nullptr;
1694 
1695   /// Interface to emit optimization remarks.
1696   OptimizationRemarkEmitter *ORE;
1697 
1698   //  ---  vectorization state --- //
1699 
1700   /// Holds the primary induction variable. This is the counter of the
1701   /// loop.
1702   PHINode *PrimaryInduction = nullptr;
1703 
1704   /// Holds the reduction variables.
1705   ReductionList Reductions;
1706 
1707   /// Holds all of the induction variables that we found in the loop.
1708   /// Notice that inductions don't need to start at zero and that induction
1709   /// variables can be pointers.
1710   InductionList Inductions;
1711 
1712   /// Holds all the casts that participate in the update chain of the induction
1713   /// variables, and that have been proven to be redundant (possibly under a
1714   /// runtime guard). These casts can be ignored when creating the vectorized
1715   /// loop body.
1716   SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
1717 
1718   /// Holds the phi nodes that are first-order recurrences.
1719   RecurrenceSet FirstOrderRecurrences;
1720 
1721   /// Holds instructions that need to sink past other instructions to handle
1722   /// first-order recurrences.
1723   DenseMap<Instruction *, Instruction *> SinkAfter;
1724 
1725   /// Holds the widest induction type encountered.
1726   Type *WidestIndTy = nullptr;
1727 
1728   /// Allowed outside users. This holds the induction and reduction
1729   /// vars which can be accessed from outside the loop.
1730   SmallPtrSet<Value *, 4> AllowedExit;
1731 
1732   /// Can we assume the absence of NaNs.
1733   bool HasFunNoNaNAttr = false;
1734 
1735   /// Vectorization requirements that will go through late-evaluation.
1736   LoopVectorizationRequirements *Requirements;
1737 
1738   /// Used to emit an analysis of any legality issues.
1739   LoopVectorizeHints *Hints;
1740 
1741   /// The demanded bits analsyis is used to compute the minimum type size in
1742   /// which a reduction can be computed.
1743   DemandedBits *DB;
1744 
1745   /// The assumption cache analysis is used to compute the minimum type size in
1746   /// which a reduction can be computed.
1747   AssumptionCache *AC;
1748 
1749   /// While vectorizing these instructions we have to generate a
1750   /// call to the appropriate masked intrinsic
1751   SmallPtrSet<const Instruction *, 8> MaskedOp;
1752 };
1753 
1754 /// LoopVectorizationCostModel - estimates the expected speedups due to
1755 /// vectorization.
1756 /// In many cases vectorization is not profitable. This can happen because of
1757 /// a number of reasons. In this class we mainly attempt to predict the
1758 /// expected speedup/slowdowns due to the supported instruction set. We use the
1759 /// TargetTransformInfo to query the different backends for the cost of
1760 /// different operations.
1761 class LoopVectorizationCostModel {
1762 public:
1763   LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
1764                              LoopInfo *LI, LoopVectorizationLegality *Legal,
1765                              const TargetTransformInfo &TTI,
1766                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1767                              AssumptionCache *AC,
1768                              OptimizationRemarkEmitter *ORE, const Function *F,
1769                              const LoopVectorizeHints *Hints,
1770                              InterleavedAccessInfo &IAI)
1771       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
1772     AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
1773 
1774   /// \return An upper bound for the vectorization factor, or None if
1775   /// vectorization should be avoided up front.
1776   Optional<unsigned> computeMaxVF(bool OptForSize);
1777 
1778   /// \return The most profitable vectorization factor and the cost of that VF.
1779   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1780   /// then this vectorization factor will be selected if vectorization is
1781   /// possible.
1782   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1783 
1784   /// Setup cost-based decisions for user vectorization factor.
1785   void selectUserVectorizationFactor(unsigned UserVF) {
1786     collectUniformsAndScalars(UserVF);
1787     collectInstsToScalarize(UserVF);
1788   }
1789 
1790   /// \return The size (in bits) of the smallest and widest types in the code
1791   /// that needs to be vectorized. We ignore values that remain scalar such as
1792   /// 64 bit loop indices.
1793   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1794 
1795   /// \return The desired interleave count.
1796   /// If interleave count has been specified by metadata it will be returned.
1797   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1798   /// are the selected vectorization factor and the cost of the selected VF.
1799   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
1800                                  unsigned LoopCost);
1801 
1802   /// Memory access instruction may be vectorized in more than one way.
1803   /// Form of instruction after vectorization depends on cost.
1804   /// This function takes cost-based decisions for Load/Store instructions
1805   /// and collects them in a map. This decisions map is used for building
1806   /// the lists of loop-uniform and loop-scalar instructions.
1807   /// The calculated cost is saved with widening decision in order to
1808   /// avoid redundant calculations.
1809   void setCostBasedWideningDecision(unsigned VF);
1810 
1811   /// \brief A struct that represents some properties of the register usage
1812   /// of a loop.
1813   struct RegisterUsage {
1814     /// Holds the number of loop invariant values that are used in the loop.
1815     unsigned LoopInvariantRegs;
1816 
1817     /// Holds the maximum number of concurrent live intervals in the loop.
1818     unsigned MaxLocalUsers;
1819   };
1820 
1821   /// \return Returns information about the register usages of the loop for the
1822   /// given vectorization factors.
1823   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1824 
1825   /// Collect values we want to ignore in the cost model.
1826   void collectValuesToIgnore();
1827 
1828   /// \returns The smallest bitwidth each instruction can be represented with.
1829   /// The vector equivalents of these instructions should be truncated to this
1830   /// type.
1831   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1832     return MinBWs;
1833   }
1834 
1835   /// \returns True if it is more profitable to scalarize instruction \p I for
1836   /// vectorization factor \p VF.
1837   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1838     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1839     auto Scalars = InstsToScalarize.find(VF);
1840     assert(Scalars != InstsToScalarize.end() &&
1841            "VF not yet analyzed for scalarization profitability");
1842     return Scalars->second.count(I);
1843   }
1844 
1845   /// Returns true if \p I is known to be uniform after vectorization.
1846   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1847     if (VF == 1)
1848       return true;
1849     assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
1850     auto UniformsPerVF = Uniforms.find(VF);
1851     return UniformsPerVF->second.count(I);
1852   }
1853 
1854   /// Returns true if \p I is known to be scalar after vectorization.
1855   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1856     if (VF == 1)
1857       return true;
1858     assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
1859     auto ScalarsPerVF = Scalars.find(VF);
1860     return ScalarsPerVF->second.count(I);
1861   }
1862 
1863   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1864   /// for vectorization factor \p VF.
1865   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1866     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
1867            !isScalarAfterVectorization(I, VF);
1868   }
1869 
1870   /// Decision that was taken during cost calculation for memory instruction.
1871   enum InstWidening {
1872     CM_Unknown,
1873     CM_Widen,         // For consecutive accesses with stride +1.
1874     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1875     CM_Interleave,
1876     CM_GatherScatter,
1877     CM_Scalarize
1878   };
1879 
1880   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1881   /// instruction \p I and vector width \p VF.
1882   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1883                            unsigned Cost) {
1884     assert(VF >= 2 && "Expected VF >=2");
1885     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1886   }
1887 
1888   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1889   /// interleaving group \p Grp and vector width \p VF.
1890   void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
1891                            InstWidening W, unsigned Cost) {
1892     assert(VF >= 2 && "Expected VF >=2");
1893     /// Broadcast this decicion to all instructions inside the group.
1894     /// But the cost will be assigned to one instruction only.
1895     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1896       if (auto *I = Grp->getMember(i)) {
1897         if (Grp->getInsertPos() == I)
1898           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1899         else
1900           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1901       }
1902     }
1903   }
1904 
1905   /// Return the cost model decision for the given instruction \p I and vector
1906   /// width \p VF. Return CM_Unknown if this instruction did not pass
1907   /// through the cost modeling.
1908   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1909     assert(VF >= 2 && "Expected VF >=2");
1910     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1911     auto Itr = WideningDecisions.find(InstOnVF);
1912     if (Itr == WideningDecisions.end())
1913       return CM_Unknown;
1914     return Itr->second.first;
1915   }
1916 
1917   /// Return the vectorization cost for the given instruction \p I and vector
1918   /// width \p VF.
1919   unsigned getWideningCost(Instruction *I, unsigned VF) {
1920     assert(VF >= 2 && "Expected VF >=2");
1921     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1922     assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
1923     return WideningDecisions[InstOnVF].second;
1924   }
1925 
1926   /// Return True if instruction \p I is an optimizable truncate whose operand
1927   /// is an induction variable. Such a truncate will be removed by adding a new
1928   /// induction variable with the destination type.
1929   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1930     // If the instruction is not a truncate, return false.
1931     auto *Trunc = dyn_cast<TruncInst>(I);
1932     if (!Trunc)
1933       return false;
1934 
1935     // Get the source and destination types of the truncate.
1936     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1937     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1938 
1939     // If the truncate is free for the given types, return false. Replacing a
1940     // free truncate with an induction variable would add an induction variable
1941     // update instruction to each iteration of the loop. We exclude from this
1942     // check the primary induction variable since it will need an update
1943     // instruction regardless.
1944     Value *Op = Trunc->getOperand(0);
1945     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1946       return false;
1947 
1948     // If the truncated value is not an induction variable, return false.
1949     return Legal->isInductionPhi(Op);
1950   }
1951 
1952   /// Collects the instructions to scalarize for each predicated instruction in
1953   /// the loop.
1954   void collectInstsToScalarize(unsigned VF);
1955 
1956   /// Collect Uniform and Scalar values for the given \p VF.
1957   /// The sets depend on CM decision for Load/Store instructions
1958   /// that may be vectorized as interleave, gather-scatter or scalarized.
1959   void collectUniformsAndScalars(unsigned VF) {
1960     // Do the analysis once.
1961     if (VF == 1 || Uniforms.count(VF))
1962       return;
1963     setCostBasedWideningDecision(VF);
1964     collectLoopUniforms(VF);
1965     collectLoopScalars(VF);
1966   }
1967 
1968   /// Returns true if the target machine supports masked store operation
1969   /// for the given \p DataType and kind of access to \p Ptr.
1970   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1971     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1972   }
1973 
1974   /// Returns true if the target machine supports masked load operation
1975   /// for the given \p DataType and kind of access to \p Ptr.
1976   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1977     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1978   }
1979 
1980   /// Returns true if the target machine supports masked scatter operation
1981   /// for the given \p DataType.
1982   bool isLegalMaskedScatter(Type *DataType) {
1983     return TTI.isLegalMaskedScatter(DataType);
1984   }
1985 
1986   /// Returns true if the target machine supports masked gather operation
1987   /// for the given \p DataType.
1988   bool isLegalMaskedGather(Type *DataType) {
1989     return TTI.isLegalMaskedGather(DataType);
1990   }
1991 
1992   /// Returns true if the target machine can represent \p V as a masked gather
1993   /// or scatter operation.
1994   bool isLegalGatherOrScatter(Value *V) {
1995     bool LI = isa<LoadInst>(V);
1996     bool SI = isa<StoreInst>(V);
1997     if (!LI && !SI)
1998       return false;
1999     auto *Ty = getMemInstValueType(V);
2000     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
2001   }
2002 
2003   /// Returns true if \p I is an instruction that will be scalarized with
2004   /// predication. Such instructions include conditional stores and
2005   /// instructions that may divide by zero.
2006   bool isScalarWithPredication(Instruction *I);
2007 
2008   /// Returns true if \p I is a memory instruction with consecutive memory
2009   /// access that can be widened.
2010   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
2011 
2012   /// \brief Check if \p Instr belongs to any interleaved access group.
2013   bool isAccessInterleaved(Instruction *Instr) {
2014     return InterleaveInfo.isInterleaved(Instr);
2015   }
2016 
2017   /// \brief Get the interleaved access group that \p Instr belongs to.
2018   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
2019     return InterleaveInfo.getInterleaveGroup(Instr);
2020   }
2021 
2022   /// \brief Returns true if an interleaved group requires a scalar iteration
2023   /// to handle accesses with gaps.
2024   bool requiresScalarEpilogue() const {
2025     return InterleaveInfo.requiresScalarEpilogue();
2026   }
2027 
2028 private:
2029   unsigned NumPredStores = 0;
2030 
2031   /// \return An upper bound for the vectorization factor, larger than zero.
2032   /// One is returned if vectorization should best be avoided due to cost.
2033   unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
2034 
2035   /// The vectorization cost is a combination of the cost itself and a boolean
2036   /// indicating whether any of the contributing operations will actually
2037   /// operate on
2038   /// vector values after type legalization in the backend. If this latter value
2039   /// is
2040   /// false, then all operations will be scalarized (i.e. no vectorization has
2041   /// actually taken place).
2042   using VectorizationCostTy = std::pair<unsigned, bool>;
2043 
2044   /// Returns the expected execution cost. The unit of the cost does
2045   /// not matter because we use the 'cost' units to compare different
2046   /// vector widths. The cost that is returned is *not* normalized by
2047   /// the factor width.
2048   VectorizationCostTy expectedCost(unsigned VF);
2049 
2050   /// Returns the execution time cost of an instruction for a given vector
2051   /// width. Vector width of one means scalar.
2052   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
2053 
2054   /// The cost-computation logic from getInstructionCost which provides
2055   /// the vector type as an output parameter.
2056   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
2057 
2058   /// Calculate vectorization cost of memory instruction \p I.
2059   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
2060 
2061   /// The cost computation for scalarized memory instruction.
2062   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
2063 
2064   /// The cost computation for interleaving group of memory instructions.
2065   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
2066 
2067   /// The cost computation for Gather/Scatter instruction.
2068   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
2069 
2070   /// The cost computation for widening instruction \p I with consecutive
2071   /// memory access.
2072   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
2073 
2074   /// The cost calculation for Load instruction \p I with uniform pointer -
2075   /// scalar load + broadcast.
2076   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
2077 
2078   /// Returns whether the instruction is a load or store and will be a emitted
2079   /// as a vector operation.
2080   bool isConsecutiveLoadOrStore(Instruction *I);
2081 
2082   /// Returns true if an artificially high cost for emulated masked memrefs
2083   /// should be used.
2084   bool useEmulatedMaskMemRefHack(Instruction *I);
2085 
2086   /// Create an analysis remark that explains why vectorization failed
2087   ///
2088   /// \p RemarkName is the identifier for the remark.  \return the remark object
2089   /// that can be streamed to.
2090   OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
2091     return ::createMissedAnalysis(Hints->vectorizeAnalysisPassName(),
2092                                   RemarkName, TheLoop);
2093   }
2094 
2095   /// Map of scalar integer values to the smallest bitwidth they can be legally
2096   /// represented as. The vector equivalents of these values should be truncated
2097   /// to this type.
2098   MapVector<Instruction *, uint64_t> MinBWs;
2099 
2100   /// A type representing the costs for instructions if they were to be
2101   /// scalarized rather than vectorized. The entries are Instruction-Cost
2102   /// pairs.
2103   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
2104 
2105   /// A set containing all BasicBlocks that are known to present after
2106   /// vectorization as a predicated block.
2107   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
2108 
2109   /// A map holding scalar costs for different vectorization factors. The
2110   /// presence of a cost for an instruction in the mapping indicates that the
2111   /// instruction will be scalarized when vectorizing with the associated
2112   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
2113   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
2114 
2115   /// Holds the instructions known to be uniform after vectorization.
2116   /// The data is collected per VF.
2117   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
2118 
2119   /// Holds the instructions known to be scalar after vectorization.
2120   /// The data is collected per VF.
2121   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
2122 
2123   /// Holds the instructions (address computations) that are forced to be
2124   /// scalarized.
2125   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
2126 
2127   /// Returns the expected difference in cost from scalarizing the expression
2128   /// feeding a predicated instruction \p PredInst. The instructions to
2129   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
2130   /// non-negative return value implies the expression will be scalarized.
2131   /// Currently, only single-use chains are considered for scalarization.
2132   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
2133                               unsigned VF);
2134 
2135   /// Collect the instructions that are uniform after vectorization. An
2136   /// instruction is uniform if we represent it with a single scalar value in
2137   /// the vectorized loop corresponding to each vector iteration. Examples of
2138   /// uniform instructions include pointer operands of consecutive or
2139   /// interleaved memory accesses. Note that although uniformity implies an
2140   /// instruction will be scalar, the reverse is not true. In general, a
2141   /// scalarized instruction will be represented by VF scalar values in the
2142   /// vectorized loop, each corresponding to an iteration of the original
2143   /// scalar loop.
2144   void collectLoopUniforms(unsigned VF);
2145 
2146   /// Collect the instructions that are scalar after vectorization. An
2147   /// instruction is scalar if it is known to be uniform or will be scalarized
2148   /// during vectorization. Non-uniform scalarized instructions will be
2149   /// represented by VF values in the vectorized loop, each corresponding to an
2150   /// iteration of the original scalar loop.
2151   void collectLoopScalars(unsigned VF);
2152 
2153   /// Keeps cost model vectorization decision and cost for instructions.
2154   /// Right now it is used for memory instructions only.
2155   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
2156                                 std::pair<InstWidening, unsigned>>;
2157 
2158   DecisionList WideningDecisions;
2159 
2160 public:
2161   /// The loop that we evaluate.
2162   Loop *TheLoop;
2163 
2164   /// Predicated scalar evolution analysis.
2165   PredicatedScalarEvolution &PSE;
2166 
2167   /// Loop Info analysis.
2168   LoopInfo *LI;
2169 
2170   /// Vectorization legality.
2171   LoopVectorizationLegality *Legal;
2172 
2173   /// Vector target information.
2174   const TargetTransformInfo &TTI;
2175 
2176   /// Target Library Info.
2177   const TargetLibraryInfo *TLI;
2178 
2179   /// Demanded bits analysis.
2180   DemandedBits *DB;
2181 
2182   /// Assumption cache.
2183   AssumptionCache *AC;
2184 
2185   /// Interface to emit optimization remarks.
2186   OptimizationRemarkEmitter *ORE;
2187 
2188   const Function *TheFunction;
2189 
2190   /// Loop Vectorize Hint.
2191   const LoopVectorizeHints *Hints;
2192 
2193   /// The interleave access information contains groups of interleaved accesses
2194   /// with the same stride and close to each other.
2195   InterleavedAccessInfo &InterleaveInfo;
2196 
2197   /// Values to ignore in the cost model.
2198   SmallPtrSet<const Value *, 16> ValuesToIgnore;
2199 
2200   /// Values to ignore in the cost model when VF > 1.
2201   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2202 };
2203 
2204 } // end namespace llvm
2205 
2206 namespace {
2207 
2208 /// \brief This holds vectorization requirements that must be verified late in
2209 /// the process. The requirements are set by legalize and costmodel. Once
2210 /// vectorization has been determined to be possible and profitable the
2211 /// requirements can be verified by looking for metadata or compiler options.
2212 /// For example, some loops require FP commutativity which is only allowed if
2213 /// vectorization is explicitly specified or if the fast-math compiler option
2214 /// has been provided.
2215 /// Late evaluation of these requirements allows helpful diagnostics to be
2216 /// composed that tells the user what need to be done to vectorize the loop. For
2217 /// example, by specifying #pragma clang loop vectorize or -ffast-math. Late
2218 /// evaluation should be used only when diagnostics can generated that can be
2219 /// followed by a non-expert user.
2220 class LoopVectorizationRequirements {
2221 public:
2222   LoopVectorizationRequirements(OptimizationRemarkEmitter &ORE) : ORE(ORE) {}
2223 
2224   void addUnsafeAlgebraInst(Instruction *I) {
2225     // First unsafe algebra instruction.
2226     if (!UnsafeAlgebraInst)
2227       UnsafeAlgebraInst = I;
2228   }
2229 
2230   void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
2231 
2232   bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
2233     const char *PassName = Hints.vectorizeAnalysisPassName();
2234     bool Failed = false;
2235     if (UnsafeAlgebraInst && !Hints.allowReordering()) {
2236       ORE.emit([&]() {
2237         return OptimizationRemarkAnalysisFPCommute(
2238                    PassName, "CantReorderFPOps",
2239                    UnsafeAlgebraInst->getDebugLoc(),
2240                    UnsafeAlgebraInst->getParent())
2241                << "loop not vectorized: cannot prove it is safe to reorder "
2242                   "floating-point operations";
2243       });
2244       Failed = true;
2245     }
2246 
2247     // Test if runtime memcheck thresholds are exceeded.
2248     bool PragmaThresholdReached =
2249         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
2250     bool ThresholdReached =
2251         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
2252     if ((ThresholdReached && !Hints.allowReordering()) ||
2253         PragmaThresholdReached) {
2254       ORE.emit([&]() {
2255         return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
2256                                                   L->getStartLoc(),
2257                                                   L->getHeader())
2258                << "loop not vectorized: cannot prove it is safe to reorder "
2259                   "memory operations";
2260       });
2261       DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
2262       Failed = true;
2263     }
2264 
2265     return Failed;
2266   }
2267 
2268 private:
2269   unsigned NumRuntimePointerChecks = 0;
2270   Instruction *UnsafeAlgebraInst = nullptr;
2271 
2272   /// Interface to emit optimization remarks.
2273   OptimizationRemarkEmitter &ORE;
2274 };
2275 
2276 } // end anonymous namespace
2277 
2278 static void addAcyclicInnerLoop(Loop &L, LoopInfo &LI,
2279                                 SmallVectorImpl<Loop *> &V) {
2280   if (L.empty()) {
2281     LoopBlocksRPO RPOT(&L);
2282     RPOT.perform(&LI);
2283     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
2284       V.push_back(&L);
2285     return;
2286   }
2287   for (Loop *InnerL : L)
2288     addAcyclicInnerLoop(*InnerL, LI, V);
2289 }
2290 
2291 namespace {
2292 
2293 /// The LoopVectorize Pass.
2294 struct LoopVectorize : public FunctionPass {
2295   /// Pass identification, replacement for typeid
2296   static char ID;
2297 
2298   LoopVectorizePass Impl;
2299 
2300   explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
2301       : FunctionPass(ID) {
2302     Impl.DisableUnrolling = NoUnrolling;
2303     Impl.AlwaysVectorize = AlwaysVectorize;
2304     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2305   }
2306 
2307   bool runOnFunction(Function &F) override {
2308     if (skipFunction(F))
2309       return false;
2310 
2311     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2312     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2313     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2314     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2315     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2316     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2317     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
2318     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2319     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2320     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2321     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2322     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2323 
2324     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2325         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2326 
2327     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2328                         GetLAA, *ORE);
2329   }
2330 
2331   void getAnalysisUsage(AnalysisUsage &AU) const override {
2332     AU.addRequired<AssumptionCacheTracker>();
2333     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2334     AU.addRequired<DominatorTreeWrapperPass>();
2335     AU.addRequired<LoopInfoWrapperPass>();
2336     AU.addRequired<ScalarEvolutionWrapperPass>();
2337     AU.addRequired<TargetTransformInfoWrapperPass>();
2338     AU.addRequired<AAResultsWrapperPass>();
2339     AU.addRequired<LoopAccessLegacyAnalysis>();
2340     AU.addRequired<DemandedBitsWrapperPass>();
2341     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2342     AU.addPreserved<LoopInfoWrapperPass>();
2343     AU.addPreserved<DominatorTreeWrapperPass>();
2344     AU.addPreserved<BasicAAWrapperPass>();
2345     AU.addPreserved<GlobalsAAWrapperPass>();
2346   }
2347 };
2348 
2349 } // end anonymous namespace
2350 
2351 //===----------------------------------------------------------------------===//
2352 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2353 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2354 //===----------------------------------------------------------------------===//
2355 
2356 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2357   // We need to place the broadcast of invariant variables outside the loop.
2358   Instruction *Instr = dyn_cast<Instruction>(V);
2359   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
2360   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
2361 
2362   // Place the code for broadcasting invariant variables in the new preheader.
2363   IRBuilder<>::InsertPointGuard Guard(Builder);
2364   if (Invariant)
2365     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2366 
2367   // Broadcast the scalar into all locations in the vector.
2368   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2369 
2370   return Shuf;
2371 }
2372 
2373 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2374     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2375   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2376          "Expected either an induction phi-node or a truncate of it!");
2377   Value *Start = II.getStartValue();
2378 
2379   // Construct the initial value of the vector IV in the vector loop preheader
2380   auto CurrIP = Builder.saveIP();
2381   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2382   if (isa<TruncInst>(EntryVal)) {
2383     assert(Start->getType()->isIntegerTy() &&
2384            "Truncation requires an integer type");
2385     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2386     Step = Builder.CreateTrunc(Step, TruncType);
2387     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2388   }
2389   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2390   Value *SteppedStart =
2391       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2392 
2393   // We create vector phi nodes for both integer and floating-point induction
2394   // variables. Here, we determine the kind of arithmetic we will perform.
2395   Instruction::BinaryOps AddOp;
2396   Instruction::BinaryOps MulOp;
2397   if (Step->getType()->isIntegerTy()) {
2398     AddOp = Instruction::Add;
2399     MulOp = Instruction::Mul;
2400   } else {
2401     AddOp = II.getInductionOpcode();
2402     MulOp = Instruction::FMul;
2403   }
2404 
2405   // Multiply the vectorization factor by the step using integer or
2406   // floating-point arithmetic as appropriate.
2407   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
2408   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2409 
2410   // Create a vector splat to use in the induction update.
2411   //
2412   // FIXME: If the step is non-constant, we create the vector splat with
2413   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2414   //        handle a constant vector splat.
2415   Value *SplatVF = isa<Constant>(Mul)
2416                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2417                        : Builder.CreateVectorSplat(VF, Mul);
2418   Builder.restoreIP(CurrIP);
2419 
2420   // We may need to add the step a number of times, depending on the unroll
2421   // factor. The last of those goes into the PHI.
2422   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2423                                     &*LoopVectorBody->getFirstInsertionPt());
2424   Instruction *LastInduction = VecInd;
2425   for (unsigned Part = 0; Part < UF; ++Part) {
2426     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2427 
2428     if (isa<TruncInst>(EntryVal))
2429       addMetadata(LastInduction, EntryVal);
2430     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2431 
2432     LastInduction = cast<Instruction>(addFastMathFlag(
2433         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2434   }
2435 
2436   // Move the last step to the end of the latch block. This ensures consistent
2437   // placement of all induction updates.
2438   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2439   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2440   auto *ICmp = cast<Instruction>(Br->getCondition());
2441   LastInduction->moveBefore(ICmp);
2442   LastInduction->setName("vec.ind.next");
2443 
2444   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2445   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2446 }
2447 
2448 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2449   return Cost->isScalarAfterVectorization(I, VF) ||
2450          Cost->isProfitableToScalarize(I, VF);
2451 }
2452 
2453 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2454   if (shouldScalarizeInstruction(IV))
2455     return true;
2456   auto isScalarInst = [&](User *U) -> bool {
2457     auto *I = cast<Instruction>(U);
2458     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2459   };
2460   return llvm::any_of(IV->users(), isScalarInst);
2461 }
2462 
2463 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2464     const InductionDescriptor &ID, const Instruction *EntryVal,
2465     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2466   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2467          "Expected either an induction phi-node or a truncate of it!");
2468 
2469   // This induction variable is not the phi from the original loop but the
2470   // newly-created IV based on the proof that casted Phi is equal to the
2471   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2472   // re-uses the same InductionDescriptor that original IV uses but we don't
2473   // have to do any recording in this case - that is done when original IV is
2474   // processed.
2475   if (isa<TruncInst>(EntryVal))
2476     return;
2477 
2478   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2479   if (Casts.empty())
2480     return;
2481   // Only the first Cast instruction in the Casts vector is of interest.
2482   // The rest of the Casts (if exist) have no uses outside the
2483   // induction update chain itself.
2484   Instruction *CastInst = *Casts.begin();
2485   if (Lane < UINT_MAX)
2486     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2487   else
2488     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2489 }
2490 
2491 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2492   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2493          "Primary induction variable must have an integer type");
2494 
2495   auto II = Legal->getInductionVars()->find(IV);
2496   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
2497 
2498   auto ID = II->second;
2499   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2500 
2501   // The scalar value to broadcast. This will be derived from the canonical
2502   // induction variable.
2503   Value *ScalarIV = nullptr;
2504 
2505   // The value from the original loop to which we are mapping the new induction
2506   // variable.
2507   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2508 
2509   // True if we have vectorized the induction variable.
2510   auto VectorizedIV = false;
2511 
2512   // Determine if we want a scalar version of the induction variable. This is
2513   // true if the induction variable itself is not widened, or if it has at
2514   // least one user in the loop that is not widened.
2515   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
2516 
2517   // Generate code for the induction step. Note that induction steps are
2518   // required to be loop-invariant
2519   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
2520          "Induction step should be loop invariant");
2521   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2522   Value *Step = nullptr;
2523   if (PSE.getSE()->isSCEVable(IV->getType())) {
2524     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2525     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
2526                              LoopVectorPreHeader->getTerminator());
2527   } else {
2528     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
2529   }
2530 
2531   // Try to create a new independent vector induction variable. If we can't
2532   // create the phi node, we will splat the scalar induction variable in each
2533   // loop iteration.
2534   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
2535     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2536     VectorizedIV = true;
2537   }
2538 
2539   // If we haven't yet vectorized the induction variable, or if we will create
2540   // a scalar one, we need to define the scalar induction variable and step
2541   // values. If we were given a truncation type, truncate the canonical
2542   // induction variable and step. Otherwise, derive these values from the
2543   // induction descriptor.
2544   if (!VectorizedIV || NeedsScalarIV) {
2545     ScalarIV = Induction;
2546     if (IV != OldInduction) {
2547       ScalarIV = IV->getType()->isIntegerTy()
2548                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2549                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2550                                           IV->getType());
2551       ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
2552       ScalarIV->setName("offset.idx");
2553     }
2554     if (Trunc) {
2555       auto *TruncType = cast<IntegerType>(Trunc->getType());
2556       assert(Step->getType()->isIntegerTy() &&
2557              "Truncation requires an integer step");
2558       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2559       Step = Builder.CreateTrunc(Step, TruncType);
2560     }
2561   }
2562 
2563   // If we haven't yet vectorized the induction variable, splat the scalar
2564   // induction variable, and build the necessary step vectors.
2565   // TODO: Don't do it unless the vectorized IV is really required.
2566   if (!VectorizedIV) {
2567     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2568     for (unsigned Part = 0; Part < UF; ++Part) {
2569       Value *EntryPart =
2570           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
2571       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2572       if (Trunc)
2573         addMetadata(EntryPart, Trunc);
2574       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2575     }
2576   }
2577 
2578   // If an induction variable is only used for counting loop iterations or
2579   // calculating addresses, it doesn't need to be widened. Create scalar steps
2580   // that can be used by instructions we will later scalarize. Note that the
2581   // addition of the scalar steps will not increase the number of instructions
2582   // in the loop in the common case prior to InstCombine. We will be trading
2583   // one vector extract for each scalar step.
2584   if (NeedsScalarIV)
2585     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2586 }
2587 
2588 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2589                                           Instruction::BinaryOps BinOp) {
2590   // Create and check the types.
2591   assert(Val->getType()->isVectorTy() && "Must be a vector");
2592   int VLen = Val->getType()->getVectorNumElements();
2593 
2594   Type *STy = Val->getType()->getScalarType();
2595   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2596          "Induction Step must be an integer or FP");
2597   assert(Step->getType() == STy && "Step has wrong type");
2598 
2599   SmallVector<Constant *, 8> Indices;
2600 
2601   if (STy->isIntegerTy()) {
2602     // Create a vector of consecutive numbers from zero to VF.
2603     for (int i = 0; i < VLen; ++i)
2604       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2605 
2606     // Add the consecutive indices to the vector value.
2607     Constant *Cv = ConstantVector::get(Indices);
2608     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2609     Step = Builder.CreateVectorSplat(VLen, Step);
2610     assert(Step->getType() == Val->getType() && "Invalid step vec");
2611     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2612     // which can be found from the original scalar operations.
2613     Step = Builder.CreateMul(Cv, Step);
2614     return Builder.CreateAdd(Val, Step, "induction");
2615   }
2616 
2617   // Floating point induction.
2618   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2619          "Binary Opcode should be specified for FP induction");
2620   // Create a vector of consecutive numbers from zero to VF.
2621   for (int i = 0; i < VLen; ++i)
2622     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2623 
2624   // Add the consecutive indices to the vector value.
2625   Constant *Cv = ConstantVector::get(Indices);
2626 
2627   Step = Builder.CreateVectorSplat(VLen, Step);
2628 
2629   // Floating point operations had to be 'fast' to enable the induction.
2630   FastMathFlags Flags;
2631   Flags.setFast();
2632 
2633   Value *MulOp = Builder.CreateFMul(Cv, Step);
2634   if (isa<Instruction>(MulOp))
2635     // Have to check, MulOp may be a constant
2636     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2637 
2638   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2639   if (isa<Instruction>(BOp))
2640     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2641   return BOp;
2642 }
2643 
2644 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2645                                            Instruction *EntryVal,
2646                                            const InductionDescriptor &ID) {
2647   // We shouldn't have to build scalar steps if we aren't vectorizing.
2648   assert(VF > 1 && "VF should be greater than one");
2649 
2650   // Get the value type and ensure it and the step have the same integer type.
2651   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2652   assert(ScalarIVTy == Step->getType() &&
2653          "Val and Step should have the same type");
2654 
2655   // We build scalar steps for both integer and floating-point induction
2656   // variables. Here, we determine the kind of arithmetic we will perform.
2657   Instruction::BinaryOps AddOp;
2658   Instruction::BinaryOps MulOp;
2659   if (ScalarIVTy->isIntegerTy()) {
2660     AddOp = Instruction::Add;
2661     MulOp = Instruction::Mul;
2662   } else {
2663     AddOp = ID.getInductionOpcode();
2664     MulOp = Instruction::FMul;
2665   }
2666 
2667   // Determine the number of scalars we need to generate for each unroll
2668   // iteration. If EntryVal is uniform, we only need to generate the first
2669   // lane. Otherwise, we generate all VF values.
2670   unsigned Lanes =
2671       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2672                                                                          : VF;
2673   // Compute the scalar steps and save the results in VectorLoopValueMap.
2674   for (unsigned Part = 0; Part < UF; ++Part) {
2675     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2676       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2677       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2678       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2679       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2680       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2681     }
2682   }
2683 }
2684 
2685 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
2686   const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
2687     ValueToValueMap();
2688 
2689   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
2690   if (Stride == 1 || Stride == -1)
2691     return Stride;
2692   return 0;
2693 }
2694 
2695 bool LoopVectorizationLegality::isUniform(Value *V) {
2696   return LAI->isUniform(V);
2697 }
2698 
2699 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2700   assert(V != Induction && "The new induction variable should not be used.");
2701   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2702   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2703 
2704   // If we have a stride that is replaced by one, do it here.
2705   if (Legal->hasStride(V))
2706     V = ConstantInt::get(V->getType(), 1);
2707 
2708   // If we have a vector mapped to this value, return it.
2709   if (VectorLoopValueMap.hasVectorValue(V, Part))
2710     return VectorLoopValueMap.getVectorValue(V, Part);
2711 
2712   // If the value has not been vectorized, check if it has been scalarized
2713   // instead. If it has been scalarized, and we actually need the value in
2714   // vector form, we will construct the vector values on demand.
2715   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2716     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2717 
2718     // If we've scalarized a value, that value should be an instruction.
2719     auto *I = cast<Instruction>(V);
2720 
2721     // If we aren't vectorizing, we can just copy the scalar map values over to
2722     // the vector map.
2723     if (VF == 1) {
2724       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2725       return ScalarValue;
2726     }
2727 
2728     // Get the last scalar instruction we generated for V and Part. If the value
2729     // is known to be uniform after vectorization, this corresponds to lane zero
2730     // of the Part unroll iteration. Otherwise, the last instruction is the one
2731     // we created for the last vector lane of the Part unroll iteration.
2732     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2733     auto *LastInst = cast<Instruction>(
2734         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2735 
2736     // Set the insert point after the last scalarized instruction. This ensures
2737     // the insertelement sequence will directly follow the scalar definitions.
2738     auto OldIP = Builder.saveIP();
2739     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2740     Builder.SetInsertPoint(&*NewIP);
2741 
2742     // However, if we are vectorizing, we need to construct the vector values.
2743     // If the value is known to be uniform after vectorization, we can just
2744     // broadcast the scalar value corresponding to lane zero for each unroll
2745     // iteration. Otherwise, we construct the vector values using insertelement
2746     // instructions. Since the resulting vectors are stored in
2747     // VectorLoopValueMap, we will only generate the insertelements once.
2748     Value *VectorValue = nullptr;
2749     if (Cost->isUniformAfterVectorization(I, VF)) {
2750       VectorValue = getBroadcastInstrs(ScalarValue);
2751       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2752     } else {
2753       // Initialize packing with insertelements to start from undef.
2754       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2755       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2756       for (unsigned Lane = 0; Lane < VF; ++Lane)
2757         packScalarIntoVectorValue(V, {Part, Lane});
2758       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2759     }
2760     Builder.restoreIP(OldIP);
2761     return VectorValue;
2762   }
2763 
2764   // If this scalar is unknown, assume that it is a constant or that it is
2765   // loop invariant. Broadcast V and save the value for future uses.
2766   Value *B = getBroadcastInstrs(V);
2767   VectorLoopValueMap.setVectorValue(V, Part, B);
2768   return B;
2769 }
2770 
2771 Value *
2772 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2773                                             const VPIteration &Instance) {
2774   // If the value is not an instruction contained in the loop, it should
2775   // already be scalar.
2776   if (OrigLoop->isLoopInvariant(V))
2777     return V;
2778 
2779   assert(Instance.Lane > 0
2780              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2781              : true && "Uniform values only have lane zero");
2782 
2783   // If the value from the original loop has not been vectorized, it is
2784   // represented by UF x VF scalar values in the new loop. Return the requested
2785   // scalar value.
2786   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2787     return VectorLoopValueMap.getScalarValue(V, Instance);
2788 
2789   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2790   // for the given unroll part. If this entry is not a vector type (i.e., the
2791   // vectorization factor is one), there is no need to generate an
2792   // extractelement instruction.
2793   auto *U = getOrCreateVectorValue(V, Instance.Part);
2794   if (!U->getType()->isVectorTy()) {
2795     assert(VF == 1 && "Value not scalarized has non-vector type");
2796     return U;
2797   }
2798 
2799   // Otherwise, the value from the original loop has been vectorized and is
2800   // represented by UF vector values. Extract and return the requested scalar
2801   // value from the appropriate vector lane.
2802   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2803 }
2804 
2805 void InnerLoopVectorizer::packScalarIntoVectorValue(
2806     Value *V, const VPIteration &Instance) {
2807   assert(V != Induction && "The new induction variable should not be used.");
2808   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2809   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2810 
2811   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2812   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2813   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2814                                             Builder.getInt32(Instance.Lane));
2815   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2816 }
2817 
2818 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2819   assert(Vec->getType()->isVectorTy() && "Invalid type");
2820   SmallVector<Constant *, 8> ShuffleMask;
2821   for (unsigned i = 0; i < VF; ++i)
2822     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2823 
2824   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2825                                      ConstantVector::get(ShuffleMask),
2826                                      "reverse");
2827 }
2828 
2829 // Try to vectorize the interleave group that \p Instr belongs to.
2830 //
2831 // E.g. Translate following interleaved load group (factor = 3):
2832 //   for (i = 0; i < N; i+=3) {
2833 //     R = Pic[i];             // Member of index 0
2834 //     G = Pic[i+1];           // Member of index 1
2835 //     B = Pic[i+2];           // Member of index 2
2836 //     ... // do something to R, G, B
2837 //   }
2838 // To:
2839 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2840 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2841 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2842 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2843 //
2844 // Or translate following interleaved store group (factor = 3):
2845 //   for (i = 0; i < N; i+=3) {
2846 //     ... do something to R, G, B
2847 //     Pic[i]   = R;           // Member of index 0
2848 //     Pic[i+1] = G;           // Member of index 1
2849 //     Pic[i+2] = B;           // Member of index 2
2850 //   }
2851 // To:
2852 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2853 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2854 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2855 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2856 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2857 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
2858   const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
2859   assert(Group && "Fail to get an interleaved access group.");
2860 
2861   // Skip if current instruction is not the insert position.
2862   if (Instr != Group->getInsertPos())
2863     return;
2864 
2865   const DataLayout &DL = Instr->getModule()->getDataLayout();
2866   Value *Ptr = getLoadStorePointerOperand(Instr);
2867 
2868   // Prepare for the vector type of the interleaved load/store.
2869   Type *ScalarTy = getMemInstValueType(Instr);
2870   unsigned InterleaveFactor = Group->getFactor();
2871   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2872   Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
2873 
2874   // Prepare for the new pointers.
2875   setDebugLocFromInst(Builder, Ptr);
2876   SmallVector<Value *, 2> NewPtrs;
2877   unsigned Index = Group->getIndex(Instr);
2878 
2879   // If the group is reverse, adjust the index to refer to the last vector lane
2880   // instead of the first. We adjust the index from the first vector lane,
2881   // rather than directly getting the pointer for lane VF - 1, because the
2882   // pointer operand of the interleaved access is supposed to be uniform. For
2883   // uniform instructions, we're only required to generate a value for the
2884   // first vector lane in each unroll iteration.
2885   if (Group->isReverse())
2886     Index += (VF - 1) * Group->getFactor();
2887 
2888   for (unsigned Part = 0; Part < UF; Part++) {
2889     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2890 
2891     // Notice current instruction could be any index. Need to adjust the address
2892     // to the member of index 0.
2893     //
2894     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2895     //       b = A[i];       // Member of index 0
2896     // Current pointer is pointed to A[i+1], adjust it to A[i].
2897     //
2898     // E.g.  A[i+1] = a;     // Member of index 1
2899     //       A[i]   = b;     // Member of index 0
2900     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2901     // Current pointer is pointed to A[i+2], adjust it to A[i].
2902     NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
2903 
2904     // Cast to the vector pointer type.
2905     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2906   }
2907 
2908   setDebugLocFromInst(Builder, Instr);
2909   Value *UndefVec = UndefValue::get(VecTy);
2910 
2911   // Vectorize the interleaved load group.
2912   if (isa<LoadInst>(Instr)) {
2913     // For each unroll part, create a wide load for the group.
2914     SmallVector<Value *, 2> NewLoads;
2915     for (unsigned Part = 0; Part < UF; Part++) {
2916       auto *NewLoad = Builder.CreateAlignedLoad(
2917           NewPtrs[Part], Group->getAlignment(), "wide.vec");
2918       Group->addMetadata(NewLoad);
2919       NewLoads.push_back(NewLoad);
2920     }
2921 
2922     // For each member in the group, shuffle out the appropriate data from the
2923     // wide loads.
2924     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2925       Instruction *Member = Group->getMember(I);
2926 
2927       // Skip the gaps in the group.
2928       if (!Member)
2929         continue;
2930 
2931       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2932       for (unsigned Part = 0; Part < UF; Part++) {
2933         Value *StridedVec = Builder.CreateShuffleVector(
2934             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2935 
2936         // If this member has different type, cast the result type.
2937         if (Member->getType() != ScalarTy) {
2938           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2939           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2940         }
2941 
2942         if (Group->isReverse())
2943           StridedVec = reverseVector(StridedVec);
2944 
2945         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2946       }
2947     }
2948     return;
2949   }
2950 
2951   // The sub vector type for current instruction.
2952   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2953 
2954   // Vectorize the interleaved store group.
2955   for (unsigned Part = 0; Part < UF; Part++) {
2956     // Collect the stored vector from each member.
2957     SmallVector<Value *, 4> StoredVecs;
2958     for (unsigned i = 0; i < InterleaveFactor; i++) {
2959       // Interleaved store group doesn't allow a gap, so each index has a member
2960       Instruction *Member = Group->getMember(i);
2961       assert(Member && "Fail to get a member from an interleaved store group");
2962 
2963       Value *StoredVec = getOrCreateVectorValue(
2964           cast<StoreInst>(Member)->getValueOperand(), Part);
2965       if (Group->isReverse())
2966         StoredVec = reverseVector(StoredVec);
2967 
2968       // If this member has different type, cast it to a unified type.
2969 
2970       if (StoredVec->getType() != SubVT)
2971         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2972 
2973       StoredVecs.push_back(StoredVec);
2974     }
2975 
2976     // Concatenate all vectors into a wide vector.
2977     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2978 
2979     // Interleave the elements in the wide vector.
2980     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2981     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2982                                               "interleaved.vec");
2983 
2984     Instruction *NewStoreInstr =
2985         Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
2986 
2987     Group->addMetadata(NewStoreInstr);
2988   }
2989 }
2990 
2991 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2992                                                      VectorParts *BlockInMask) {
2993   // Attempt to issue a wide load.
2994   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2995   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2996 
2997   assert((LI || SI) && "Invalid Load/Store instruction");
2998 
2999   LoopVectorizationCostModel::InstWidening Decision =
3000       Cost->getWideningDecision(Instr, VF);
3001   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
3002          "CM decision should be taken at this point");
3003   if (Decision == LoopVectorizationCostModel::CM_Interleave)
3004     return vectorizeInterleaveGroup(Instr);
3005 
3006   Type *ScalarDataTy = getMemInstValueType(Instr);
3007   Type *DataTy = VectorType::get(ScalarDataTy, VF);
3008   Value *Ptr = getLoadStorePointerOperand(Instr);
3009   unsigned Alignment = getMemInstAlignment(Instr);
3010   // An alignment of 0 means target abi alignment. We need to use the scalar's
3011   // target abi alignment in such a case.
3012   const DataLayout &DL = Instr->getModule()->getDataLayout();
3013   if (!Alignment)
3014     Alignment = DL.getABITypeAlignment(ScalarDataTy);
3015   unsigned AddressSpace = getMemInstAddressSpace(Instr);
3016 
3017   // Determine if the pointer operand of the access is either consecutive or
3018   // reverse consecutive.
3019   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
3020   bool ConsecutiveStride =
3021       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
3022   bool CreateGatherScatter =
3023       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
3024 
3025   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
3026   // gather/scatter. Otherwise Decision should have been to Scalarize.
3027   assert((ConsecutiveStride || CreateGatherScatter) &&
3028          "The instruction should be scalarized");
3029 
3030   // Handle consecutive loads/stores.
3031   if (ConsecutiveStride)
3032     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
3033 
3034   VectorParts Mask;
3035   bool isMaskRequired = BlockInMask;
3036   if (isMaskRequired)
3037     Mask = *BlockInMask;
3038 
3039   // Handle Stores:
3040   if (SI) {
3041     setDebugLocFromInst(Builder, SI);
3042 
3043     for (unsigned Part = 0; Part < UF; ++Part) {
3044       Instruction *NewSI = nullptr;
3045       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
3046       if (CreateGatherScatter) {
3047         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
3048         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3049         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3050                                             MaskPart);
3051       } else {
3052         // Calculate the pointer for the specific unroll-part.
3053         Value *PartPtr =
3054             Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3055 
3056         if (Reverse) {
3057           // If we store to reverse consecutive memory locations, then we need
3058           // to reverse the order of elements in the stored value.
3059           StoredVal = reverseVector(StoredVal);
3060           // We don't want to update the value in the map as it might be used in
3061           // another expression. So don't call resetVectorValue(StoredVal).
3062 
3063           // If the address is consecutive but reversed, then the
3064           // wide store needs to start at the last vector element.
3065           PartPtr =
3066               Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3067           PartPtr =
3068               Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3069           if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3070             Mask[Part] = reverseVector(Mask[Part]);
3071         }
3072 
3073         Value *VecPtr =
3074             Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3075 
3076         if (isMaskRequired)
3077           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3078                                             Mask[Part]);
3079         else
3080           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3081       }
3082       addMetadata(NewSI, SI);
3083     }
3084     return;
3085   }
3086 
3087   // Handle loads.
3088   assert(LI && "Must have a load instruction");
3089   setDebugLocFromInst(Builder, LI);
3090   for (unsigned Part = 0; Part < UF; ++Part) {
3091     Value *NewLI;
3092     if (CreateGatherScatter) {
3093       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
3094       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
3095       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
3096                                          nullptr, "wide.masked.gather");
3097       addMetadata(NewLI, LI);
3098     } else {
3099       // Calculate the pointer for the specific unroll-part.
3100       Value *PartPtr =
3101           Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
3102 
3103       if (Reverse) {
3104         // If the address is consecutive but reversed, then the
3105         // wide load needs to start at the last vector element.
3106         PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
3107         PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
3108         if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3109           Mask[Part] = reverseVector(Mask[Part]);
3110       }
3111 
3112       Value *VecPtr =
3113           Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3114       if (isMaskRequired)
3115         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
3116                                          UndefValue::get(DataTy),
3117                                          "wide.masked.load");
3118       else
3119         NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
3120 
3121       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3122       addMetadata(NewLI, LI);
3123       if (Reverse)
3124         NewLI = reverseVector(NewLI);
3125     }
3126     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
3127   }
3128 }
3129 
3130 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
3131                                                const VPIteration &Instance,
3132                                                bool IfPredicateInstr) {
3133   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3134 
3135   setDebugLocFromInst(Builder, Instr);
3136 
3137   // Does this instruction return a value ?
3138   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3139 
3140   Instruction *Cloned = Instr->clone();
3141   if (!IsVoidRetTy)
3142     Cloned->setName(Instr->getName() + ".cloned");
3143 
3144   // Replace the operands of the cloned instructions with their scalar
3145   // equivalents in the new loop.
3146   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
3147     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
3148     Cloned->setOperand(op, NewOp);
3149   }
3150   addNewMetadata(Cloned, Instr);
3151 
3152   // Place the cloned scalar in the new loop.
3153   Builder.Insert(Cloned);
3154 
3155   // Add the cloned scalar to the scalar map entry.
3156   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
3157 
3158   // If we just cloned a new assumption, add it the assumption cache.
3159   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
3160     if (II->getIntrinsicID() == Intrinsic::assume)
3161       AC->registerAssumption(II);
3162 
3163   // End if-block.
3164   if (IfPredicateInstr)
3165     PredicatedInstructions.push_back(Cloned);
3166 }
3167 
3168 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3169                                                       Value *End, Value *Step,
3170                                                       Instruction *DL) {
3171   BasicBlock *Header = L->getHeader();
3172   BasicBlock *Latch = L->getLoopLatch();
3173   // As we're just creating this loop, it's possible no latch exists
3174   // yet. If so, use the header as this will be a single block loop.
3175   if (!Latch)
3176     Latch = Header;
3177 
3178   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3179   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3180   setDebugLocFromInst(Builder, OldInst);
3181   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3182 
3183   Builder.SetInsertPoint(Latch->getTerminator());
3184   setDebugLocFromInst(Builder, OldInst);
3185 
3186   // Create i+1 and fill the PHINode.
3187   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3188   Induction->addIncoming(Start, L->getLoopPreheader());
3189   Induction->addIncoming(Next, Latch);
3190   // Create the compare.
3191   Value *ICmp = Builder.CreateICmpEQ(Next, End);
3192   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
3193 
3194   // Now we have two terminators. Remove the old one from the block.
3195   Latch->getTerminator()->eraseFromParent();
3196 
3197   return Induction;
3198 }
3199 
3200 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3201   if (TripCount)
3202     return TripCount;
3203 
3204   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3205   // Find the loop boundaries.
3206   ScalarEvolution *SE = PSE.getSE();
3207   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3208   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
3209          "Invalid loop count");
3210 
3211   Type *IdxTy = Legal->getWidestInductionType();
3212 
3213   // The exit count might have the type of i64 while the phi is i32. This can
3214   // happen if we have an induction variable that is sign extended before the
3215   // compare. The only way that we get a backedge taken count is that the
3216   // induction variable was signed and as such will not overflow. In such a case
3217   // truncation is legal.
3218   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
3219       IdxTy->getPrimitiveSizeInBits())
3220     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3221   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3222 
3223   // Get the total trip count from the count by adding 1.
3224   const SCEV *ExitCount = SE->getAddExpr(
3225       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3226 
3227   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3228 
3229   // Expand the trip count and place the new instructions in the preheader.
3230   // Notice that the pre-header does not change, only the loop body.
3231   SCEVExpander Exp(*SE, DL, "induction");
3232 
3233   // Count holds the overall loop count (N).
3234   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3235                                 L->getLoopPreheader()->getTerminator());
3236 
3237   if (TripCount->getType()->isPointerTy())
3238     TripCount =
3239         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3240                                     L->getLoopPreheader()->getTerminator());
3241 
3242   return TripCount;
3243 }
3244 
3245 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3246   if (VectorTripCount)
3247     return VectorTripCount;
3248 
3249   Value *TC = getOrCreateTripCount(L);
3250   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3251 
3252   // Now we need to generate the expression for the part of the loop that the
3253   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3254   // iterations are not required for correctness, or N - Step, otherwise. Step
3255   // is equal to the vectorization factor (number of SIMD elements) times the
3256   // unroll factor (number of SIMD instructions).
3257   Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
3258   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3259 
3260   // If there is a non-reversed interleaved group that may speculatively access
3261   // memory out-of-bounds, we need to ensure that there will be at least one
3262   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3263   // the trip count, we set the remainder to be equal to the step. If the step
3264   // does not evenly divide the trip count, no adjustment is necessary since
3265   // there will already be scalar iterations. Note that the minimum iterations
3266   // check ensures that N >= Step.
3267   if (VF > 1 && Cost->requiresScalarEpilogue()) {
3268     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3269     R = Builder.CreateSelect(IsZero, Step, R);
3270   }
3271 
3272   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3273 
3274   return VectorTripCount;
3275 }
3276 
3277 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3278                                                    const DataLayout &DL) {
3279   // Verify that V is a vector type with same number of elements as DstVTy.
3280   unsigned VF = DstVTy->getNumElements();
3281   VectorType *SrcVecTy = cast<VectorType>(V->getType());
3282   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3283   Type *SrcElemTy = SrcVecTy->getElementType();
3284   Type *DstElemTy = DstVTy->getElementType();
3285   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3286          "Vector elements must have same size");
3287 
3288   // Do a direct cast if element types are castable.
3289   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3290     return Builder.CreateBitOrPointerCast(V, DstVTy);
3291   }
3292   // V cannot be directly casted to desired vector type.
3293   // May happen when V is a floating point vector but DstVTy is a vector of
3294   // pointers or vice-versa. Handle this using a two-step bitcast using an
3295   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3296   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3297          "Only one type should be a pointer type");
3298   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3299          "Only one type should be a floating point type");
3300   Type *IntTy =
3301       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3302   VectorType *VecIntTy = VectorType::get(IntTy, VF);
3303   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3304   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
3305 }
3306 
3307 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3308                                                          BasicBlock *Bypass) {
3309   Value *Count = getOrCreateTripCount(L);
3310   BasicBlock *BB = L->getLoopPreheader();
3311   IRBuilder<> Builder(BB->getTerminator());
3312 
3313   // Generate code to check if the loop's trip count is less than VF * UF, or
3314   // equal to it in case a scalar epilogue is required; this implies that the
3315   // vector trip count is zero. This check also covers the case where adding one
3316   // to the backedge-taken count overflowed leading to an incorrect trip count
3317   // of zero. In this case we will also jump to the scalar loop.
3318   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3319                                           : ICmpInst::ICMP_ULT;
3320   Value *CheckMinIters = Builder.CreateICmp(
3321       P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
3322 
3323   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3324   // Update dominator tree immediately if the generated block is a
3325   // LoopBypassBlock because SCEV expansions to generate loop bypass
3326   // checks may query it before the current function is finished.
3327   DT->addNewBlock(NewBB, BB);
3328   if (L->getParentLoop())
3329     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3330   ReplaceInstWithInst(BB->getTerminator(),
3331                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
3332   LoopBypassBlocks.push_back(BB);
3333 }
3334 
3335 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3336   BasicBlock *BB = L->getLoopPreheader();
3337 
3338   // Generate the code to check that the SCEV assumptions that we made.
3339   // We want the new basic block to start at the first instruction in a
3340   // sequence of instructions that form a check.
3341   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3342                    "scev.check");
3343   Value *SCEVCheck =
3344       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
3345 
3346   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3347     if (C->isZero())
3348       return;
3349 
3350   // Create a new block containing the stride check.
3351   BB->setName("vector.scevcheck");
3352   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3353   // Update dominator tree immediately if the generated block is a
3354   // LoopBypassBlock because SCEV expansions to generate loop bypass
3355   // checks may query it before the current function is finished.
3356   DT->addNewBlock(NewBB, BB);
3357   if (L->getParentLoop())
3358     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3359   ReplaceInstWithInst(BB->getTerminator(),
3360                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
3361   LoopBypassBlocks.push_back(BB);
3362   AddedSafetyChecks = true;
3363 }
3364 
3365 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3366   BasicBlock *BB = L->getLoopPreheader();
3367 
3368   // Generate the code that checks in runtime if arrays overlap. We put the
3369   // checks into a separate block to make the more common case of few elements
3370   // faster.
3371   Instruction *FirstCheckInst;
3372   Instruction *MemRuntimeCheck;
3373   std::tie(FirstCheckInst, MemRuntimeCheck) =
3374       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
3375   if (!MemRuntimeCheck)
3376     return;
3377 
3378   // Create a new block containing the memory check.
3379   BB->setName("vector.memcheck");
3380   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
3381   // Update dominator tree immediately if the generated block is a
3382   // LoopBypassBlock because SCEV expansions to generate loop bypass
3383   // checks may query it before the current function is finished.
3384   DT->addNewBlock(NewBB, BB);
3385   if (L->getParentLoop())
3386     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
3387   ReplaceInstWithInst(BB->getTerminator(),
3388                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
3389   LoopBypassBlocks.push_back(BB);
3390   AddedSafetyChecks = true;
3391 
3392   // We currently don't use LoopVersioning for the actual loop cloning but we
3393   // still use it to add the noalias metadata.
3394   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
3395                                            PSE.getSE());
3396   LVer->prepareNoAliasMetadata();
3397 }
3398 
3399 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3400   /*
3401    In this function we generate a new loop. The new loop will contain
3402    the vectorized instructions while the old loop will continue to run the
3403    scalar remainder.
3404 
3405        [ ] <-- loop iteration number check.
3406     /   |
3407    /    v
3408   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3409   |  /  |
3410   | /   v
3411   ||   [ ]     <-- vector pre header.
3412   |/    |
3413   |     v
3414   |    [  ] \
3415   |    [  ]_|   <-- vector loop.
3416   |     |
3417   |     v
3418   |   -[ ]   <--- middle-block.
3419   |  /  |
3420   | /   v
3421   -|- >[ ]     <--- new preheader.
3422    |    |
3423    |    v
3424    |   [ ] \
3425    |   [ ]_|   <-- old scalar loop to handle remainder.
3426     \   |
3427      \  v
3428       >[ ]     <-- exit block.
3429    ...
3430    */
3431 
3432   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
3433   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
3434   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
3435   assert(VectorPH && "Invalid loop structure");
3436   assert(ExitBlock && "Must have an exit block");
3437 
3438   // Some loops have a single integer induction variable, while other loops
3439   // don't. One example is c++ iterators that often have multiple pointer
3440   // induction variables. In the code below we also support a case where we
3441   // don't have a single induction variable.
3442   //
3443   // We try to obtain an induction variable from the original loop as hard
3444   // as possible. However if we don't find one that:
3445   //   - is an integer
3446   //   - counts from zero, stepping by one
3447   //   - is the size of the widest induction variable type
3448   // then we create a new one.
3449   OldInduction = Legal->getPrimaryInduction();
3450   Type *IdxTy = Legal->getWidestInductionType();
3451 
3452   // Split the single block loop into the two loop structure described above.
3453   BasicBlock *VecBody =
3454       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
3455   BasicBlock *MiddleBlock =
3456       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
3457   BasicBlock *ScalarPH =
3458       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
3459 
3460   // Create and register the new vector loop.
3461   Loop *Lp = LI->AllocateLoop();
3462   Loop *ParentLoop = OrigLoop->getParentLoop();
3463 
3464   // Insert the new loop into the loop nest and register the new basic blocks
3465   // before calling any utilities such as SCEV that require valid LoopInfo.
3466   if (ParentLoop) {
3467     ParentLoop->addChildLoop(Lp);
3468     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
3469     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
3470   } else {
3471     LI->addTopLevelLoop(Lp);
3472   }
3473   Lp->addBasicBlockToLoop(VecBody, *LI);
3474 
3475   // Find the loop boundaries.
3476   Value *Count = getOrCreateTripCount(Lp);
3477 
3478   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3479 
3480   // Now, compare the new count to zero. If it is zero skip the vector loop and
3481   // jump to the scalar loop. This check also covers the case where the
3482   // backedge-taken count is uint##_max: adding one to it will overflow leading
3483   // to an incorrect trip count of zero. In this (rare) case we will also jump
3484   // to the scalar loop.
3485   emitMinimumIterationCountCheck(Lp, ScalarPH);
3486 
3487   // Generate the code to check any assumptions that we've made for SCEV
3488   // expressions.
3489   emitSCEVChecks(Lp, ScalarPH);
3490 
3491   // Generate the code that checks in runtime if arrays overlap. We put the
3492   // checks into a separate block to make the more common case of few elements
3493   // faster.
3494   emitMemRuntimeChecks(Lp, ScalarPH);
3495 
3496   // Generate the induction variable.
3497   // The loop step is equal to the vectorization factor (num of SIMD elements)
3498   // times the unroll factor (num of SIMD instructions).
3499   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3500   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3501   Induction =
3502       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3503                               getDebugLocFromInstOrOperands(OldInduction));
3504 
3505   // We are going to resume the execution of the scalar loop.
3506   // Go over all of the induction variables that we found and fix the
3507   // PHIs that are left in the scalar version of the loop.
3508   // The starting values of PHI nodes depend on the counter of the last
3509   // iteration in the vectorized loop.
3510   // If we come from a bypass edge then we need to start from the original
3511   // start value.
3512 
3513   // This variable saves the new starting index for the scalar loop. It is used
3514   // to test if there are any tail iterations left once the vector loop has
3515   // completed.
3516   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3517   for (auto &InductionEntry : *List) {
3518     PHINode *OrigPhi = InductionEntry.first;
3519     InductionDescriptor II = InductionEntry.second;
3520 
3521     // Create phi nodes to merge from the  backedge-taken check block.
3522     PHINode *BCResumeVal = PHINode::Create(
3523         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3524     Value *&EndValue = IVEndValues[OrigPhi];
3525     if (OrigPhi == OldInduction) {
3526       // We know what the end value is.
3527       EndValue = CountRoundDown;
3528     } else {
3529       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3530       Type *StepType = II.getStep()->getType();
3531       Instruction::CastOps CastOp =
3532         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3533       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3534       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3535       EndValue = II.transform(B, CRD, PSE.getSE(), DL);
3536       EndValue->setName("ind.end");
3537     }
3538 
3539     // The new PHI merges the original incoming value, in case of a bypass,
3540     // or the value at the end of the vectorized loop.
3541     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3542 
3543     // Fix the scalar body counter (PHI node).
3544     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
3545 
3546     // The old induction's phi node in the scalar body needs the truncated
3547     // value.
3548     for (BasicBlock *BB : LoopBypassBlocks)
3549       BCResumeVal->addIncoming(II.getStartValue(), BB);
3550     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
3551   }
3552 
3553   // Add a check in the middle block to see if we have completed
3554   // all of the iterations in the first vector loop.
3555   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3556   Value *CmpN =
3557       CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3558                       CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3559   ReplaceInstWithInst(MiddleBlock->getTerminator(),
3560                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
3561 
3562   // Get ready to start creating new instructions into the vectorized body.
3563   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3564 
3565   // Save the state.
3566   LoopVectorPreHeader = Lp->getLoopPreheader();
3567   LoopScalarPreHeader = ScalarPH;
3568   LoopMiddleBlock = MiddleBlock;
3569   LoopExitBlock = ExitBlock;
3570   LoopVectorBody = VecBody;
3571   LoopScalarBody = OldBasicBlock;
3572 
3573   // Keep all loop hints from the original loop on the vector loop (we'll
3574   // replace the vectorizer-specific hints below).
3575   if (MDNode *LID = OrigLoop->getLoopID())
3576     Lp->setLoopID(LID);
3577 
3578   LoopVectorizeHints Hints(Lp, true, *ORE);
3579   Hints.setAlreadyVectorized();
3580 
3581   return LoopVectorPreHeader;
3582 }
3583 
3584 // Fix up external users of the induction variable. At this point, we are
3585 // in LCSSA form, with all external PHIs that use the IV having one input value,
3586 // coming from the remainder loop. We need those PHIs to also have a correct
3587 // value for the IV when arriving directly from the middle block.
3588 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3589                                        const InductionDescriptor &II,
3590                                        Value *CountRoundDown, Value *EndValue,
3591                                        BasicBlock *MiddleBlock) {
3592   // There are two kinds of external IV usages - those that use the value
3593   // computed in the last iteration (the PHI) and those that use the penultimate
3594   // value (the value that feeds into the phi from the loop latch).
3595   // We allow both, but they, obviously, have different values.
3596 
3597   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3598 
3599   DenseMap<Value *, Value *> MissingVals;
3600 
3601   // An external user of the last iteration's value should see the value that
3602   // the remainder loop uses to initialize its own IV.
3603   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3604   for (User *U : PostInc->users()) {
3605     Instruction *UI = cast<Instruction>(U);
3606     if (!OrigLoop->contains(UI)) {
3607       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3608       MissingVals[UI] = EndValue;
3609     }
3610   }
3611 
3612   // An external user of the penultimate value need to see EndValue - Step.
3613   // The simplest way to get this is to recompute it from the constituent SCEVs,
3614   // that is Start + (Step * (CRD - 1)).
3615   for (User *U : OrigPhi->users()) {
3616     auto *UI = cast<Instruction>(U);
3617     if (!OrigLoop->contains(UI)) {
3618       const DataLayout &DL =
3619           OrigLoop->getHeader()->getModule()->getDataLayout();
3620       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3621 
3622       IRBuilder<> B(MiddleBlock->getTerminator());
3623       Value *CountMinusOne = B.CreateSub(
3624           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3625       Value *CMO =
3626           !II.getStep()->getType()->isIntegerTy()
3627               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3628                              II.getStep()->getType())
3629               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3630       CMO->setName("cast.cmo");
3631       Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
3632       Escape->setName("ind.escape");
3633       MissingVals[UI] = Escape;
3634     }
3635   }
3636 
3637   for (auto &I : MissingVals) {
3638     PHINode *PHI = cast<PHINode>(I.first);
3639     // One corner case we have to handle is two IVs "chasing" each-other,
3640     // that is %IV2 = phi [...], [ %IV1, %latch ]
3641     // In this case, if IV1 has an external use, we need to avoid adding both
3642     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3643     // don't already have an incoming value for the middle block.
3644     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3645       PHI->addIncoming(I.second, MiddleBlock);
3646   }
3647 }
3648 
3649 namespace {
3650 
3651 struct CSEDenseMapInfo {
3652   static bool canHandle(const Instruction *I) {
3653     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3654            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3655   }
3656 
3657   static inline Instruction *getEmptyKey() {
3658     return DenseMapInfo<Instruction *>::getEmptyKey();
3659   }
3660 
3661   static inline Instruction *getTombstoneKey() {
3662     return DenseMapInfo<Instruction *>::getTombstoneKey();
3663   }
3664 
3665   static unsigned getHashValue(const Instruction *I) {
3666     assert(canHandle(I) && "Unknown instruction!");
3667     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3668                                                            I->value_op_end()));
3669   }
3670 
3671   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3672     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3673         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3674       return LHS == RHS;
3675     return LHS->isIdenticalTo(RHS);
3676   }
3677 };
3678 
3679 } // end anonymous namespace
3680 
3681 ///\brief Perform cse of induction variable instructions.
3682 static void cse(BasicBlock *BB) {
3683   // Perform simple cse.
3684   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3685   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3686     Instruction *In = &*I++;
3687 
3688     if (!CSEDenseMapInfo::canHandle(In))
3689       continue;
3690 
3691     // Check if we can replace this instruction with any of the
3692     // visited instructions.
3693     if (Instruction *V = CSEMap.lookup(In)) {
3694       In->replaceAllUsesWith(V);
3695       In->eraseFromParent();
3696       continue;
3697     }
3698 
3699     CSEMap[In] = In;
3700   }
3701 }
3702 
3703 /// \brief Estimate the overhead of scalarizing an instruction. This is a
3704 /// convenience wrapper for the type-based getScalarizationOverhead API.
3705 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3706                                          const TargetTransformInfo &TTI) {
3707   if (VF == 1)
3708     return 0;
3709 
3710   unsigned Cost = 0;
3711   Type *RetTy = ToVectorTy(I->getType(), VF);
3712   if (!RetTy->isVoidTy() &&
3713       (!isa<LoadInst>(I) ||
3714        !TTI.supportsEfficientVectorElementLoadStore()))
3715     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3716 
3717   if (CallInst *CI = dyn_cast<CallInst>(I)) {
3718     SmallVector<const Value *, 4> Operands(CI->arg_operands());
3719     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3720   }
3721   else if (!isa<StoreInst>(I) ||
3722            !TTI.supportsEfficientVectorElementLoadStore()) {
3723     SmallVector<const Value *, 4> Operands(I->operand_values());
3724     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3725   }
3726 
3727   return Cost;
3728 }
3729 
3730 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3731 // Return the cost of the instruction, including scalarization overhead if it's
3732 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3733 // i.e. either vector version isn't available, or is too expensive.
3734 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3735                                   const TargetTransformInfo &TTI,
3736                                   const TargetLibraryInfo *TLI,
3737                                   bool &NeedToScalarize) {
3738   Function *F = CI->getCalledFunction();
3739   StringRef FnName = CI->getCalledFunction()->getName();
3740   Type *ScalarRetTy = CI->getType();
3741   SmallVector<Type *, 4> Tys, ScalarTys;
3742   for (auto &ArgOp : CI->arg_operands())
3743     ScalarTys.push_back(ArgOp->getType());
3744 
3745   // Estimate cost of scalarized vector call. The source operands are assumed
3746   // to be vectors, so we need to extract individual elements from there,
3747   // execute VF scalar calls, and then gather the result into the vector return
3748   // value.
3749   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3750   if (VF == 1)
3751     return ScalarCallCost;
3752 
3753   // Compute corresponding vector type for return value and arguments.
3754   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3755   for (Type *ScalarTy : ScalarTys)
3756     Tys.push_back(ToVectorTy(ScalarTy, VF));
3757 
3758   // Compute costs of unpacking argument values for the scalar calls and
3759   // packing the return values to a vector.
3760   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3761 
3762   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3763 
3764   // If we can't emit a vector call for this function, then the currently found
3765   // cost is the cost we need to return.
3766   NeedToScalarize = true;
3767   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3768     return Cost;
3769 
3770   // If the corresponding vector cost is cheaper, return its cost.
3771   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3772   if (VectorCallCost < Cost) {
3773     NeedToScalarize = false;
3774     return VectorCallCost;
3775   }
3776   return Cost;
3777 }
3778 
3779 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3780 // factor VF.  Return the cost of the instruction, including scalarization
3781 // overhead if it's needed.
3782 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3783                                        const TargetTransformInfo &TTI,
3784                                        const TargetLibraryInfo *TLI) {
3785   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3786   assert(ID && "Expected intrinsic call!");
3787 
3788   FastMathFlags FMF;
3789   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3790     FMF = FPMO->getFastMathFlags();
3791 
3792   SmallVector<Value *, 4> Operands(CI->arg_operands());
3793   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3794 }
3795 
3796 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3797   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3798   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3799   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3800 }
3801 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3802   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3803   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3804   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3805 }
3806 
3807 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3808   // For every instruction `I` in MinBWs, truncate the operands, create a
3809   // truncated version of `I` and reextend its result. InstCombine runs
3810   // later and will remove any ext/trunc pairs.
3811   SmallPtrSet<Value *, 4> Erased;
3812   for (const auto &KV : Cost->getMinimalBitwidths()) {
3813     // If the value wasn't vectorized, we must maintain the original scalar
3814     // type. The absence of the value from VectorLoopValueMap indicates that it
3815     // wasn't vectorized.
3816     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3817       continue;
3818     for (unsigned Part = 0; Part < UF; ++Part) {
3819       Value *I = getOrCreateVectorValue(KV.first, Part);
3820       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3821         continue;
3822       Type *OriginalTy = I->getType();
3823       Type *ScalarTruncatedTy =
3824           IntegerType::get(OriginalTy->getContext(), KV.second);
3825       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3826                                           OriginalTy->getVectorNumElements());
3827       if (TruncatedTy == OriginalTy)
3828         continue;
3829 
3830       IRBuilder<> B(cast<Instruction>(I));
3831       auto ShrinkOperand = [&](Value *V) -> Value * {
3832         if (auto *ZI = dyn_cast<ZExtInst>(V))
3833           if (ZI->getSrcTy() == TruncatedTy)
3834             return ZI->getOperand(0);
3835         return B.CreateZExtOrTrunc(V, TruncatedTy);
3836       };
3837 
3838       // The actual instruction modification depends on the instruction type,
3839       // unfortunately.
3840       Value *NewI = nullptr;
3841       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3842         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3843                              ShrinkOperand(BO->getOperand(1)));
3844 
3845         // Any wrapping introduced by shrinking this operation shouldn't be
3846         // considered undefined behavior. So, we can't unconditionally copy
3847         // arithmetic wrapping flags to NewI.
3848         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3849       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3850         NewI =
3851             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3852                          ShrinkOperand(CI->getOperand(1)));
3853       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3854         NewI = B.CreateSelect(SI->getCondition(),
3855                               ShrinkOperand(SI->getTrueValue()),
3856                               ShrinkOperand(SI->getFalseValue()));
3857       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3858         switch (CI->getOpcode()) {
3859         default:
3860           llvm_unreachable("Unhandled cast!");
3861         case Instruction::Trunc:
3862           NewI = ShrinkOperand(CI->getOperand(0));
3863           break;
3864         case Instruction::SExt:
3865           NewI = B.CreateSExtOrTrunc(
3866               CI->getOperand(0),
3867               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3868           break;
3869         case Instruction::ZExt:
3870           NewI = B.CreateZExtOrTrunc(
3871               CI->getOperand(0),
3872               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3873           break;
3874         }
3875       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3876         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3877         auto *O0 = B.CreateZExtOrTrunc(
3878             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3879         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3880         auto *O1 = B.CreateZExtOrTrunc(
3881             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3882 
3883         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3884       } else if (isa<LoadInst>(I)) {
3885         // Don't do anything with the operands, just extend the result.
3886         continue;
3887       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3888         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3889         auto *O0 = B.CreateZExtOrTrunc(
3890             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3891         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3892         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3893       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3894         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3895         auto *O0 = B.CreateZExtOrTrunc(
3896             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3897         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3898       } else {
3899         llvm_unreachable("Unhandled instruction type!");
3900       }
3901 
3902       // Lastly, extend the result.
3903       NewI->takeName(cast<Instruction>(I));
3904       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3905       I->replaceAllUsesWith(Res);
3906       cast<Instruction>(I)->eraseFromParent();
3907       Erased.insert(I);
3908       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3909     }
3910   }
3911 
3912   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3913   for (const auto &KV : Cost->getMinimalBitwidths()) {
3914     // If the value wasn't vectorized, we must maintain the original scalar
3915     // type. The absence of the value from VectorLoopValueMap indicates that it
3916     // wasn't vectorized.
3917     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3918       continue;
3919     for (unsigned Part = 0; Part < UF; ++Part) {
3920       Value *I = getOrCreateVectorValue(KV.first, Part);
3921       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3922       if (Inst && Inst->use_empty()) {
3923         Value *NewI = Inst->getOperand(0);
3924         Inst->eraseFromParent();
3925         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3926       }
3927     }
3928   }
3929 }
3930 
3931 void InnerLoopVectorizer::fixVectorizedLoop() {
3932   // Insert truncates and extends for any truncated instructions as hints to
3933   // InstCombine.
3934   if (VF > 1)
3935     truncateToMinimalBitwidths();
3936 
3937   // At this point every instruction in the original loop is widened to a
3938   // vector form. Now we need to fix the recurrences in the loop. These PHI
3939   // nodes are currently empty because we did not want to introduce cycles.
3940   // This is the second stage of vectorizing recurrences.
3941   fixCrossIterationPHIs();
3942 
3943   // Update the dominator tree.
3944   //
3945   // FIXME: After creating the structure of the new loop, the dominator tree is
3946   //        no longer up-to-date, and it remains that way until we update it
3947   //        here. An out-of-date dominator tree is problematic for SCEV,
3948   //        because SCEVExpander uses it to guide code generation. The
3949   //        vectorizer use SCEVExpanders in several places. Instead, we should
3950   //        keep the dominator tree up-to-date as we go.
3951   updateAnalysis();
3952 
3953   // Fix-up external users of the induction variables.
3954   for (auto &Entry : *Legal->getInductionVars())
3955     fixupIVUsers(Entry.first, Entry.second,
3956                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3957                  IVEndValues[Entry.first], LoopMiddleBlock);
3958 
3959   fixLCSSAPHIs();
3960   for (Instruction *PI : PredicatedInstructions)
3961     sinkScalarOperands(&*PI);
3962 
3963   // Remove redundant induction instructions.
3964   cse(LoopVectorBody);
3965 }
3966 
3967 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3968   // In order to support recurrences we need to be able to vectorize Phi nodes.
3969   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3970   // stage #2: We now need to fix the recurrences by adding incoming edges to
3971   // the currently empty PHI nodes. At this point every instruction in the
3972   // original loop is widened to a vector form so we can use them to construct
3973   // the incoming edges.
3974   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3975     // Handle first-order recurrences and reductions that need to be fixed.
3976     if (Legal->isFirstOrderRecurrence(&Phi))
3977       fixFirstOrderRecurrence(&Phi);
3978     else if (Legal->isReductionVariable(&Phi))
3979       fixReduction(&Phi);
3980   }
3981 }
3982 
3983 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3984   // This is the second phase of vectorizing first-order recurrences. An
3985   // overview of the transformation is described below. Suppose we have the
3986   // following loop.
3987   //
3988   //   for (int i = 0; i < n; ++i)
3989   //     b[i] = a[i] - a[i - 1];
3990   //
3991   // There is a first-order recurrence on "a". For this loop, the shorthand
3992   // scalar IR looks like:
3993   //
3994   //   scalar.ph:
3995   //     s_init = a[-1]
3996   //     br scalar.body
3997   //
3998   //   scalar.body:
3999   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4000   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4001   //     s2 = a[i]
4002   //     b[i] = s2 - s1
4003   //     br cond, scalar.body, ...
4004   //
4005   // In this example, s1 is a recurrence because it's value depends on the
4006   // previous iteration. In the first phase of vectorization, we created a
4007   // temporary value for s1. We now complete the vectorization and produce the
4008   // shorthand vector IR shown below (for VF = 4, UF = 1).
4009   //
4010   //   vector.ph:
4011   //     v_init = vector(..., ..., ..., a[-1])
4012   //     br vector.body
4013   //
4014   //   vector.body
4015   //     i = phi [0, vector.ph], [i+4, vector.body]
4016   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4017   //     v2 = a[i, i+1, i+2, i+3];
4018   //     v3 = vector(v1(3), v2(0, 1, 2))
4019   //     b[i, i+1, i+2, i+3] = v2 - v3
4020   //     br cond, vector.body, middle.block
4021   //
4022   //   middle.block:
4023   //     x = v2(3)
4024   //     br scalar.ph
4025   //
4026   //   scalar.ph:
4027   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4028   //     br scalar.body
4029   //
4030   // After execution completes the vector loop, we extract the next value of
4031   // the recurrence (x) to use as the initial value in the scalar loop.
4032 
4033   // Get the original loop preheader and single loop latch.
4034   auto *Preheader = OrigLoop->getLoopPreheader();
4035   auto *Latch = OrigLoop->getLoopLatch();
4036 
4037   // Get the initial and previous values of the scalar recurrence.
4038   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4039   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4040 
4041   // Create a vector from the initial value.
4042   auto *VectorInit = ScalarInit;
4043   if (VF > 1) {
4044     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4045     VectorInit = Builder.CreateInsertElement(
4046         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4047         Builder.getInt32(VF - 1), "vector.recur.init");
4048   }
4049 
4050   // We constructed a temporary phi node in the first phase of vectorization.
4051   // This phi node will eventually be deleted.
4052   Builder.SetInsertPoint(
4053       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4054 
4055   // Create a phi node for the new recurrence. The current value will either be
4056   // the initial value inserted into a vector or loop-varying vector value.
4057   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4058   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4059 
4060   // Get the vectorized previous value of the last part UF - 1. It appears last
4061   // among all unrolled iterations, due to the order of their construction.
4062   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4063 
4064   // Set the insertion point after the previous value if it is an instruction.
4065   // Note that the previous value may have been constant-folded so it is not
4066   // guaranteed to be an instruction in the vector loop. Also, if the previous
4067   // value is a phi node, we should insert after all the phi nodes to avoid
4068   // breaking basic block verification.
4069   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
4070       isa<PHINode>(PreviousLastPart))
4071     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
4072   else
4073     Builder.SetInsertPoint(
4074         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
4075 
4076   // We will construct a vector for the recurrence by combining the values for
4077   // the current and previous iterations. This is the required shuffle mask.
4078   SmallVector<Constant *, 8> ShuffleMask(VF);
4079   ShuffleMask[0] = Builder.getInt32(VF - 1);
4080   for (unsigned I = 1; I < VF; ++I)
4081     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
4082 
4083   // The vector from which to take the initial value for the current iteration
4084   // (actual or unrolled). Initially, this is the vector phi node.
4085   Value *Incoming = VecPhi;
4086 
4087   // Shuffle the current and previous vector and update the vector parts.
4088   for (unsigned Part = 0; Part < UF; ++Part) {
4089     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4090     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4091     auto *Shuffle =
4092         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
4093                                              ConstantVector::get(ShuffleMask))
4094                : Incoming;
4095     PhiPart->replaceAllUsesWith(Shuffle);
4096     cast<Instruction>(PhiPart)->eraseFromParent();
4097     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4098     Incoming = PreviousPart;
4099   }
4100 
4101   // Fix the latch value of the new recurrence in the vector loop.
4102   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4103 
4104   // Extract the last vector element in the middle block. This will be the
4105   // initial value for the recurrence when jumping to the scalar loop.
4106   auto *ExtractForScalar = Incoming;
4107   if (VF > 1) {
4108     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4109     ExtractForScalar = Builder.CreateExtractElement(
4110         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
4111   }
4112   // Extract the second last element in the middle block if the
4113   // Phi is used outside the loop. We need to extract the phi itself
4114   // and not the last element (the phi update in the current iteration). This
4115   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4116   // when the scalar loop is not run at all.
4117   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4118   if (VF > 1)
4119     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4120         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
4121   // When loop is unrolled without vectorizing, initialize
4122   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4123   // `Incoming`. This is analogous to the vectorized case above: extracting the
4124   // second last element when VF > 1.
4125   else if (UF > 1)
4126     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4127 
4128   // Fix the initial value of the original recurrence in the scalar loop.
4129   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4130   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4131   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4132     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4133     Start->addIncoming(Incoming, BB);
4134   }
4135 
4136   Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
4137   Phi->setName("scalar.recur");
4138 
4139   // Finally, fix users of the recurrence outside the loop. The users will need
4140   // either the last value of the scalar recurrence or the last value of the
4141   // vector recurrence we extracted in the middle block. Since the loop is in
4142   // LCSSA form, we just need to find the phi node for the original scalar
4143   // recurrence in the exit block, and then add an edge for the middle block.
4144   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4145     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4146       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4147       break;
4148     }
4149   }
4150 }
4151 
4152 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4153   Constant *Zero = Builder.getInt32(0);
4154 
4155   // Get it's reduction variable descriptor.
4156   assert(Legal->isReductionVariable(Phi) &&
4157          "Unable to find the reduction variable");
4158   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
4159 
4160   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4161   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4162   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4163   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4164     RdxDesc.getMinMaxRecurrenceKind();
4165   setDebugLocFromInst(Builder, ReductionStartValue);
4166 
4167   // We need to generate a reduction vector from the incoming scalar.
4168   // To do so, we need to generate the 'identity' vector and override
4169   // one of the elements with the incoming scalar reduction. We need
4170   // to do it in the vector-loop preheader.
4171   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4172 
4173   // This is the vector-clone of the value that leaves the loop.
4174   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4175 
4176   // Find the reduction identity variable. Zero for addition, or, xor,
4177   // one for multiplication, -1 for And.
4178   Value *Identity;
4179   Value *VectorStart;
4180   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4181       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4182     // MinMax reduction have the start value as their identify.
4183     if (VF == 1) {
4184       VectorStart = Identity = ReductionStartValue;
4185     } else {
4186       VectorStart = Identity =
4187         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4188     }
4189   } else {
4190     // Handle other reduction kinds:
4191     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4192         RK, VecTy->getScalarType());
4193     if (VF == 1) {
4194       Identity = Iden;
4195       // This vector is the Identity vector where the first element is the
4196       // incoming scalar reduction.
4197       VectorStart = ReductionStartValue;
4198     } else {
4199       Identity = ConstantVector::getSplat(VF, Iden);
4200 
4201       // This vector is the Identity vector where the first element is the
4202       // incoming scalar reduction.
4203       VectorStart =
4204         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4205     }
4206   }
4207 
4208   // Fix the vector-loop phi.
4209 
4210   // Reductions do not have to start at zero. They can start with
4211   // any loop invariant values.
4212   BasicBlock *Latch = OrigLoop->getLoopLatch();
4213   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4214   for (unsigned Part = 0; Part < UF; ++Part) {
4215     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4216     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4217     // Make sure to add the reduction stat value only to the
4218     // first unroll part.
4219     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4220     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4221     cast<PHINode>(VecRdxPhi)
4222       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4223   }
4224 
4225   // Before each round, move the insertion point right between
4226   // the PHIs and the values we are going to write.
4227   // This allows us to write both PHINodes and the extractelement
4228   // instructions.
4229   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4230 
4231   setDebugLocFromInst(Builder, LoopExitInst);
4232 
4233   // If the vector reduction can be performed in a smaller type, we truncate
4234   // then extend the loop exit value to enable InstCombine to evaluate the
4235   // entire expression in the smaller type.
4236   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
4237     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4238     Builder.SetInsertPoint(
4239         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4240     VectorParts RdxParts(UF);
4241     for (unsigned Part = 0; Part < UF; ++Part) {
4242       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4243       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4244       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4245                                         : Builder.CreateZExt(Trunc, VecTy);
4246       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4247            UI != RdxParts[Part]->user_end();)
4248         if (*UI != Trunc) {
4249           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4250           RdxParts[Part] = Extnd;
4251         } else {
4252           ++UI;
4253         }
4254     }
4255     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4256     for (unsigned Part = 0; Part < UF; ++Part) {
4257       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4258       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4259     }
4260   }
4261 
4262   // Reduce all of the unrolled parts into a single vector.
4263   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4264   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4265   setDebugLocFromInst(Builder, ReducedPartRdx);
4266   for (unsigned Part = 1; Part < UF; ++Part) {
4267     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4268     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4269       // Floating point operations had to be 'fast' to enable the reduction.
4270       ReducedPartRdx = addFastMathFlag(
4271           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4272                               ReducedPartRdx, "bin.rdx"));
4273     else
4274       ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
4275           Builder, MinMaxKind, ReducedPartRdx, RdxPart);
4276   }
4277 
4278   if (VF > 1) {
4279     bool NoNaN = Legal->hasFunNoNaNAttr();
4280     ReducedPartRdx =
4281         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4282     // If the reduction can be performed in a smaller type, we need to extend
4283     // the reduction to the wider type before we branch to the original loop.
4284     if (Phi->getType() != RdxDesc.getRecurrenceType())
4285       ReducedPartRdx =
4286         RdxDesc.isSigned()
4287         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4288         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4289   }
4290 
4291   // Create a phi node that merges control-flow from the backedge-taken check
4292   // block and the middle block.
4293   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4294                                         LoopScalarPreHeader->getTerminator());
4295   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4296     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4297   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4298 
4299   // Now, we need to fix the users of the reduction variable
4300   // inside and outside of the scalar remainder loop.
4301   // We know that the loop is in LCSSA form. We need to update the
4302   // PHI nodes in the exit blocks.
4303   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4304     // All PHINodes need to have a single entry edge, or two if
4305     // we already fixed them.
4306     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4307 
4308     // We found a reduction value exit-PHI. Update it with the
4309     // incoming bypass edge.
4310     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4311       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4312   } // end of the LCSSA phi scan.
4313 
4314     // Fix the scalar loop reduction variable with the incoming reduction sum
4315     // from the vector body and from the backedge value.
4316   int IncomingEdgeBlockIdx =
4317     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4318   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4319   // Pick the other block.
4320   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4321   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4322   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4323 }
4324 
4325 void InnerLoopVectorizer::fixLCSSAPHIs() {
4326   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4327     if (LCSSAPhi.getNumIncomingValues() == 1) {
4328       assert(OrigLoop->isLoopInvariant(LCSSAPhi.getIncomingValue(0)) &&
4329              "Incoming value isn't loop invariant");
4330       LCSSAPhi.addIncoming(LCSSAPhi.getIncomingValue(0), LoopMiddleBlock);
4331     }
4332   }
4333 }
4334 
4335 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4336   // The basic block and loop containing the predicated instruction.
4337   auto *PredBB = PredInst->getParent();
4338   auto *VectorLoop = LI->getLoopFor(PredBB);
4339 
4340   // Initialize a worklist with the operands of the predicated instruction.
4341   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4342 
4343   // Holds instructions that we need to analyze again. An instruction may be
4344   // reanalyzed if we don't yet know if we can sink it or not.
4345   SmallVector<Instruction *, 8> InstsToReanalyze;
4346 
4347   // Returns true if a given use occurs in the predicated block. Phi nodes use
4348   // their operands in their corresponding predecessor blocks.
4349   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4350     auto *I = cast<Instruction>(U.getUser());
4351     BasicBlock *BB = I->getParent();
4352     if (auto *Phi = dyn_cast<PHINode>(I))
4353       BB = Phi->getIncomingBlock(
4354           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4355     return BB == PredBB;
4356   };
4357 
4358   // Iteratively sink the scalarized operands of the predicated instruction
4359   // into the block we created for it. When an instruction is sunk, it's
4360   // operands are then added to the worklist. The algorithm ends after one pass
4361   // through the worklist doesn't sink a single instruction.
4362   bool Changed;
4363   do {
4364     // Add the instructions that need to be reanalyzed to the worklist, and
4365     // reset the changed indicator.
4366     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4367     InstsToReanalyze.clear();
4368     Changed = false;
4369 
4370     while (!Worklist.empty()) {
4371       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4372 
4373       // We can't sink an instruction if it is a phi node, is already in the
4374       // predicated block, is not in the loop, or may have side effects.
4375       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4376           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4377         continue;
4378 
4379       // It's legal to sink the instruction if all its uses occur in the
4380       // predicated block. Otherwise, there's nothing to do yet, and we may
4381       // need to reanalyze the instruction.
4382       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4383         InstsToReanalyze.push_back(I);
4384         continue;
4385       }
4386 
4387       // Move the instruction to the beginning of the predicated block, and add
4388       // it's operands to the worklist.
4389       I->moveBefore(&*PredBB->getFirstInsertionPt());
4390       Worklist.insert(I->op_begin(), I->op_end());
4391 
4392       // The sinking may have enabled other instructions to be sunk, so we will
4393       // need to iterate.
4394       Changed = true;
4395     }
4396   } while (Changed);
4397 }
4398 
4399 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4400                                               unsigned VF) {
4401   assert(PN->getParent() == OrigLoop->getHeader() &&
4402          "Non-header phis should have been handled elsewhere");
4403 
4404   PHINode *P = cast<PHINode>(PN);
4405   // In order to support recurrences we need to be able to vectorize Phi nodes.
4406   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4407   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4408   // this value when we vectorize all of the instructions that use the PHI.
4409   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4410     for (unsigned Part = 0; Part < UF; ++Part) {
4411       // This is phase one of vectorizing PHIs.
4412       Type *VecTy =
4413           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4414       Value *EntryPart = PHINode::Create(
4415           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4416       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4417     }
4418     return;
4419   }
4420 
4421   setDebugLocFromInst(Builder, P);
4422 
4423   // This PHINode must be an induction variable.
4424   // Make sure that we know about it.
4425   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4426 
4427   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4428   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4429 
4430   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4431   // which can be found from the original scalar operations.
4432   switch (II.getKind()) {
4433   case InductionDescriptor::IK_NoInduction:
4434     llvm_unreachable("Unknown induction");
4435   case InductionDescriptor::IK_IntInduction:
4436   case InductionDescriptor::IK_FpInduction:
4437     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4438   case InductionDescriptor::IK_PtrInduction: {
4439     // Handle the pointer induction variable case.
4440     assert(P->getType()->isPointerTy() && "Unexpected type.");
4441     // This is the normalized GEP that starts counting at zero.
4442     Value *PtrInd = Induction;
4443     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4444     // Determine the number of scalars we need to generate for each unroll
4445     // iteration. If the instruction is uniform, we only need to generate the
4446     // first lane. Otherwise, we generate all VF values.
4447     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4448     // These are the scalar results. Notice that we don't generate vector GEPs
4449     // because scalar GEPs result in better code.
4450     for (unsigned Part = 0; Part < UF; ++Part) {
4451       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4452         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4453         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4454         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
4455         SclrGep->setName("next.gep");
4456         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4457       }
4458     }
4459     return;
4460   }
4461   }
4462 }
4463 
4464 /// A helper function for checking whether an integer division-related
4465 /// instruction may divide by zero (in which case it must be predicated if
4466 /// executed conditionally in the scalar code).
4467 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4468 /// Non-zero divisors that are non compile-time constants will not be
4469 /// converted into multiplication, so we will still end up scalarizing
4470 /// the division, but can do so w/o predication.
4471 static bool mayDivideByZero(Instruction &I) {
4472   assert((I.getOpcode() == Instruction::UDiv ||
4473           I.getOpcode() == Instruction::SDiv ||
4474           I.getOpcode() == Instruction::URem ||
4475           I.getOpcode() == Instruction::SRem) &&
4476          "Unexpected instruction");
4477   Value *Divisor = I.getOperand(1);
4478   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4479   return !CInt || CInt->isZero();
4480 }
4481 
4482 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4483   switch (I.getOpcode()) {
4484   case Instruction::Br:
4485   case Instruction::PHI:
4486     llvm_unreachable("This instruction is handled by a different recipe.");
4487   case Instruction::GetElementPtr: {
4488     // Construct a vector GEP by widening the operands of the scalar GEP as
4489     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4490     // results in a vector of pointers when at least one operand of the GEP
4491     // is vector-typed. Thus, to keep the representation compact, we only use
4492     // vector-typed operands for loop-varying values.
4493     auto *GEP = cast<GetElementPtrInst>(&I);
4494 
4495     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4496       // If we are vectorizing, but the GEP has only loop-invariant operands,
4497       // the GEP we build (by only using vector-typed operands for
4498       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4499       // produce a vector of pointers, we need to either arbitrarily pick an
4500       // operand to broadcast, or broadcast a clone of the original GEP.
4501       // Here, we broadcast a clone of the original.
4502       //
4503       // TODO: If at some point we decide to scalarize instructions having
4504       //       loop-invariant operands, this special case will no longer be
4505       //       required. We would add the scalarization decision to
4506       //       collectLoopScalars() and teach getVectorValue() to broadcast
4507       //       the lane-zero scalar value.
4508       auto *Clone = Builder.Insert(GEP->clone());
4509       for (unsigned Part = 0; Part < UF; ++Part) {
4510         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4511         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4512         addMetadata(EntryPart, GEP);
4513       }
4514     } else {
4515       // If the GEP has at least one loop-varying operand, we are sure to
4516       // produce a vector of pointers. But if we are only unrolling, we want
4517       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4518       // produce with the code below will be scalar (if VF == 1) or vector
4519       // (otherwise). Note that for the unroll-only case, we still maintain
4520       // values in the vector mapping with initVector, as we do for other
4521       // instructions.
4522       for (unsigned Part = 0; Part < UF; ++Part) {
4523         // The pointer operand of the new GEP. If it's loop-invariant, we
4524         // won't broadcast it.
4525         auto *Ptr =
4526             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4527                 ? GEP->getPointerOperand()
4528                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4529 
4530         // Collect all the indices for the new GEP. If any index is
4531         // loop-invariant, we won't broadcast it.
4532         SmallVector<Value *, 4> Indices;
4533         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4534           if (OrigLoop->isLoopInvariant(U.get()))
4535             Indices.push_back(U.get());
4536           else
4537             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4538         }
4539 
4540         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4541         // but it should be a vector, otherwise.
4542         auto *NewGEP = GEP->isInBounds()
4543                            ? Builder.CreateInBoundsGEP(Ptr, Indices)
4544                            : Builder.CreateGEP(Ptr, Indices);
4545         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4546                "NewGEP is not a pointer vector");
4547         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4548         addMetadata(NewGEP, GEP);
4549       }
4550     }
4551 
4552     break;
4553   }
4554   case Instruction::UDiv:
4555   case Instruction::SDiv:
4556   case Instruction::SRem:
4557   case Instruction::URem:
4558   case Instruction::Add:
4559   case Instruction::FAdd:
4560   case Instruction::Sub:
4561   case Instruction::FSub:
4562   case Instruction::Mul:
4563   case Instruction::FMul:
4564   case Instruction::FDiv:
4565   case Instruction::FRem:
4566   case Instruction::Shl:
4567   case Instruction::LShr:
4568   case Instruction::AShr:
4569   case Instruction::And:
4570   case Instruction::Or:
4571   case Instruction::Xor: {
4572     // Just widen binops.
4573     auto *BinOp = cast<BinaryOperator>(&I);
4574     setDebugLocFromInst(Builder, BinOp);
4575 
4576     for (unsigned Part = 0; Part < UF; ++Part) {
4577       Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
4578       Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
4579       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
4580 
4581       if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
4582         VecOp->copyIRFlags(BinOp);
4583 
4584       // Use this vector value for all users of the original instruction.
4585       VectorLoopValueMap.setVectorValue(&I, Part, V);
4586       addMetadata(V, BinOp);
4587     }
4588 
4589     break;
4590   }
4591   case Instruction::Select: {
4592     // Widen selects.
4593     // If the selector is loop invariant we can create a select
4594     // instruction with a scalar condition. Otherwise, use vector-select.
4595     auto *SE = PSE.getSE();
4596     bool InvariantCond =
4597         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4598     setDebugLocFromInst(Builder, &I);
4599 
4600     // The condition can be loop invariant  but still defined inside the
4601     // loop. This means that we can't just use the original 'cond' value.
4602     // We have to take the 'vectorized' value and pick the first lane.
4603     // Instcombine will make this a no-op.
4604 
4605     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4606 
4607     for (unsigned Part = 0; Part < UF; ++Part) {
4608       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4609       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4610       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4611       Value *Sel =
4612           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4613       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4614       addMetadata(Sel, &I);
4615     }
4616 
4617     break;
4618   }
4619 
4620   case Instruction::ICmp:
4621   case Instruction::FCmp: {
4622     // Widen compares. Generate vector compares.
4623     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4624     auto *Cmp = dyn_cast<CmpInst>(&I);
4625     setDebugLocFromInst(Builder, Cmp);
4626     for (unsigned Part = 0; Part < UF; ++Part) {
4627       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4628       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4629       Value *C = nullptr;
4630       if (FCmp) {
4631         // Propagate fast math flags.
4632         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4633         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4634         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4635       } else {
4636         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4637       }
4638       VectorLoopValueMap.setVectorValue(&I, Part, C);
4639       addMetadata(C, &I);
4640     }
4641 
4642     break;
4643   }
4644 
4645   case Instruction::ZExt:
4646   case Instruction::SExt:
4647   case Instruction::FPToUI:
4648   case Instruction::FPToSI:
4649   case Instruction::FPExt:
4650   case Instruction::PtrToInt:
4651   case Instruction::IntToPtr:
4652   case Instruction::SIToFP:
4653   case Instruction::UIToFP:
4654   case Instruction::Trunc:
4655   case Instruction::FPTrunc:
4656   case Instruction::BitCast: {
4657     auto *CI = dyn_cast<CastInst>(&I);
4658     setDebugLocFromInst(Builder, CI);
4659 
4660     /// Vectorize casts.
4661     Type *DestTy =
4662         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4663 
4664     for (unsigned Part = 0; Part < UF; ++Part) {
4665       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4666       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4667       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4668       addMetadata(Cast, &I);
4669     }
4670     break;
4671   }
4672 
4673   case Instruction::Call: {
4674     // Ignore dbg intrinsics.
4675     if (isa<DbgInfoIntrinsic>(I))
4676       break;
4677     setDebugLocFromInst(Builder, &I);
4678 
4679     Module *M = I.getParent()->getParent()->getParent();
4680     auto *CI = cast<CallInst>(&I);
4681 
4682     StringRef FnName = CI->getCalledFunction()->getName();
4683     Function *F = CI->getCalledFunction();
4684     Type *RetTy = ToVectorTy(CI->getType(), VF);
4685     SmallVector<Type *, 4> Tys;
4686     for (Value *ArgOperand : CI->arg_operands())
4687       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4688 
4689     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4690 
4691     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4692     // version of the instruction.
4693     // Is it beneficial to perform intrinsic call compared to lib call?
4694     bool NeedToScalarize;
4695     unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4696     bool UseVectorIntrinsic =
4697         ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4698     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4699            "Instruction should be scalarized elsewhere.");
4700 
4701     for (unsigned Part = 0; Part < UF; ++Part) {
4702       SmallVector<Value *, 4> Args;
4703       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4704         Value *Arg = CI->getArgOperand(i);
4705         // Some intrinsics have a scalar argument - don't replace it with a
4706         // vector.
4707         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4708           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4709         Args.push_back(Arg);
4710       }
4711 
4712       Function *VectorF;
4713       if (UseVectorIntrinsic) {
4714         // Use vector version of the intrinsic.
4715         Type *TysForDecl[] = {CI->getType()};
4716         if (VF > 1)
4717           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4718         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4719       } else {
4720         // Use vector version of the library call.
4721         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4722         assert(!VFnName.empty() && "Vector function name is empty.");
4723         VectorF = M->getFunction(VFnName);
4724         if (!VectorF) {
4725           // Generate a declaration
4726           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4727           VectorF =
4728               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4729           VectorF->copyAttributesFrom(F);
4730         }
4731       }
4732       assert(VectorF && "Can't create vector function.");
4733 
4734       SmallVector<OperandBundleDef, 1> OpBundles;
4735       CI->getOperandBundlesAsDefs(OpBundles);
4736       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4737 
4738       if (isa<FPMathOperator>(V))
4739         V->copyFastMathFlags(CI);
4740 
4741       VectorLoopValueMap.setVectorValue(&I, Part, V);
4742       addMetadata(V, &I);
4743     }
4744 
4745     break;
4746   }
4747 
4748   default:
4749     // This instruction is not vectorized by simple widening.
4750     DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4751     llvm_unreachable("Unhandled instruction!");
4752   } // end of switch.
4753 }
4754 
4755 void InnerLoopVectorizer::updateAnalysis() {
4756   // Forget the original basic block.
4757   PSE.getSE()->forgetLoop(OrigLoop);
4758 
4759   // Update the dominator tree information.
4760   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4761          "Entry does not dominate exit.");
4762 
4763   DT->addNewBlock(LoopMiddleBlock,
4764                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4765   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4766   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4767   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4768   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4769 }
4770 
4771 /// \brief Check whether it is safe to if-convert this phi node.
4772 ///
4773 /// Phi nodes with constant expressions that can trap are not safe to if
4774 /// convert.
4775 static bool canIfConvertPHINodes(BasicBlock *BB) {
4776   for (PHINode &Phi : BB->phis()) {
4777     for (Value *V : Phi.incoming_values())
4778       if (auto *C = dyn_cast<Constant>(V))
4779         if (C->canTrap())
4780           return false;
4781   }
4782   return true;
4783 }
4784 
4785 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
4786   if (!EnableIfConversion) {
4787     ORE->emit(createMissedAnalysis("IfConversionDisabled")
4788               << "if-conversion is disabled");
4789     return false;
4790   }
4791 
4792   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
4793 
4794   // A list of pointers that we can safely read and write to.
4795   SmallPtrSet<Value *, 8> SafePointes;
4796 
4797   // Collect safe addresses.
4798   for (BasicBlock *BB : TheLoop->blocks()) {
4799     if (blockNeedsPredication(BB))
4800       continue;
4801 
4802     for (Instruction &I : *BB)
4803       if (auto *Ptr = getLoadStorePointerOperand(&I))
4804         SafePointes.insert(Ptr);
4805   }
4806 
4807   // Collect the blocks that need predication.
4808   BasicBlock *Header = TheLoop->getHeader();
4809   for (BasicBlock *BB : TheLoop->blocks()) {
4810     // We don't support switch statements inside loops.
4811     if (!isa<BranchInst>(BB->getTerminator())) {
4812       ORE->emit(createMissedAnalysis("LoopContainsSwitch", BB->getTerminator())
4813                 << "loop contains a switch statement");
4814       return false;
4815     }
4816 
4817     // We must be able to predicate all blocks that need to be predicated.
4818     if (blockNeedsPredication(BB)) {
4819       if (!blockCanBePredicated(BB, SafePointes)) {
4820         ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
4821                   << "control flow cannot be substituted for a select");
4822         return false;
4823       }
4824     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
4825       ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
4826                 << "control flow cannot be substituted for a select");
4827       return false;
4828     }
4829   }
4830 
4831   // We can if-convert this loop.
4832   return true;
4833 }
4834 
4835 bool LoopVectorizationLegality::canVectorize() {
4836   // Store the result and return it at the end instead of exiting early, in case
4837   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
4838   bool Result = true;
4839 
4840   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
4841   // We must have a loop in canonical form. Loops with indirectbr in them cannot
4842   // be canonicalized.
4843   if (!TheLoop->getLoopPreheader()) {
4844     DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n");
4845     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4846               << "loop control flow is not understood by vectorizer");
4847     if (DoExtraAnalysis)
4848       Result = false;
4849     else
4850       return false;
4851   }
4852 
4853   // FIXME: The code is currently dead, since the loop gets sent to
4854   // LoopVectorizationLegality is already an innermost loop.
4855   //
4856   // We can only vectorize innermost loops.
4857   if (!TheLoop->empty()) {
4858     ORE->emit(createMissedAnalysis("NotInnermostLoop")
4859               << "loop is not the innermost loop");
4860     if (DoExtraAnalysis)
4861       Result = false;
4862     else
4863       return false;
4864   }
4865 
4866   // We must have a single backedge.
4867   if (TheLoop->getNumBackEdges() != 1) {
4868     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4869               << "loop control flow is not understood by vectorizer");
4870     if (DoExtraAnalysis)
4871       Result = false;
4872     else
4873       return false;
4874   }
4875 
4876   // We must have a single exiting block.
4877   if (!TheLoop->getExitingBlock()) {
4878     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4879               << "loop control flow is not understood by vectorizer");
4880     if (DoExtraAnalysis)
4881       Result = false;
4882     else
4883       return false;
4884   }
4885 
4886   // We only handle bottom-tested loops, i.e. loop in which the condition is
4887   // checked at the end of each iteration. With that we can assume that all
4888   // instructions in the loop are executed the same number of times.
4889   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4890     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
4891               << "loop control flow is not understood by vectorizer");
4892     if (DoExtraAnalysis)
4893       Result = false;
4894     else
4895       return false;
4896   }
4897 
4898   // We need to have a loop header.
4899   DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
4900                << '\n');
4901 
4902   // Check if we can if-convert non-single-bb loops.
4903   unsigned NumBlocks = TheLoop->getNumBlocks();
4904   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
4905     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
4906     if (DoExtraAnalysis)
4907       Result = false;
4908     else
4909       return false;
4910   }
4911 
4912   // Check if we can vectorize the instructions and CFG in this loop.
4913   if (!canVectorizeInstrs()) {
4914     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
4915     if (DoExtraAnalysis)
4916       Result = false;
4917     else
4918       return false;
4919   }
4920 
4921   // Go over each instruction and look at memory deps.
4922   if (!canVectorizeMemory()) {
4923     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
4924     if (DoExtraAnalysis)
4925       Result = false;
4926     else
4927       return false;
4928   }
4929 
4930   DEBUG(dbgs() << "LV: We can vectorize this loop"
4931                << (LAI->getRuntimePointerChecking()->Need
4932                        ? " (with a runtime bound check)"
4933                        : "")
4934                << "!\n");
4935 
4936   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
4937   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
4938     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
4939 
4940   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
4941     ORE->emit(createMissedAnalysis("TooManySCEVRunTimeChecks")
4942               << "Too many SCEV assumptions need to be made and checked "
4943               << "at runtime");
4944     DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
4945     if (DoExtraAnalysis)
4946       Result = false;
4947     else
4948       return false;
4949   }
4950 
4951   // Okay! We've done all the tests. If any have failed, return false. Otherwise
4952   // we can vectorize, and at this point we don't have any other mem analysis
4953   // which may limit our maximum vectorization factor, so just return true with
4954   // no restrictions.
4955   return Result;
4956 }
4957 
4958 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
4959   if (Ty->isPointerTy())
4960     return DL.getIntPtrType(Ty);
4961 
4962   // It is possible that char's or short's overflow when we ask for the loop's
4963   // trip count, work around this by changing the type size.
4964   if (Ty->getScalarSizeInBits() < 32)
4965     return Type::getInt32Ty(Ty->getContext());
4966 
4967   return Ty;
4968 }
4969 
4970 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
4971   Ty0 = convertPointerToIntegerType(DL, Ty0);
4972   Ty1 = convertPointerToIntegerType(DL, Ty1);
4973   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
4974     return Ty0;
4975   return Ty1;
4976 }
4977 
4978 /// \brief Check that the instruction has outside loop users and is not an
4979 /// identified reduction variable.
4980 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
4981                                SmallPtrSetImpl<Value *> &AllowedExit) {
4982   // Reduction and Induction instructions are allowed to have exit users. All
4983   // other instructions must not have external users.
4984   if (!AllowedExit.count(Inst))
4985     // Check that all of the users of the loop are inside the BB.
4986     for (User *U : Inst->users()) {
4987       Instruction *UI = cast<Instruction>(U);
4988       // This user may be a reduction exit value.
4989       if (!TheLoop->contains(UI)) {
4990         DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
4991         return true;
4992       }
4993     }
4994   return false;
4995 }
4996 
4997 void LoopVectorizationLegality::addInductionPhi(
4998     PHINode *Phi, const InductionDescriptor &ID,
4999     SmallPtrSetImpl<Value *> &AllowedExit) {
5000   Inductions[Phi] = ID;
5001 
5002   // In case this induction also comes with casts that we know we can ignore
5003   // in the vectorized loop body, record them here. All casts could be recorded
5004   // here for ignoring, but suffices to record only the first (as it is the
5005   // only one that may bw used outside the cast sequence).
5006   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
5007   if (!Casts.empty())
5008     InductionCastsToIgnore.insert(*Casts.begin());
5009 
5010   Type *PhiTy = Phi->getType();
5011   const DataLayout &DL = Phi->getModule()->getDataLayout();
5012 
5013   // Get the widest type.
5014   if (!PhiTy->isFloatingPointTy()) {
5015     if (!WidestIndTy)
5016       WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
5017     else
5018       WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
5019   }
5020 
5021   // Int inductions are special because we only allow one IV.
5022   if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
5023       ID.getConstIntStepValue() &&
5024       ID.getConstIntStepValue()->isOne() &&
5025       isa<Constant>(ID.getStartValue()) &&
5026       cast<Constant>(ID.getStartValue())->isNullValue()) {
5027 
5028     // Use the phi node with the widest type as induction. Use the last
5029     // one if there are multiple (no good reason for doing this other
5030     // than it is expedient). We've checked that it begins at zero and
5031     // steps by one, so this is a canonical induction variable.
5032     if (!PrimaryInduction || PhiTy == WidestIndTy)
5033       PrimaryInduction = Phi;
5034   }
5035 
5036   // Both the PHI node itself, and the "post-increment" value feeding
5037   // back into the PHI node may have external users.
5038   // We can allow those uses, except if the SCEVs we have for them rely
5039   // on predicates that only hold within the loop, since allowing the exit
5040   // currently means re-using this SCEV outside the loop.
5041   if (PSE.getUnionPredicate().isAlwaysTrue()) {
5042     AllowedExit.insert(Phi);
5043     AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
5044   }
5045 
5046   DEBUG(dbgs() << "LV: Found an induction variable.\n");
5047 }
5048 
5049 bool LoopVectorizationLegality::canVectorizeInstrs() {
5050   BasicBlock *Header = TheLoop->getHeader();
5051 
5052   // Look for the attribute signaling the absence of NaNs.
5053   Function &F = *Header->getParent();
5054   HasFunNoNaNAttr =
5055       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
5056 
5057   // For each block in the loop.
5058   for (BasicBlock *BB : TheLoop->blocks()) {
5059     // Scan the instructions in the block and look for hazards.
5060     for (Instruction &I : *BB) {
5061       if (auto *Phi = dyn_cast<PHINode>(&I)) {
5062         Type *PhiTy = Phi->getType();
5063         // Check that this PHI type is allowed.
5064         if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
5065             !PhiTy->isPointerTy()) {
5066           ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5067                     << "loop control flow is not understood by vectorizer");
5068           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
5069           return false;
5070         }
5071 
5072         // If this PHINode is not in the header block, then we know that we
5073         // can convert it to select during if-conversion. No need to check if
5074         // the PHIs in this block are induction or reduction variables.
5075         if (BB != Header) {
5076           // Check that this instruction has no outside users or is an
5077           // identified reduction value with an outside user.
5078           if (!hasOutsideLoopUser(TheLoop, Phi, AllowedExit))
5079             continue;
5080           ORE->emit(createMissedAnalysis("NeitherInductionNorReduction", Phi)
5081                     << "value could not be identified as "
5082                        "an induction or reduction variable");
5083           return false;
5084         }
5085 
5086         // We only allow if-converted PHIs with exactly two incoming values.
5087         if (Phi->getNumIncomingValues() != 2) {
5088           ORE->emit(createMissedAnalysis("CFGNotUnderstood", Phi)
5089                     << "control flow not understood by vectorizer");
5090           DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
5091           return false;
5092         }
5093 
5094         RecurrenceDescriptor RedDes;
5095         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
5096                                                  DT)) {
5097           if (RedDes.hasUnsafeAlgebra())
5098             Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
5099           AllowedExit.insert(RedDes.getLoopExitInstr());
5100           Reductions[Phi] = RedDes;
5101           continue;
5102         }
5103 
5104         InductionDescriptor ID;
5105         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
5106           addInductionPhi(Phi, ID, AllowedExit);
5107           if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
5108             Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
5109           continue;
5110         }
5111 
5112         if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
5113                                                          SinkAfter, DT)) {
5114           FirstOrderRecurrences.insert(Phi);
5115           continue;
5116         }
5117 
5118         // As a last resort, coerce the PHI to a AddRec expression
5119         // and re-try classifying it a an induction PHI.
5120         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
5121           addInductionPhi(Phi, ID, AllowedExit);
5122           continue;
5123         }
5124 
5125         ORE->emit(createMissedAnalysis("NonReductionValueUsedOutsideLoop", Phi)
5126                   << "value that could not be identified as "
5127                      "reduction is used outside the loop");
5128         DEBUG(dbgs() << "LV: Found an unidentified PHI." << *Phi << "\n");
5129         return false;
5130       } // end of PHI handling
5131 
5132       // We handle calls that:
5133       //   * Are debug info intrinsics.
5134       //   * Have a mapping to an IR intrinsic.
5135       //   * Have a vector version available.
5136       auto *CI = dyn_cast<CallInst>(&I);
5137       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
5138           !isa<DbgInfoIntrinsic>(CI) &&
5139           !(CI->getCalledFunction() && TLI &&
5140             TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
5141         ORE->emit(createMissedAnalysis("CantVectorizeCall", CI)
5142                   << "call instruction cannot be vectorized");
5143         DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
5144         return false;
5145       }
5146 
5147       // Intrinsics such as powi,cttz and ctlz are legal to vectorize if the
5148       // second argument is the same (i.e. loop invariant)
5149       if (CI && hasVectorInstrinsicScalarOpd(
5150                     getVectorIntrinsicIDForCall(CI, TLI), 1)) {
5151         auto *SE = PSE.getSE();
5152         if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(1)), TheLoop)) {
5153           ORE->emit(createMissedAnalysis("CantVectorizeIntrinsic", CI)
5154                     << "intrinsic instruction cannot be vectorized");
5155           DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
5156           return false;
5157         }
5158       }
5159 
5160       // Check that the instruction return type is vectorizable.
5161       // Also, we can't vectorize extractelement instructions.
5162       if ((!VectorType::isValidElementType(I.getType()) &&
5163            !I.getType()->isVoidTy()) ||
5164           isa<ExtractElementInst>(I)) {
5165         ORE->emit(createMissedAnalysis("CantVectorizeInstructionReturnType", &I)
5166                   << "instruction return type cannot be vectorized");
5167         DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
5168         return false;
5169       }
5170 
5171       // Check that the stored type is vectorizable.
5172       if (auto *ST = dyn_cast<StoreInst>(&I)) {
5173         Type *T = ST->getValueOperand()->getType();
5174         if (!VectorType::isValidElementType(T)) {
5175           ORE->emit(createMissedAnalysis("CantVectorizeStore", ST)
5176                     << "store instruction cannot be vectorized");
5177           return false;
5178         }
5179 
5180         // FP instructions can allow unsafe algebra, thus vectorizable by
5181         // non-IEEE-754 compliant SIMD units.
5182         // This applies to floating-point math operations and calls, not memory
5183         // operations, shuffles, or casts, as they don't change precision or
5184         // semantics.
5185       } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
5186                  !I.isFast()) {
5187         DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
5188         Hints->setPotentiallyUnsafe();
5189       }
5190 
5191       // Reduction instructions are allowed to have exit users.
5192       // All other instructions must not have external users.
5193       if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
5194         ORE->emit(createMissedAnalysis("ValueUsedOutsideLoop", &I)
5195                   << "value cannot be used outside the loop");
5196         return false;
5197       }
5198     } // next instr.
5199   }
5200 
5201   if (!PrimaryInduction) {
5202     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
5203     if (Inductions.empty()) {
5204       ORE->emit(createMissedAnalysis("NoInductionVariable")
5205                 << "loop induction variable could not be identified");
5206       return false;
5207     }
5208   }
5209 
5210   // Now we know the widest induction type, check if our found induction
5211   // is the same size. If it's not, unset it here and InnerLoopVectorizer
5212   // will create another.
5213   if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
5214     PrimaryInduction = nullptr;
5215 
5216   return true;
5217 }
5218 
5219 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
5220   // We should not collect Scalars more than once per VF. Right now, this
5221   // function is called from collectUniformsAndScalars(), which already does
5222   // this check. Collecting Scalars for VF=1 does not make any sense.
5223   assert(VF >= 2 && !Scalars.count(VF) &&
5224          "This function should not be visited twice for the same VF");
5225 
5226   SmallSetVector<Instruction *, 8> Worklist;
5227 
5228   // These sets are used to seed the analysis with pointers used by memory
5229   // accesses that will remain scalar.
5230   SmallSetVector<Instruction *, 8> ScalarPtrs;
5231   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5232 
5233   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5234   // The pointer operands of loads and stores will be scalar as long as the
5235   // memory access is not a gather or scatter operation. The value operand of a
5236   // store will remain scalar if the store is scalarized.
5237   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5238     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5239     assert(WideningDecision != CM_Unknown &&
5240            "Widening decision should be ready at this moment");
5241     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5242       if (Ptr == Store->getValueOperand())
5243         return WideningDecision == CM_Scalarize;
5244     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5245            "Ptr is neither a value or pointer operand");
5246     return WideningDecision != CM_GatherScatter;
5247   };
5248 
5249   // A helper that returns true if the given value is a bitcast or
5250   // getelementptr instruction contained in the loop.
5251   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5252     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5253             isa<GetElementPtrInst>(V)) &&
5254            !TheLoop->isLoopInvariant(V);
5255   };
5256 
5257   // A helper that evaluates a memory access's use of a pointer. If the use
5258   // will be a scalar use, and the pointer is only used by memory accesses, we
5259   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
5260   // PossibleNonScalarPtrs.
5261   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5262     // We only care about bitcast and getelementptr instructions contained in
5263     // the loop.
5264     if (!isLoopVaryingBitCastOrGEP(Ptr))
5265       return;
5266 
5267     // If the pointer has already been identified as scalar (e.g., if it was
5268     // also identified as uniform), there's nothing to do.
5269     auto *I = cast<Instruction>(Ptr);
5270     if (Worklist.count(I))
5271       return;
5272 
5273     // If the use of the pointer will be a scalar use, and all users of the
5274     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5275     // place the pointer in PossibleNonScalarPtrs.
5276     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5277           return isa<LoadInst>(U) || isa<StoreInst>(U);
5278         }))
5279       ScalarPtrs.insert(I);
5280     else
5281       PossibleNonScalarPtrs.insert(I);
5282   };
5283 
5284   // We seed the scalars analysis with three classes of instructions: (1)
5285   // instructions marked uniform-after-vectorization, (2) bitcast and
5286   // getelementptr instructions used by memory accesses requiring a scalar use,
5287   // and (3) pointer induction variables and their update instructions (we
5288   // currently only scalarize these).
5289   //
5290   // (1) Add to the worklist all instructions that have been identified as
5291   // uniform-after-vectorization.
5292   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5293 
5294   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5295   // memory accesses requiring a scalar use. The pointer operands of loads and
5296   // stores will be scalar as long as the memory accesses is not a gather or
5297   // scatter operation. The value operand of a store will remain scalar if the
5298   // store is scalarized.
5299   for (auto *BB : TheLoop->blocks())
5300     for (auto &I : *BB) {
5301       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5302         evaluatePtrUse(Load, Load->getPointerOperand());
5303       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5304         evaluatePtrUse(Store, Store->getPointerOperand());
5305         evaluatePtrUse(Store, Store->getValueOperand());
5306       }
5307     }
5308   for (auto *I : ScalarPtrs)
5309     if (!PossibleNonScalarPtrs.count(I)) {
5310       DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5311       Worklist.insert(I);
5312     }
5313 
5314   // (3) Add to the worklist all pointer induction variables and their update
5315   // instructions.
5316   //
5317   // TODO: Once we are able to vectorize pointer induction variables we should
5318   //       no longer insert them into the worklist here.
5319   auto *Latch = TheLoop->getLoopLatch();
5320   for (auto &Induction : *Legal->getInductionVars()) {
5321     auto *Ind = Induction.first;
5322     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5323     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
5324       continue;
5325     Worklist.insert(Ind);
5326     Worklist.insert(IndUpdate);
5327     DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5328     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5329   }
5330 
5331   // Insert the forced scalars.
5332   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5333   // induction variable when the PHI user is scalarized.
5334   if (ForcedScalars.count(VF))
5335     for (auto *I : ForcedScalars.find(VF)->second)
5336       Worklist.insert(I);
5337 
5338   // Expand the worklist by looking through any bitcasts and getelementptr
5339   // instructions we've already identified as scalar. This is similar to the
5340   // expansion step in collectLoopUniforms(); however, here we're only
5341   // expanding to include additional bitcasts and getelementptr instructions.
5342   unsigned Idx = 0;
5343   while (Idx != Worklist.size()) {
5344     Instruction *Dst = Worklist[Idx++];
5345     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5346       continue;
5347     auto *Src = cast<Instruction>(Dst->getOperand(0));
5348     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5349           auto *J = cast<Instruction>(U);
5350           return !TheLoop->contains(J) || Worklist.count(J) ||
5351                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5352                   isScalarUse(J, Src));
5353         })) {
5354       Worklist.insert(Src);
5355       DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5356     }
5357   }
5358 
5359   // An induction variable will remain scalar if all users of the induction
5360   // variable and induction variable update remain scalar.
5361   for (auto &Induction : *Legal->getInductionVars()) {
5362     auto *Ind = Induction.first;
5363     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5364 
5365     // We already considered pointer induction variables, so there's no reason
5366     // to look at their users again.
5367     //
5368     // TODO: Once we are able to vectorize pointer induction variables we
5369     //       should no longer skip over them here.
5370     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
5371       continue;
5372 
5373     // Determine if all users of the induction variable are scalar after
5374     // vectorization.
5375     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5376       auto *I = cast<Instruction>(U);
5377       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5378     });
5379     if (!ScalarInd)
5380       continue;
5381 
5382     // Determine if all users of the induction variable update instruction are
5383     // scalar after vectorization.
5384     auto ScalarIndUpdate =
5385         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5386           auto *I = cast<Instruction>(U);
5387           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5388         });
5389     if (!ScalarIndUpdate)
5390       continue;
5391 
5392     // The induction variable and its update instruction will remain scalar.
5393     Worklist.insert(Ind);
5394     Worklist.insert(IndUpdate);
5395     DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5396     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
5397   }
5398 
5399   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5400 }
5401 
5402 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
5403   if (!Legal->blockNeedsPredication(I->getParent()))
5404     return false;
5405   switch(I->getOpcode()) {
5406   default:
5407     break;
5408   case Instruction::Load:
5409   case Instruction::Store: {
5410     if (!Legal->isMaskRequired(I))
5411       return false;
5412     auto *Ptr = getLoadStorePointerOperand(I);
5413     auto *Ty = getMemInstValueType(I);
5414     return isa<LoadInst>(I) ?
5415         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
5416       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
5417   }
5418   case Instruction::UDiv:
5419   case Instruction::SDiv:
5420   case Instruction::SRem:
5421   case Instruction::URem:
5422     return mayDivideByZero(*I);
5423   }
5424   return false;
5425 }
5426 
5427 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
5428                                                                unsigned VF) {
5429   // Get and ensure we have a valid memory instruction.
5430   LoadInst *LI = dyn_cast<LoadInst>(I);
5431   StoreInst *SI = dyn_cast<StoreInst>(I);
5432   assert((LI || SI) && "Invalid memory instruction");
5433 
5434   auto *Ptr = getLoadStorePointerOperand(I);
5435 
5436   // In order to be widened, the pointer should be consecutive, first of all.
5437   if (!Legal->isConsecutivePtr(Ptr))
5438     return false;
5439 
5440   // If the instruction is a store located in a predicated block, it will be
5441   // scalarized.
5442   if (isScalarWithPredication(I))
5443     return false;
5444 
5445   // If the instruction's allocated size doesn't equal it's type size, it
5446   // requires padding and will be scalarized.
5447   auto &DL = I->getModule()->getDataLayout();
5448   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5449   if (hasIrregularType(ScalarTy, DL, VF))
5450     return false;
5451 
5452   return true;
5453 }
5454 
5455 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
5456   // We should not collect Uniforms more than once per VF. Right now,
5457   // this function is called from collectUniformsAndScalars(), which
5458   // already does this check. Collecting Uniforms for VF=1 does not make any
5459   // sense.
5460 
5461   assert(VF >= 2 && !Uniforms.count(VF) &&
5462          "This function should not be visited twice for the same VF");
5463 
5464   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5465   // not analyze again.  Uniforms.count(VF) will return 1.
5466   Uniforms[VF].clear();
5467 
5468   // We now know that the loop is vectorizable!
5469   // Collect instructions inside the loop that will remain uniform after
5470   // vectorization.
5471 
5472   // Global values, params and instructions outside of current loop are out of
5473   // scope.
5474   auto isOutOfScope = [&](Value *V) -> bool {
5475     Instruction *I = dyn_cast<Instruction>(V);
5476     return (!I || !TheLoop->contains(I));
5477   };
5478 
5479   SetVector<Instruction *> Worklist;
5480   BasicBlock *Latch = TheLoop->getLoopLatch();
5481 
5482   // Start with the conditional branch. If the branch condition is an
5483   // instruction contained in the loop that is only used by the branch, it is
5484   // uniform.
5485   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5486   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
5487     Worklist.insert(Cmp);
5488     DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
5489   }
5490 
5491   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5492   // are pointers that are treated like consecutive pointers during
5493   // vectorization. The pointer operands of interleaved accesses are an
5494   // example.
5495   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5496 
5497   // Holds pointer operands of instructions that are possibly non-uniform.
5498   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5499 
5500   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
5501     InstWidening WideningDecision = getWideningDecision(I, VF);
5502     assert(WideningDecision != CM_Unknown &&
5503            "Widening decision should be ready at this moment");
5504 
5505     return (WideningDecision == CM_Widen ||
5506             WideningDecision == CM_Widen_Reverse ||
5507             WideningDecision == CM_Interleave);
5508   };
5509   // Iterate over the instructions in the loop, and collect all
5510   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5511   // that a consecutive-like pointer operand will be scalarized, we collect it
5512   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5513   // getelementptr instruction can be used by both vectorized and scalarized
5514   // memory instructions. For example, if a loop loads and stores from the same
5515   // location, but the store is conditional, the store will be scalarized, and
5516   // the getelementptr won't remain uniform.
5517   for (auto *BB : TheLoop->blocks())
5518     for (auto &I : *BB) {
5519       // If there's no pointer operand, there's nothing to do.
5520       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5521       if (!Ptr)
5522         continue;
5523 
5524       // True if all users of Ptr are memory accesses that have Ptr as their
5525       // pointer operand.
5526       auto UsersAreMemAccesses =
5527           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5528             return getLoadStorePointerOperand(U) == Ptr;
5529           });
5530 
5531       // Ensure the memory instruction will not be scalarized or used by
5532       // gather/scatter, making its pointer operand non-uniform. If the pointer
5533       // operand is used by any instruction other than a memory access, we
5534       // conservatively assume the pointer operand may be non-uniform.
5535       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5536         PossibleNonUniformPtrs.insert(Ptr);
5537 
5538       // If the memory instruction will be vectorized and its pointer operand
5539       // is consecutive-like, or interleaving - the pointer operand should
5540       // remain uniform.
5541       else
5542         ConsecutiveLikePtrs.insert(Ptr);
5543     }
5544 
5545   // Add to the Worklist all consecutive and consecutive-like pointers that
5546   // aren't also identified as possibly non-uniform.
5547   for (auto *V : ConsecutiveLikePtrs)
5548     if (!PossibleNonUniformPtrs.count(V)) {
5549       DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
5550       Worklist.insert(V);
5551     }
5552 
5553   // Expand Worklist in topological order: whenever a new instruction
5554   // is added , its users should be either already inside Worklist, or
5555   // out of scope. It ensures a uniform instruction will only be used
5556   // by uniform instructions or out of scope instructions.
5557   unsigned idx = 0;
5558   while (idx != Worklist.size()) {
5559     Instruction *I = Worklist[idx++];
5560 
5561     for (auto OV : I->operand_values()) {
5562       if (isOutOfScope(OV))
5563         continue;
5564       auto *OI = cast<Instruction>(OV);
5565       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5566             auto *J = cast<Instruction>(U);
5567             return !TheLoop->contains(J) || Worklist.count(J) ||
5568                    (OI == getLoadStorePointerOperand(J) &&
5569                     isUniformDecision(J, VF));
5570           })) {
5571         Worklist.insert(OI);
5572         DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
5573       }
5574     }
5575   }
5576 
5577   // Returns true if Ptr is the pointer operand of a memory access instruction
5578   // I, and I is known to not require scalarization.
5579   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5580     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5581   };
5582 
5583   // For an instruction to be added into Worklist above, all its users inside
5584   // the loop should also be in Worklist. However, this condition cannot be
5585   // true for phi nodes that form a cyclic dependence. We must process phi
5586   // nodes separately. An induction variable will remain uniform if all users
5587   // of the induction variable and induction variable update remain uniform.
5588   // The code below handles both pointer and non-pointer induction variables.
5589   for (auto &Induction : *Legal->getInductionVars()) {
5590     auto *Ind = Induction.first;
5591     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5592 
5593     // Determine if all users of the induction variable are uniform after
5594     // vectorization.
5595     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5596       auto *I = cast<Instruction>(U);
5597       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5598              isVectorizedMemAccessUse(I, Ind);
5599     });
5600     if (!UniformInd)
5601       continue;
5602 
5603     // Determine if all users of the induction variable update instruction are
5604     // uniform after vectorization.
5605     auto UniformIndUpdate =
5606         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5607           auto *I = cast<Instruction>(U);
5608           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5609                  isVectorizedMemAccessUse(I, IndUpdate);
5610         });
5611     if (!UniformIndUpdate)
5612       continue;
5613 
5614     // The induction variable and its update instruction will remain uniform.
5615     Worklist.insert(Ind);
5616     Worklist.insert(IndUpdate);
5617     DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
5618     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
5619   }
5620 
5621   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5622 }
5623 
5624 bool LoopVectorizationLegality::canVectorizeMemory() {
5625   LAI = &(*GetLAA)(*TheLoop);
5626   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
5627   if (LAR) {
5628     ORE->emit([&]() {
5629       return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
5630                                         "loop not vectorized: ", *LAR);
5631     });
5632   }
5633   if (!LAI->canVectorizeMemory())
5634     return false;
5635 
5636   if (LAI->hasStoreToLoopInvariantAddress()) {
5637     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
5638               << "write to a loop invariant address could not be vectorized");
5639     DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
5640     return false;
5641   }
5642 
5643   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
5644   PSE.addPredicate(LAI->getPSE().getUnionPredicate());
5645 
5646   return true;
5647 }
5648 
5649 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
5650   Value *In0 = const_cast<Value *>(V);
5651   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
5652   if (!PN)
5653     return false;
5654 
5655   return Inductions.count(PN);
5656 }
5657 
5658 bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
5659   auto *Inst = dyn_cast<Instruction>(V);
5660   return (Inst && InductionCastsToIgnore.count(Inst));
5661 }
5662 
5663 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
5664   return isInductionPhi(V) || isCastedInductionVariable(V);
5665 }
5666 
5667 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
5668   return FirstOrderRecurrences.count(Phi);
5669 }
5670 
5671 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
5672   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
5673 }
5674 
5675 bool LoopVectorizationLegality::blockCanBePredicated(
5676     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
5677   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
5678 
5679   for (Instruction &I : *BB) {
5680     // Check that we don't have a constant expression that can trap as operand.
5681     for (Value *Operand : I.operands()) {
5682       if (auto *C = dyn_cast<Constant>(Operand))
5683         if (C->canTrap())
5684           return false;
5685     }
5686     // We might be able to hoist the load.
5687     if (I.mayReadFromMemory()) {
5688       auto *LI = dyn_cast<LoadInst>(&I);
5689       if (!LI)
5690         return false;
5691       if (!SafePtrs.count(LI->getPointerOperand())) {
5692         // !llvm.mem.parallel_loop_access implies if-conversion safety.
5693         // Otherwise, record that the load needs (real or emulated) masking
5694         // and let the cost model decide.
5695         if (!IsAnnotatedParallel)
5696           MaskedOp.insert(LI);
5697         continue;
5698       }
5699     }
5700 
5701     if (I.mayWriteToMemory()) {
5702       auto *SI = dyn_cast<StoreInst>(&I);
5703       if (!SI)
5704         return false;
5705       // Predicated store requires some form of masking:
5706       // 1) masked store HW instruction,
5707       // 2) emulation via load-blend-store (only if safe and legal to do so,
5708       //    be aware on the race conditions), or
5709       // 3) element-by-element predicate check and scalar store.
5710       MaskedOp.insert(SI);
5711       continue;
5712     }
5713     if (I.mayThrow())
5714       return false;
5715   }
5716 
5717   return true;
5718 }
5719 
5720 void InterleavedAccessInfo::collectConstStrideAccesses(
5721     MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
5722     const ValueToValueMap &Strides) {
5723   auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
5724 
5725   // Since it's desired that the load/store instructions be maintained in
5726   // "program order" for the interleaved access analysis, we have to visit the
5727   // blocks in the loop in reverse postorder (i.e., in a topological order).
5728   // Such an ordering will ensure that any load/store that may be executed
5729   // before a second load/store will precede the second load/store in
5730   // AccessStrideInfo.
5731   LoopBlocksDFS DFS(TheLoop);
5732   DFS.perform(LI);
5733   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
5734     for (auto &I : *BB) {
5735       auto *LI = dyn_cast<LoadInst>(&I);
5736       auto *SI = dyn_cast<StoreInst>(&I);
5737       if (!LI && !SI)
5738         continue;
5739 
5740       Value *Ptr = getLoadStorePointerOperand(&I);
5741       // We don't check wrapping here because we don't know yet if Ptr will be
5742       // part of a full group or a group with gaps. Checking wrapping for all
5743       // pointers (even those that end up in groups with no gaps) will be overly
5744       // conservative. For full groups, wrapping should be ok since if we would
5745       // wrap around the address space we would do a memory access at nullptr
5746       // even without the transformation. The wrapping checks are therefore
5747       // deferred until after we've formed the interleaved groups.
5748       int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
5749                                     /*Assume=*/true, /*ShouldCheckWrap=*/false);
5750 
5751       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
5752       PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
5753       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
5754 
5755       // An alignment of 0 means target ABI alignment.
5756       unsigned Align = getMemInstAlignment(&I);
5757       if (!Align)
5758         Align = DL.getABITypeAlignment(PtrTy->getElementType());
5759 
5760       AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
5761     }
5762 }
5763 
5764 // Analyze interleaved accesses and collect them into interleaved load and
5765 // store groups.
5766 //
5767 // When generating code for an interleaved load group, we effectively hoist all
5768 // loads in the group to the location of the first load in program order. When
5769 // generating code for an interleaved store group, we sink all stores to the
5770 // location of the last store. This code motion can change the order of load
5771 // and store instructions and may break dependences.
5772 //
5773 // The code generation strategy mentioned above ensures that we won't violate
5774 // any write-after-read (WAR) dependences.
5775 //
5776 // E.g., for the WAR dependence:  a = A[i];      // (1)
5777 //                                A[i] = b;      // (2)
5778 //
5779 // The store group of (2) is always inserted at or below (2), and the load
5780 // group of (1) is always inserted at or above (1). Thus, the instructions will
5781 // never be reordered. All other dependences are checked to ensure the
5782 // correctness of the instruction reordering.
5783 //
5784 // The algorithm visits all memory accesses in the loop in bottom-up program
5785 // order. Program order is established by traversing the blocks in the loop in
5786 // reverse postorder when collecting the accesses.
5787 //
5788 // We visit the memory accesses in bottom-up order because it can simplify the
5789 // construction of store groups in the presence of write-after-write (WAW)
5790 // dependences.
5791 //
5792 // E.g., for the WAW dependence:  A[i] = a;      // (1)
5793 //                                A[i] = b;      // (2)
5794 //                                A[i + 1] = c;  // (3)
5795 //
5796 // We will first create a store group with (3) and (2). (1) can't be added to
5797 // this group because it and (2) are dependent. However, (1) can be grouped
5798 // with other accesses that may precede it in program order. Note that a
5799 // bottom-up order does not imply that WAW dependences should not be checked.
5800 void InterleavedAccessInfo::analyzeInterleaving() {
5801   DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
5802   const ValueToValueMap &Strides = LAI->getSymbolicStrides();
5803 
5804   // Holds all accesses with a constant stride.
5805   MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
5806   collectConstStrideAccesses(AccessStrideInfo, Strides);
5807 
5808   if (AccessStrideInfo.empty())
5809     return;
5810 
5811   // Collect the dependences in the loop.
5812   collectDependences();
5813 
5814   // Holds all interleaved store groups temporarily.
5815   SmallSetVector<InterleaveGroup *, 4> StoreGroups;
5816   // Holds all interleaved load groups temporarily.
5817   SmallSetVector<InterleaveGroup *, 4> LoadGroups;
5818 
5819   // Search in bottom-up program order for pairs of accesses (A and B) that can
5820   // form interleaved load or store groups. In the algorithm below, access A
5821   // precedes access B in program order. We initialize a group for B in the
5822   // outer loop of the algorithm, and then in the inner loop, we attempt to
5823   // insert each A into B's group if:
5824   //
5825   //  1. A and B have the same stride,
5826   //  2. A and B have the same memory object size, and
5827   //  3. A belongs in B's group according to its distance from B.
5828   //
5829   // Special care is taken to ensure group formation will not break any
5830   // dependences.
5831   for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
5832        BI != E; ++BI) {
5833     Instruction *B = BI->first;
5834     StrideDescriptor DesB = BI->second;
5835 
5836     // Initialize a group for B if it has an allowable stride. Even if we don't
5837     // create a group for B, we continue with the bottom-up algorithm to ensure
5838     // we don't break any of B's dependences.
5839     InterleaveGroup *Group = nullptr;
5840     if (isStrided(DesB.Stride)) {
5841       Group = getInterleaveGroup(B);
5842       if (!Group) {
5843         DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n');
5844         Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
5845       }
5846       if (B->mayWriteToMemory())
5847         StoreGroups.insert(Group);
5848       else
5849         LoadGroups.insert(Group);
5850     }
5851 
5852     for (auto AI = std::next(BI); AI != E; ++AI) {
5853       Instruction *A = AI->first;
5854       StrideDescriptor DesA = AI->second;
5855 
5856       // Our code motion strategy implies that we can't have dependences
5857       // between accesses in an interleaved group and other accesses located
5858       // between the first and last member of the group. Note that this also
5859       // means that a group can't have more than one member at a given offset.
5860       // The accesses in a group can have dependences with other accesses, but
5861       // we must ensure we don't extend the boundaries of the group such that
5862       // we encompass those dependent accesses.
5863       //
5864       // For example, assume we have the sequence of accesses shown below in a
5865       // stride-2 loop:
5866       //
5867       //  (1, 2) is a group | A[i]   = a;  // (1)
5868       //                    | A[i-1] = b;  // (2) |
5869       //                      A[i-3] = c;  // (3)
5870       //                      A[i]   = d;  // (4) | (2, 4) is not a group
5871       //
5872       // Because accesses (2) and (3) are dependent, we can group (2) with (1)
5873       // but not with (4). If we did, the dependent access (3) would be within
5874       // the boundaries of the (2, 4) group.
5875       if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
5876         // If a dependence exists and A is already in a group, we know that A
5877         // must be a store since A precedes B and WAR dependences are allowed.
5878         // Thus, A would be sunk below B. We release A's group to prevent this
5879         // illegal code motion. A will then be free to form another group with
5880         // instructions that precede it.
5881         if (isInterleaved(A)) {
5882           InterleaveGroup *StoreGroup = getInterleaveGroup(A);
5883           StoreGroups.remove(StoreGroup);
5884           releaseGroup(StoreGroup);
5885         }
5886 
5887         // If a dependence exists and A is not already in a group (or it was
5888         // and we just released it), B might be hoisted above A (if B is a
5889         // load) or another store might be sunk below A (if B is a store). In
5890         // either case, we can't add additional instructions to B's group. B
5891         // will only form a group with instructions that it precedes.
5892         break;
5893       }
5894 
5895       // At this point, we've checked for illegal code motion. If either A or B
5896       // isn't strided, there's nothing left to do.
5897       if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
5898         continue;
5899 
5900       // Ignore A if it's already in a group or isn't the same kind of memory
5901       // operation as B.
5902       // Note that mayReadFromMemory() isn't mutually exclusive to mayWriteToMemory
5903       // in the case of atomic loads. We shouldn't see those here, canVectorizeMemory()
5904       // should have returned false - except for the case we asked for optimization
5905       // remarks.
5906       if (isInterleaved(A) || (A->mayReadFromMemory() != B->mayReadFromMemory())
5907           || (A->mayWriteToMemory() != B->mayWriteToMemory()))
5908         continue;
5909 
5910       // Check rules 1 and 2. Ignore A if its stride or size is different from
5911       // that of B.
5912       if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
5913         continue;
5914 
5915       // Ignore A if the memory object of A and B don't belong to the same
5916       // address space
5917       if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
5918         continue;
5919 
5920       // Calculate the distance from A to B.
5921       const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
5922           PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
5923       if (!DistToB)
5924         continue;
5925       int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
5926 
5927       // Check rule 3. Ignore A if its distance to B is not a multiple of the
5928       // size.
5929       if (DistanceToB % static_cast<int64_t>(DesB.Size))
5930         continue;
5931 
5932       // Ignore A if either A or B is in a predicated block. Although we
5933       // currently prevent group formation for predicated accesses, we may be
5934       // able to relax this limitation in the future once we handle more
5935       // complicated blocks.
5936       if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
5937         continue;
5938 
5939       // The index of A is the index of B plus A's distance to B in multiples
5940       // of the size.
5941       int IndexA =
5942           Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
5943 
5944       // Try to insert A into B's group.
5945       if (Group->insertMember(A, IndexA, DesA.Align)) {
5946         DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
5947                      << "    into the interleave group with" << *B << '\n');
5948         InterleaveGroupMap[A] = Group;
5949 
5950         // Set the first load in program order as the insert position.
5951         if (A->mayReadFromMemory())
5952           Group->setInsertPos(A);
5953       }
5954     } // Iteration over A accesses.
5955   } // Iteration over B accesses.
5956 
5957   // Remove interleaved store groups with gaps.
5958   for (InterleaveGroup *Group : StoreGroups)
5959     if (Group->getNumMembers() != Group->getFactor()) {
5960       DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
5961                       "to gaps.\n");
5962       releaseGroup(Group);
5963     }
5964   // Remove interleaved groups with gaps (currently only loads) whose memory
5965   // accesses may wrap around. We have to revisit the getPtrStride analysis,
5966   // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
5967   // not check wrapping (see documentation there).
5968   // FORNOW we use Assume=false;
5969   // TODO: Change to Assume=true but making sure we don't exceed the threshold
5970   // of runtime SCEV assumptions checks (thereby potentially failing to
5971   // vectorize altogether).
5972   // Additional optional optimizations:
5973   // TODO: If we are peeling the loop and we know that the first pointer doesn't
5974   // wrap then we can deduce that all pointers in the group don't wrap.
5975   // This means that we can forcefully peel the loop in order to only have to
5976   // check the first pointer for no-wrap. When we'll change to use Assume=true
5977   // we'll only need at most one runtime check per interleaved group.
5978   for (InterleaveGroup *Group : LoadGroups) {
5979     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
5980     // load would wrap around the address space we would do a memory access at
5981     // nullptr even without the transformation.
5982     if (Group->getNumMembers() == Group->getFactor())
5983       continue;
5984 
5985     // Case 2: If first and last members of the group don't wrap this implies
5986     // that all the pointers in the group don't wrap.
5987     // So we check only group member 0 (which is always guaranteed to exist),
5988     // and group member Factor - 1; If the latter doesn't exist we rely on
5989     // peeling (if it is a non-reveresed accsess -- see Case 3).
5990     Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
5991     if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
5992                       /*ShouldCheckWrap=*/true)) {
5993       DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
5994                       "first group member potentially pointer-wrapping.\n");
5995       releaseGroup(Group);
5996       continue;
5997     }
5998     Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
5999     if (LastMember) {
6000       Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
6001       if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
6002                         /*ShouldCheckWrap=*/true)) {
6003         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6004                         "last group member potentially pointer-wrapping.\n");
6005         releaseGroup(Group);
6006       }
6007     } else {
6008       // Case 3: A non-reversed interleaved load group with gaps: We need
6009       // to execute at least one scalar epilogue iteration. This will ensure
6010       // we don't speculatively access memory out-of-bounds. We only need
6011       // to look for a member at index factor - 1, since every group must have
6012       // a member at index zero.
6013       if (Group->isReverse()) {
6014         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
6015                         "a reverse access with gaps.\n");
6016         releaseGroup(Group);
6017         continue;
6018       }
6019       DEBUG(dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
6020       RequiresScalarEpilogue = true;
6021     }
6022   }
6023 }
6024 
6025 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
6026   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
6027     // TODO: It may by useful to do since it's still likely to be dynamically
6028     // uniform if the target can skip.
6029     DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
6030 
6031     ORE->emit(
6032       createMissedAnalysis("CantVersionLoopWithDivergentTarget")
6033       << "runtime pointer checks needed. Not enabled for divergent target");
6034 
6035     return None;
6036   }
6037 
6038   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6039   if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
6040     return computeFeasibleMaxVF(OptForSize, TC);
6041 
6042   if (Legal->getRuntimePointerChecking()->Need) {
6043     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
6044               << "runtime pointer checks needed. Enable vectorization of this "
6045                  "loop with '#pragma clang loop vectorize(enable)' when "
6046                  "compiling with -Os/-Oz");
6047     DEBUG(dbgs()
6048           << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
6049     return None;
6050   }
6051 
6052   // If we optimize the program for size, avoid creating the tail loop.
6053   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
6054 
6055   // If we don't know the precise trip count, don't try to vectorize.
6056   if (TC < 2) {
6057     ORE->emit(
6058         createMissedAnalysis("UnknownLoopCountComplexCFG")
6059         << "unable to calculate the loop count due to complex control flow");
6060     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6061     return None;
6062   }
6063 
6064   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
6065 
6066   if (TC % MaxVF != 0) {
6067     // If the trip count that we found modulo the vectorization factor is not
6068     // zero then we require a tail.
6069     // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
6070     // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
6071     //        smaller MaxVF that does not require a scalar epilog.
6072 
6073     ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
6074               << "cannot optimize for size and vectorize at the "
6075                  "same time. Enable vectorization of this loop "
6076                  "with '#pragma clang loop vectorize(enable)' "
6077                  "when compiling with -Os/-Oz");
6078     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
6079     return None;
6080   }
6081 
6082   return MaxVF;
6083 }
6084 
6085 unsigned
6086 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
6087                                                  unsigned ConstTripCount) {
6088   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
6089   unsigned SmallestType, WidestType;
6090   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
6091   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
6092 
6093   // Get the maximum safe dependence distance in bits computed by LAA.
6094   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
6095   // the memory accesses that is most restrictive (involved in the smallest
6096   // dependence distance).
6097   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
6098 
6099   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
6100 
6101   unsigned MaxVectorSize = WidestRegister / WidestType;
6102 
6103   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
6104                << WidestType << " bits.\n");
6105   DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
6106                << " bits.\n");
6107 
6108   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
6109                                 " into one vector!");
6110   if (MaxVectorSize == 0) {
6111     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
6112     MaxVectorSize = 1;
6113     return MaxVectorSize;
6114   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
6115              isPowerOf2_32(ConstTripCount)) {
6116     // We need to clamp the VF to be the ConstTripCount. There is no point in
6117     // choosing a higher viable VF as done in the loop below.
6118     DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
6119                  << ConstTripCount << "\n");
6120     MaxVectorSize = ConstTripCount;
6121     return MaxVectorSize;
6122   }
6123 
6124   unsigned MaxVF = MaxVectorSize;
6125   if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
6126       (MaximizeBandwidth && !OptForSize)) {
6127     // Collect all viable vectorization factors larger than the default MaxVF
6128     // (i.e. MaxVectorSize).
6129     SmallVector<unsigned, 8> VFs;
6130     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
6131     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
6132       VFs.push_back(VS);
6133 
6134     // For each VF calculate its register usage.
6135     auto RUs = calculateRegisterUsage(VFs);
6136 
6137     // Select the largest VF which doesn't require more registers than existing
6138     // ones.
6139     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
6140     for (int i = RUs.size() - 1; i >= 0; --i) {
6141       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
6142         MaxVF = VFs[i];
6143         break;
6144       }
6145     }
6146     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
6147       if (MaxVF < MinVF) {
6148         DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
6149                      << ") with target's minimum: " << MinVF << '\n');
6150         MaxVF = MinVF;
6151       }
6152     }
6153   }
6154   return MaxVF;
6155 }
6156 
6157 VectorizationFactor
6158 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
6159   float Cost = expectedCost(1).first;
6160   const float ScalarCost = Cost;
6161   unsigned Width = 1;
6162   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
6163 
6164   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6165   // Ignore scalar width, because the user explicitly wants vectorization.
6166   if (ForceVectorization && MaxVF > 1) {
6167     Width = 2;
6168     Cost = expectedCost(Width).first / (float)Width;
6169   }
6170 
6171   for (unsigned i = 2; i <= MaxVF; i *= 2) {
6172     // Notice that the vector loop needs to be executed less times, so
6173     // we need to divide the cost of the vector loops by the width of
6174     // the vector elements.
6175     VectorizationCostTy C = expectedCost(i);
6176     float VectorCost = C.first / (float)i;
6177     DEBUG(dbgs() << "LV: Vector loop of width " << i
6178                  << " costs: " << (int)VectorCost << ".\n");
6179     if (!C.second && !ForceVectorization) {
6180       DEBUG(
6181           dbgs() << "LV: Not considering vector loop of width " << i
6182                  << " because it will not generate any vector instructions.\n");
6183       continue;
6184     }
6185     if (VectorCost < Cost) {
6186       Cost = VectorCost;
6187       Width = i;
6188     }
6189   }
6190 
6191   if (!EnableCondStoresVectorization && NumPredStores) {
6192     ORE->emit(createMissedAnalysis("ConditionalStore")
6193               << "store that is conditionally executed prevents vectorization");
6194     DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
6195     Width = 1;
6196     Cost = ScalarCost;
6197   }
6198 
6199   DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
6200         << "LV: Vectorization seems to be not beneficial, "
6201         << "but was forced by a user.\n");
6202   DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
6203   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
6204   return Factor;
6205 }
6206 
6207 std::pair<unsigned, unsigned>
6208 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6209   unsigned MinWidth = -1U;
6210   unsigned MaxWidth = 8;
6211   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6212 
6213   // For each block.
6214   for (BasicBlock *BB : TheLoop->blocks()) {
6215     // For each instruction in the loop.
6216     for (Instruction &I : *BB) {
6217       Type *T = I.getType();
6218 
6219       // Skip ignored values.
6220       if (ValuesToIgnore.count(&I))
6221         continue;
6222 
6223       // Only examine Loads, Stores and PHINodes.
6224       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6225         continue;
6226 
6227       // Examine PHI nodes that are reduction variables. Update the type to
6228       // account for the recurrence type.
6229       if (auto *PN = dyn_cast<PHINode>(&I)) {
6230         if (!Legal->isReductionVariable(PN))
6231           continue;
6232         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
6233         T = RdxDesc.getRecurrenceType();
6234       }
6235 
6236       // Examine the stored values.
6237       if (auto *ST = dyn_cast<StoreInst>(&I))
6238         T = ST->getValueOperand()->getType();
6239 
6240       // Ignore loaded pointer types and stored pointer types that are not
6241       // vectorizable.
6242       //
6243       // FIXME: The check here attempts to predict whether a load or store will
6244       //        be vectorized. We only know this for certain after a VF has
6245       //        been selected. Here, we assume that if an access can be
6246       //        vectorized, it will be. We should also look at extending this
6247       //        optimization to non-pointer types.
6248       //
6249       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6250           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6251         continue;
6252 
6253       MinWidth = std::min(MinWidth,
6254                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6255       MaxWidth = std::max(MaxWidth,
6256                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6257     }
6258   }
6259 
6260   return {MinWidth, MaxWidth};
6261 }
6262 
6263 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
6264                                                            unsigned VF,
6265                                                            unsigned LoopCost) {
6266   // -- The interleave heuristics --
6267   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6268   // There are many micro-architectural considerations that we can't predict
6269   // at this level. For example, frontend pressure (on decode or fetch) due to
6270   // code size, or the number and capabilities of the execution ports.
6271   //
6272   // We use the following heuristics to select the interleave count:
6273   // 1. If the code has reductions, then we interleave to break the cross
6274   // iteration dependency.
6275   // 2. If the loop is really small, then we interleave to reduce the loop
6276   // overhead.
6277   // 3. We don't interleave if we think that we will spill registers to memory
6278   // due to the increased register pressure.
6279 
6280   // When we optimize for size, we don't interleave.
6281   if (OptForSize)
6282     return 1;
6283 
6284   // We used the distance for the interleave count.
6285   if (Legal->getMaxSafeDepDistBytes() != -1U)
6286     return 1;
6287 
6288   // Do not interleave loops with a relatively small trip count.
6289   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
6290   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
6291     return 1;
6292 
6293   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
6294   DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6295                << " registers\n");
6296 
6297   if (VF == 1) {
6298     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6299       TargetNumRegisters = ForceTargetNumScalarRegs;
6300   } else {
6301     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6302       TargetNumRegisters = ForceTargetNumVectorRegs;
6303   }
6304 
6305   RegisterUsage R = calculateRegisterUsage({VF})[0];
6306   // We divide by these constants so assume that we have at least one
6307   // instruction that uses at least one register.
6308   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
6309 
6310   // We calculate the interleave count using the following formula.
6311   // Subtract the number of loop invariants from the number of available
6312   // registers. These registers are used by all of the interleaved instances.
6313   // Next, divide the remaining registers by the number of registers that is
6314   // required by the loop, in order to estimate how many parallel instances
6315   // fit without causing spills. All of this is rounded down if necessary to be
6316   // a power of two. We want power of two interleave count to simplify any
6317   // addressing operations or alignment considerations.
6318   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
6319                               R.MaxLocalUsers);
6320 
6321   // Don't count the induction variable as interleaved.
6322   if (EnableIndVarRegisterHeur)
6323     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
6324                        std::max(1U, (R.MaxLocalUsers - 1)));
6325 
6326   // Clamp the interleave ranges to reasonable counts.
6327   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
6328 
6329   // Check if the user has overridden the max.
6330   if (VF == 1) {
6331     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6332       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6333   } else {
6334     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6335       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6336   }
6337 
6338   // If we did not calculate the cost for VF (because the user selected the VF)
6339   // then we calculate the cost of VF here.
6340   if (LoopCost == 0)
6341     LoopCost = expectedCost(VF).first;
6342 
6343   // Clamp the calculated IC to be between the 1 and the max interleave count
6344   // that the target allows.
6345   if (IC > MaxInterleaveCount)
6346     IC = MaxInterleaveCount;
6347   else if (IC < 1)
6348     IC = 1;
6349 
6350   // Interleave if we vectorized this loop and there is a reduction that could
6351   // benefit from interleaving.
6352   if (VF > 1 && !Legal->getReductionVars()->empty()) {
6353     DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6354     return IC;
6355   }
6356 
6357   // Note that if we've already vectorized the loop we will have done the
6358   // runtime check and so interleaving won't require further checks.
6359   bool InterleavingRequiresRuntimePointerCheck =
6360       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
6361 
6362   // We want to interleave small loops in order to reduce the loop overhead and
6363   // potentially expose ILP opportunities.
6364   DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
6365   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6366     // We assume that the cost overhead is 1 and we use the cost model
6367     // to estimate the cost of the loop and interleave until the cost of the
6368     // loop overhead is about 5% of the cost of the loop.
6369     unsigned SmallIC =
6370         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6371 
6372     // Interleave until store/load ports (estimated by max interleave count) are
6373     // saturated.
6374     unsigned NumStores = Legal->getNumStores();
6375     unsigned NumLoads = Legal->getNumLoads();
6376     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6377     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6378 
6379     // If we have a scalar reduction (vector reductions are already dealt with
6380     // by this point), we can increase the critical path length if the loop
6381     // we're interleaving is inside another loop. Limit, by default to 2, so the
6382     // critical path only gets increased by one reduction operation.
6383     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
6384       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6385       SmallIC = std::min(SmallIC, F);
6386       StoresIC = std::min(StoresIC, F);
6387       LoadsIC = std::min(LoadsIC, F);
6388     }
6389 
6390     if (EnableLoadStoreRuntimeInterleave &&
6391         std::max(StoresIC, LoadsIC) > SmallIC) {
6392       DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6393       return std::max(StoresIC, LoadsIC);
6394     }
6395 
6396     DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6397     return SmallIC;
6398   }
6399 
6400   // Interleave if this is a large loop (small loops are already dealt with by
6401   // this point) that could benefit from interleaving.
6402   bool HasReductions = !Legal->getReductionVars()->empty();
6403   if (TTI.enableAggressiveInterleaving(HasReductions)) {
6404     DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6405     return IC;
6406   }
6407 
6408   DEBUG(dbgs() << "LV: Not Interleaving.\n");
6409   return 1;
6410 }
6411 
6412 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6413 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
6414   // This function calculates the register usage by measuring the highest number
6415   // of values that are alive at a single location. Obviously, this is a very
6416   // rough estimation. We scan the loop in a topological order in order and
6417   // assign a number to each instruction. We use RPO to ensure that defs are
6418   // met before their users. We assume that each instruction that has in-loop
6419   // users starts an interval. We record every time that an in-loop value is
6420   // used, so we have a list of the first and last occurrences of each
6421   // instruction. Next, we transpose this data structure into a multi map that
6422   // holds the list of intervals that *end* at a specific location. This multi
6423   // map allows us to perform a linear search. We scan the instructions linearly
6424   // and record each time that a new interval starts, by placing it in a set.
6425   // If we find this value in the multi-map then we remove it from the set.
6426   // The max register usage is the maximum size of the set.
6427   // We also search for instructions that are defined outside the loop, but are
6428   // used inside the loop. We need this number separately from the max-interval
6429   // usage number because when we unroll, loop-invariant values do not take
6430   // more register.
6431   LoopBlocksDFS DFS(TheLoop);
6432   DFS.perform(LI);
6433 
6434   RegisterUsage RU;
6435 
6436   // Each 'key' in the map opens a new interval. The values
6437   // of the map are the index of the 'last seen' usage of the
6438   // instruction that is the key.
6439   using IntervalMap = DenseMap<Instruction *, unsigned>;
6440 
6441   // Maps instruction to its index.
6442   DenseMap<unsigned, Instruction *> IdxToInstr;
6443   // Marks the end of each interval.
6444   IntervalMap EndPoint;
6445   // Saves the list of instruction indices that are used in the loop.
6446   SmallSet<Instruction *, 8> Ends;
6447   // Saves the list of values that are used in the loop but are
6448   // defined outside the loop, such as arguments and constants.
6449   SmallPtrSet<Value *, 8> LoopInvariants;
6450 
6451   unsigned Index = 0;
6452   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6453     for (Instruction &I : *BB) {
6454       IdxToInstr[Index++] = &I;
6455 
6456       // Save the end location of each USE.
6457       for (Value *U : I.operands()) {
6458         auto *Instr = dyn_cast<Instruction>(U);
6459 
6460         // Ignore non-instruction values such as arguments, constants, etc.
6461         if (!Instr)
6462           continue;
6463 
6464         // If this instruction is outside the loop then record it and continue.
6465         if (!TheLoop->contains(Instr)) {
6466           LoopInvariants.insert(Instr);
6467           continue;
6468         }
6469 
6470         // Overwrite previous end points.
6471         EndPoint[Instr] = Index;
6472         Ends.insert(Instr);
6473       }
6474     }
6475   }
6476 
6477   // Saves the list of intervals that end with the index in 'key'.
6478   using InstrList = SmallVector<Instruction *, 2>;
6479   DenseMap<unsigned, InstrList> TransposeEnds;
6480 
6481   // Transpose the EndPoints to a list of values that end at each index.
6482   for (auto &Interval : EndPoint)
6483     TransposeEnds[Interval.second].push_back(Interval.first);
6484 
6485   SmallSet<Instruction *, 8> OpenIntervals;
6486 
6487   // Get the size of the widest register.
6488   unsigned MaxSafeDepDist = -1U;
6489   if (Legal->getMaxSafeDepDistBytes() != -1U)
6490     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
6491   unsigned WidestRegister =
6492       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
6493   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6494 
6495   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6496   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
6497 
6498   DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6499 
6500   // A lambda that gets the register usage for the given type and VF.
6501   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
6502     if (Ty->isTokenTy())
6503       return 0U;
6504     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
6505     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
6506   };
6507 
6508   for (unsigned int i = 0; i < Index; ++i) {
6509     Instruction *I = IdxToInstr[i];
6510 
6511     // Remove all of the instructions that end at this location.
6512     InstrList &List = TransposeEnds[i];
6513     for (Instruction *ToRemove : List)
6514       OpenIntervals.erase(ToRemove);
6515 
6516     // Ignore instructions that are never used within the loop.
6517     if (!Ends.count(I))
6518       continue;
6519 
6520     // Skip ignored values.
6521     if (ValuesToIgnore.count(I))
6522       continue;
6523 
6524     // For each VF find the maximum usage of registers.
6525     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6526       if (VFs[j] == 1) {
6527         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
6528         continue;
6529       }
6530       collectUniformsAndScalars(VFs[j]);
6531       // Count the number of live intervals.
6532       unsigned RegUsage = 0;
6533       for (auto Inst : OpenIntervals) {
6534         // Skip ignored values for VF > 1.
6535         if (VecValuesToIgnore.count(Inst) ||
6536             isScalarAfterVectorization(Inst, VFs[j]))
6537           continue;
6538         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
6539       }
6540       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
6541     }
6542 
6543     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6544                  << OpenIntervals.size() << '\n');
6545 
6546     // Add the current instruction to the list of open intervals.
6547     OpenIntervals.insert(I);
6548   }
6549 
6550   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6551     unsigned Invariant = 0;
6552     if (VFs[i] == 1)
6553       Invariant = LoopInvariants.size();
6554     else {
6555       for (auto Inst : LoopInvariants)
6556         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
6557     }
6558 
6559     DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
6560     DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
6561     DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
6562 
6563     RU.LoopInvariantRegs = Invariant;
6564     RU.MaxLocalUsers = MaxUsages[i];
6565     RUs[i] = RU;
6566   }
6567 
6568   return RUs;
6569 }
6570 
6571 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6572   // TODO: Cost model for emulated masked load/store is completely
6573   // broken. This hack guides the cost model to use an artificially
6574   // high enough value to practically disable vectorization with such
6575   // operations, except where previously deployed legality hack allowed
6576   // using very low cost values. This is to avoid regressions coming simply
6577   // from moving "masked load/store" check from legality to cost model.
6578   // Masked Load/Gather emulation was previously never allowed.
6579   // Limited number of Masked Store/Scatter emulation was allowed.
6580   assert(isScalarWithPredication(I) &&
6581          "Expecting a scalar emulated instruction");
6582   return isa<LoadInst>(I) ||
6583          (isa<StoreInst>(I) &&
6584           NumPredStores > NumberOfStoresToPredicate);
6585 }
6586 
6587 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
6588   // If we aren't vectorizing the loop, or if we've already collected the
6589   // instructions to scalarize, there's nothing to do. Collection may already
6590   // have occurred if we have a user-selected VF and are now computing the
6591   // expected cost for interleaving.
6592   if (VF < 2 || InstsToScalarize.count(VF))
6593     return;
6594 
6595   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6596   // not profitable to scalarize any instructions, the presence of VF in the
6597   // map will indicate that we've analyzed it already.
6598   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6599 
6600   // Find all the instructions that are scalar with predication in the loop and
6601   // determine if it would be better to not if-convert the blocks they are in.
6602   // If so, we also record the instructions to scalarize.
6603   for (BasicBlock *BB : TheLoop->blocks()) {
6604     if (!Legal->blockNeedsPredication(BB))
6605       continue;
6606     for (Instruction &I : *BB)
6607       if (isScalarWithPredication(&I)) {
6608         ScalarCostsTy ScalarCosts;
6609         // Do not apply discount logic if hacked cost is needed
6610         // for emulated masked memrefs.
6611         if (!useEmulatedMaskMemRefHack(&I) &&
6612             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6613           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6614         // Remember that BB will remain after vectorization.
6615         PredicatedBBsAfterVectorization.insert(BB);
6616       }
6617   }
6618 }
6619 
6620 int LoopVectorizationCostModel::computePredInstDiscount(
6621     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6622     unsigned VF) {
6623   assert(!isUniformAfterVectorization(PredInst, VF) &&
6624          "Instruction marked uniform-after-vectorization will be predicated");
6625 
6626   // Initialize the discount to zero, meaning that the scalar version and the
6627   // vector version cost the same.
6628   int Discount = 0;
6629 
6630   // Holds instructions to analyze. The instructions we visit are mapped in
6631   // ScalarCosts. Those instructions are the ones that would be scalarized if
6632   // we find that the scalar version costs less.
6633   SmallVector<Instruction *, 8> Worklist;
6634 
6635   // Returns true if the given instruction can be scalarized.
6636   auto canBeScalarized = [&](Instruction *I) -> bool {
6637     // We only attempt to scalarize instructions forming a single-use chain
6638     // from the original predicated block that would otherwise be vectorized.
6639     // Although not strictly necessary, we give up on instructions we know will
6640     // already be scalar to avoid traversing chains that are unlikely to be
6641     // beneficial.
6642     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6643         isScalarAfterVectorization(I, VF))
6644       return false;
6645 
6646     // If the instruction is scalar with predication, it will be analyzed
6647     // separately. We ignore it within the context of PredInst.
6648     if (isScalarWithPredication(I))
6649       return false;
6650 
6651     // If any of the instruction's operands are uniform after vectorization,
6652     // the instruction cannot be scalarized. This prevents, for example, a
6653     // masked load from being scalarized.
6654     //
6655     // We assume we will only emit a value for lane zero of an instruction
6656     // marked uniform after vectorization, rather than VF identical values.
6657     // Thus, if we scalarize an instruction that uses a uniform, we would
6658     // create uses of values corresponding to the lanes we aren't emitting code
6659     // for. This behavior can be changed by allowing getScalarValue to clone
6660     // the lane zero values for uniforms rather than asserting.
6661     for (Use &U : I->operands())
6662       if (auto *J = dyn_cast<Instruction>(U.get()))
6663         if (isUniformAfterVectorization(J, VF))
6664           return false;
6665 
6666     // Otherwise, we can scalarize the instruction.
6667     return true;
6668   };
6669 
6670   // Returns true if an operand that cannot be scalarized must be extracted
6671   // from a vector. We will account for this scalarization overhead below. Note
6672   // that the non-void predicated instructions are placed in their own blocks,
6673   // and their return values are inserted into vectors. Thus, an extract would
6674   // still be required.
6675   auto needsExtract = [&](Instruction *I) -> bool {
6676     return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
6677   };
6678 
6679   // Compute the expected cost discount from scalarizing the entire expression
6680   // feeding the predicated instruction. We currently only consider expressions
6681   // that are single-use instruction chains.
6682   Worklist.push_back(PredInst);
6683   while (!Worklist.empty()) {
6684     Instruction *I = Worklist.pop_back_val();
6685 
6686     // If we've already analyzed the instruction, there's nothing to do.
6687     if (ScalarCosts.count(I))
6688       continue;
6689 
6690     // Compute the cost of the vector instruction. Note that this cost already
6691     // includes the scalarization overhead of the predicated instruction.
6692     unsigned VectorCost = getInstructionCost(I, VF).first;
6693 
6694     // Compute the cost of the scalarized instruction. This cost is the cost of
6695     // the instruction as if it wasn't if-converted and instead remained in the
6696     // predicated block. We will scale this cost by block probability after
6697     // computing the scalarization overhead.
6698     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
6699 
6700     // Compute the scalarization overhead of needed insertelement instructions
6701     // and phi nodes.
6702     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6703       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
6704                                                  true, false);
6705       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
6706     }
6707 
6708     // Compute the scalarization overhead of needed extractelement
6709     // instructions. For each of the instruction's operands, if the operand can
6710     // be scalarized, add it to the worklist; otherwise, account for the
6711     // overhead.
6712     for (Use &U : I->operands())
6713       if (auto *J = dyn_cast<Instruction>(U.get())) {
6714         assert(VectorType::isValidElementType(J->getType()) &&
6715                "Instruction has non-scalar type");
6716         if (canBeScalarized(J))
6717           Worklist.push_back(J);
6718         else if (needsExtract(J))
6719           ScalarCost += TTI.getScalarizationOverhead(
6720                               ToVectorTy(J->getType(),VF), false, true);
6721       }
6722 
6723     // Scale the total scalar cost by block probability.
6724     ScalarCost /= getReciprocalPredBlockProb();
6725 
6726     // Compute the discount. A non-negative discount means the vector version
6727     // of the instruction costs more, and scalarizing would be beneficial.
6728     Discount += VectorCost - ScalarCost;
6729     ScalarCosts[I] = ScalarCost;
6730   }
6731 
6732   return Discount;
6733 }
6734 
6735 LoopVectorizationCostModel::VectorizationCostTy
6736 LoopVectorizationCostModel::expectedCost(unsigned VF) {
6737   VectorizationCostTy Cost;
6738 
6739   // For each block.
6740   for (BasicBlock *BB : TheLoop->blocks()) {
6741     VectorizationCostTy BlockCost;
6742 
6743     // For each instruction in the old loop.
6744     for (Instruction &I : *BB) {
6745       // Skip dbg intrinsics.
6746       if (isa<DbgInfoIntrinsic>(I))
6747         continue;
6748 
6749       // Skip ignored values.
6750       if (ValuesToIgnore.count(&I) ||
6751           (VF > 1 && VecValuesToIgnore.count(&I)))
6752         continue;
6753 
6754       VectorizationCostTy C = getInstructionCost(&I, VF);
6755 
6756       // Check if we should override the cost.
6757       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6758         C.first = ForceTargetInstructionCost;
6759 
6760       BlockCost.first += C.first;
6761       BlockCost.second |= C.second;
6762       DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first << " for VF "
6763                    << VF << " For instruction: " << I << '\n');
6764     }
6765 
6766     // If we are vectorizing a predicated block, it will have been
6767     // if-converted. This means that the block's instructions (aside from
6768     // stores and instructions that may divide by zero) will now be
6769     // unconditionally executed. For the scalar case, we may not always execute
6770     // the predicated block. Thus, scale the block's cost by the probability of
6771     // executing it.
6772     if (VF == 1 && Legal->blockNeedsPredication(BB))
6773       BlockCost.first /= getReciprocalPredBlockProb();
6774 
6775     Cost.first += BlockCost.first;
6776     Cost.second |= BlockCost.second;
6777   }
6778 
6779   return Cost;
6780 }
6781 
6782 /// \brief Gets Address Access SCEV after verifying that the access pattern
6783 /// is loop invariant except the induction variable dependence.
6784 ///
6785 /// This SCEV can be sent to the Target in order to estimate the address
6786 /// calculation cost.
6787 static const SCEV *getAddressAccessSCEV(
6788               Value *Ptr,
6789               LoopVectorizationLegality *Legal,
6790               PredicatedScalarEvolution &PSE,
6791               const Loop *TheLoop) {
6792 
6793   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6794   if (!Gep)
6795     return nullptr;
6796 
6797   // We are looking for a gep with all loop invariant indices except for one
6798   // which should be an induction variable.
6799   auto SE = PSE.getSE();
6800   unsigned NumOperands = Gep->getNumOperands();
6801   for (unsigned i = 1; i < NumOperands; ++i) {
6802     Value *Opd = Gep->getOperand(i);
6803     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6804         !Legal->isInductionVariable(Opd))
6805       return nullptr;
6806   }
6807 
6808   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6809   return PSE.getSCEV(Ptr);
6810 }
6811 
6812 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6813   return Legal->hasStride(I->getOperand(0)) ||
6814          Legal->hasStride(I->getOperand(1));
6815 }
6816 
6817 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6818                                                                  unsigned VF) {
6819   Type *ValTy = getMemInstValueType(I);
6820   auto SE = PSE.getSE();
6821 
6822   unsigned Alignment = getMemInstAlignment(I);
6823   unsigned AS = getMemInstAddressSpace(I);
6824   Value *Ptr = getLoadStorePointerOperand(I);
6825   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6826 
6827   // Figure out whether the access is strided and get the stride value
6828   // if it's known in compile time
6829   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6830 
6831   // Get the cost of the scalar memory instruction and address computation.
6832   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6833 
6834   Cost += VF *
6835           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6836                               AS, I);
6837 
6838   // Get the overhead of the extractelement and insertelement instructions
6839   // we might create due to scalarization.
6840   Cost += getScalarizationOverhead(I, VF, TTI);
6841 
6842   // If we have a predicated store, it may not be executed for each vector
6843   // lane. Scale the cost by the probability of executing the predicated
6844   // block.
6845   if (isScalarWithPredication(I)) {
6846     Cost /= getReciprocalPredBlockProb();
6847 
6848     if (useEmulatedMaskMemRefHack(I))
6849       // Artificially setting to a high enough value to practically disable
6850       // vectorization with such operations.
6851       Cost = 3000000;
6852   }
6853 
6854   return Cost;
6855 }
6856 
6857 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6858                                                              unsigned VF) {
6859   Type *ValTy = getMemInstValueType(I);
6860   Type *VectorTy = ToVectorTy(ValTy, VF);
6861   unsigned Alignment = getMemInstAlignment(I);
6862   Value *Ptr = getLoadStorePointerOperand(I);
6863   unsigned AS = getMemInstAddressSpace(I);
6864   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6865 
6866   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6867          "Stride should be 1 or -1 for consecutive memory access");
6868   unsigned Cost = 0;
6869   if (Legal->isMaskRequired(I))
6870     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
6871   else
6872     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
6873 
6874   bool Reverse = ConsecutiveStride < 0;
6875   if (Reverse)
6876     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6877   return Cost;
6878 }
6879 
6880 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6881                                                          unsigned VF) {
6882   LoadInst *LI = cast<LoadInst>(I);
6883   Type *ValTy = LI->getType();
6884   Type *VectorTy = ToVectorTy(ValTy, VF);
6885   unsigned Alignment = LI->getAlignment();
6886   unsigned AS = LI->getPointerAddressSpace();
6887 
6888   return TTI.getAddressComputationCost(ValTy) +
6889          TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
6890          TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6891 }
6892 
6893 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6894                                                           unsigned VF) {
6895   Type *ValTy = getMemInstValueType(I);
6896   Type *VectorTy = ToVectorTy(ValTy, VF);
6897   unsigned Alignment = getMemInstAlignment(I);
6898   Value *Ptr = getLoadStorePointerOperand(I);
6899 
6900   return TTI.getAddressComputationCost(VectorTy) +
6901          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
6902                                     Legal->isMaskRequired(I), Alignment);
6903 }
6904 
6905 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6906                                                             unsigned VF) {
6907   Type *ValTy = getMemInstValueType(I);
6908   Type *VectorTy = ToVectorTy(ValTy, VF);
6909   unsigned AS = getMemInstAddressSpace(I);
6910 
6911   auto Group = getInterleavedAccessGroup(I);
6912   assert(Group && "Fail to get an interleaved access group.");
6913 
6914   unsigned InterleaveFactor = Group->getFactor();
6915   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6916 
6917   // Holds the indices of existing members in an interleaved load group.
6918   // An interleaved store group doesn't need this as it doesn't allow gaps.
6919   SmallVector<unsigned, 4> Indices;
6920   if (isa<LoadInst>(I)) {
6921     for (unsigned i = 0; i < InterleaveFactor; i++)
6922       if (Group->getMember(i))
6923         Indices.push_back(i);
6924   }
6925 
6926   // Calculate the cost of the whole interleaved group.
6927   unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
6928                                                  Group->getFactor(), Indices,
6929                                                  Group->getAlignment(), AS);
6930 
6931   if (Group->isReverse())
6932     Cost += Group->getNumMembers() *
6933             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6934   return Cost;
6935 }
6936 
6937 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6938                                                               unsigned VF) {
6939   // Calculate scalar cost only. Vectorization cost should be ready at this
6940   // moment.
6941   if (VF == 1) {
6942     Type *ValTy = getMemInstValueType(I);
6943     unsigned Alignment = getMemInstAlignment(I);
6944     unsigned AS = getMemInstAddressSpace(I);
6945 
6946     return TTI.getAddressComputationCost(ValTy) +
6947            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
6948   }
6949   return getWideningCost(I, VF);
6950 }
6951 
6952 LoopVectorizationCostModel::VectorizationCostTy
6953 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6954   // If we know that this instruction will remain uniform, check the cost of
6955   // the scalar version.
6956   if (isUniformAfterVectorization(I, VF))
6957     VF = 1;
6958 
6959   if (VF > 1 && isProfitableToScalarize(I, VF))
6960     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6961 
6962   // Forced scalars do not have any scalarization overhead.
6963   if (VF > 1 && ForcedScalars.count(VF) &&
6964       ForcedScalars.find(VF)->second.count(I))
6965     return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6966 
6967   Type *VectorTy;
6968   unsigned C = getInstructionCost(I, VF, VectorTy);
6969 
6970   bool TypeNotScalarized =
6971       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6972   return VectorizationCostTy(C, TypeNotScalarized);
6973 }
6974 
6975 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6976   if (VF == 1)
6977     return;
6978   NumPredStores = 0;
6979   for (BasicBlock *BB : TheLoop->blocks()) {
6980     // For each instruction in the old loop.
6981     for (Instruction &I : *BB) {
6982       Value *Ptr =  getLoadStorePointerOperand(&I);
6983       if (!Ptr)
6984         continue;
6985 
6986       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6987         NumPredStores++;
6988       if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
6989         // Scalar load + broadcast
6990         unsigned Cost = getUniformMemOpCost(&I, VF);
6991         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6992         continue;
6993       }
6994 
6995       // We assume that widening is the best solution when possible.
6996       if (memoryInstructionCanBeWidened(&I, VF)) {
6997         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6998         int ConsecutiveStride =
6999                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7000         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7001                "Expected consecutive stride.");
7002         InstWidening Decision =
7003             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7004         setWideningDecision(&I, VF, Decision, Cost);
7005         continue;
7006       }
7007 
7008       // Choose between Interleaving, Gather/Scatter or Scalarization.
7009       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
7010       unsigned NumAccesses = 1;
7011       if (isAccessInterleaved(&I)) {
7012         auto Group = getInterleavedAccessGroup(&I);
7013         assert(Group && "Fail to get an interleaved access group.");
7014 
7015         // Make one decision for the whole group.
7016         if (getWideningDecision(&I, VF) != CM_Unknown)
7017           continue;
7018 
7019         NumAccesses = Group->getNumMembers();
7020         InterleaveCost = getInterleaveGroupCost(&I, VF);
7021       }
7022 
7023       unsigned GatherScatterCost =
7024           isLegalGatherOrScatter(&I)
7025               ? getGatherScatterCost(&I, VF) * NumAccesses
7026               : std::numeric_limits<unsigned>::max();
7027 
7028       unsigned ScalarizationCost =
7029           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7030 
7031       // Choose better solution for the current VF,
7032       // write down this decision and use it during vectorization.
7033       unsigned Cost;
7034       InstWidening Decision;
7035       if (InterleaveCost <= GatherScatterCost &&
7036           InterleaveCost < ScalarizationCost) {
7037         Decision = CM_Interleave;
7038         Cost = InterleaveCost;
7039       } else if (GatherScatterCost < ScalarizationCost) {
7040         Decision = CM_GatherScatter;
7041         Cost = GatherScatterCost;
7042       } else {
7043         Decision = CM_Scalarize;
7044         Cost = ScalarizationCost;
7045       }
7046       // If the instructions belongs to an interleave group, the whole group
7047       // receives the same decision. The whole group receives the cost, but
7048       // the cost will actually be assigned to one instruction.
7049       if (auto Group = getInterleavedAccessGroup(&I))
7050         setWideningDecision(Group, VF, Decision, Cost);
7051       else
7052         setWideningDecision(&I, VF, Decision, Cost);
7053     }
7054   }
7055 
7056   // Make sure that any load of address and any other address computation
7057   // remains scalar unless there is gather/scatter support. This avoids
7058   // inevitable extracts into address registers, and also has the benefit of
7059   // activating LSR more, since that pass can't optimize vectorized
7060   // addresses.
7061   if (TTI.prefersVectorizedAddressing())
7062     return;
7063 
7064   // Start with all scalar pointer uses.
7065   SmallPtrSet<Instruction *, 8> AddrDefs;
7066   for (BasicBlock *BB : TheLoop->blocks())
7067     for (Instruction &I : *BB) {
7068       Instruction *PtrDef =
7069         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7070       if (PtrDef && TheLoop->contains(PtrDef) &&
7071           getWideningDecision(&I, VF) != CM_GatherScatter)
7072         AddrDefs.insert(PtrDef);
7073     }
7074 
7075   // Add all instructions used to generate the addresses.
7076   SmallVector<Instruction *, 4> Worklist;
7077   for (auto *I : AddrDefs)
7078     Worklist.push_back(I);
7079   while (!Worklist.empty()) {
7080     Instruction *I = Worklist.pop_back_val();
7081     for (auto &Op : I->operands())
7082       if (auto *InstOp = dyn_cast<Instruction>(Op))
7083         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7084             AddrDefs.insert(InstOp).second)
7085           Worklist.push_back(InstOp);
7086   }
7087 
7088   for (auto *I : AddrDefs) {
7089     if (isa<LoadInst>(I)) {
7090       // Setting the desired widening decision should ideally be handled in
7091       // by cost functions, but since this involves the task of finding out
7092       // if the loaded register is involved in an address computation, it is
7093       // instead changed here when we know this is the case.
7094       InstWidening Decision = getWideningDecision(I, VF);
7095       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7096         // Scalarize a widened load of address.
7097         setWideningDecision(I, VF, CM_Scalarize,
7098                             (VF * getMemoryInstructionCost(I, 1)));
7099       else if (auto Group = getInterleavedAccessGroup(I)) {
7100         // Scalarize an interleave group of address loads.
7101         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7102           if (Instruction *Member = Group->getMember(I))
7103             setWideningDecision(Member, VF, CM_Scalarize,
7104                                 (VF * getMemoryInstructionCost(Member, 1)));
7105         }
7106       }
7107     } else
7108       // Make sure I gets scalarized and a cost estimate without
7109       // scalarization overhead.
7110       ForcedScalars[VF].insert(I);
7111   }
7112 }
7113 
7114 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7115                                                         unsigned VF,
7116                                                         Type *&VectorTy) {
7117   Type *RetTy = I->getType();
7118   if (canTruncateToMinimalBitwidth(I, VF))
7119     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7120   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7121   auto SE = PSE.getSE();
7122 
7123   // TODO: We need to estimate the cost of intrinsic calls.
7124   switch (I->getOpcode()) {
7125   case Instruction::GetElementPtr:
7126     // We mark this instruction as zero-cost because the cost of GEPs in
7127     // vectorized code depends on whether the corresponding memory instruction
7128     // is scalarized or not. Therefore, we handle GEPs with the memory
7129     // instruction cost.
7130     return 0;
7131   case Instruction::Br: {
7132     // In cases of scalarized and predicated instructions, there will be VF
7133     // predicated blocks in the vectorized loop. Each branch around these
7134     // blocks requires also an extract of its vector compare i1 element.
7135     bool ScalarPredicatedBB = false;
7136     BranchInst *BI = cast<BranchInst>(I);
7137     if (VF > 1 && BI->isConditional() &&
7138         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7139          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7140       ScalarPredicatedBB = true;
7141 
7142     if (ScalarPredicatedBB) {
7143       // Return cost for branches around scalarized and predicated blocks.
7144       Type *Vec_i1Ty =
7145           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7146       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
7147               (TTI.getCFInstrCost(Instruction::Br) * VF));
7148     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
7149       // The back-edge branch will remain, as will all scalar branches.
7150       return TTI.getCFInstrCost(Instruction::Br);
7151     else
7152       // This branch will be eliminated by if-conversion.
7153       return 0;
7154     // Note: We currently assume zero cost for an unconditional branch inside
7155     // a predicated block since it will become a fall-through, although we
7156     // may decide in the future to call TTI for all branches.
7157   }
7158   case Instruction::PHI: {
7159     auto *Phi = cast<PHINode>(I);
7160 
7161     // First-order recurrences are replaced by vector shuffles inside the loop.
7162     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
7163       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
7164                                 VectorTy, VF - 1, VectorTy);
7165 
7166     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7167     // converted into select instructions. We require N - 1 selects per phi
7168     // node, where N is the number of incoming values.
7169     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
7170       return (Phi->getNumIncomingValues() - 1) *
7171              TTI.getCmpSelInstrCost(
7172                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7173                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
7174 
7175     return TTI.getCFInstrCost(Instruction::PHI);
7176   }
7177   case Instruction::UDiv:
7178   case Instruction::SDiv:
7179   case Instruction::URem:
7180   case Instruction::SRem:
7181     // If we have a predicated instruction, it may not be executed for each
7182     // vector lane. Get the scalarization cost and scale this amount by the
7183     // probability of executing the predicated block. If the instruction is not
7184     // predicated, we fall through to the next case.
7185     if (VF > 1 && isScalarWithPredication(I)) {
7186       unsigned Cost = 0;
7187 
7188       // These instructions have a non-void type, so account for the phi nodes
7189       // that we will create. This cost is likely to be zero. The phi node
7190       // cost, if any, should be scaled by the block probability because it
7191       // models a copy at the end of each predicated block.
7192       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
7193 
7194       // The cost of the non-predicated instruction.
7195       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
7196 
7197       // The cost of insertelement and extractelement instructions needed for
7198       // scalarization.
7199       Cost += getScalarizationOverhead(I, VF, TTI);
7200 
7201       // Scale the cost by the probability of executing the predicated blocks.
7202       // This assumes the predicated block for each vector lane is equally
7203       // likely.
7204       return Cost / getReciprocalPredBlockProb();
7205     }
7206     LLVM_FALLTHROUGH;
7207   case Instruction::Add:
7208   case Instruction::FAdd:
7209   case Instruction::Sub:
7210   case Instruction::FSub:
7211   case Instruction::Mul:
7212   case Instruction::FMul:
7213   case Instruction::FDiv:
7214   case Instruction::FRem:
7215   case Instruction::Shl:
7216   case Instruction::LShr:
7217   case Instruction::AShr:
7218   case Instruction::And:
7219   case Instruction::Or:
7220   case Instruction::Xor: {
7221     // Since we will replace the stride by 1 the multiplication should go away.
7222     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7223       return 0;
7224     // Certain instructions can be cheaper to vectorize if they have a constant
7225     // second vector operand. One example of this are shifts on x86.
7226     TargetTransformInfo::OperandValueKind Op1VK =
7227         TargetTransformInfo::OK_AnyValue;
7228     TargetTransformInfo::OperandValueKind Op2VK =
7229         TargetTransformInfo::OK_AnyValue;
7230     TargetTransformInfo::OperandValueProperties Op1VP =
7231         TargetTransformInfo::OP_None;
7232     TargetTransformInfo::OperandValueProperties Op2VP =
7233         TargetTransformInfo::OP_None;
7234     Value *Op2 = I->getOperand(1);
7235 
7236     // Check for a splat or for a non uniform vector of constants.
7237     if (isa<ConstantInt>(Op2)) {
7238       ConstantInt *CInt = cast<ConstantInt>(Op2);
7239       if (CInt && CInt->getValue().isPowerOf2())
7240         Op2VP = TargetTransformInfo::OP_PowerOf2;
7241       Op2VK = TargetTransformInfo::OK_UniformConstantValue;
7242     } else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
7243       Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
7244       Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
7245       if (SplatValue) {
7246         ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
7247         if (CInt && CInt->getValue().isPowerOf2())
7248           Op2VP = TargetTransformInfo::OP_PowerOf2;
7249         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
7250       }
7251     } else if (Legal->isUniform(Op2)) {
7252       Op2VK = TargetTransformInfo::OK_UniformValue;
7253     }
7254     SmallVector<const Value *, 4> Operands(I->operand_values());
7255     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
7256     return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
7257                                           Op2VK, Op1VP, Op2VP, Operands);
7258   }
7259   case Instruction::Select: {
7260     SelectInst *SI = cast<SelectInst>(I);
7261     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7262     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7263     Type *CondTy = SI->getCondition()->getType();
7264     if (!ScalarCond)
7265       CondTy = VectorType::get(CondTy, VF);
7266 
7267     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
7268   }
7269   case Instruction::ICmp:
7270   case Instruction::FCmp: {
7271     Type *ValTy = I->getOperand(0)->getType();
7272     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7273     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7274       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7275     VectorTy = ToVectorTy(ValTy, VF);
7276     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
7277   }
7278   case Instruction::Store:
7279   case Instruction::Load: {
7280     unsigned Width = VF;
7281     if (Width > 1) {
7282       InstWidening Decision = getWideningDecision(I, Width);
7283       assert(Decision != CM_Unknown &&
7284              "CM decision should be taken at this point");
7285       if (Decision == CM_Scalarize)
7286         Width = 1;
7287     }
7288     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7289     return getMemoryInstructionCost(I, VF);
7290   }
7291   case Instruction::ZExt:
7292   case Instruction::SExt:
7293   case Instruction::FPToUI:
7294   case Instruction::FPToSI:
7295   case Instruction::FPExt:
7296   case Instruction::PtrToInt:
7297   case Instruction::IntToPtr:
7298   case Instruction::SIToFP:
7299   case Instruction::UIToFP:
7300   case Instruction::Trunc:
7301   case Instruction::FPTrunc:
7302   case Instruction::BitCast: {
7303     // We optimize the truncation of induction variables having constant
7304     // integer steps. The cost of these truncations is the same as the scalar
7305     // operation.
7306     if (isOptimizableIVTruncate(I, VF)) {
7307       auto *Trunc = cast<TruncInst>(I);
7308       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7309                                   Trunc->getSrcTy(), Trunc);
7310     }
7311 
7312     Type *SrcScalarTy = I->getOperand(0)->getType();
7313     Type *SrcVecTy =
7314         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7315     if (canTruncateToMinimalBitwidth(I, VF)) {
7316       // This cast is going to be shrunk. This may remove the cast or it might
7317       // turn it into slightly different cast. For example, if MinBW == 16,
7318       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7319       //
7320       // Calculate the modified src and dest types.
7321       Type *MinVecTy = VectorTy;
7322       if (I->getOpcode() == Instruction::Trunc) {
7323         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7324         VectorTy =
7325             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7326       } else if (I->getOpcode() == Instruction::ZExt ||
7327                  I->getOpcode() == Instruction::SExt) {
7328         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7329         VectorTy =
7330             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7331       }
7332     }
7333 
7334     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
7335     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
7336   }
7337   case Instruction::Call: {
7338     bool NeedToScalarize;
7339     CallInst *CI = cast<CallInst>(I);
7340     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
7341     if (getVectorIntrinsicIDForCall(CI, TLI))
7342       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
7343     return CallCost;
7344   }
7345   default:
7346     // The cost of executing VF copies of the scalar instruction. This opcode
7347     // is unknown. Assume that it is the same as 'mul'.
7348     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
7349            getScalarizationOverhead(I, VF, TTI);
7350   } // end of switch.
7351 }
7352 
7353 char LoopVectorize::ID = 0;
7354 
7355 static const char lv_name[] = "Loop Vectorization";
7356 
7357 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7358 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7359 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7360 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7361 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7362 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7363 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7364 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7365 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7366 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7367 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7368 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7369 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7370 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7371 
7372 namespace llvm {
7373 
7374 Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
7375   return new LoopVectorize(NoUnrolling, AlwaysVectorize);
7376 }
7377 
7378 } // end namespace llvm
7379 
7380 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7381   // Check if the pointer operand of a load or store instruction is
7382   // consecutive.
7383   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7384     return Legal->isConsecutivePtr(Ptr);
7385   return false;
7386 }
7387 
7388 void LoopVectorizationCostModel::collectValuesToIgnore() {
7389   // Ignore ephemeral values.
7390   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7391 
7392   // Ignore type-promoting instructions we identified during reduction
7393   // detection.
7394   for (auto &Reduction : *Legal->getReductionVars()) {
7395     RecurrenceDescriptor &RedDes = Reduction.second;
7396     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7397     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7398   }
7399   // Ignore type-casting instructions we identified during induction
7400   // detection.
7401   for (auto &Induction : *Legal->getInductionVars()) {
7402     InductionDescriptor &IndDes = Induction.second;
7403     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7404     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7405   }
7406 }
7407 
7408 VectorizationFactor
7409 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
7410   // Width 1 means no vectorize, cost 0 means uncomputed cost.
7411   const VectorizationFactor NoVectorization = {1U, 0U};
7412   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
7413   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
7414     return NoVectorization;
7415 
7416   if (UserVF) {
7417     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7418     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
7419     // Collect the instructions (and their associated costs) that will be more
7420     // profitable to scalarize.
7421     CM.selectUserVectorizationFactor(UserVF);
7422     buildVPlans(UserVF, UserVF);
7423     DEBUG(printPlans(dbgs()));
7424     return {UserVF, 0};
7425   }
7426 
7427   unsigned MaxVF = MaybeMaxVF.getValue();
7428   assert(MaxVF != 0 && "MaxVF is zero.");
7429 
7430   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
7431     // Collect Uniform and Scalar instructions after vectorization with VF.
7432     CM.collectUniformsAndScalars(VF);
7433 
7434     // Collect the instructions (and their associated costs) that will be more
7435     // profitable to scalarize.
7436     if (VF > 1)
7437       CM.collectInstsToScalarize(VF);
7438   }
7439 
7440   buildVPlans(1, MaxVF);
7441   DEBUG(printPlans(dbgs()));
7442   if (MaxVF == 1)
7443     return NoVectorization;
7444 
7445   // Select the optimal vectorization factor.
7446   return CM.selectVectorizationFactor(MaxVF);
7447 }
7448 
7449 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
7450   DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
7451   BestVF = VF;
7452   BestUF = UF;
7453 
7454   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7455     return !Plan->hasVF(VF);
7456   });
7457   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7458 }
7459 
7460 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7461                                            DominatorTree *DT) {
7462   // Perform the actual loop transformation.
7463 
7464   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7465   VPCallbackILV CallbackILV(ILV);
7466 
7467   VPTransformState State{BestVF, BestUF,      LI,
7468                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
7469                          &ILV,   CallbackILV};
7470   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7471 
7472   //===------------------------------------------------===//
7473   //
7474   // Notice: any optimization or new instruction that go
7475   // into the code below should also be implemented in
7476   // the cost-model.
7477   //
7478   //===------------------------------------------------===//
7479 
7480   // 2. Copy and widen instructions from the old loop into the new loop.
7481   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7482   VPlans.front()->execute(&State);
7483 
7484   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7485   //    predication, updating analyses.
7486   ILV.fixVectorizedLoop();
7487 }
7488 
7489 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7490     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7491   BasicBlock *Latch = OrigLoop->getLoopLatch();
7492 
7493   // We create new control-flow for the vectorized loop, so the original
7494   // condition will be dead after vectorization if it's only used by the
7495   // branch.
7496   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7497   if (Cmp && Cmp->hasOneUse())
7498     DeadInstructions.insert(Cmp);
7499 
7500   // We create new "steps" for induction variable updates to which the original
7501   // induction variables map. An original update instruction will be dead if
7502   // all its users except the induction variable are dead.
7503   for (auto &Induction : *Legal->getInductionVars()) {
7504     PHINode *Ind = Induction.first;
7505     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7506     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7507           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7508         }))
7509       DeadInstructions.insert(IndUpdate);
7510 
7511     // We record as "Dead" also the type-casting instructions we had identified
7512     // during induction analysis. We don't need any handling for them in the
7513     // vectorized loop because we have proven that, under a proper runtime
7514     // test guarding the vectorized loop, the value of the phi, and the casted
7515     // value of the phi, are the same. The last instruction in this casting chain
7516     // will get its scalar/vector/widened def from the scalar/vector/widened def
7517     // of the respective phi node. Any other casts in the induction def-use chain
7518     // have no other uses outside the phi update chain, and will be ignored.
7519     InductionDescriptor &IndDes = Induction.second;
7520     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7521     DeadInstructions.insert(Casts.begin(), Casts.end());
7522   }
7523 }
7524 
7525 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7526 
7527 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7528 
7529 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7530                                         Instruction::BinaryOps BinOp) {
7531   // When unrolling and the VF is 1, we only need to add a simple scalar.
7532   Type *Ty = Val->getType();
7533   assert(!Ty->isVectorTy() && "Val must be a scalar");
7534 
7535   if (Ty->isFloatingPointTy()) {
7536     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7537 
7538     // Floating point operations had to be 'fast' to enable the unrolling.
7539     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7540     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7541   }
7542   Constant *C = ConstantInt::get(Ty, StartIdx);
7543   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7544 }
7545 
7546 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7547   SmallVector<Metadata *, 4> MDs;
7548   // Reserve first location for self reference to the LoopID metadata node.
7549   MDs.push_back(nullptr);
7550   bool IsUnrollMetadata = false;
7551   MDNode *LoopID = L->getLoopID();
7552   if (LoopID) {
7553     // First find existing loop unrolling disable metadata.
7554     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7555       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7556       if (MD) {
7557         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7558         IsUnrollMetadata =
7559             S && S->getString().startswith("llvm.loop.unroll.disable");
7560       }
7561       MDs.push_back(LoopID->getOperand(i));
7562     }
7563   }
7564 
7565   if (!IsUnrollMetadata) {
7566     // Add runtime unroll disable metadata.
7567     LLVMContext &Context = L->getHeader()->getContext();
7568     SmallVector<Metadata *, 1> DisableOperands;
7569     DisableOperands.push_back(
7570         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7571     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7572     MDs.push_back(DisableNode);
7573     MDNode *NewLoopID = MDNode::get(Context, MDs);
7574     // Set operand 0 to refer to the loop id itself.
7575     NewLoopID->replaceOperandWith(0, NewLoopID);
7576     L->setLoopID(NewLoopID);
7577   }
7578 }
7579 
7580 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7581     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
7582   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7583   bool PredicateAtRangeStart = Predicate(Range.Start);
7584 
7585   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7586     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7587       Range.End = TmpVF;
7588       break;
7589     }
7590 
7591   return PredicateAtRangeStart;
7592 }
7593 
7594 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7595 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7596 /// of VF's starting at a given VF and extending it as much as possible. Each
7597 /// vectorization decision can potentially shorten this sub-range during
7598 /// buildVPlan().
7599 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7600 
7601   // Collect conditions feeding internal conditional branches; they need to be
7602   // represented in VPlan for it to model masking.
7603   SmallPtrSet<Value *, 1> NeedDef;
7604 
7605   auto *Latch = OrigLoop->getLoopLatch();
7606   for (BasicBlock *BB : OrigLoop->blocks()) {
7607     if (BB == Latch)
7608       continue;
7609     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7610     if (Branch && Branch->isConditional())
7611       NeedDef.insert(Branch->getCondition());
7612   }
7613 
7614   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7615     VFRange SubRange = {VF, MaxVF + 1};
7616     VPlans.push_back(buildVPlan(SubRange, NeedDef));
7617     VF = SubRange.End;
7618   }
7619 }
7620 
7621 VPValue *LoopVectorizationPlanner::createEdgeMask(BasicBlock *Src,
7622                                                   BasicBlock *Dst,
7623                                                   VPlanPtr &Plan) {
7624   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7625 
7626   // Look for cached value.
7627   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7628   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7629   if (ECEntryIt != EdgeMaskCache.end())
7630     return ECEntryIt->second;
7631 
7632   VPValue *SrcMask = createBlockInMask(Src, Plan);
7633 
7634   // The terminator has to be a branch inst!
7635   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7636   assert(BI && "Unexpected terminator found");
7637 
7638   if (!BI->isConditional())
7639     return EdgeMaskCache[Edge] = SrcMask;
7640 
7641   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7642   assert(EdgeMask && "No Edge Mask found for condition");
7643 
7644   if (BI->getSuccessor(0) != Dst)
7645     EdgeMask = Builder.createNot(EdgeMask);
7646 
7647   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7648     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7649 
7650   return EdgeMaskCache[Edge] = EdgeMask;
7651 }
7652 
7653 VPValue *LoopVectorizationPlanner::createBlockInMask(BasicBlock *BB,
7654                                                      VPlanPtr &Plan) {
7655   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7656 
7657   // Look for cached value.
7658   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7659   if (BCEntryIt != BlockMaskCache.end())
7660     return BCEntryIt->second;
7661 
7662   // All-one mask is modelled as no-mask following the convention for masked
7663   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7664   VPValue *BlockMask = nullptr;
7665 
7666   // Loop incoming mask is all-one.
7667   if (OrigLoop->getHeader() == BB)
7668     return BlockMaskCache[BB] = BlockMask;
7669 
7670   // This is the block mask. We OR all incoming edges.
7671   for (auto *Predecessor : predecessors(BB)) {
7672     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7673     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7674       return BlockMaskCache[BB] = EdgeMask;
7675 
7676     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7677       BlockMask = EdgeMask;
7678       continue;
7679     }
7680 
7681     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7682   }
7683 
7684   return BlockMaskCache[BB] = BlockMask;
7685 }
7686 
7687 VPInterleaveRecipe *
7688 LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
7689                                                 VFRange &Range) {
7690   const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
7691   if (!IG)
7692     return nullptr;
7693 
7694   // Now check if IG is relevant for VF's in the given range.
7695   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
7696     return [=](unsigned VF) -> bool {
7697       return (VF >= 2 && // Query is illegal for VF == 1
7698               CM.getWideningDecision(I, VF) ==
7699                   LoopVectorizationCostModel::CM_Interleave);
7700     };
7701   };
7702   if (!getDecisionAndClampRange(isIGMember(I), Range))
7703     return nullptr;
7704 
7705   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
7706   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
7707   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
7708   assert(I == IG->getInsertPos() &&
7709          "Generating a recipe for an adjunct member of an interleave group");
7710 
7711   return new VPInterleaveRecipe(IG);
7712 }
7713 
7714 VPWidenMemoryInstructionRecipe *
7715 LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range,
7716                                            VPlanPtr &Plan) {
7717   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
7718     return nullptr;
7719 
7720   auto willWiden = [&](unsigned VF) -> bool {
7721     if (VF == 1)
7722       return false;
7723     if (CM.isScalarAfterVectorization(I, VF) ||
7724         CM.isProfitableToScalarize(I, VF))
7725       return false;
7726     LoopVectorizationCostModel::InstWidening Decision =
7727         CM.getWideningDecision(I, VF);
7728     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7729            "CM decision should be taken at this point.");
7730     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
7731            "Interleave memory opportunity should be caught earlier.");
7732     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7733   };
7734 
7735   if (!getDecisionAndClampRange(willWiden, Range))
7736     return nullptr;
7737 
7738   VPValue *Mask = nullptr;
7739   if (Legal->isMaskRequired(I))
7740     Mask = createBlockInMask(I->getParent(), Plan);
7741 
7742   return new VPWidenMemoryInstructionRecipe(*I, Mask);
7743 }
7744 
7745 VPWidenIntOrFpInductionRecipe *
7746 LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
7747                                                  VFRange &Range) {
7748   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
7749     // Check if this is an integer or fp induction. If so, build the recipe that
7750     // produces its scalar and vector values.
7751     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
7752     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7753         II.getKind() == InductionDescriptor::IK_FpInduction)
7754       return new VPWidenIntOrFpInductionRecipe(Phi);
7755 
7756     return nullptr;
7757   }
7758 
7759   // Optimize the special case where the source is a constant integer
7760   // induction variable. Notice that we can only optimize the 'trunc' case
7761   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7762   // (c) other casts depend on pointer size.
7763 
7764   // Determine whether \p K is a truncation based on an induction variable that
7765   // can be optimized.
7766   auto isOptimizableIVTruncate =
7767       [&](Instruction *K) -> std::function<bool(unsigned)> {
7768     return
7769         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
7770   };
7771 
7772   if (isa<TruncInst>(I) &&
7773       getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
7774     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7775                                              cast<TruncInst>(I));
7776   return nullptr;
7777 }
7778 
7779 VPBlendRecipe *
7780 LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
7781   PHINode *Phi = dyn_cast<PHINode>(I);
7782   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
7783     return nullptr;
7784 
7785   // We know that all PHIs in non-header blocks are converted into selects, so
7786   // we don't have to worry about the insertion order and we can just use the
7787   // builder. At this point we generate the predication tree. There may be
7788   // duplications since this is a simple recursive scan, but future
7789   // optimizations will clean it up.
7790 
7791   SmallVector<VPValue *, 2> Masks;
7792   unsigned NumIncoming = Phi->getNumIncomingValues();
7793   for (unsigned In = 0; In < NumIncoming; In++) {
7794     VPValue *EdgeMask =
7795       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7796     assert((EdgeMask || NumIncoming == 1) &&
7797            "Multiple predecessors with one having a full mask");
7798     if (EdgeMask)
7799       Masks.push_back(EdgeMask);
7800   }
7801   return new VPBlendRecipe(Phi, Masks);
7802 }
7803 
7804 bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
7805                                           VFRange &Range) {
7806   if (CM.isScalarWithPredication(I))
7807     return false;
7808 
7809   auto IsVectorizableOpcode = [](unsigned Opcode) {
7810     switch (Opcode) {
7811     case Instruction::Add:
7812     case Instruction::And:
7813     case Instruction::AShr:
7814     case Instruction::BitCast:
7815     case Instruction::Br:
7816     case Instruction::Call:
7817     case Instruction::FAdd:
7818     case Instruction::FCmp:
7819     case Instruction::FDiv:
7820     case Instruction::FMul:
7821     case Instruction::FPExt:
7822     case Instruction::FPToSI:
7823     case Instruction::FPToUI:
7824     case Instruction::FPTrunc:
7825     case Instruction::FRem:
7826     case Instruction::FSub:
7827     case Instruction::GetElementPtr:
7828     case Instruction::ICmp:
7829     case Instruction::IntToPtr:
7830     case Instruction::Load:
7831     case Instruction::LShr:
7832     case Instruction::Mul:
7833     case Instruction::Or:
7834     case Instruction::PHI:
7835     case Instruction::PtrToInt:
7836     case Instruction::SDiv:
7837     case Instruction::Select:
7838     case Instruction::SExt:
7839     case Instruction::Shl:
7840     case Instruction::SIToFP:
7841     case Instruction::SRem:
7842     case Instruction::Store:
7843     case Instruction::Sub:
7844     case Instruction::Trunc:
7845     case Instruction::UDiv:
7846     case Instruction::UIToFP:
7847     case Instruction::URem:
7848     case Instruction::Xor:
7849     case Instruction::ZExt:
7850       return true;
7851     }
7852     return false;
7853   };
7854 
7855   if (!IsVectorizableOpcode(I->getOpcode()))
7856     return false;
7857 
7858   if (CallInst *CI = dyn_cast<CallInst>(I)) {
7859     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7860     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7861                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7862       return false;
7863   }
7864 
7865   auto willWiden = [&](unsigned VF) -> bool {
7866     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
7867                              CM.isProfitableToScalarize(I, VF)))
7868       return false;
7869     if (CallInst *CI = dyn_cast<CallInst>(I)) {
7870       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7871       // The following case may be scalarized depending on the VF.
7872       // The flag shows whether we use Intrinsic or a usual Call for vectorized
7873       // version of the instruction.
7874       // Is it beneficial to perform intrinsic call compared to lib call?
7875       bool NeedToScalarize;
7876       unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
7877       bool UseVectorIntrinsic =
7878           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
7879       return UseVectorIntrinsic || !NeedToScalarize;
7880     }
7881     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
7882       assert(CM.getWideningDecision(I, VF) ==
7883                  LoopVectorizationCostModel::CM_Scalarize &&
7884              "Memory widening decisions should have been taken care by now");
7885       return false;
7886     }
7887     return true;
7888   };
7889 
7890   if (!getDecisionAndClampRange(willWiden, Range))
7891     return false;
7892 
7893   // Success: widen this instruction. We optimize the common case where
7894   // consecutive instructions can be represented by a single recipe.
7895   if (!VPBB->empty()) {
7896     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
7897     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
7898       return true;
7899   }
7900 
7901   VPBB->appendRecipe(new VPWidenRecipe(I));
7902   return true;
7903 }
7904 
7905 VPBasicBlock *LoopVectorizationPlanner::handleReplication(
7906     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7907     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7908     VPlanPtr &Plan) {
7909   bool IsUniform = getDecisionAndClampRange(
7910       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7911       Range);
7912 
7913   bool IsPredicated = CM.isScalarWithPredication(I);
7914   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7915 
7916   // Find if I uses a predicated instruction. If so, it will use its scalar
7917   // value. Avoid hoisting the insert-element which packs the scalar value into
7918   // a vector value, as that happens iff all users use the vector value.
7919   for (auto &Op : I->operands())
7920     if (auto *PredInst = dyn_cast<Instruction>(Op))
7921       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7922         PredInst2Recipe[PredInst]->setAlsoPack(false);
7923 
7924   // Finalize the recipe for Instr, first if it is not predicated.
7925   if (!IsPredicated) {
7926     DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7927     VPBB->appendRecipe(Recipe);
7928     return VPBB;
7929   }
7930   DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7931   assert(VPBB->getSuccessors().empty() &&
7932          "VPBB has successors when handling predicated replication.");
7933   // Record predicated instructions for above packing optimizations.
7934   PredInst2Recipe[I] = Recipe;
7935   VPBlockBase *Region =
7936     VPBB->setOneSuccessor(createReplicateRegion(I, Recipe, Plan));
7937   return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
7938 }
7939 
7940 VPRegionBlock *
7941 LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
7942                                                 VPRecipeBase *PredRecipe,
7943                                                 VPlanPtr &Plan) {
7944   // Instructions marked for predication are replicated and placed under an
7945   // if-then construct to prevent side-effects.
7946 
7947   // Generate recipes to compute the block mask for this region.
7948   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7949 
7950   // Build the triangular if-then region.
7951   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7952   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7953   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7954   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7955   auto *PHIRecipe =
7956       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7957   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7958   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7959   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7960 
7961   // Note: first set Entry as region entry and then connect successors starting
7962   // from it in order, to propagate the "parent" of each VPBasicBlock.
7963   Entry->setTwoSuccessors(Pred, Exit);
7964   Pred->setOneSuccessor(Exit);
7965 
7966   return Region;
7967 }
7968 
7969 LoopVectorizationPlanner::VPlanPtr
7970 LoopVectorizationPlanner::buildVPlan(VFRange &Range,
7971                                      const SmallPtrSetImpl<Value *> &NeedDef) {
7972   EdgeMaskCache.clear();
7973   BlockMaskCache.clear();
7974   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7975   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7976 
7977   // Collect instructions from the original loop that will become trivially dead
7978   // in the vectorized loop. We don't need to vectorize these instructions. For
7979   // example, original induction update instructions can become dead because we
7980   // separately emit induction "steps" when generating code for the new loop.
7981   // Similarly, we create a new latch condition when setting up the structure
7982   // of the new loop, so the old one can become dead.
7983   SmallPtrSet<Instruction *, 4> DeadInstructions;
7984   collectTriviallyDeadInstructions(DeadInstructions);
7985 
7986   // Hold a mapping from predicated instructions to their recipes, in order to
7987   // fix their AlsoPack behavior if a user is determined to replicate and use a
7988   // scalar instead of vector value.
7989   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7990 
7991   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7992   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7993   auto Plan = llvm::make_unique<VPlan>(VPBB);
7994 
7995   // Represent values that will have defs inside VPlan.
7996   for (Value *V : NeedDef)
7997     Plan->addVPValue(V);
7998 
7999   // Scan the body of the loop in a topological order to visit each basic block
8000   // after having visited its predecessor basic blocks.
8001   LoopBlocksDFS DFS(OrigLoop);
8002   DFS.perform(LI);
8003 
8004   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8005     // Relevant instructions from basic block BB will be grouped into VPRecipe
8006     // ingredients and fill a new VPBasicBlock.
8007     unsigned VPBBsForBB = 0;
8008     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8009     VPBB->setOneSuccessor(FirstVPBBForBB);
8010     VPBB = FirstVPBBForBB;
8011     Builder.setInsertPoint(VPBB);
8012 
8013     std::vector<Instruction *> Ingredients;
8014 
8015     // Organize the ingredients to vectorize from current basic block in the
8016     // right order.
8017     for (Instruction &I : *BB) {
8018       Instruction *Instr = &I;
8019 
8020       // First filter out irrelevant instructions, to ensure no recipes are
8021       // built for them.
8022       if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
8023           DeadInstructions.count(Instr))
8024         continue;
8025 
8026       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
8027       // member of the IG, do not construct any Recipe for it.
8028       const InterleaveGroup *IG = CM.getInterleavedAccessGroup(Instr);
8029       if (IG && Instr != IG->getInsertPos() &&
8030           Range.Start >= 2 && // Query is illegal for VF == 1
8031           CM.getWideningDecision(Instr, Range.Start) ==
8032               LoopVectorizationCostModel::CM_Interleave) {
8033         if (SinkAfterInverse.count(Instr))
8034           Ingredients.push_back(SinkAfterInverse.find(Instr)->second);
8035         continue;
8036       }
8037 
8038       // Move instructions to handle first-order recurrences, step 1: avoid
8039       // handling this instruction until after we've handled the instruction it
8040       // should follow.
8041       auto SAIt = SinkAfter.find(Instr);
8042       if (SAIt != SinkAfter.end()) {
8043         DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
8044                      << " to vectorize a 1st order recurrence.\n");
8045         SinkAfterInverse[SAIt->second] = Instr;
8046         continue;
8047       }
8048 
8049       Ingredients.push_back(Instr);
8050 
8051       // Move instructions to handle first-order recurrences, step 2: push the
8052       // instruction to be sunk at its insertion point.
8053       auto SAInvIt = SinkAfterInverse.find(Instr);
8054       if (SAInvIt != SinkAfterInverse.end())
8055         Ingredients.push_back(SAInvIt->second);
8056     }
8057 
8058     // Introduce each ingredient into VPlan.
8059     for (Instruction *Instr : Ingredients) {
8060       VPRecipeBase *Recipe = nullptr;
8061 
8062       // Check if Instr should belong to an interleave memory recipe, or already
8063       // does. In the latter case Instr is irrelevant.
8064       if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
8065         VPBB->appendRecipe(Recipe);
8066         continue;
8067       }
8068 
8069       // Check if Instr is a memory operation that should be widened.
8070       if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
8071         VPBB->appendRecipe(Recipe);
8072         continue;
8073       }
8074 
8075       // Check if Instr should form some PHI recipe.
8076       if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
8077         VPBB->appendRecipe(Recipe);
8078         continue;
8079       }
8080       if ((Recipe = tryToBlend(Instr, Plan))) {
8081         VPBB->appendRecipe(Recipe);
8082         continue;
8083       }
8084       if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
8085         VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
8086         continue;
8087       }
8088 
8089       // Check if Instr is to be widened by a general VPWidenRecipe, after
8090       // having first checked for specific widening recipes that deal with
8091       // Interleave Groups, Inductions and Phi nodes.
8092       if (tryToWiden(Instr, VPBB, Range))
8093         continue;
8094 
8095       // Otherwise, if all widening options failed, Instruction is to be
8096       // replicated. This may create a successor for VPBB.
8097       VPBasicBlock *NextVPBB =
8098         handleReplication(Instr, Range, VPBB, PredInst2Recipe, Plan);
8099       if (NextVPBB != VPBB) {
8100         VPBB = NextVPBB;
8101         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8102                                     : "");
8103       }
8104     }
8105   }
8106 
8107   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8108   // may also be empty, such as the last one VPBB, reflecting original
8109   // basic-blocks with no recipes.
8110   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8111   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8112   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8113   PreEntry->disconnectSuccessor(Entry);
8114   delete PreEntry;
8115 
8116   std::string PlanName;
8117   raw_string_ostream RSO(PlanName);
8118   unsigned VF = Range.Start;
8119   Plan->addVF(VF);
8120   RSO << "Initial VPlan for VF={" << VF;
8121   for (VF *= 2; VF < Range.End; VF *= 2) {
8122     Plan->addVF(VF);
8123     RSO << "," << VF;
8124   }
8125   RSO << "},UF>=1";
8126   RSO.flush();
8127   Plan->setName(PlanName);
8128 
8129   return Plan;
8130 }
8131 
8132 Value* LoopVectorizationPlanner::VPCallbackILV::
8133 getOrCreateVectorValues(Value *V, unsigned Part) {
8134       return ILV.getOrCreateVectorValue(V, Part);
8135 }
8136 
8137 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
8138   O << " +\n"
8139     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8140   IG->getInsertPos()->printAsOperand(O, false);
8141   O << "\\l\"";
8142   for (unsigned i = 0; i < IG->getFactor(); ++i)
8143     if (Instruction *I = IG->getMember(i))
8144       O << " +\n"
8145         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
8146 }
8147 
8148 void VPWidenRecipe::execute(VPTransformState &State) {
8149   for (auto &Instr : make_range(Begin, End))
8150     State.ILV->widenInstruction(Instr);
8151 }
8152 
8153 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8154   assert(!State.Instance && "Int or FP induction being replicated.");
8155   State.ILV->widenIntOrFpInduction(IV, Trunc);
8156 }
8157 
8158 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8159   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8160 }
8161 
8162 void VPBlendRecipe::execute(VPTransformState &State) {
8163   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8164   // We know that all PHIs in non-header blocks are converted into
8165   // selects, so we don't have to worry about the insertion order and we
8166   // can just use the builder.
8167   // At this point we generate the predication tree. There may be
8168   // duplications since this is a simple recursive scan, but future
8169   // optimizations will clean it up.
8170 
8171   unsigned NumIncoming = Phi->getNumIncomingValues();
8172 
8173   assert((User || NumIncoming == 1) &&
8174          "Multiple predecessors with predecessors having a full mask");
8175   // Generate a sequence of selects of the form:
8176   // SELECT(Mask3, In3,
8177   //      SELECT(Mask2, In2,
8178   //                   ( ...)))
8179   InnerLoopVectorizer::VectorParts Entry(State.UF);
8180   for (unsigned In = 0; In < NumIncoming; ++In) {
8181     for (unsigned Part = 0; Part < State.UF; ++Part) {
8182       // We might have single edge PHIs (blocks) - use an identity
8183       // 'select' for the first PHI operand.
8184       Value *In0 =
8185           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
8186       if (In == 0)
8187         Entry[Part] = In0; // Initialize with the first incoming value.
8188       else {
8189         // Select between the current value and the previous incoming edge
8190         // based on the incoming mask.
8191         Value *Cond = State.get(User->getOperand(In), Part);
8192         Entry[Part] =
8193             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8194       }
8195     }
8196   }
8197   for (unsigned Part = 0; Part < State.UF; ++Part)
8198     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8199 }
8200 
8201 void VPInterleaveRecipe::execute(VPTransformState &State) {
8202   assert(!State.Instance && "Interleave group being replicated.");
8203   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
8204 }
8205 
8206 void VPReplicateRecipe::execute(VPTransformState &State) {
8207   if (State.Instance) { // Generate a single instance.
8208     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
8209     // Insert scalar instance packing it into a vector.
8210     if (AlsoPack && State.VF > 1) {
8211       // If we're constructing lane 0, initialize to start from undef.
8212       if (State.Instance->Lane == 0) {
8213         Value *Undef =
8214             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8215         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8216       }
8217       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8218     }
8219     return;
8220   }
8221 
8222   // Generate scalar instances for all VF lanes of all UF parts, unless the
8223   // instruction is uniform inwhich case generate only the first lane for each
8224   // of the UF parts.
8225   unsigned EndLane = IsUniform ? 1 : State.VF;
8226   for (unsigned Part = 0; Part < State.UF; ++Part)
8227     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8228       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
8229 }
8230 
8231 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8232   assert(State.Instance && "Branch on Mask works only on single instance.");
8233 
8234   unsigned Part = State.Instance->Part;
8235   unsigned Lane = State.Instance->Lane;
8236 
8237   Value *ConditionBit = nullptr;
8238   if (!User) // Block in mask is all-one.
8239     ConditionBit = State.Builder.getTrue();
8240   else {
8241     VPValue *BlockInMask = User->getOperand(0);
8242     ConditionBit = State.get(BlockInMask, Part);
8243     if (ConditionBit->getType()->isVectorTy())
8244       ConditionBit = State.Builder.CreateExtractElement(
8245           ConditionBit, State.Builder.getInt32(Lane));
8246   }
8247 
8248   // Replace the temporary unreachable terminator with a new conditional branch,
8249   // whose two destinations will be set later when they are created.
8250   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8251   assert(isa<UnreachableInst>(CurrentTerminator) &&
8252          "Expected to replace unreachable terminator with conditional branch.");
8253   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8254   CondBr->setSuccessor(0, nullptr);
8255   ReplaceInstWithInst(CurrentTerminator, CondBr);
8256 }
8257 
8258 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8259   assert(State.Instance && "Predicated instruction PHI works per instance.");
8260   Instruction *ScalarPredInst = cast<Instruction>(
8261       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8262   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8263   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8264   assert(PredicatingBB && "Predicated block has no single predecessor.");
8265 
8266   // By current pack/unpack logic we need to generate only a single phi node: if
8267   // a vector value for the predicated instruction exists at this point it means
8268   // the instruction has vector users only, and a phi for the vector value is
8269   // needed. In this case the recipe of the predicated instruction is marked to
8270   // also do that packing, thereby "hoisting" the insert-element sequence.
8271   // Otherwise, a phi node for the scalar value is needed.
8272   unsigned Part = State.Instance->Part;
8273   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8274     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8275     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8276     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8277     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8278     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8279     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8280   } else {
8281     Type *PredInstType = PredInst->getType();
8282     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8283     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8284     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8285     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8286   }
8287 }
8288 
8289 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8290   if (!User)
8291     return State.ILV->vectorizeMemoryInstruction(&Instr);
8292 
8293   // Last (and currently only) operand is a mask.
8294   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
8295   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
8296   for (unsigned Part = 0; Part < State.UF; ++Part)
8297     MaskValues[Part] = State.get(Mask, Part);
8298   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
8299 }
8300 
8301 bool LoopVectorizePass::processLoop(Loop *L) {
8302   assert(L->empty() && "Only process inner loops.");
8303 
8304 #ifndef NDEBUG
8305   const std::string DebugLocStr = getDebugLocString(L);
8306 #endif /* NDEBUG */
8307 
8308   DEBUG(dbgs() << "\nLV: Checking a loop in \""
8309                << L->getHeader()->getParent()->getName() << "\" from "
8310                << DebugLocStr << "\n");
8311 
8312   LoopVectorizeHints Hints(L, DisableUnrolling, *ORE);
8313 
8314   DEBUG(dbgs() << "LV: Loop hints:"
8315                << " force="
8316                << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8317                        ? "disabled"
8318                        : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8319                               ? "enabled"
8320                               : "?"))
8321                << " width=" << Hints.getWidth()
8322                << " unroll=" << Hints.getInterleave() << "\n");
8323 
8324   // Function containing loop
8325   Function *F = L->getHeader()->getParent();
8326 
8327   // Looking at the diagnostic output is the only way to determine if a loop
8328   // was vectorized (other than looking at the IR or machine code), so it
8329   // is important to generate an optimization remark for each loop. Most of
8330   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8331   // generated as OptimizationRemark and OptimizationRemarkMissed are
8332   // less verbose reporting vectorized loops and unvectorized loops that may
8333   // benefit from vectorization, respectively.
8334 
8335   if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
8336     DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8337     return false;
8338   }
8339 
8340   PredicatedScalarEvolution PSE(*SE, *L);
8341 
8342   // Check if it is legal to vectorize the loop.
8343   LoopVectorizationRequirements Requirements(*ORE);
8344   LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
8345                                 &Requirements, &Hints, DB, AC);
8346   if (!LVL.canVectorize()) {
8347     DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8348     emitMissedWarning(F, L, Hints, ORE);
8349     return false;
8350   }
8351 
8352   // Check the function attributes to find out if this function should be
8353   // optimized for size.
8354   bool OptForSize =
8355       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
8356 
8357   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8358   // count by optimizing for size, to minimize overheads.
8359   // Prefer constant trip counts over profile data, over upper bound estimate.
8360   unsigned ExpectedTC = 0;
8361   bool HasExpectedTC = false;
8362   if (const SCEVConstant *ConstExits =
8363       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
8364     const APInt &ExitsCount = ConstExits->getAPInt();
8365     // We are interested in small values for ExpectedTC. Skip over those that
8366     // can't fit an unsigned.
8367     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
8368       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
8369       HasExpectedTC = true;
8370     }
8371   }
8372   // ExpectedTC may be large because it's bound by a variable. Check
8373   // profiling information to validate we should vectorize.
8374   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
8375     auto EstimatedTC = getLoopEstimatedTripCount(L);
8376     if (EstimatedTC) {
8377       ExpectedTC = *EstimatedTC;
8378       HasExpectedTC = true;
8379     }
8380   }
8381   if (!HasExpectedTC) {
8382     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
8383     HasExpectedTC = (ExpectedTC > 0);
8384   }
8385 
8386   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
8387     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8388                  << "This loop is worth vectorizing only if no scalar "
8389                  << "iteration overheads are incurred.");
8390     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8391       DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8392     else {
8393       DEBUG(dbgs() << "\n");
8394       // Loops with a very small trip count are considered for vectorization
8395       // under OptForSize, thereby making sure the cost of their loop body is
8396       // dominant, free of runtime guards and scalar iteration overheads.
8397       OptForSize = true;
8398     }
8399   }
8400 
8401   // Check the function attributes to see if implicit floats are allowed.
8402   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8403   // an integer loop and the vector instructions selected are purely integer
8404   // vector instructions?
8405   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8406     DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
8407                     "attribute is used.\n");
8408     ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
8409                                    "NoImplicitFloat", L)
8410               << "loop not vectorized due to NoImplicitFloat attribute");
8411     emitMissedWarning(F, L, Hints, ORE);
8412     return false;
8413   }
8414 
8415   // Check if the target supports potentially unsafe FP vectorization.
8416   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8417   // for the target we're vectorizing for, to make sure none of the
8418   // additional fp-math flags can help.
8419   if (Hints.isPotentiallyUnsafe() &&
8420       TTI->isFPVectorizationPotentiallyUnsafe()) {
8421     DEBUG(dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
8422     ORE->emit(
8423         createMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
8424         << "loop not vectorized due to unsafe FP support.");
8425     emitMissedWarning(F, L, Hints, ORE);
8426     return false;
8427   }
8428 
8429   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8430   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8431 
8432   // If an override option has been passed in for interleaved accesses, use it.
8433   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8434     UseInterleaved = EnableInterleavedMemAccesses;
8435 
8436   // Analyze interleaved memory accesses.
8437   if (UseInterleaved) {
8438     IAI.analyzeInterleaving();
8439   }
8440 
8441   // Use the cost model.
8442   LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
8443                                 &Hints, IAI);
8444   CM.collectValuesToIgnore();
8445 
8446   // Use the planner for vectorization.
8447   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
8448 
8449   // Get user vectorization factor.
8450   unsigned UserVF = Hints.getWidth();
8451 
8452   // Plan how to best vectorize, return the best VF and its cost.
8453   VectorizationFactor VF = LVP.plan(OptForSize, UserVF);
8454 
8455   // Select the interleave count.
8456   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
8457 
8458   // Get user interleave count.
8459   unsigned UserIC = Hints.getInterleave();
8460 
8461   // Identify the diagnostic messages that should be produced.
8462   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8463   bool VectorizeLoop = true, InterleaveLoop = true;
8464   if (Requirements.doesNotMeet(F, L, Hints)) {
8465     DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8466                     "requirements.\n");
8467     emitMissedWarning(F, L, Hints, ORE);
8468     return false;
8469   }
8470 
8471   if (VF.Width == 1) {
8472     DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8473     VecDiagMsg = std::make_pair(
8474         "VectorizationNotBeneficial",
8475         "the cost-model indicates that vectorization is not beneficial");
8476     VectorizeLoop = false;
8477   }
8478 
8479   if (IC == 1 && UserIC <= 1) {
8480     // Tell the user interleaving is not beneficial.
8481     DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8482     IntDiagMsg = std::make_pair(
8483         "InterleavingNotBeneficial",
8484         "the cost-model indicates that interleaving is not beneficial");
8485     InterleaveLoop = false;
8486     if (UserIC == 1) {
8487       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8488       IntDiagMsg.second +=
8489           " and is explicitly disabled or interleave count is set to 1";
8490     }
8491   } else if (IC > 1 && UserIC == 1) {
8492     // Tell the user interleaving is beneficial, but it explicitly disabled.
8493     DEBUG(dbgs()
8494           << "LV: Interleaving is beneficial but is explicitly disabled.");
8495     IntDiagMsg = std::make_pair(
8496         "InterleavingBeneficialButDisabled",
8497         "the cost-model indicates that interleaving is beneficial "
8498         "but is explicitly disabled or interleave count is set to 1");
8499     InterleaveLoop = false;
8500   }
8501 
8502   // Override IC if user provided an interleave count.
8503   IC = UserIC > 0 ? UserIC : IC;
8504 
8505   // Emit diagnostic messages, if any.
8506   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8507   if (!VectorizeLoop && !InterleaveLoop) {
8508     // Do not vectorize or interleaving the loop.
8509     ORE->emit([&]() {
8510       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8511                                       L->getStartLoc(), L->getHeader())
8512              << VecDiagMsg.second;
8513     });
8514     ORE->emit([&]() {
8515       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8516                                       L->getStartLoc(), L->getHeader())
8517              << IntDiagMsg.second;
8518     });
8519     return false;
8520   } else if (!VectorizeLoop && InterleaveLoop) {
8521     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8522     ORE->emit([&]() {
8523       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8524                                         L->getStartLoc(), L->getHeader())
8525              << VecDiagMsg.second;
8526     });
8527   } else if (VectorizeLoop && !InterleaveLoop) {
8528     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
8529                  << DebugLocStr << '\n');
8530     ORE->emit([&]() {
8531       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8532                                         L->getStartLoc(), L->getHeader())
8533              << IntDiagMsg.second;
8534     });
8535   } else if (VectorizeLoop && InterleaveLoop) {
8536     DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
8537                  << DebugLocStr << '\n');
8538     DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8539   }
8540 
8541   LVP.setBestPlan(VF.Width, IC);
8542 
8543   using namespace ore;
8544 
8545   if (!VectorizeLoop) {
8546     assert(IC > 1 && "interleave count should not be 1 or 0");
8547     // If we decided that it is not legal to vectorize the loop, then
8548     // interleave it.
8549     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
8550                                &CM);
8551     LVP.executePlan(Unroller, DT);
8552 
8553     ORE->emit([&]() {
8554       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8555                                 L->getHeader())
8556              << "interleaved loop (interleaved count: "
8557              << NV("InterleaveCount", IC) << ")";
8558     });
8559   } else {
8560     // If we decided that it is *legal* to vectorize the loop, then do it.
8561     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8562                            &LVL, &CM);
8563     LVP.executePlan(LB, DT);
8564     ++LoopsVectorized;
8565 
8566     // Add metadata to disable runtime unrolling a scalar loop when there are
8567     // no runtime checks about strides and memory. A scalar loop that is
8568     // rarely used is not worth unrolling.
8569     if (!LB.areSafetyChecksAdded())
8570       AddRuntimeUnrollDisableMetaData(L);
8571 
8572     // Report the vectorization decision.
8573     ORE->emit([&]() {
8574       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8575                                 L->getHeader())
8576              << "vectorized loop (vectorization width: "
8577              << NV("VectorizationFactor", VF.Width)
8578              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8579     });
8580   }
8581 
8582   // Mark the loop as already vectorized to avoid vectorizing again.
8583   Hints.setAlreadyVectorized();
8584 
8585   DEBUG(verifyFunction(*L->getHeader()->getParent()));
8586   return true;
8587 }
8588 
8589 bool LoopVectorizePass::runImpl(
8590     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8591     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8592     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
8593     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8594     OptimizationRemarkEmitter &ORE_) {
8595   SE = &SE_;
8596   LI = &LI_;
8597   TTI = &TTI_;
8598   DT = &DT_;
8599   BFI = &BFI_;
8600   TLI = TLI_;
8601   AA = &AA_;
8602   AC = &AC_;
8603   GetLAA = &GetLAA_;
8604   DB = &DB_;
8605   ORE = &ORE_;
8606 
8607   // Don't attempt if
8608   // 1. the target claims to have no vector registers, and
8609   // 2. interleaving won't help ILP.
8610   //
8611   // The second condition is necessary because, even if the target has no
8612   // vector registers, loop vectorization may still enable scalar
8613   // interleaving.
8614   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
8615     return false;
8616 
8617   bool Changed = false;
8618 
8619   // The vectorizer requires loops to be in simplified form.
8620   // Since simplification may add new inner loops, it has to run before the
8621   // legality and profitability checks. This means running the loop vectorizer
8622   // will simplify all loops, regardless of whether anything end up being
8623   // vectorized.
8624   for (auto &L : *LI)
8625     Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
8626 
8627   // Build up a worklist of inner-loops to vectorize. This is necessary as
8628   // the act of vectorizing or partially unrolling a loop creates new loops
8629   // and can invalidate iterators across the loops.
8630   SmallVector<Loop *, 8> Worklist;
8631 
8632   for (Loop *L : *LI)
8633     addAcyclicInnerLoop(*L, *LI, Worklist);
8634 
8635   LoopsAnalyzed += Worklist.size();
8636 
8637   // Now walk the identified inner loops.
8638   while (!Worklist.empty()) {
8639     Loop *L = Worklist.pop_back_val();
8640 
8641     // For the inner loops we actually process, form LCSSA to simplify the
8642     // transform.
8643     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8644 
8645     Changed |= processLoop(L);
8646   }
8647 
8648   // Process each loop nest in the function.
8649   return Changed;
8650 }
8651 
8652 PreservedAnalyses LoopVectorizePass::run(Function &F,
8653                                          FunctionAnalysisManager &AM) {
8654     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8655     auto &LI = AM.getResult<LoopAnalysis>(F);
8656     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8657     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8658     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8659     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8660     auto &AA = AM.getResult<AAManager>(F);
8661     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8662     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8663     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8664 
8665     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8666     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8667         [&](Loop &L) -> const LoopAccessInfo & {
8668       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
8669       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8670     };
8671     bool Changed =
8672         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
8673     if (!Changed)
8674       return PreservedAnalyses::all();
8675     PreservedAnalyses PA;
8676     PA.preserve<LoopAnalysis>();
8677     PA.preserve<DominatorTreeAnalysis>();
8678     PA.preserve<BasicAA>();
8679     PA.preserve<GlobalsAA>();
8680     return PA;
8681 }
8682