1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlanHCFGBuilder.h"
60 #include "VPlanHCFGTransforms.h"
61 #include "VPlanPredicator.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SetVector.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/MemorySSA.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpander.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/Pass.h"
126 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/CommandLine.h"
128 #include "llvm/Support/Compiler.h"
129 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/ErrorHandling.h"
131 #include "llvm/Support/MathExtras.h"
132 #include "llvm/Support/raw_ostream.h"
133 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
134 #include "llvm/Transforms/Utils/LoopSimplify.h"
135 #include "llvm/Transforms/Utils/LoopUtils.h"
136 #include "llvm/Transforms/Utils/LoopVersioning.h"
137 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
138 #include <algorithm>
139 #include <cassert>
140 #include <cstdint>
141 #include <cstdlib>
142 #include <functional>
143 #include <iterator>
144 #include <limits>
145 #include <memory>
146 #include <string>
147 #include <tuple>
148 #include <utility>
149 #include <vector>
150 
151 using namespace llvm;
152 
153 #define LV_NAME "loop-vectorize"
154 #define DEBUG_TYPE LV_NAME
155 
156 /// @{
157 /// Metadata attribute names
158 static const char *const LLVMLoopVectorizeFollowupAll =
159     "llvm.loop.vectorize.followup_all";
160 static const char *const LLVMLoopVectorizeFollowupVectorized =
161     "llvm.loop.vectorize.followup_vectorized";
162 static const char *const LLVMLoopVectorizeFollowupEpilogue =
163     "llvm.loop.vectorize.followup_epilogue";
164 /// @}
165 
166 STATISTIC(LoopsVectorized, "Number of loops vectorized");
167 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
168 
169 /// Loops with a known constant trip count below this number are vectorized only
170 /// if no scalar iteration overheads are incurred.
171 static cl::opt<unsigned> TinyTripCountVectorThreshold(
172     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
173     cl::desc("Loops with a constant trip count that is smaller than this "
174              "value are vectorized only if no scalar iteration overheads "
175              "are incurred."));
176 
177 static cl::opt<bool> MaximizeBandwidth(
178     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
179     cl::desc("Maximize bandwidth when selecting vectorization factor which "
180              "will be determined by the smallest type in loop."));
181 
182 static cl::opt<bool> EnableInterleavedMemAccesses(
183     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
184     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
185 
186 /// An interleave-group may need masking if it resides in a block that needs
187 /// predication, or in order to mask away gaps.
188 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
189     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
190     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
191 
192 /// We don't interleave loops with a known constant trip count below this
193 /// number.
194 static const unsigned TinyTripCountInterleaveThreshold = 128;
195 
196 static cl::opt<unsigned> ForceTargetNumScalarRegs(
197     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
198     cl::desc("A flag that overrides the target's number of scalar registers."));
199 
200 static cl::opt<unsigned> ForceTargetNumVectorRegs(
201     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
202     cl::desc("A flag that overrides the target's number of vector registers."));
203 
204 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
205     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
206     cl::desc("A flag that overrides the target's max interleave factor for "
207              "scalar loops."));
208 
209 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
210     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's max interleave factor for "
212              "vectorized loops."));
213 
214 static cl::opt<unsigned> ForceTargetInstructionCost(
215     "force-target-instruction-cost", cl::init(0), cl::Hidden,
216     cl::desc("A flag that overrides the target's expected cost for "
217              "an instruction to a single constant value. Mostly "
218              "useful for getting consistent testing."));
219 
220 static cl::opt<unsigned> SmallLoopCost(
221     "small-loop-cost", cl::init(20), cl::Hidden,
222     cl::desc(
223         "The cost of a loop that is considered 'small' by the interleaver."));
224 
225 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
226     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
227     cl::desc("Enable the use of the block frequency analysis to access PGO "
228              "heuristics minimizing code growth in cold regions and being more "
229              "aggressive in hot regions."));
230 
231 // Runtime interleave loops for load/store throughput.
232 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
233     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
234     cl::desc(
235         "Enable runtime interleaving until load/store ports are saturated"));
236 
237 /// The number of stores in a loop that are allowed to need predication.
238 static cl::opt<unsigned> NumberOfStoresToPredicate(
239     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
240     cl::desc("Max number of stores to be predicated behind an if."));
241 
242 static cl::opt<bool> EnableIndVarRegisterHeur(
243     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
244     cl::desc("Count the induction variable only once when interleaving"));
245 
246 static cl::opt<bool> EnableCondStoresVectorization(
247     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
248     cl::desc("Enable if predication of stores during vectorization."));
249 
250 static cl::opt<unsigned> MaxNestedScalarReductionIC(
251     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
252     cl::desc("The maximum interleave count to use when interleaving a scalar "
253              "reduction in a nested loop."));
254 
255 cl::opt<bool> EnableVPlanNativePath(
256     "enable-vplan-native-path", cl::init(false), cl::Hidden,
257     cl::desc("Enable VPlan-native vectorization path with "
258              "support for outer loop vectorization."));
259 
260 // FIXME: Remove this switch once we have divergence analysis. Currently we
261 // assume divergent non-backedge branches when this switch is true.
262 cl::opt<bool> EnableVPlanPredication(
263     "enable-vplan-predication", cl::init(false), cl::Hidden,
264     cl::desc("Enable VPlan-native vectorization path predicator with "
265              "support for outer loop vectorization."));
266 
267 // This flag enables the stress testing of the VPlan H-CFG construction in the
268 // VPlan-native vectorization path. It must be used in conjuction with
269 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
270 // verification of the H-CFGs built.
271 static cl::opt<bool> VPlanBuildStressTest(
272     "vplan-build-stress-test", cl::init(false), cl::Hidden,
273     cl::desc(
274         "Build VPlan for every supported loop nest in the function and bail "
275         "out right after the build (stress test the VPlan H-CFG construction "
276         "in the VPlan-native vectorization path)."));
277 
278 /// A helper function for converting Scalar types to vector types.
279 /// If the incoming type is void, we return void. If the VF is 1, we return
280 /// the scalar type.
281 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
282   if (Scalar->isVoidTy() || VF == 1)
283     return Scalar;
284   return VectorType::get(Scalar, VF);
285 }
286 
287 /// A helper function that returns the type of loaded or stored value.
288 static Type *getMemInstValueType(Value *I) {
289   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
290          "Expected Load or Store instruction");
291   if (auto *LI = dyn_cast<LoadInst>(I))
292     return LI->getType();
293   return cast<StoreInst>(I)->getValueOperand()->getType();
294 }
295 
296 /// A helper function that returns true if the given type is irregular. The
297 /// type is irregular if its allocated size doesn't equal the store size of an
298 /// element of the corresponding vector type at the given vectorization factor.
299 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
300   // Determine if an array of VF elements of type Ty is "bitcast compatible"
301   // with a <VF x Ty> vector.
302   if (VF > 1) {
303     auto *VectorTy = VectorType::get(Ty, VF);
304     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
305   }
306 
307   // If the vectorization factor is one, we just check if an array of type Ty
308   // requires padding between elements.
309   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
310 }
311 
312 /// A helper function that returns the reciprocal of the block probability of
313 /// predicated blocks. If we return X, we are assuming the predicated block
314 /// will execute once for every X iterations of the loop header.
315 ///
316 /// TODO: We should use actual block probability here, if available. Currently,
317 ///       we always assume predicated blocks have a 50% chance of executing.
318 static unsigned getReciprocalPredBlockProb() { return 2; }
319 
320 /// A helper function that adds a 'fast' flag to floating-point operations.
321 static Value *addFastMathFlag(Value *V) {
322   if (isa<FPMathOperator>(V))
323     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
324   return V;
325 }
326 
327 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
328   if (isa<FPMathOperator>(V))
329     cast<Instruction>(V)->setFastMathFlags(FMF);
330   return V;
331 }
332 
333 /// A helper function that returns an integer or floating-point constant with
334 /// value C.
335 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
336   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
337                            : ConstantFP::get(Ty, C);
338 }
339 
340 namespace llvm {
341 
342 /// InnerLoopVectorizer vectorizes loops which contain only one basic
343 /// block to a specified vectorization factor (VF).
344 /// This class performs the widening of scalars into vectors, or multiple
345 /// scalars. This class also implements the following features:
346 /// * It inserts an epilogue loop for handling loops that don't have iteration
347 ///   counts that are known to be a multiple of the vectorization factor.
348 /// * It handles the code generation for reduction variables.
349 /// * Scalarization (implementation using scalars) of un-vectorizable
350 ///   instructions.
351 /// InnerLoopVectorizer does not perform any vectorization-legality
352 /// checks, and relies on the caller to check for the different legality
353 /// aspects. The InnerLoopVectorizer relies on the
354 /// LoopVectorizationLegality class to provide information about the induction
355 /// and reduction variables that were found to a given vectorization factor.
356 class InnerLoopVectorizer {
357 public:
358   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
359                       LoopInfo *LI, DominatorTree *DT,
360                       const TargetLibraryInfo *TLI,
361                       const TargetTransformInfo *TTI, AssumptionCache *AC,
362                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
363                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
364                       LoopVectorizationCostModel *CM)
365       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
366         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
367         Builder(PSE.getSE()->getContext()),
368         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
369   virtual ~InnerLoopVectorizer() = default;
370 
371   /// Create a new empty loop. Unlink the old loop and connect the new one.
372   /// Return the pre-header block of the new loop.
373   BasicBlock *createVectorizedLoopSkeleton();
374 
375   /// Widen a single instruction within the innermost loop.
376   void widenInstruction(Instruction &I);
377 
378   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
379   void fixVectorizedLoop();
380 
381   // Return true if any runtime check is added.
382   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
383 
384   /// A type for vectorized values in the new loop. Each value from the
385   /// original loop, when vectorized, is represented by UF vector values in the
386   /// new unrolled loop, where UF is the unroll factor.
387   using VectorParts = SmallVector<Value *, 2>;
388 
389   /// Vectorize a single PHINode in a block. This method handles the induction
390   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
391   /// arbitrary length vectors.
392   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
393 
394   /// A helper function to scalarize a single Instruction in the innermost loop.
395   /// Generates a sequence of scalar instances for each lane between \p MinLane
396   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
397   /// inclusive..
398   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
399                             bool IfPredicateInstr);
400 
401   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
402   /// is provided, the integer induction variable will first be truncated to
403   /// the corresponding type.
404   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
405 
406   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
407   /// vector or scalar value on-demand if one is not yet available. When
408   /// vectorizing a loop, we visit the definition of an instruction before its
409   /// uses. When visiting the definition, we either vectorize or scalarize the
410   /// instruction, creating an entry for it in the corresponding map. (In some
411   /// cases, such as induction variables, we will create both vector and scalar
412   /// entries.) Then, as we encounter uses of the definition, we derive values
413   /// for each scalar or vector use unless such a value is already available.
414   /// For example, if we scalarize a definition and one of its uses is vector,
415   /// we build the required vector on-demand with an insertelement sequence
416   /// when visiting the use. Otherwise, if the use is scalar, we can use the
417   /// existing scalar definition.
418   ///
419   /// Return a value in the new loop corresponding to \p V from the original
420   /// loop at unroll index \p Part. If the value has already been vectorized,
421   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
422   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
423   /// a new vector value on-demand by inserting the scalar values into a vector
424   /// with an insertelement sequence. If the value has been neither vectorized
425   /// nor scalarized, it must be loop invariant, so we simply broadcast the
426   /// value into a vector.
427   Value *getOrCreateVectorValue(Value *V, unsigned Part);
428 
429   /// Return a value in the new loop corresponding to \p V from the original
430   /// loop at unroll and vector indices \p Instance. If the value has been
431   /// vectorized but not scalarized, the necessary extractelement instruction
432   /// will be generated.
433   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
434 
435   /// Construct the vector value of a scalarized value \p V one lane at a time.
436   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
437 
438   /// Try to vectorize the interleaved access group that \p Instr belongs to,
439   /// optionally masking the vector operations if \p BlockInMask is non-null.
440   void vectorizeInterleaveGroup(Instruction *Instr,
441                                 VectorParts *BlockInMask = nullptr);
442 
443   /// Vectorize Load and Store instructions, optionally masking the vector
444   /// operations if \p BlockInMask is non-null.
445   void vectorizeMemoryInstruction(Instruction *Instr,
446                                   VectorParts *BlockInMask = nullptr);
447 
448   /// Set the debug location in the builder using the debug location in
449   /// the instruction.
450   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
451 
452   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
453   void fixNonInductionPHIs(void);
454 
455 protected:
456   friend class LoopVectorizationPlanner;
457 
458   /// A small list of PHINodes.
459   using PhiVector = SmallVector<PHINode *, 4>;
460 
461   /// A type for scalarized values in the new loop. Each value from the
462   /// original loop, when scalarized, is represented by UF x VF scalar values
463   /// in the new unrolled loop, where UF is the unroll factor and VF is the
464   /// vectorization factor.
465   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
466 
467   /// Set up the values of the IVs correctly when exiting the vector loop.
468   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
469                     Value *CountRoundDown, Value *EndValue,
470                     BasicBlock *MiddleBlock);
471 
472   /// Create a new induction variable inside L.
473   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
474                                    Value *Step, Instruction *DL);
475 
476   /// Handle all cross-iteration phis in the header.
477   void fixCrossIterationPHIs();
478 
479   /// Fix a first-order recurrence. This is the second phase of vectorizing
480   /// this phi node.
481   void fixFirstOrderRecurrence(PHINode *Phi);
482 
483   /// Fix a reduction cross-iteration phi. This is the second phase of
484   /// vectorizing this phi node.
485   void fixReduction(PHINode *Phi);
486 
487   /// The Loop exit block may have single value PHI nodes with some
488   /// incoming value. While vectorizing we only handled real values
489   /// that were defined inside the loop and we should have one value for
490   /// each predecessor of its parent basic block. See PR14725.
491   void fixLCSSAPHIs();
492 
493   /// Iteratively sink the scalarized operands of a predicated instruction into
494   /// the block that was created for it.
495   void sinkScalarOperands(Instruction *PredInst);
496 
497   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
498   /// represented as.
499   void truncateToMinimalBitwidths();
500 
501   /// Insert the new loop to the loop hierarchy and pass manager
502   /// and update the analysis passes.
503   void updateAnalysis();
504 
505   /// Create a broadcast instruction. This method generates a broadcast
506   /// instruction (shuffle) for loop invariant values and for the induction
507   /// value. If this is the induction variable then we extend it to N, N+1, ...
508   /// this is needed because each iteration in the loop corresponds to a SIMD
509   /// element.
510   virtual Value *getBroadcastInstrs(Value *V);
511 
512   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
513   /// to each vector element of Val. The sequence starts at StartIndex.
514   /// \p Opcode is relevant for FP induction variable.
515   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
516                                Instruction::BinaryOps Opcode =
517                                Instruction::BinaryOpsEnd);
518 
519   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
520   /// variable on which to base the steps, \p Step is the size of the step, and
521   /// \p EntryVal is the value from the original loop that maps to the steps.
522   /// Note that \p EntryVal doesn't have to be an induction variable - it
523   /// can also be a truncate instruction.
524   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
525                         const InductionDescriptor &ID);
526 
527   /// Create a vector induction phi node based on an existing scalar one. \p
528   /// EntryVal is the value from the original loop that maps to the vector phi
529   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
530   /// truncate instruction, instead of widening the original IV, we widen a
531   /// version of the IV truncated to \p EntryVal's type.
532   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
533                                        Value *Step, Instruction *EntryVal);
534 
535   /// Returns true if an instruction \p I should be scalarized instead of
536   /// vectorized for the chosen vectorization factor.
537   bool shouldScalarizeInstruction(Instruction *I) const;
538 
539   /// Returns true if we should generate a scalar version of \p IV.
540   bool needsScalarInduction(Instruction *IV) const;
541 
542   /// If there is a cast involved in the induction variable \p ID, which should
543   /// be ignored in the vectorized loop body, this function records the
544   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
545   /// cast. We had already proved that the casted Phi is equal to the uncasted
546   /// Phi in the vectorized loop (under a runtime guard), and therefore
547   /// there is no need to vectorize the cast - the same value can be used in the
548   /// vector loop for both the Phi and the cast.
549   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
550   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
551   ///
552   /// \p EntryVal is the value from the original loop that maps to the vector
553   /// phi node and is used to distinguish what is the IV currently being
554   /// processed - original one (if \p EntryVal is a phi corresponding to the
555   /// original IV) or the "newly-created" one based on the proof mentioned above
556   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
557   /// latter case \p EntryVal is a TruncInst and we must not record anything for
558   /// that IV, but it's error-prone to expect callers of this routine to care
559   /// about that, hence this explicit parameter.
560   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
561                                              const Instruction *EntryVal,
562                                              Value *VectorLoopValue,
563                                              unsigned Part,
564                                              unsigned Lane = UINT_MAX);
565 
566   /// Generate a shuffle sequence that will reverse the vector Vec.
567   virtual Value *reverseVector(Value *Vec);
568 
569   /// Returns (and creates if needed) the original loop trip count.
570   Value *getOrCreateTripCount(Loop *NewLoop);
571 
572   /// Returns (and creates if needed) the trip count of the widened loop.
573   Value *getOrCreateVectorTripCount(Loop *NewLoop);
574 
575   /// Returns a bitcasted value to the requested vector type.
576   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
577   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
578                                 const DataLayout &DL);
579 
580   /// Emit a bypass check to see if the vector trip count is zero, including if
581   /// it overflows.
582   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
583 
584   /// Emit a bypass check to see if all of the SCEV assumptions we've
585   /// had to make are correct.
586   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
587 
588   /// Emit bypass checks to check any memory assumptions we may have made.
589   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
590 
591   /// Compute the transformed value of Index at offset StartValue using step
592   /// StepValue.
593   /// For integer induction, returns StartValue + Index * StepValue.
594   /// For pointer induction, returns StartValue[Index * StepValue].
595   /// FIXME: The newly created binary instructions should contain nsw/nuw
596   /// flags, which can be found from the original scalar operations.
597   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
598                               const DataLayout &DL,
599                               const InductionDescriptor &ID) const;
600 
601   /// Add additional metadata to \p To that was not present on \p Orig.
602   ///
603   /// Currently this is used to add the noalias annotations based on the
604   /// inserted memchecks.  Use this for instructions that are *cloned* into the
605   /// vector loop.
606   void addNewMetadata(Instruction *To, const Instruction *Orig);
607 
608   /// Add metadata from one instruction to another.
609   ///
610   /// This includes both the original MDs from \p From and additional ones (\see
611   /// addNewMetadata).  Use this for *newly created* instructions in the vector
612   /// loop.
613   void addMetadata(Instruction *To, Instruction *From);
614 
615   /// Similar to the previous function but it adds the metadata to a
616   /// vector of instructions.
617   void addMetadata(ArrayRef<Value *> To, Instruction *From);
618 
619   /// The original loop.
620   Loop *OrigLoop;
621 
622   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
623   /// dynamic knowledge to simplify SCEV expressions and converts them to a
624   /// more usable form.
625   PredicatedScalarEvolution &PSE;
626 
627   /// Loop Info.
628   LoopInfo *LI;
629 
630   /// Dominator Tree.
631   DominatorTree *DT;
632 
633   /// Alias Analysis.
634   AliasAnalysis *AA;
635 
636   /// Target Library Info.
637   const TargetLibraryInfo *TLI;
638 
639   /// Target Transform Info.
640   const TargetTransformInfo *TTI;
641 
642   /// Assumption Cache.
643   AssumptionCache *AC;
644 
645   /// Interface to emit optimization remarks.
646   OptimizationRemarkEmitter *ORE;
647 
648   /// LoopVersioning.  It's only set up (non-null) if memchecks were
649   /// used.
650   ///
651   /// This is currently only used to add no-alias metadata based on the
652   /// memchecks.  The actually versioning is performed manually.
653   std::unique_ptr<LoopVersioning> LVer;
654 
655   /// The vectorization SIMD factor to use. Each vector will have this many
656   /// vector elements.
657   unsigned VF;
658 
659   /// The vectorization unroll factor to use. Each scalar is vectorized to this
660   /// many different vector instructions.
661   unsigned UF;
662 
663   /// The builder that we use
664   IRBuilder<> Builder;
665 
666   // --- Vectorization state ---
667 
668   /// The vector-loop preheader.
669   BasicBlock *LoopVectorPreHeader;
670 
671   /// The scalar-loop preheader.
672   BasicBlock *LoopScalarPreHeader;
673 
674   /// Middle Block between the vector and the scalar.
675   BasicBlock *LoopMiddleBlock;
676 
677   /// The ExitBlock of the scalar loop.
678   BasicBlock *LoopExitBlock;
679 
680   /// The vector loop body.
681   BasicBlock *LoopVectorBody;
682 
683   /// The scalar loop body.
684   BasicBlock *LoopScalarBody;
685 
686   /// A list of all bypass blocks. The first block is the entry of the loop.
687   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
688 
689   /// The new Induction variable which was added to the new block.
690   PHINode *Induction = nullptr;
691 
692   /// The induction variable of the old basic block.
693   PHINode *OldInduction = nullptr;
694 
695   /// Maps values from the original loop to their corresponding values in the
696   /// vectorized loop. A key value can map to either vector values, scalar
697   /// values or both kinds of values, depending on whether the key was
698   /// vectorized and scalarized.
699   VectorizerValueMap VectorLoopValueMap;
700 
701   /// Store instructions that were predicated.
702   SmallVector<Instruction *, 4> PredicatedInstructions;
703 
704   /// Trip count of the original loop.
705   Value *TripCount = nullptr;
706 
707   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
708   Value *VectorTripCount = nullptr;
709 
710   /// The legality analysis.
711   LoopVectorizationLegality *Legal;
712 
713   /// The profitablity analysis.
714   LoopVectorizationCostModel *Cost;
715 
716   // Record whether runtime checks are added.
717   bool AddedSafetyChecks = false;
718 
719   // Holds the end values for each induction variable. We save the end values
720   // so we can later fix-up the external users of the induction variables.
721   DenseMap<PHINode *, Value *> IVEndValues;
722 
723   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
724   // fixed up at the end of vector code generation.
725   SmallVector<PHINode *, 8> OrigPHIsToFix;
726 };
727 
728 class InnerLoopUnroller : public InnerLoopVectorizer {
729 public:
730   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
731                     LoopInfo *LI, DominatorTree *DT,
732                     const TargetLibraryInfo *TLI,
733                     const TargetTransformInfo *TTI, AssumptionCache *AC,
734                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
735                     LoopVectorizationLegality *LVL,
736                     LoopVectorizationCostModel *CM)
737       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
738                             UnrollFactor, LVL, CM) {}
739 
740 private:
741   Value *getBroadcastInstrs(Value *V) override;
742   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
743                        Instruction::BinaryOps Opcode =
744                        Instruction::BinaryOpsEnd) override;
745   Value *reverseVector(Value *Vec) override;
746 };
747 
748 } // end namespace llvm
749 
750 /// Look for a meaningful debug location on the instruction or it's
751 /// operands.
752 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
753   if (!I)
754     return I;
755 
756   DebugLoc Empty;
757   if (I->getDebugLoc() != Empty)
758     return I;
759 
760   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
761     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
762       if (OpInst->getDebugLoc() != Empty)
763         return OpInst;
764   }
765 
766   return I;
767 }
768 
769 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
770   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
771     const DILocation *DIL = Inst->getDebugLoc();
772     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
773         !isa<DbgInfoIntrinsic>(Inst)) {
774       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
775       if (NewDIL)
776         B.SetCurrentDebugLocation(NewDIL.getValue());
777       else
778         LLVM_DEBUG(dbgs()
779                    << "Failed to create new discriminator: "
780                    << DIL->getFilename() << " Line: " << DIL->getLine());
781     }
782     else
783       B.SetCurrentDebugLocation(DIL);
784   } else
785     B.SetCurrentDebugLocation(DebugLoc());
786 }
787 
788 #ifndef NDEBUG
789 /// \return string containing a file name and a line # for the given loop.
790 static std::string getDebugLocString(const Loop *L) {
791   std::string Result;
792   if (L) {
793     raw_string_ostream OS(Result);
794     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
795       LoopDbgLoc.print(OS);
796     else
797       // Just print the module name.
798       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
799     OS.flush();
800   }
801   return Result;
802 }
803 #endif
804 
805 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
806                                          const Instruction *Orig) {
807   // If the loop was versioned with memchecks, add the corresponding no-alias
808   // metadata.
809   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
810     LVer->annotateInstWithNoAlias(To, Orig);
811 }
812 
813 void InnerLoopVectorizer::addMetadata(Instruction *To,
814                                       Instruction *From) {
815   propagateMetadata(To, From);
816   addNewMetadata(To, From);
817 }
818 
819 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
820                                       Instruction *From) {
821   for (Value *V : To) {
822     if (Instruction *I = dyn_cast<Instruction>(V))
823       addMetadata(I, From);
824   }
825 }
826 
827 namespace llvm {
828 
829 /// LoopVectorizationCostModel - estimates the expected speedups due to
830 /// vectorization.
831 /// In many cases vectorization is not profitable. This can happen because of
832 /// a number of reasons. In this class we mainly attempt to predict the
833 /// expected speedup/slowdowns due to the supported instruction set. We use the
834 /// TargetTransformInfo to query the different backends for the cost of
835 /// different operations.
836 class LoopVectorizationCostModel {
837 public:
838   LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
839                              LoopInfo *LI, LoopVectorizationLegality *Legal,
840                              const TargetTransformInfo &TTI,
841                              const TargetLibraryInfo *TLI, DemandedBits *DB,
842                              AssumptionCache *AC,
843                              OptimizationRemarkEmitter *ORE, const Function *F,
844                              const LoopVectorizeHints *Hints,
845                              InterleavedAccessInfo &IAI)
846       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
847     AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
848 
849   /// \return An upper bound for the vectorization factor, or None if
850   /// vectorization and interleaving should be avoided up front.
851   Optional<unsigned> computeMaxVF(bool OptForSize);
852 
853   /// \return The most profitable vectorization factor and the cost of that VF.
854   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
855   /// then this vectorization factor will be selected if vectorization is
856   /// possible.
857   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
858 
859   /// Setup cost-based decisions for user vectorization factor.
860   void selectUserVectorizationFactor(unsigned UserVF) {
861     collectUniformsAndScalars(UserVF);
862     collectInstsToScalarize(UserVF);
863   }
864 
865   /// \return The size (in bits) of the smallest and widest types in the code
866   /// that needs to be vectorized. We ignore values that remain scalar such as
867   /// 64 bit loop indices.
868   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
869 
870   /// \return The desired interleave count.
871   /// If interleave count has been specified by metadata it will be returned.
872   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
873   /// are the selected vectorization factor and the cost of the selected VF.
874   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
875                                  unsigned LoopCost);
876 
877   /// Memory access instruction may be vectorized in more than one way.
878   /// Form of instruction after vectorization depends on cost.
879   /// This function takes cost-based decisions for Load/Store instructions
880   /// and collects them in a map. This decisions map is used for building
881   /// the lists of loop-uniform and loop-scalar instructions.
882   /// The calculated cost is saved with widening decision in order to
883   /// avoid redundant calculations.
884   void setCostBasedWideningDecision(unsigned VF);
885 
886   /// A struct that represents some properties of the register usage
887   /// of a loop.
888   struct RegisterUsage {
889     /// Holds the number of loop invariant values that are used in the loop.
890     unsigned LoopInvariantRegs;
891 
892     /// Holds the maximum number of concurrent live intervals in the loop.
893     unsigned MaxLocalUsers;
894   };
895 
896   /// \return Returns information about the register usages of the loop for the
897   /// given vectorization factors.
898   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
899 
900   /// Collect values we want to ignore in the cost model.
901   void collectValuesToIgnore();
902 
903   /// \returns The smallest bitwidth each instruction can be represented with.
904   /// The vector equivalents of these instructions should be truncated to this
905   /// type.
906   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
907     return MinBWs;
908   }
909 
910   /// \returns True if it is more profitable to scalarize instruction \p I for
911   /// vectorization factor \p VF.
912   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
913     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
914 
915     // Cost model is not run in the VPlan-native path - return conservative
916     // result until this changes.
917     if (EnableVPlanNativePath)
918       return false;
919 
920     auto Scalars = InstsToScalarize.find(VF);
921     assert(Scalars != InstsToScalarize.end() &&
922            "VF not yet analyzed for scalarization profitability");
923     return Scalars->second.find(I) != Scalars->second.end();
924   }
925 
926   /// Returns true if \p I is known to be uniform after vectorization.
927   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
928     if (VF == 1)
929       return true;
930 
931     // Cost model is not run in the VPlan-native path - return conservative
932     // result until this changes.
933     if (EnableVPlanNativePath)
934       return false;
935 
936     auto UniformsPerVF = Uniforms.find(VF);
937     assert(UniformsPerVF != Uniforms.end() &&
938            "VF not yet analyzed for uniformity");
939     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
940   }
941 
942   /// Returns true if \p I is known to be scalar after vectorization.
943   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
944     if (VF == 1)
945       return true;
946 
947     // Cost model is not run in the VPlan-native path - return conservative
948     // result until this changes.
949     if (EnableVPlanNativePath)
950       return false;
951 
952     auto ScalarsPerVF = Scalars.find(VF);
953     assert(ScalarsPerVF != Scalars.end() &&
954            "Scalar values are not calculated for VF");
955     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
956   }
957 
958   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
959   /// for vectorization factor \p VF.
960   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
961     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
962            !isProfitableToScalarize(I, VF) &&
963            !isScalarAfterVectorization(I, VF);
964   }
965 
966   /// Decision that was taken during cost calculation for memory instruction.
967   enum InstWidening {
968     CM_Unknown,
969     CM_Widen,         // For consecutive accesses with stride +1.
970     CM_Widen_Reverse, // For consecutive accesses with stride -1.
971     CM_Interleave,
972     CM_GatherScatter,
973     CM_Scalarize
974   };
975 
976   /// Save vectorization decision \p W and \p Cost taken by the cost model for
977   /// instruction \p I and vector width \p VF.
978   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
979                            unsigned Cost) {
980     assert(VF >= 2 && "Expected VF >=2");
981     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
982   }
983 
984   /// Save vectorization decision \p W and \p Cost taken by the cost model for
985   /// interleaving group \p Grp and vector width \p VF.
986   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
987                            InstWidening W, unsigned Cost) {
988     assert(VF >= 2 && "Expected VF >=2");
989     /// Broadcast this decicion to all instructions inside the group.
990     /// But the cost will be assigned to one instruction only.
991     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
992       if (auto *I = Grp->getMember(i)) {
993         if (Grp->getInsertPos() == I)
994           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
995         else
996           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
997       }
998     }
999   }
1000 
1001   /// Return the cost model decision for the given instruction \p I and vector
1002   /// width \p VF. Return CM_Unknown if this instruction did not pass
1003   /// through the cost modeling.
1004   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1005     assert(VF >= 2 && "Expected VF >=2");
1006 
1007     // Cost model is not run in the VPlan-native path - return conservative
1008     // result until this changes.
1009     if (EnableVPlanNativePath)
1010       return CM_GatherScatter;
1011 
1012     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1013     auto Itr = WideningDecisions.find(InstOnVF);
1014     if (Itr == WideningDecisions.end())
1015       return CM_Unknown;
1016     return Itr->second.first;
1017   }
1018 
1019   /// Return the vectorization cost for the given instruction \p I and vector
1020   /// width \p VF.
1021   unsigned getWideningCost(Instruction *I, unsigned VF) {
1022     assert(VF >= 2 && "Expected VF >=2");
1023     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1024     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1025            "The cost is not calculated");
1026     return WideningDecisions[InstOnVF].second;
1027   }
1028 
1029   /// Return True if instruction \p I is an optimizable truncate whose operand
1030   /// is an induction variable. Such a truncate will be removed by adding a new
1031   /// induction variable with the destination type.
1032   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1033     // If the instruction is not a truncate, return false.
1034     auto *Trunc = dyn_cast<TruncInst>(I);
1035     if (!Trunc)
1036       return false;
1037 
1038     // Get the source and destination types of the truncate.
1039     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1040     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1041 
1042     // If the truncate is free for the given types, return false. Replacing a
1043     // free truncate with an induction variable would add an induction variable
1044     // update instruction to each iteration of the loop. We exclude from this
1045     // check the primary induction variable since it will need an update
1046     // instruction regardless.
1047     Value *Op = Trunc->getOperand(0);
1048     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1049       return false;
1050 
1051     // If the truncated value is not an induction variable, return false.
1052     return Legal->isInductionPhi(Op);
1053   }
1054 
1055   /// Collects the instructions to scalarize for each predicated instruction in
1056   /// the loop.
1057   void collectInstsToScalarize(unsigned VF);
1058 
1059   /// Collect Uniform and Scalar values for the given \p VF.
1060   /// The sets depend on CM decision for Load/Store instructions
1061   /// that may be vectorized as interleave, gather-scatter or scalarized.
1062   void collectUniformsAndScalars(unsigned VF) {
1063     // Do the analysis once.
1064     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1065       return;
1066     setCostBasedWideningDecision(VF);
1067     collectLoopUniforms(VF);
1068     collectLoopScalars(VF);
1069   }
1070 
1071   /// Returns true if the target machine supports masked store operation
1072   /// for the given \p DataType and kind of access to \p Ptr.
1073   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1074     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1075   }
1076 
1077   /// Returns true if the target machine supports masked load operation
1078   /// for the given \p DataType and kind of access to \p Ptr.
1079   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1080     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1081   }
1082 
1083   /// Returns true if the target machine supports masked scatter operation
1084   /// for the given \p DataType.
1085   bool isLegalMaskedScatter(Type *DataType) {
1086     return TTI.isLegalMaskedScatter(DataType);
1087   }
1088 
1089   /// Returns true if the target machine supports masked gather operation
1090   /// for the given \p DataType.
1091   bool isLegalMaskedGather(Type *DataType) {
1092     return TTI.isLegalMaskedGather(DataType);
1093   }
1094 
1095   /// Returns true if the target machine can represent \p V as a masked gather
1096   /// or scatter operation.
1097   bool isLegalGatherOrScatter(Value *V) {
1098     bool LI = isa<LoadInst>(V);
1099     bool SI = isa<StoreInst>(V);
1100     if (!LI && !SI)
1101       return false;
1102     auto *Ty = getMemInstValueType(V);
1103     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1104   }
1105 
1106   /// Returns true if \p I is an instruction that will be scalarized with
1107   /// predication. Such instructions include conditional stores and
1108   /// instructions that may divide by zero.
1109   /// If a non-zero VF has been calculated, we check if I will be scalarized
1110   /// predication for that VF.
1111   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1112 
1113   // Returns true if \p I is an instruction that will be predicated either
1114   // through scalar predication or masked load/store or masked gather/scatter.
1115   // Superset of instructions that return true for isScalarWithPredication.
1116   bool isPredicatedInst(Instruction *I) {
1117     if (!blockNeedsPredication(I->getParent()))
1118       return false;
1119     // Loads and stores that need some form of masked operation are predicated
1120     // instructions.
1121     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1122       return Legal->isMaskRequired(I);
1123     return isScalarWithPredication(I);
1124   }
1125 
1126   /// Returns true if \p I is a memory instruction with consecutive memory
1127   /// access that can be widened.
1128   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1129 
1130   /// Returns true if \p I is a memory instruction in an interleaved-group
1131   /// of memory accesses that can be vectorized with wide vector loads/stores
1132   /// and shuffles.
1133   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1134 
1135   /// Check if \p Instr belongs to any interleaved access group.
1136   bool isAccessInterleaved(Instruction *Instr) {
1137     return InterleaveInfo.isInterleaved(Instr);
1138   }
1139 
1140   /// Get the interleaved access group that \p Instr belongs to.
1141   const InterleaveGroup<Instruction> *
1142   getInterleavedAccessGroup(Instruction *Instr) {
1143     return InterleaveInfo.getInterleaveGroup(Instr);
1144   }
1145 
1146   /// Returns true if an interleaved group requires a scalar iteration
1147   /// to handle accesses with gaps, and there is nothing preventing us from
1148   /// creating a scalar epilogue.
1149   bool requiresScalarEpilogue() const {
1150     return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
1151   }
1152 
1153   /// Returns true if a scalar epilogue is not allowed due to optsize.
1154   bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
1155 
1156   /// Returns true if all loop blocks should be masked to fold tail loop.
1157   bool foldTailByMasking() const { return FoldTailByMasking; }
1158 
1159   bool blockNeedsPredication(BasicBlock *BB) {
1160     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1161   }
1162 
1163 private:
1164   unsigned NumPredStores = 0;
1165 
1166   /// \return An upper bound for the vectorization factor, larger than zero.
1167   /// One is returned if vectorization should best be avoided due to cost.
1168   unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
1169 
1170   /// The vectorization cost is a combination of the cost itself and a boolean
1171   /// indicating whether any of the contributing operations will actually
1172   /// operate on
1173   /// vector values after type legalization in the backend. If this latter value
1174   /// is
1175   /// false, then all operations will be scalarized (i.e. no vectorization has
1176   /// actually taken place).
1177   using VectorizationCostTy = std::pair<unsigned, bool>;
1178 
1179   /// Returns the expected execution cost. The unit of the cost does
1180   /// not matter because we use the 'cost' units to compare different
1181   /// vector widths. The cost that is returned is *not* normalized by
1182   /// the factor width.
1183   VectorizationCostTy expectedCost(unsigned VF);
1184 
1185   /// Returns the execution time cost of an instruction for a given vector
1186   /// width. Vector width of one means scalar.
1187   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1188 
1189   /// The cost-computation logic from getInstructionCost which provides
1190   /// the vector type as an output parameter.
1191   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1192 
1193   /// Calculate vectorization cost of memory instruction \p I.
1194   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1195 
1196   /// The cost computation for scalarized memory instruction.
1197   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1198 
1199   /// The cost computation for interleaving group of memory instructions.
1200   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1201 
1202   /// The cost computation for Gather/Scatter instruction.
1203   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1204 
1205   /// The cost computation for widening instruction \p I with consecutive
1206   /// memory access.
1207   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1208 
1209   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1210   /// Load: scalar load + broadcast.
1211   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1212   /// element)
1213   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1214 
1215   /// Returns whether the instruction is a load or store and will be a emitted
1216   /// as a vector operation.
1217   bool isConsecutiveLoadOrStore(Instruction *I);
1218 
1219   /// Returns true if an artificially high cost for emulated masked memrefs
1220   /// should be used.
1221   bool useEmulatedMaskMemRefHack(Instruction *I);
1222 
1223   /// Create an analysis remark that explains why vectorization failed
1224   ///
1225   /// \p RemarkName is the identifier for the remark.  \return the remark object
1226   /// that can be streamed to.
1227   OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) {
1228     return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(),
1229                                   RemarkName, TheLoop);
1230   }
1231 
1232   /// Map of scalar integer values to the smallest bitwidth they can be legally
1233   /// represented as. The vector equivalents of these values should be truncated
1234   /// to this type.
1235   MapVector<Instruction *, uint64_t> MinBWs;
1236 
1237   /// A type representing the costs for instructions if they were to be
1238   /// scalarized rather than vectorized. The entries are Instruction-Cost
1239   /// pairs.
1240   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1241 
1242   /// A set containing all BasicBlocks that are known to present after
1243   /// vectorization as a predicated block.
1244   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1245 
1246   /// Records whether it is allowed to have the original scalar loop execute at
1247   /// least once. This may be needed as a fallback loop in case runtime
1248   /// aliasing/dependence checks fail, or to handle the tail/remainder
1249   /// iterations when the trip count is unknown or doesn't divide by the VF,
1250   /// or as a peel-loop to handle gaps in interleave-groups.
1251   /// Under optsize and when the trip count is very small we don't allow any
1252   /// iterations to execute in the scalar loop.
1253   bool IsScalarEpilogueAllowed = true;
1254 
1255   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1256   bool FoldTailByMasking = false;
1257 
1258   /// A map holding scalar costs for different vectorization factors. The
1259   /// presence of a cost for an instruction in the mapping indicates that the
1260   /// instruction will be scalarized when vectorizing with the associated
1261   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1262   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1263 
1264   /// Holds the instructions known to be uniform after vectorization.
1265   /// The data is collected per VF.
1266   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1267 
1268   /// Holds the instructions known to be scalar after vectorization.
1269   /// The data is collected per VF.
1270   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1271 
1272   /// Holds the instructions (address computations) that are forced to be
1273   /// scalarized.
1274   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1275 
1276   /// Returns the expected difference in cost from scalarizing the expression
1277   /// feeding a predicated instruction \p PredInst. The instructions to
1278   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1279   /// non-negative return value implies the expression will be scalarized.
1280   /// Currently, only single-use chains are considered for scalarization.
1281   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1282                               unsigned VF);
1283 
1284   /// Collect the instructions that are uniform after vectorization. An
1285   /// instruction is uniform if we represent it with a single scalar value in
1286   /// the vectorized loop corresponding to each vector iteration. Examples of
1287   /// uniform instructions include pointer operands of consecutive or
1288   /// interleaved memory accesses. Note that although uniformity implies an
1289   /// instruction will be scalar, the reverse is not true. In general, a
1290   /// scalarized instruction will be represented by VF scalar values in the
1291   /// vectorized loop, each corresponding to an iteration of the original
1292   /// scalar loop.
1293   void collectLoopUniforms(unsigned VF);
1294 
1295   /// Collect the instructions that are scalar after vectorization. An
1296   /// instruction is scalar if it is known to be uniform or will be scalarized
1297   /// during vectorization. Non-uniform scalarized instructions will be
1298   /// represented by VF values in the vectorized loop, each corresponding to an
1299   /// iteration of the original scalar loop.
1300   void collectLoopScalars(unsigned VF);
1301 
1302   /// Keeps cost model vectorization decision and cost for instructions.
1303   /// Right now it is used for memory instructions only.
1304   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1305                                 std::pair<InstWidening, unsigned>>;
1306 
1307   DecisionList WideningDecisions;
1308 
1309 public:
1310   /// The loop that we evaluate.
1311   Loop *TheLoop;
1312 
1313   /// Predicated scalar evolution analysis.
1314   PredicatedScalarEvolution &PSE;
1315 
1316   /// Loop Info analysis.
1317   LoopInfo *LI;
1318 
1319   /// Vectorization legality.
1320   LoopVectorizationLegality *Legal;
1321 
1322   /// Vector target information.
1323   const TargetTransformInfo &TTI;
1324 
1325   /// Target Library Info.
1326   const TargetLibraryInfo *TLI;
1327 
1328   /// Demanded bits analysis.
1329   DemandedBits *DB;
1330 
1331   /// Assumption cache.
1332   AssumptionCache *AC;
1333 
1334   /// Interface to emit optimization remarks.
1335   OptimizationRemarkEmitter *ORE;
1336 
1337   const Function *TheFunction;
1338 
1339   /// Loop Vectorize Hint.
1340   const LoopVectorizeHints *Hints;
1341 
1342   /// The interleave access information contains groups of interleaved accesses
1343   /// with the same stride and close to each other.
1344   InterleavedAccessInfo &InterleaveInfo;
1345 
1346   /// Values to ignore in the cost model.
1347   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1348 
1349   /// Values to ignore in the cost model when VF > 1.
1350   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1351 };
1352 
1353 } // end namespace llvm
1354 
1355 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1356 // vectorization. The loop needs to be annotated with #pragma omp simd
1357 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1358 // vector length information is not provided, vectorization is not considered
1359 // explicit. Interleave hints are not allowed either. These limitations will be
1360 // relaxed in the future.
1361 // Please, note that we are currently forced to abuse the pragma 'clang
1362 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1363 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1364 // provides *explicit vectorization hints* (LV can bypass legal checks and
1365 // assume that vectorization is legal). However, both hints are implemented
1366 // using the same metadata (llvm.loop.vectorize, processed by
1367 // LoopVectorizeHints). This will be fixed in the future when the native IR
1368 // representation for pragma 'omp simd' is introduced.
1369 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1370                                    OptimizationRemarkEmitter *ORE) {
1371   assert(!OuterLp->empty() && "This is not an outer loop");
1372   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1373 
1374   // Only outer loops with an explicit vectorization hint are supported.
1375   // Unannotated outer loops are ignored.
1376   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1377     return false;
1378 
1379   Function *Fn = OuterLp->getHeader()->getParent();
1380   if (!Hints.allowVectorization(Fn, OuterLp,
1381                                 true /*VectorizeOnlyWhenForced*/)) {
1382     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1383     return false;
1384   }
1385 
1386   if (!Hints.getWidth()) {
1387     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No user vector width.\n");
1388     Hints.emitRemarkWithHints();
1389     return false;
1390   }
1391 
1392   if (Hints.getInterleave() > 1) {
1393     // TODO: Interleave support is future work.
1394     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1395                          "outer loops.\n");
1396     Hints.emitRemarkWithHints();
1397     return false;
1398   }
1399 
1400   return true;
1401 }
1402 
1403 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1404                                   OptimizationRemarkEmitter *ORE,
1405                                   SmallVectorImpl<Loop *> &V) {
1406   // Collect inner loops and outer loops without irreducible control flow. For
1407   // now, only collect outer loops that have explicit vectorization hints. If we
1408   // are stress testing the VPlan H-CFG construction, we collect the outermost
1409   // loop of every loop nest.
1410   if (L.empty() || VPlanBuildStressTest ||
1411       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1412     LoopBlocksRPO RPOT(&L);
1413     RPOT.perform(LI);
1414     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1415       V.push_back(&L);
1416       // TODO: Collect inner loops inside marked outer loops in case
1417       // vectorization fails for the outer loop. Do not invoke
1418       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1419       // already known to be reducible. We can use an inherited attribute for
1420       // that.
1421       return;
1422     }
1423   }
1424   for (Loop *InnerL : L)
1425     collectSupportedLoops(*InnerL, LI, ORE, V);
1426 }
1427 
1428 namespace {
1429 
1430 /// The LoopVectorize Pass.
1431 struct LoopVectorize : public FunctionPass {
1432   /// Pass identification, replacement for typeid
1433   static char ID;
1434 
1435   LoopVectorizePass Impl;
1436 
1437   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1438                          bool VectorizeOnlyWhenForced = false)
1439       : FunctionPass(ID) {
1440     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1441     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1442     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1443   }
1444 
1445   bool runOnFunction(Function &F) override {
1446     if (skipFunction(F))
1447       return false;
1448 
1449     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1450     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1451     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1452     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1453     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1454     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1455     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1456     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1457     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1458     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1459     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1460     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1461 
1462     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1463         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1464 
1465     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1466                         GetLAA, *ORE);
1467   }
1468 
1469   void getAnalysisUsage(AnalysisUsage &AU) const override {
1470     AU.addRequired<AssumptionCacheTracker>();
1471     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1472     AU.addRequired<DominatorTreeWrapperPass>();
1473     AU.addRequired<LoopInfoWrapperPass>();
1474     AU.addRequired<ScalarEvolutionWrapperPass>();
1475     AU.addRequired<TargetTransformInfoWrapperPass>();
1476     AU.addRequired<AAResultsWrapperPass>();
1477     AU.addRequired<LoopAccessLegacyAnalysis>();
1478     AU.addRequired<DemandedBitsWrapperPass>();
1479     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1480 
1481     // We currently do not preserve loopinfo/dominator analyses with outer loop
1482     // vectorization. Until this is addressed, mark these analyses as preserved
1483     // only for non-VPlan-native path.
1484     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1485     if (!EnableVPlanNativePath) {
1486       AU.addPreserved<LoopInfoWrapperPass>();
1487       AU.addPreserved<DominatorTreeWrapperPass>();
1488     }
1489 
1490     AU.addPreserved<BasicAAWrapperPass>();
1491     AU.addPreserved<GlobalsAAWrapperPass>();
1492   }
1493 };
1494 
1495 } // end anonymous namespace
1496 
1497 //===----------------------------------------------------------------------===//
1498 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1499 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1500 //===----------------------------------------------------------------------===//
1501 
1502 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1503   // We need to place the broadcast of invariant variables outside the loop,
1504   // but only if it's proven safe to do so. Else, broadcast will be inside
1505   // vector loop body.
1506   Instruction *Instr = dyn_cast<Instruction>(V);
1507   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1508                      (!Instr ||
1509                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1510   // Place the code for broadcasting invariant variables in the new preheader.
1511   IRBuilder<>::InsertPointGuard Guard(Builder);
1512   if (SafeToHoist)
1513     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1514 
1515   // Broadcast the scalar into all locations in the vector.
1516   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1517 
1518   return Shuf;
1519 }
1520 
1521 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1522     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1523   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1524          "Expected either an induction phi-node or a truncate of it!");
1525   Value *Start = II.getStartValue();
1526 
1527   // Construct the initial value of the vector IV in the vector loop preheader
1528   auto CurrIP = Builder.saveIP();
1529   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1530   if (isa<TruncInst>(EntryVal)) {
1531     assert(Start->getType()->isIntegerTy() &&
1532            "Truncation requires an integer type");
1533     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1534     Step = Builder.CreateTrunc(Step, TruncType);
1535     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1536   }
1537   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1538   Value *SteppedStart =
1539       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1540 
1541   // We create vector phi nodes for both integer and floating-point induction
1542   // variables. Here, we determine the kind of arithmetic we will perform.
1543   Instruction::BinaryOps AddOp;
1544   Instruction::BinaryOps MulOp;
1545   if (Step->getType()->isIntegerTy()) {
1546     AddOp = Instruction::Add;
1547     MulOp = Instruction::Mul;
1548   } else {
1549     AddOp = II.getInductionOpcode();
1550     MulOp = Instruction::FMul;
1551   }
1552 
1553   // Multiply the vectorization factor by the step using integer or
1554   // floating-point arithmetic as appropriate.
1555   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1556   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1557 
1558   // Create a vector splat to use in the induction update.
1559   //
1560   // FIXME: If the step is non-constant, we create the vector splat with
1561   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1562   //        handle a constant vector splat.
1563   Value *SplatVF = isa<Constant>(Mul)
1564                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1565                        : Builder.CreateVectorSplat(VF, Mul);
1566   Builder.restoreIP(CurrIP);
1567 
1568   // We may need to add the step a number of times, depending on the unroll
1569   // factor. The last of those goes into the PHI.
1570   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1571                                     &*LoopVectorBody->getFirstInsertionPt());
1572   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1573   Instruction *LastInduction = VecInd;
1574   for (unsigned Part = 0; Part < UF; ++Part) {
1575     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1576 
1577     if (isa<TruncInst>(EntryVal))
1578       addMetadata(LastInduction, EntryVal);
1579     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1580 
1581     LastInduction = cast<Instruction>(addFastMathFlag(
1582         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1583     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1584   }
1585 
1586   // Move the last step to the end of the latch block. This ensures consistent
1587   // placement of all induction updates.
1588   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1589   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1590   auto *ICmp = cast<Instruction>(Br->getCondition());
1591   LastInduction->moveBefore(ICmp);
1592   LastInduction->setName("vec.ind.next");
1593 
1594   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1595   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1596 }
1597 
1598 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1599   return Cost->isScalarAfterVectorization(I, VF) ||
1600          Cost->isProfitableToScalarize(I, VF);
1601 }
1602 
1603 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1604   if (shouldScalarizeInstruction(IV))
1605     return true;
1606   auto isScalarInst = [&](User *U) -> bool {
1607     auto *I = cast<Instruction>(U);
1608     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1609   };
1610   return llvm::any_of(IV->users(), isScalarInst);
1611 }
1612 
1613 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1614     const InductionDescriptor &ID, const Instruction *EntryVal,
1615     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1616   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1617          "Expected either an induction phi-node or a truncate of it!");
1618 
1619   // This induction variable is not the phi from the original loop but the
1620   // newly-created IV based on the proof that casted Phi is equal to the
1621   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1622   // re-uses the same InductionDescriptor that original IV uses but we don't
1623   // have to do any recording in this case - that is done when original IV is
1624   // processed.
1625   if (isa<TruncInst>(EntryVal))
1626     return;
1627 
1628   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1629   if (Casts.empty())
1630     return;
1631   // Only the first Cast instruction in the Casts vector is of interest.
1632   // The rest of the Casts (if exist) have no uses outside the
1633   // induction update chain itself.
1634   Instruction *CastInst = *Casts.begin();
1635   if (Lane < UINT_MAX)
1636     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1637   else
1638     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1639 }
1640 
1641 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1642   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1643          "Primary induction variable must have an integer type");
1644 
1645   auto II = Legal->getInductionVars()->find(IV);
1646   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1647 
1648   auto ID = II->second;
1649   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1650 
1651   // The scalar value to broadcast. This will be derived from the canonical
1652   // induction variable.
1653   Value *ScalarIV = nullptr;
1654 
1655   // The value from the original loop to which we are mapping the new induction
1656   // variable.
1657   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1658 
1659   // True if we have vectorized the induction variable.
1660   auto VectorizedIV = false;
1661 
1662   // Determine if we want a scalar version of the induction variable. This is
1663   // true if the induction variable itself is not widened, or if it has at
1664   // least one user in the loop that is not widened.
1665   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1666 
1667   // Generate code for the induction step. Note that induction steps are
1668   // required to be loop-invariant
1669   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1670          "Induction step should be loop invariant");
1671   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1672   Value *Step = nullptr;
1673   if (PSE.getSE()->isSCEVable(IV->getType())) {
1674     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1675     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1676                              LoopVectorPreHeader->getTerminator());
1677   } else {
1678     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1679   }
1680 
1681   // Try to create a new independent vector induction variable. If we can't
1682   // create the phi node, we will splat the scalar induction variable in each
1683   // loop iteration.
1684   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1685     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1686     VectorizedIV = true;
1687   }
1688 
1689   // If we haven't yet vectorized the induction variable, or if we will create
1690   // a scalar one, we need to define the scalar induction variable and step
1691   // values. If we were given a truncation type, truncate the canonical
1692   // induction variable and step. Otherwise, derive these values from the
1693   // induction descriptor.
1694   if (!VectorizedIV || NeedsScalarIV) {
1695     ScalarIV = Induction;
1696     if (IV != OldInduction) {
1697       ScalarIV = IV->getType()->isIntegerTy()
1698                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1699                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1700                                           IV->getType());
1701       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1702       ScalarIV->setName("offset.idx");
1703     }
1704     if (Trunc) {
1705       auto *TruncType = cast<IntegerType>(Trunc->getType());
1706       assert(Step->getType()->isIntegerTy() &&
1707              "Truncation requires an integer step");
1708       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1709       Step = Builder.CreateTrunc(Step, TruncType);
1710     }
1711   }
1712 
1713   // If we haven't yet vectorized the induction variable, splat the scalar
1714   // induction variable, and build the necessary step vectors.
1715   // TODO: Don't do it unless the vectorized IV is really required.
1716   if (!VectorizedIV) {
1717     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1718     for (unsigned Part = 0; Part < UF; ++Part) {
1719       Value *EntryPart =
1720           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1721       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1722       if (Trunc)
1723         addMetadata(EntryPart, Trunc);
1724       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1725     }
1726   }
1727 
1728   // If an induction variable is only used for counting loop iterations or
1729   // calculating addresses, it doesn't need to be widened. Create scalar steps
1730   // that can be used by instructions we will later scalarize. Note that the
1731   // addition of the scalar steps will not increase the number of instructions
1732   // in the loop in the common case prior to InstCombine. We will be trading
1733   // one vector extract for each scalar step.
1734   if (NeedsScalarIV)
1735     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1736 }
1737 
1738 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1739                                           Instruction::BinaryOps BinOp) {
1740   // Create and check the types.
1741   assert(Val->getType()->isVectorTy() && "Must be a vector");
1742   int VLen = Val->getType()->getVectorNumElements();
1743 
1744   Type *STy = Val->getType()->getScalarType();
1745   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1746          "Induction Step must be an integer or FP");
1747   assert(Step->getType() == STy && "Step has wrong type");
1748 
1749   SmallVector<Constant *, 8> Indices;
1750 
1751   if (STy->isIntegerTy()) {
1752     // Create a vector of consecutive numbers from zero to VF.
1753     for (int i = 0; i < VLen; ++i)
1754       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1755 
1756     // Add the consecutive indices to the vector value.
1757     Constant *Cv = ConstantVector::get(Indices);
1758     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1759     Step = Builder.CreateVectorSplat(VLen, Step);
1760     assert(Step->getType() == Val->getType() && "Invalid step vec");
1761     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1762     // which can be found from the original scalar operations.
1763     Step = Builder.CreateMul(Cv, Step);
1764     return Builder.CreateAdd(Val, Step, "induction");
1765   }
1766 
1767   // Floating point induction.
1768   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1769          "Binary Opcode should be specified for FP induction");
1770   // Create a vector of consecutive numbers from zero to VF.
1771   for (int i = 0; i < VLen; ++i)
1772     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1773 
1774   // Add the consecutive indices to the vector value.
1775   Constant *Cv = ConstantVector::get(Indices);
1776 
1777   Step = Builder.CreateVectorSplat(VLen, Step);
1778 
1779   // Floating point operations had to be 'fast' to enable the induction.
1780   FastMathFlags Flags;
1781   Flags.setFast();
1782 
1783   Value *MulOp = Builder.CreateFMul(Cv, Step);
1784   if (isa<Instruction>(MulOp))
1785     // Have to check, MulOp may be a constant
1786     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1787 
1788   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1789   if (isa<Instruction>(BOp))
1790     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1791   return BOp;
1792 }
1793 
1794 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1795                                            Instruction *EntryVal,
1796                                            const InductionDescriptor &ID) {
1797   // We shouldn't have to build scalar steps if we aren't vectorizing.
1798   assert(VF > 1 && "VF should be greater than one");
1799 
1800   // Get the value type and ensure it and the step have the same integer type.
1801   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1802   assert(ScalarIVTy == Step->getType() &&
1803          "Val and Step should have the same type");
1804 
1805   // We build scalar steps for both integer and floating-point induction
1806   // variables. Here, we determine the kind of arithmetic we will perform.
1807   Instruction::BinaryOps AddOp;
1808   Instruction::BinaryOps MulOp;
1809   if (ScalarIVTy->isIntegerTy()) {
1810     AddOp = Instruction::Add;
1811     MulOp = Instruction::Mul;
1812   } else {
1813     AddOp = ID.getInductionOpcode();
1814     MulOp = Instruction::FMul;
1815   }
1816 
1817   // Determine the number of scalars we need to generate for each unroll
1818   // iteration. If EntryVal is uniform, we only need to generate the first
1819   // lane. Otherwise, we generate all VF values.
1820   unsigned Lanes =
1821       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1822                                                                          : VF;
1823   // Compute the scalar steps and save the results in VectorLoopValueMap.
1824   for (unsigned Part = 0; Part < UF; ++Part) {
1825     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1826       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1827       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1828       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1829       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1830       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1831     }
1832   }
1833 }
1834 
1835 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1836   assert(V != Induction && "The new induction variable should not be used.");
1837   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1838   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1839 
1840   // If we have a stride that is replaced by one, do it here. Defer this for
1841   // the VPlan-native path until we start running Legal checks in that path.
1842   if (!EnableVPlanNativePath && Legal->hasStride(V))
1843     V = ConstantInt::get(V->getType(), 1);
1844 
1845   // If we have a vector mapped to this value, return it.
1846   if (VectorLoopValueMap.hasVectorValue(V, Part))
1847     return VectorLoopValueMap.getVectorValue(V, Part);
1848 
1849   // If the value has not been vectorized, check if it has been scalarized
1850   // instead. If it has been scalarized, and we actually need the value in
1851   // vector form, we will construct the vector values on demand.
1852   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1853     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1854 
1855     // If we've scalarized a value, that value should be an instruction.
1856     auto *I = cast<Instruction>(V);
1857 
1858     // If we aren't vectorizing, we can just copy the scalar map values over to
1859     // the vector map.
1860     if (VF == 1) {
1861       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1862       return ScalarValue;
1863     }
1864 
1865     // Get the last scalar instruction we generated for V and Part. If the value
1866     // is known to be uniform after vectorization, this corresponds to lane zero
1867     // of the Part unroll iteration. Otherwise, the last instruction is the one
1868     // we created for the last vector lane of the Part unroll iteration.
1869     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1870     auto *LastInst = cast<Instruction>(
1871         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1872 
1873     // Set the insert point after the last scalarized instruction. This ensures
1874     // the insertelement sequence will directly follow the scalar definitions.
1875     auto OldIP = Builder.saveIP();
1876     auto NewIP = std::next(BasicBlock::iterator(LastInst));
1877     Builder.SetInsertPoint(&*NewIP);
1878 
1879     // However, if we are vectorizing, we need to construct the vector values.
1880     // If the value is known to be uniform after vectorization, we can just
1881     // broadcast the scalar value corresponding to lane zero for each unroll
1882     // iteration. Otherwise, we construct the vector values using insertelement
1883     // instructions. Since the resulting vectors are stored in
1884     // VectorLoopValueMap, we will only generate the insertelements once.
1885     Value *VectorValue = nullptr;
1886     if (Cost->isUniformAfterVectorization(I, VF)) {
1887       VectorValue = getBroadcastInstrs(ScalarValue);
1888       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
1889     } else {
1890       // Initialize packing with insertelements to start from undef.
1891       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
1892       VectorLoopValueMap.setVectorValue(V, Part, Undef);
1893       for (unsigned Lane = 0; Lane < VF; ++Lane)
1894         packScalarIntoVectorValue(V, {Part, Lane});
1895       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
1896     }
1897     Builder.restoreIP(OldIP);
1898     return VectorValue;
1899   }
1900 
1901   // If this scalar is unknown, assume that it is a constant or that it is
1902   // loop invariant. Broadcast V and save the value for future uses.
1903   Value *B = getBroadcastInstrs(V);
1904   VectorLoopValueMap.setVectorValue(V, Part, B);
1905   return B;
1906 }
1907 
1908 Value *
1909 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
1910                                             const VPIteration &Instance) {
1911   // If the value is not an instruction contained in the loop, it should
1912   // already be scalar.
1913   if (OrigLoop->isLoopInvariant(V))
1914     return V;
1915 
1916   assert(Instance.Lane > 0
1917              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
1918              : true && "Uniform values only have lane zero");
1919 
1920   // If the value from the original loop has not been vectorized, it is
1921   // represented by UF x VF scalar values in the new loop. Return the requested
1922   // scalar value.
1923   if (VectorLoopValueMap.hasScalarValue(V, Instance))
1924     return VectorLoopValueMap.getScalarValue(V, Instance);
1925 
1926   // If the value has not been scalarized, get its entry in VectorLoopValueMap
1927   // for the given unroll part. If this entry is not a vector type (i.e., the
1928   // vectorization factor is one), there is no need to generate an
1929   // extractelement instruction.
1930   auto *U = getOrCreateVectorValue(V, Instance.Part);
1931   if (!U->getType()->isVectorTy()) {
1932     assert(VF == 1 && "Value not scalarized has non-vector type");
1933     return U;
1934   }
1935 
1936   // Otherwise, the value from the original loop has been vectorized and is
1937   // represented by UF vector values. Extract and return the requested scalar
1938   // value from the appropriate vector lane.
1939   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
1940 }
1941 
1942 void InnerLoopVectorizer::packScalarIntoVectorValue(
1943     Value *V, const VPIteration &Instance) {
1944   assert(V != Induction && "The new induction variable should not be used.");
1945   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
1946   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1947 
1948   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
1949   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
1950   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
1951                                             Builder.getInt32(Instance.Lane));
1952   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
1953 }
1954 
1955 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
1956   assert(Vec->getType()->isVectorTy() && "Invalid type");
1957   SmallVector<Constant *, 8> ShuffleMask;
1958   for (unsigned i = 0; i < VF; ++i)
1959     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
1960 
1961   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
1962                                      ConstantVector::get(ShuffleMask),
1963                                      "reverse");
1964 }
1965 
1966 // Return whether we allow using masked interleave-groups (for dealing with
1967 // strided loads/stores that reside in predicated blocks, or for dealing
1968 // with gaps).
1969 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
1970   // If an override option has been passed in for interleaved accesses, use it.
1971   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1972     return EnableMaskedInterleavedMemAccesses;
1973 
1974   return TTI.enableMaskedInterleavedAccessVectorization();
1975 }
1976 
1977 // Try to vectorize the interleave group that \p Instr belongs to.
1978 //
1979 // E.g. Translate following interleaved load group (factor = 3):
1980 //   for (i = 0; i < N; i+=3) {
1981 //     R = Pic[i];             // Member of index 0
1982 //     G = Pic[i+1];           // Member of index 1
1983 //     B = Pic[i+2];           // Member of index 2
1984 //     ... // do something to R, G, B
1985 //   }
1986 // To:
1987 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
1988 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
1989 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
1990 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
1991 //
1992 // Or translate following interleaved store group (factor = 3):
1993 //   for (i = 0; i < N; i+=3) {
1994 //     ... do something to R, G, B
1995 //     Pic[i]   = R;           // Member of index 0
1996 //     Pic[i+1] = G;           // Member of index 1
1997 //     Pic[i+2] = B;           // Member of index 2
1998 //   }
1999 // To:
2000 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2001 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2002 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2003 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2004 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2005 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2006                                                    VectorParts *BlockInMask) {
2007   const InterleaveGroup<Instruction> *Group =
2008       Cost->getInterleavedAccessGroup(Instr);
2009   assert(Group && "Fail to get an interleaved access group.");
2010 
2011   // Skip if current instruction is not the insert position.
2012   if (Instr != Group->getInsertPos())
2013     return;
2014 
2015   const DataLayout &DL = Instr->getModule()->getDataLayout();
2016   Value *Ptr = getLoadStorePointerOperand(Instr);
2017 
2018   // Prepare for the vector type of the interleaved load/store.
2019   Type *ScalarTy = getMemInstValueType(Instr);
2020   unsigned InterleaveFactor = Group->getFactor();
2021   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2022   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2023 
2024   // Prepare for the new pointers.
2025   setDebugLocFromInst(Builder, Ptr);
2026   SmallVector<Value *, 2> NewPtrs;
2027   unsigned Index = Group->getIndex(Instr);
2028 
2029   VectorParts Mask;
2030   bool IsMaskForCondRequired = BlockInMask;
2031   if (IsMaskForCondRequired) {
2032     Mask = *BlockInMask;
2033     // TODO: extend the masked interleaved-group support to reversed access.
2034     assert(!Group->isReverse() && "Reversed masked interleave-group "
2035                                   "not supported.");
2036   }
2037 
2038   // If the group is reverse, adjust the index to refer to the last vector lane
2039   // instead of the first. We adjust the index from the first vector lane,
2040   // rather than directly getting the pointer for lane VF - 1, because the
2041   // pointer operand of the interleaved access is supposed to be uniform. For
2042   // uniform instructions, we're only required to generate a value for the
2043   // first vector lane in each unroll iteration.
2044   if (Group->isReverse())
2045     Index += (VF - 1) * Group->getFactor();
2046 
2047   bool InBounds = false;
2048   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2049     InBounds = gep->isInBounds();
2050 
2051   for (unsigned Part = 0; Part < UF; Part++) {
2052     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2053 
2054     // Notice current instruction could be any index. Need to adjust the address
2055     // to the member of index 0.
2056     //
2057     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2058     //       b = A[i];       // Member of index 0
2059     // Current pointer is pointed to A[i+1], adjust it to A[i].
2060     //
2061     // E.g.  A[i+1] = a;     // Member of index 1
2062     //       A[i]   = b;     // Member of index 0
2063     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2064     // Current pointer is pointed to A[i+2], adjust it to A[i].
2065     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2066     if (InBounds)
2067       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2068 
2069     // Cast to the vector pointer type.
2070     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2071   }
2072 
2073   setDebugLocFromInst(Builder, Instr);
2074   Value *UndefVec = UndefValue::get(VecTy);
2075 
2076   Value *MaskForGaps = nullptr;
2077   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2078     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2079     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2080   }
2081 
2082   // Vectorize the interleaved load group.
2083   if (isa<LoadInst>(Instr)) {
2084     // For each unroll part, create a wide load for the group.
2085     SmallVector<Value *, 2> NewLoads;
2086     for (unsigned Part = 0; Part < UF; Part++) {
2087       Instruction *NewLoad;
2088       if (IsMaskForCondRequired || MaskForGaps) {
2089         assert(useMaskedInterleavedAccesses(*TTI) &&
2090                "masked interleaved groups are not allowed.");
2091         Value *GroupMask = MaskForGaps;
2092         if (IsMaskForCondRequired) {
2093           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2094           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2095           Value *ShuffledMask = Builder.CreateShuffleVector(
2096               Mask[Part], Undefs, RepMask, "interleaved.mask");
2097           GroupMask = MaskForGaps
2098                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2099                                                 MaskForGaps)
2100                           : ShuffledMask;
2101         }
2102         NewLoad =
2103             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2104                                      GroupMask, UndefVec, "wide.masked.vec");
2105       }
2106       else
2107         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2108                                             Group->getAlignment(), "wide.vec");
2109       Group->addMetadata(NewLoad);
2110       NewLoads.push_back(NewLoad);
2111     }
2112 
2113     // For each member in the group, shuffle out the appropriate data from the
2114     // wide loads.
2115     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2116       Instruction *Member = Group->getMember(I);
2117 
2118       // Skip the gaps in the group.
2119       if (!Member)
2120         continue;
2121 
2122       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2123       for (unsigned Part = 0; Part < UF; Part++) {
2124         Value *StridedVec = Builder.CreateShuffleVector(
2125             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2126 
2127         // If this member has different type, cast the result type.
2128         if (Member->getType() != ScalarTy) {
2129           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2130           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2131         }
2132 
2133         if (Group->isReverse())
2134           StridedVec = reverseVector(StridedVec);
2135 
2136         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2137       }
2138     }
2139     return;
2140   }
2141 
2142   // The sub vector type for current instruction.
2143   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2144 
2145   // Vectorize the interleaved store group.
2146   for (unsigned Part = 0; Part < UF; Part++) {
2147     // Collect the stored vector from each member.
2148     SmallVector<Value *, 4> StoredVecs;
2149     for (unsigned i = 0; i < InterleaveFactor; i++) {
2150       // Interleaved store group doesn't allow a gap, so each index has a member
2151       Instruction *Member = Group->getMember(i);
2152       assert(Member && "Fail to get a member from an interleaved store group");
2153 
2154       Value *StoredVec = getOrCreateVectorValue(
2155           cast<StoreInst>(Member)->getValueOperand(), Part);
2156       if (Group->isReverse())
2157         StoredVec = reverseVector(StoredVec);
2158 
2159       // If this member has different type, cast it to a unified type.
2160 
2161       if (StoredVec->getType() != SubVT)
2162         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2163 
2164       StoredVecs.push_back(StoredVec);
2165     }
2166 
2167     // Concatenate all vectors into a wide vector.
2168     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2169 
2170     // Interleave the elements in the wide vector.
2171     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2172     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2173                                               "interleaved.vec");
2174 
2175     Instruction *NewStoreInstr;
2176     if (IsMaskForCondRequired) {
2177       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2178       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2179       Value *ShuffledMask = Builder.CreateShuffleVector(
2180           Mask[Part], Undefs, RepMask, "interleaved.mask");
2181       NewStoreInstr = Builder.CreateMaskedStore(
2182           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2183     }
2184     else
2185       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2186         Group->getAlignment());
2187 
2188     Group->addMetadata(NewStoreInstr);
2189   }
2190 }
2191 
2192 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2193                                                      VectorParts *BlockInMask) {
2194   // Attempt to issue a wide load.
2195   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2196   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2197 
2198   assert((LI || SI) && "Invalid Load/Store instruction");
2199 
2200   LoopVectorizationCostModel::InstWidening Decision =
2201       Cost->getWideningDecision(Instr, VF);
2202   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2203          "CM decision should be taken at this point");
2204   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2205     return vectorizeInterleaveGroup(Instr);
2206 
2207   Type *ScalarDataTy = getMemInstValueType(Instr);
2208   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2209   Value *Ptr = getLoadStorePointerOperand(Instr);
2210   unsigned Alignment = getLoadStoreAlignment(Instr);
2211   // An alignment of 0 means target abi alignment. We need to use the scalar's
2212   // target abi alignment in such a case.
2213   const DataLayout &DL = Instr->getModule()->getDataLayout();
2214   if (!Alignment)
2215     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2216   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2217 
2218   // Determine if the pointer operand of the access is either consecutive or
2219   // reverse consecutive.
2220   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2221   bool ConsecutiveStride =
2222       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2223   bool CreateGatherScatter =
2224       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2225 
2226   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2227   // gather/scatter. Otherwise Decision should have been to Scalarize.
2228   assert((ConsecutiveStride || CreateGatherScatter) &&
2229          "The instruction should be scalarized");
2230 
2231   // Handle consecutive loads/stores.
2232   if (ConsecutiveStride)
2233     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2234 
2235   VectorParts Mask;
2236   bool isMaskRequired = BlockInMask;
2237   if (isMaskRequired)
2238     Mask = *BlockInMask;
2239 
2240   bool InBounds = false;
2241   if (auto *gep = dyn_cast<GetElementPtrInst>(
2242           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2243     InBounds = gep->isInBounds();
2244 
2245   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2246     // Calculate the pointer for the specific unroll-part.
2247     GetElementPtrInst *PartPtr = nullptr;
2248 
2249     if (Reverse) {
2250       // If the address is consecutive but reversed, then the
2251       // wide store needs to start at the last vector element.
2252       PartPtr = cast<GetElementPtrInst>(
2253           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2254       PartPtr->setIsInBounds(InBounds);
2255       PartPtr = cast<GetElementPtrInst>(
2256           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2257       PartPtr->setIsInBounds(InBounds);
2258       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2259         Mask[Part] = reverseVector(Mask[Part]);
2260     } else {
2261       PartPtr = cast<GetElementPtrInst>(
2262           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2263       PartPtr->setIsInBounds(InBounds);
2264     }
2265 
2266     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2267   };
2268 
2269   // Handle Stores:
2270   if (SI) {
2271     setDebugLocFromInst(Builder, SI);
2272 
2273     for (unsigned Part = 0; Part < UF; ++Part) {
2274       Instruction *NewSI = nullptr;
2275       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2276       if (CreateGatherScatter) {
2277         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2278         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2279         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2280                                             MaskPart);
2281       } else {
2282         if (Reverse) {
2283           // If we store to reverse consecutive memory locations, then we need
2284           // to reverse the order of elements in the stored value.
2285           StoredVal = reverseVector(StoredVal);
2286           // We don't want to update the value in the map as it might be used in
2287           // another expression. So don't call resetVectorValue(StoredVal).
2288         }
2289         auto *VecPtr = CreateVecPtr(Part, Ptr);
2290         if (isMaskRequired)
2291           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2292                                             Mask[Part]);
2293         else
2294           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2295       }
2296       addMetadata(NewSI, SI);
2297     }
2298     return;
2299   }
2300 
2301   // Handle loads.
2302   assert(LI && "Must have a load instruction");
2303   setDebugLocFromInst(Builder, LI);
2304   for (unsigned Part = 0; Part < UF; ++Part) {
2305     Value *NewLI;
2306     if (CreateGatherScatter) {
2307       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2308       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2309       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2310                                          nullptr, "wide.masked.gather");
2311       addMetadata(NewLI, LI);
2312     } else {
2313       auto *VecPtr = CreateVecPtr(Part, Ptr);
2314       if (isMaskRequired)
2315         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2316                                          UndefValue::get(DataTy),
2317                                          "wide.masked.load");
2318       else
2319         NewLI =
2320             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2321 
2322       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2323       addMetadata(NewLI, LI);
2324       if (Reverse)
2325         NewLI = reverseVector(NewLI);
2326     }
2327     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2328   }
2329 }
2330 
2331 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2332                                                const VPIteration &Instance,
2333                                                bool IfPredicateInstr) {
2334   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2335 
2336   setDebugLocFromInst(Builder, Instr);
2337 
2338   // Does this instruction return a value ?
2339   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2340 
2341   Instruction *Cloned = Instr->clone();
2342   if (!IsVoidRetTy)
2343     Cloned->setName(Instr->getName() + ".cloned");
2344 
2345   // Replace the operands of the cloned instructions with their scalar
2346   // equivalents in the new loop.
2347   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2348     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2349     Cloned->setOperand(op, NewOp);
2350   }
2351   addNewMetadata(Cloned, Instr);
2352 
2353   // Place the cloned scalar in the new loop.
2354   Builder.Insert(Cloned);
2355 
2356   // Add the cloned scalar to the scalar map entry.
2357   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2358 
2359   // If we just cloned a new assumption, add it the assumption cache.
2360   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2361     if (II->getIntrinsicID() == Intrinsic::assume)
2362       AC->registerAssumption(II);
2363 
2364   // End if-block.
2365   if (IfPredicateInstr)
2366     PredicatedInstructions.push_back(Cloned);
2367 }
2368 
2369 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2370                                                       Value *End, Value *Step,
2371                                                       Instruction *DL) {
2372   BasicBlock *Header = L->getHeader();
2373   BasicBlock *Latch = L->getLoopLatch();
2374   // As we're just creating this loop, it's possible no latch exists
2375   // yet. If so, use the header as this will be a single block loop.
2376   if (!Latch)
2377     Latch = Header;
2378 
2379   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2380   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2381   setDebugLocFromInst(Builder, OldInst);
2382   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2383 
2384   Builder.SetInsertPoint(Latch->getTerminator());
2385   setDebugLocFromInst(Builder, OldInst);
2386 
2387   // Create i+1 and fill the PHINode.
2388   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2389   Induction->addIncoming(Start, L->getLoopPreheader());
2390   Induction->addIncoming(Next, Latch);
2391   // Create the compare.
2392   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2393   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2394 
2395   // Now we have two terminators. Remove the old one from the block.
2396   Latch->getTerminator()->eraseFromParent();
2397 
2398   return Induction;
2399 }
2400 
2401 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2402   if (TripCount)
2403     return TripCount;
2404 
2405   assert(L && "Create Trip Count for null loop.");
2406   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2407   // Find the loop boundaries.
2408   ScalarEvolution *SE = PSE.getSE();
2409   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2410   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2411          "Invalid loop count");
2412 
2413   Type *IdxTy = Legal->getWidestInductionType();
2414   assert(IdxTy && "No type for induction");
2415 
2416   // The exit count might have the type of i64 while the phi is i32. This can
2417   // happen if we have an induction variable that is sign extended before the
2418   // compare. The only way that we get a backedge taken count is that the
2419   // induction variable was signed and as such will not overflow. In such a case
2420   // truncation is legal.
2421   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2422       IdxTy->getPrimitiveSizeInBits())
2423     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2424   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2425 
2426   // Get the total trip count from the count by adding 1.
2427   const SCEV *ExitCount = SE->getAddExpr(
2428       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2429 
2430   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2431 
2432   // Expand the trip count and place the new instructions in the preheader.
2433   // Notice that the pre-header does not change, only the loop body.
2434   SCEVExpander Exp(*SE, DL, "induction");
2435 
2436   // Count holds the overall loop count (N).
2437   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2438                                 L->getLoopPreheader()->getTerminator());
2439 
2440   if (TripCount->getType()->isPointerTy())
2441     TripCount =
2442         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2443                                     L->getLoopPreheader()->getTerminator());
2444 
2445   return TripCount;
2446 }
2447 
2448 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2449   if (VectorTripCount)
2450     return VectorTripCount;
2451 
2452   Value *TC = getOrCreateTripCount(L);
2453   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2454 
2455   Type *Ty = TC->getType();
2456   Constant *Step = ConstantInt::get(Ty, VF * UF);
2457 
2458   // If the tail is to be folded by masking, round the number of iterations N
2459   // up to a multiple of Step instead of rounding down. This is done by first
2460   // adding Step-1 and then rounding down. Note that it's ok if this addition
2461   // overflows: the vector induction variable will eventually wrap to zero given
2462   // that it starts at zero and its Step is a power of two; the loop will then
2463   // exit, with the last early-exit vector comparison also producing all-true.
2464   if (Cost->foldTailByMasking()) {
2465     assert(isPowerOf2_32(VF * UF) &&
2466            "VF*UF must be a power of 2 when folding tail by masking");
2467     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2468   }
2469 
2470   // Now we need to generate the expression for the part of the loop that the
2471   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2472   // iterations are not required for correctness, or N - Step, otherwise. Step
2473   // is equal to the vectorization factor (number of SIMD elements) times the
2474   // unroll factor (number of SIMD instructions).
2475   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2476 
2477   // If there is a non-reversed interleaved group that may speculatively access
2478   // memory out-of-bounds, we need to ensure that there will be at least one
2479   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2480   // the trip count, we set the remainder to be equal to the step. If the step
2481   // does not evenly divide the trip count, no adjustment is necessary since
2482   // there will already be scalar iterations. Note that the minimum iterations
2483   // check ensures that N >= Step.
2484   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2485     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2486     R = Builder.CreateSelect(IsZero, Step, R);
2487   }
2488 
2489   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2490 
2491   return VectorTripCount;
2492 }
2493 
2494 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2495                                                    const DataLayout &DL) {
2496   // Verify that V is a vector type with same number of elements as DstVTy.
2497   unsigned VF = DstVTy->getNumElements();
2498   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2499   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2500   Type *SrcElemTy = SrcVecTy->getElementType();
2501   Type *DstElemTy = DstVTy->getElementType();
2502   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2503          "Vector elements must have same size");
2504 
2505   // Do a direct cast if element types are castable.
2506   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2507     return Builder.CreateBitOrPointerCast(V, DstVTy);
2508   }
2509   // V cannot be directly casted to desired vector type.
2510   // May happen when V is a floating point vector but DstVTy is a vector of
2511   // pointers or vice-versa. Handle this using a two-step bitcast using an
2512   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2513   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2514          "Only one type should be a pointer type");
2515   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2516          "Only one type should be a floating point type");
2517   Type *IntTy =
2518       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2519   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2520   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2521   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2522 }
2523 
2524 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2525                                                          BasicBlock *Bypass) {
2526   Value *Count = getOrCreateTripCount(L);
2527   BasicBlock *BB = L->getLoopPreheader();
2528   IRBuilder<> Builder(BB->getTerminator());
2529 
2530   // Generate code to check if the loop's trip count is less than VF * UF, or
2531   // equal to it in case a scalar epilogue is required; this implies that the
2532   // vector trip count is zero. This check also covers the case where adding one
2533   // to the backedge-taken count overflowed leading to an incorrect trip count
2534   // of zero. In this case we will also jump to the scalar loop.
2535   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2536                                           : ICmpInst::ICMP_ULT;
2537 
2538   // If tail is to be folded, vector loop takes care of all iterations.
2539   Value *CheckMinIters = Builder.getFalse();
2540   if (!Cost->foldTailByMasking())
2541     CheckMinIters = Builder.CreateICmp(
2542         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2543         "min.iters.check");
2544 
2545   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2546   // Update dominator tree immediately if the generated block is a
2547   // LoopBypassBlock because SCEV expansions to generate loop bypass
2548   // checks may query it before the current function is finished.
2549   DT->addNewBlock(NewBB, BB);
2550   if (L->getParentLoop())
2551     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2552   ReplaceInstWithInst(BB->getTerminator(),
2553                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2554   LoopBypassBlocks.push_back(BB);
2555 }
2556 
2557 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2558   BasicBlock *BB = L->getLoopPreheader();
2559 
2560   // Generate the code to check that the SCEV assumptions that we made.
2561   // We want the new basic block to start at the first instruction in a
2562   // sequence of instructions that form a check.
2563   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2564                    "scev.check");
2565   Value *SCEVCheck =
2566       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2567 
2568   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2569     if (C->isZero())
2570       return;
2571 
2572   assert(!Cost->foldTailByMasking() &&
2573          "Cannot SCEV check stride or overflow when folding tail");
2574   // Create a new block containing the stride check.
2575   BB->setName("vector.scevcheck");
2576   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2577   // Update dominator tree immediately if the generated block is a
2578   // LoopBypassBlock because SCEV expansions to generate loop bypass
2579   // checks may query it before the current function is finished.
2580   DT->addNewBlock(NewBB, BB);
2581   if (L->getParentLoop())
2582     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2583   ReplaceInstWithInst(BB->getTerminator(),
2584                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2585   LoopBypassBlocks.push_back(BB);
2586   AddedSafetyChecks = true;
2587 }
2588 
2589 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2590   // VPlan-native path does not do any analysis for runtime checks currently.
2591   if (EnableVPlanNativePath)
2592     return;
2593 
2594   BasicBlock *BB = L->getLoopPreheader();
2595 
2596   // Generate the code that checks in runtime if arrays overlap. We put the
2597   // checks into a separate block to make the more common case of few elements
2598   // faster.
2599   Instruction *FirstCheckInst;
2600   Instruction *MemRuntimeCheck;
2601   std::tie(FirstCheckInst, MemRuntimeCheck) =
2602       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2603   if (!MemRuntimeCheck)
2604     return;
2605 
2606   assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2607   // Create a new block containing the memory check.
2608   BB->setName("vector.memcheck");
2609   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2610   // Update dominator tree immediately if the generated block is a
2611   // LoopBypassBlock because SCEV expansions to generate loop bypass
2612   // checks may query it before the current function is finished.
2613   DT->addNewBlock(NewBB, BB);
2614   if (L->getParentLoop())
2615     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2616   ReplaceInstWithInst(BB->getTerminator(),
2617                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2618   LoopBypassBlocks.push_back(BB);
2619   AddedSafetyChecks = true;
2620 
2621   // We currently don't use LoopVersioning for the actual loop cloning but we
2622   // still use it to add the noalias metadata.
2623   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2624                                            PSE.getSE());
2625   LVer->prepareNoAliasMetadata();
2626 }
2627 
2628 Value *InnerLoopVectorizer::emitTransformedIndex(
2629     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2630     const InductionDescriptor &ID) const {
2631 
2632   SCEVExpander Exp(*SE, DL, "induction");
2633   auto Step = ID.getStep();
2634   auto StartValue = ID.getStartValue();
2635   assert(Index->getType() == Step->getType() &&
2636          "Index type does not match StepValue type");
2637 
2638   // Note: the IR at this point is broken. We cannot use SE to create any new
2639   // SCEV and then expand it, hoping that SCEV's simplification will give us
2640   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2641   // lead to various SCEV crashes. So all we can do is to use builder and rely
2642   // on InstCombine for future simplifications. Here we handle some trivial
2643   // cases only.
2644   auto CreateAdd = [&B](Value *X, Value *Y) {
2645     assert(X->getType() == Y->getType() && "Types don't match!");
2646     if (auto *CX = dyn_cast<ConstantInt>(X))
2647       if (CX->isZero())
2648         return Y;
2649     if (auto *CY = dyn_cast<ConstantInt>(Y))
2650       if (CY->isZero())
2651         return X;
2652     return B.CreateAdd(X, Y);
2653   };
2654 
2655   auto CreateMul = [&B](Value *X, Value *Y) {
2656     assert(X->getType() == Y->getType() && "Types don't match!");
2657     if (auto *CX = dyn_cast<ConstantInt>(X))
2658       if (CX->isOne())
2659         return Y;
2660     if (auto *CY = dyn_cast<ConstantInt>(Y))
2661       if (CY->isOne())
2662         return X;
2663     return B.CreateMul(X, Y);
2664   };
2665 
2666   switch (ID.getKind()) {
2667   case InductionDescriptor::IK_IntInduction: {
2668     assert(Index->getType() == StartValue->getType() &&
2669            "Index type does not match StartValue type");
2670     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2671       return B.CreateSub(StartValue, Index);
2672     auto *Offset = CreateMul(
2673         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2674     return CreateAdd(StartValue, Offset);
2675   }
2676   case InductionDescriptor::IK_PtrInduction: {
2677     assert(isa<SCEVConstant>(Step) &&
2678            "Expected constant step for pointer induction");
2679     return B.CreateGEP(
2680         StartValue->getType()->getPointerElementType(), StartValue,
2681         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2682                                            &*B.GetInsertPoint())));
2683   }
2684   case InductionDescriptor::IK_FpInduction: {
2685     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2686     auto InductionBinOp = ID.getInductionBinOp();
2687     assert(InductionBinOp &&
2688            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2689             InductionBinOp->getOpcode() == Instruction::FSub) &&
2690            "Original bin op should be defined for FP induction");
2691 
2692     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2693 
2694     // Floating point operations had to be 'fast' to enable the induction.
2695     FastMathFlags Flags;
2696     Flags.setFast();
2697 
2698     Value *MulExp = B.CreateFMul(StepValue, Index);
2699     if (isa<Instruction>(MulExp))
2700       // We have to check, the MulExp may be a constant.
2701       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2702 
2703     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2704                                "induction");
2705     if (isa<Instruction>(BOp))
2706       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2707 
2708     return BOp;
2709   }
2710   case InductionDescriptor::IK_NoInduction:
2711     return nullptr;
2712   }
2713   llvm_unreachable("invalid enum");
2714 }
2715 
2716 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2717   /*
2718    In this function we generate a new loop. The new loop will contain
2719    the vectorized instructions while the old loop will continue to run the
2720    scalar remainder.
2721 
2722        [ ] <-- loop iteration number check.
2723     /   |
2724    /    v
2725   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2726   |  /  |
2727   | /   v
2728   ||   [ ]     <-- vector pre header.
2729   |/    |
2730   |     v
2731   |    [  ] \
2732   |    [  ]_|   <-- vector loop.
2733   |     |
2734   |     v
2735   |   -[ ]   <--- middle-block.
2736   |  /  |
2737   | /   v
2738   -|- >[ ]     <--- new preheader.
2739    |    |
2740    |    v
2741    |   [ ] \
2742    |   [ ]_|   <-- old scalar loop to handle remainder.
2743     \   |
2744      \  v
2745       >[ ]     <-- exit block.
2746    ...
2747    */
2748 
2749   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2750   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2751   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2752   MDNode *OrigLoopID = OrigLoop->getLoopID();
2753   assert(VectorPH && "Invalid loop structure");
2754   assert(ExitBlock && "Must have an exit block");
2755 
2756   // Some loops have a single integer induction variable, while other loops
2757   // don't. One example is c++ iterators that often have multiple pointer
2758   // induction variables. In the code below we also support a case where we
2759   // don't have a single induction variable.
2760   //
2761   // We try to obtain an induction variable from the original loop as hard
2762   // as possible. However if we don't find one that:
2763   //   - is an integer
2764   //   - counts from zero, stepping by one
2765   //   - is the size of the widest induction variable type
2766   // then we create a new one.
2767   OldInduction = Legal->getPrimaryInduction();
2768   Type *IdxTy = Legal->getWidestInductionType();
2769 
2770   // Split the single block loop into the two loop structure described above.
2771   BasicBlock *VecBody =
2772       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2773   BasicBlock *MiddleBlock =
2774       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2775   BasicBlock *ScalarPH =
2776       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2777 
2778   // Create and register the new vector loop.
2779   Loop *Lp = LI->AllocateLoop();
2780   Loop *ParentLoop = OrigLoop->getParentLoop();
2781 
2782   // Insert the new loop into the loop nest and register the new basic blocks
2783   // before calling any utilities such as SCEV that require valid LoopInfo.
2784   if (ParentLoop) {
2785     ParentLoop->addChildLoop(Lp);
2786     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2787     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2788   } else {
2789     LI->addTopLevelLoop(Lp);
2790   }
2791   Lp->addBasicBlockToLoop(VecBody, *LI);
2792 
2793   // Find the loop boundaries.
2794   Value *Count = getOrCreateTripCount(Lp);
2795 
2796   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2797 
2798   // Now, compare the new count to zero. If it is zero skip the vector loop and
2799   // jump to the scalar loop. This check also covers the case where the
2800   // backedge-taken count is uint##_max: adding one to it will overflow leading
2801   // to an incorrect trip count of zero. In this (rare) case we will also jump
2802   // to the scalar loop.
2803   emitMinimumIterationCountCheck(Lp, ScalarPH);
2804 
2805   // Generate the code to check any assumptions that we've made for SCEV
2806   // expressions.
2807   emitSCEVChecks(Lp, ScalarPH);
2808 
2809   // Generate the code that checks in runtime if arrays overlap. We put the
2810   // checks into a separate block to make the more common case of few elements
2811   // faster.
2812   emitMemRuntimeChecks(Lp, ScalarPH);
2813 
2814   // Generate the induction variable.
2815   // The loop step is equal to the vectorization factor (num of SIMD elements)
2816   // times the unroll factor (num of SIMD instructions).
2817   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2818   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2819   Induction =
2820       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2821                               getDebugLocFromInstOrOperands(OldInduction));
2822 
2823   // We are going to resume the execution of the scalar loop.
2824   // Go over all of the induction variables that we found and fix the
2825   // PHIs that are left in the scalar version of the loop.
2826   // The starting values of PHI nodes depend on the counter of the last
2827   // iteration in the vectorized loop.
2828   // If we come from a bypass edge then we need to start from the original
2829   // start value.
2830 
2831   // This variable saves the new starting index for the scalar loop. It is used
2832   // to test if there are any tail iterations left once the vector loop has
2833   // completed.
2834   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2835   for (auto &InductionEntry : *List) {
2836     PHINode *OrigPhi = InductionEntry.first;
2837     InductionDescriptor II = InductionEntry.second;
2838 
2839     // Create phi nodes to merge from the  backedge-taken check block.
2840     PHINode *BCResumeVal = PHINode::Create(
2841         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2842     // Copy original phi DL over to the new one.
2843     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2844     Value *&EndValue = IVEndValues[OrigPhi];
2845     if (OrigPhi == OldInduction) {
2846       // We know what the end value is.
2847       EndValue = CountRoundDown;
2848     } else {
2849       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2850       Type *StepType = II.getStep()->getType();
2851       Instruction::CastOps CastOp =
2852         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2853       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2854       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2855       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2856       EndValue->setName("ind.end");
2857     }
2858 
2859     // The new PHI merges the original incoming value, in case of a bypass,
2860     // or the value at the end of the vectorized loop.
2861     BCResumeVal->addIncoming(EndValue, MiddleBlock);
2862 
2863     // Fix the scalar body counter (PHI node).
2864     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
2865 
2866     // The old induction's phi node in the scalar body needs the truncated
2867     // value.
2868     for (BasicBlock *BB : LoopBypassBlocks)
2869       BCResumeVal->addIncoming(II.getStartValue(), BB);
2870     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
2871   }
2872 
2873   // Add a check in the middle block to see if we have completed
2874   // all of the iterations in the first vector loop.
2875   // If (N - N%VF) == N, then we *don't* need to run the remainder.
2876   // If tail is to be folded, we know we don't need to run the remainder.
2877   Value *CmpN = Builder.getTrue();
2878   if (!Cost->foldTailByMasking())
2879     CmpN =
2880         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
2881                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
2882   ReplaceInstWithInst(MiddleBlock->getTerminator(),
2883                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
2884 
2885   // Get ready to start creating new instructions into the vectorized body.
2886   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
2887 
2888   // Save the state.
2889   LoopVectorPreHeader = Lp->getLoopPreheader();
2890   LoopScalarPreHeader = ScalarPH;
2891   LoopMiddleBlock = MiddleBlock;
2892   LoopExitBlock = ExitBlock;
2893   LoopVectorBody = VecBody;
2894   LoopScalarBody = OldBasicBlock;
2895 
2896   Optional<MDNode *> VectorizedLoopID =
2897       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
2898                                       LLVMLoopVectorizeFollowupVectorized});
2899   if (VectorizedLoopID.hasValue()) {
2900     Lp->setLoopID(VectorizedLoopID.getValue());
2901 
2902     // Do not setAlreadyVectorized if loop attributes have been defined
2903     // explicitly.
2904     return LoopVectorPreHeader;
2905   }
2906 
2907   // Keep all loop hints from the original loop on the vector loop (we'll
2908   // replace the vectorizer-specific hints below).
2909   if (MDNode *LID = OrigLoop->getLoopID())
2910     Lp->setLoopID(LID);
2911 
2912   LoopVectorizeHints Hints(Lp, true, *ORE);
2913   Hints.setAlreadyVectorized();
2914 
2915   return LoopVectorPreHeader;
2916 }
2917 
2918 // Fix up external users of the induction variable. At this point, we are
2919 // in LCSSA form, with all external PHIs that use the IV having one input value,
2920 // coming from the remainder loop. We need those PHIs to also have a correct
2921 // value for the IV when arriving directly from the middle block.
2922 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2923                                        const InductionDescriptor &II,
2924                                        Value *CountRoundDown, Value *EndValue,
2925                                        BasicBlock *MiddleBlock) {
2926   // There are two kinds of external IV usages - those that use the value
2927   // computed in the last iteration (the PHI) and those that use the penultimate
2928   // value (the value that feeds into the phi from the loop latch).
2929   // We allow both, but they, obviously, have different values.
2930 
2931   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
2932 
2933   DenseMap<Value *, Value *> MissingVals;
2934 
2935   // An external user of the last iteration's value should see the value that
2936   // the remainder loop uses to initialize its own IV.
2937   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2938   for (User *U : PostInc->users()) {
2939     Instruction *UI = cast<Instruction>(U);
2940     if (!OrigLoop->contains(UI)) {
2941       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2942       MissingVals[UI] = EndValue;
2943     }
2944   }
2945 
2946   // An external user of the penultimate value need to see EndValue - Step.
2947   // The simplest way to get this is to recompute it from the constituent SCEVs,
2948   // that is Start + (Step * (CRD - 1)).
2949   for (User *U : OrigPhi->users()) {
2950     auto *UI = cast<Instruction>(U);
2951     if (!OrigLoop->contains(UI)) {
2952       const DataLayout &DL =
2953           OrigLoop->getHeader()->getModule()->getDataLayout();
2954       assert(isa<PHINode>(UI) && "Expected LCSSA form");
2955 
2956       IRBuilder<> B(MiddleBlock->getTerminator());
2957       Value *CountMinusOne = B.CreateSub(
2958           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
2959       Value *CMO =
2960           !II.getStep()->getType()->isIntegerTy()
2961               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
2962                              II.getStep()->getType())
2963               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
2964       CMO->setName("cast.cmo");
2965       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
2966       Escape->setName("ind.escape");
2967       MissingVals[UI] = Escape;
2968     }
2969   }
2970 
2971   for (auto &I : MissingVals) {
2972     PHINode *PHI = cast<PHINode>(I.first);
2973     // One corner case we have to handle is two IVs "chasing" each-other,
2974     // that is %IV2 = phi [...], [ %IV1, %latch ]
2975     // In this case, if IV1 has an external use, we need to avoid adding both
2976     // "last value of IV1" and "penultimate value of IV2". So, verify that we
2977     // don't already have an incoming value for the middle block.
2978     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
2979       PHI->addIncoming(I.second, MiddleBlock);
2980   }
2981 }
2982 
2983 namespace {
2984 
2985 struct CSEDenseMapInfo {
2986   static bool canHandle(const Instruction *I) {
2987     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
2988            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
2989   }
2990 
2991   static inline Instruction *getEmptyKey() {
2992     return DenseMapInfo<Instruction *>::getEmptyKey();
2993   }
2994 
2995   static inline Instruction *getTombstoneKey() {
2996     return DenseMapInfo<Instruction *>::getTombstoneKey();
2997   }
2998 
2999   static unsigned getHashValue(const Instruction *I) {
3000     assert(canHandle(I) && "Unknown instruction!");
3001     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3002                                                            I->value_op_end()));
3003   }
3004 
3005   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3006     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3007         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3008       return LHS == RHS;
3009     return LHS->isIdenticalTo(RHS);
3010   }
3011 };
3012 
3013 } // end anonymous namespace
3014 
3015 ///Perform cse of induction variable instructions.
3016 static void cse(BasicBlock *BB) {
3017   // Perform simple cse.
3018   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3019   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3020     Instruction *In = &*I++;
3021 
3022     if (!CSEDenseMapInfo::canHandle(In))
3023       continue;
3024 
3025     // Check if we can replace this instruction with any of the
3026     // visited instructions.
3027     if (Instruction *V = CSEMap.lookup(In)) {
3028       In->replaceAllUsesWith(V);
3029       In->eraseFromParent();
3030       continue;
3031     }
3032 
3033     CSEMap[In] = In;
3034   }
3035 }
3036 
3037 /// Estimate the overhead of scalarizing an instruction. This is a
3038 /// convenience wrapper for the type-based getScalarizationOverhead API.
3039 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
3040                                          const TargetTransformInfo &TTI) {
3041   if (VF == 1)
3042     return 0;
3043 
3044   unsigned Cost = 0;
3045   Type *RetTy = ToVectorTy(I->getType(), VF);
3046   if (!RetTy->isVoidTy() &&
3047       (!isa<LoadInst>(I) ||
3048        !TTI.supportsEfficientVectorElementLoadStore()))
3049     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
3050 
3051   // Some targets keep addresses scalar.
3052   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
3053     return Cost;
3054 
3055   if (CallInst *CI = dyn_cast<CallInst>(I)) {
3056     SmallVector<const Value *, 4> Operands(CI->arg_operands());
3057     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3058   }
3059   else if (!isa<StoreInst>(I) ||
3060            !TTI.supportsEfficientVectorElementLoadStore()) {
3061     SmallVector<const Value *, 4> Operands(I->operand_values());
3062     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
3063   }
3064 
3065   return Cost;
3066 }
3067 
3068 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
3069 // Return the cost of the instruction, including scalarization overhead if it's
3070 // needed. The flag NeedToScalarize shows if the call needs to be scalarized -
3071 // i.e. either vector version isn't available, or is too expensive.
3072 static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
3073                                   const TargetTransformInfo &TTI,
3074                                   const TargetLibraryInfo *TLI,
3075                                   bool &NeedToScalarize) {
3076   Function *F = CI->getCalledFunction();
3077   StringRef FnName = CI->getCalledFunction()->getName();
3078   Type *ScalarRetTy = CI->getType();
3079   SmallVector<Type *, 4> Tys, ScalarTys;
3080   for (auto &ArgOp : CI->arg_operands())
3081     ScalarTys.push_back(ArgOp->getType());
3082 
3083   // Estimate cost of scalarized vector call. The source operands are assumed
3084   // to be vectors, so we need to extract individual elements from there,
3085   // execute VF scalar calls, and then gather the result into the vector return
3086   // value.
3087   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3088   if (VF == 1)
3089     return ScalarCallCost;
3090 
3091   // Compute corresponding vector type for return value and arguments.
3092   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3093   for (Type *ScalarTy : ScalarTys)
3094     Tys.push_back(ToVectorTy(ScalarTy, VF));
3095 
3096   // Compute costs of unpacking argument values for the scalar calls and
3097   // packing the return values to a vector.
3098   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
3099 
3100   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3101 
3102   // If we can't emit a vector call for this function, then the currently found
3103   // cost is the cost we need to return.
3104   NeedToScalarize = true;
3105   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3106     return Cost;
3107 
3108   // If the corresponding vector cost is cheaper, return its cost.
3109   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3110   if (VectorCallCost < Cost) {
3111     NeedToScalarize = false;
3112     return VectorCallCost;
3113   }
3114   return Cost;
3115 }
3116 
3117 // Estimate cost of an intrinsic call instruction CI if it were vectorized with
3118 // factor VF.  Return the cost of the instruction, including scalarization
3119 // overhead if it's needed.
3120 static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
3121                                        const TargetTransformInfo &TTI,
3122                                        const TargetLibraryInfo *TLI) {
3123   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3124   assert(ID && "Expected intrinsic call!");
3125 
3126   FastMathFlags FMF;
3127   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3128     FMF = FPMO->getFastMathFlags();
3129 
3130   SmallVector<Value *, 4> Operands(CI->arg_operands());
3131   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3132 }
3133 
3134 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3135   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3136   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3137   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3138 }
3139 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3140   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3141   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3142   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3143 }
3144 
3145 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3146   // For every instruction `I` in MinBWs, truncate the operands, create a
3147   // truncated version of `I` and reextend its result. InstCombine runs
3148   // later and will remove any ext/trunc pairs.
3149   SmallPtrSet<Value *, 4> Erased;
3150   for (const auto &KV : Cost->getMinimalBitwidths()) {
3151     // If the value wasn't vectorized, we must maintain the original scalar
3152     // type. The absence of the value from VectorLoopValueMap indicates that it
3153     // wasn't vectorized.
3154     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3155       continue;
3156     for (unsigned Part = 0; Part < UF; ++Part) {
3157       Value *I = getOrCreateVectorValue(KV.first, Part);
3158       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3159           !isa<Instruction>(I))
3160         continue;
3161       Type *OriginalTy = I->getType();
3162       Type *ScalarTruncatedTy =
3163           IntegerType::get(OriginalTy->getContext(), KV.second);
3164       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3165                                           OriginalTy->getVectorNumElements());
3166       if (TruncatedTy == OriginalTy)
3167         continue;
3168 
3169       IRBuilder<> B(cast<Instruction>(I));
3170       auto ShrinkOperand = [&](Value *V) -> Value * {
3171         if (auto *ZI = dyn_cast<ZExtInst>(V))
3172           if (ZI->getSrcTy() == TruncatedTy)
3173             return ZI->getOperand(0);
3174         return B.CreateZExtOrTrunc(V, TruncatedTy);
3175       };
3176 
3177       // The actual instruction modification depends on the instruction type,
3178       // unfortunately.
3179       Value *NewI = nullptr;
3180       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3181         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3182                              ShrinkOperand(BO->getOperand(1)));
3183 
3184         // Any wrapping introduced by shrinking this operation shouldn't be
3185         // considered undefined behavior. So, we can't unconditionally copy
3186         // arithmetic wrapping flags to NewI.
3187         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3188       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3189         NewI =
3190             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3191                          ShrinkOperand(CI->getOperand(1)));
3192       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3193         NewI = B.CreateSelect(SI->getCondition(),
3194                               ShrinkOperand(SI->getTrueValue()),
3195                               ShrinkOperand(SI->getFalseValue()));
3196       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3197         switch (CI->getOpcode()) {
3198         default:
3199           llvm_unreachable("Unhandled cast!");
3200         case Instruction::Trunc:
3201           NewI = ShrinkOperand(CI->getOperand(0));
3202           break;
3203         case Instruction::SExt:
3204           NewI = B.CreateSExtOrTrunc(
3205               CI->getOperand(0),
3206               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3207           break;
3208         case Instruction::ZExt:
3209           NewI = B.CreateZExtOrTrunc(
3210               CI->getOperand(0),
3211               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3212           break;
3213         }
3214       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3215         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3216         auto *O0 = B.CreateZExtOrTrunc(
3217             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3218         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3219         auto *O1 = B.CreateZExtOrTrunc(
3220             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3221 
3222         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3223       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3224         // Don't do anything with the operands, just extend the result.
3225         continue;
3226       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3227         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3228         auto *O0 = B.CreateZExtOrTrunc(
3229             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3230         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3231         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3232       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3233         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3234         auto *O0 = B.CreateZExtOrTrunc(
3235             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3236         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3237       } else {
3238         // If we don't know what to do, be conservative and don't do anything.
3239         continue;
3240       }
3241 
3242       // Lastly, extend the result.
3243       NewI->takeName(cast<Instruction>(I));
3244       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3245       I->replaceAllUsesWith(Res);
3246       cast<Instruction>(I)->eraseFromParent();
3247       Erased.insert(I);
3248       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3249     }
3250   }
3251 
3252   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3253   for (const auto &KV : Cost->getMinimalBitwidths()) {
3254     // If the value wasn't vectorized, we must maintain the original scalar
3255     // type. The absence of the value from VectorLoopValueMap indicates that it
3256     // wasn't vectorized.
3257     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3258       continue;
3259     for (unsigned Part = 0; Part < UF; ++Part) {
3260       Value *I = getOrCreateVectorValue(KV.first, Part);
3261       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3262       if (Inst && Inst->use_empty()) {
3263         Value *NewI = Inst->getOperand(0);
3264         Inst->eraseFromParent();
3265         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3266       }
3267     }
3268   }
3269 }
3270 
3271 void InnerLoopVectorizer::fixVectorizedLoop() {
3272   // Insert truncates and extends for any truncated instructions as hints to
3273   // InstCombine.
3274   if (VF > 1)
3275     truncateToMinimalBitwidths();
3276 
3277   // Fix widened non-induction PHIs by setting up the PHI operands.
3278   if (OrigPHIsToFix.size()) {
3279     assert(EnableVPlanNativePath &&
3280            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3281     fixNonInductionPHIs();
3282   }
3283 
3284   // At this point every instruction in the original loop is widened to a
3285   // vector form. Now we need to fix the recurrences in the loop. These PHI
3286   // nodes are currently empty because we did not want to introduce cycles.
3287   // This is the second stage of vectorizing recurrences.
3288   fixCrossIterationPHIs();
3289 
3290   // Update the dominator tree.
3291   //
3292   // FIXME: After creating the structure of the new loop, the dominator tree is
3293   //        no longer up-to-date, and it remains that way until we update it
3294   //        here. An out-of-date dominator tree is problematic for SCEV,
3295   //        because SCEVExpander uses it to guide code generation. The
3296   //        vectorizer use SCEVExpanders in several places. Instead, we should
3297   //        keep the dominator tree up-to-date as we go.
3298   updateAnalysis();
3299 
3300   // Fix-up external users of the induction variables.
3301   for (auto &Entry : *Legal->getInductionVars())
3302     fixupIVUsers(Entry.first, Entry.second,
3303                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3304                  IVEndValues[Entry.first], LoopMiddleBlock);
3305 
3306   fixLCSSAPHIs();
3307   for (Instruction *PI : PredicatedInstructions)
3308     sinkScalarOperands(&*PI);
3309 
3310   // Remove redundant induction instructions.
3311   cse(LoopVectorBody);
3312 }
3313 
3314 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3315   // In order to support recurrences we need to be able to vectorize Phi nodes.
3316   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3317   // stage #2: We now need to fix the recurrences by adding incoming edges to
3318   // the currently empty PHI nodes. At this point every instruction in the
3319   // original loop is widened to a vector form so we can use them to construct
3320   // the incoming edges.
3321   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3322     // Handle first-order recurrences and reductions that need to be fixed.
3323     if (Legal->isFirstOrderRecurrence(&Phi))
3324       fixFirstOrderRecurrence(&Phi);
3325     else if (Legal->isReductionVariable(&Phi))
3326       fixReduction(&Phi);
3327   }
3328 }
3329 
3330 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3331   // This is the second phase of vectorizing first-order recurrences. An
3332   // overview of the transformation is described below. Suppose we have the
3333   // following loop.
3334   //
3335   //   for (int i = 0; i < n; ++i)
3336   //     b[i] = a[i] - a[i - 1];
3337   //
3338   // There is a first-order recurrence on "a". For this loop, the shorthand
3339   // scalar IR looks like:
3340   //
3341   //   scalar.ph:
3342   //     s_init = a[-1]
3343   //     br scalar.body
3344   //
3345   //   scalar.body:
3346   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3347   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3348   //     s2 = a[i]
3349   //     b[i] = s2 - s1
3350   //     br cond, scalar.body, ...
3351   //
3352   // In this example, s1 is a recurrence because it's value depends on the
3353   // previous iteration. In the first phase of vectorization, we created a
3354   // temporary value for s1. We now complete the vectorization and produce the
3355   // shorthand vector IR shown below (for VF = 4, UF = 1).
3356   //
3357   //   vector.ph:
3358   //     v_init = vector(..., ..., ..., a[-1])
3359   //     br vector.body
3360   //
3361   //   vector.body
3362   //     i = phi [0, vector.ph], [i+4, vector.body]
3363   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3364   //     v2 = a[i, i+1, i+2, i+3];
3365   //     v3 = vector(v1(3), v2(0, 1, 2))
3366   //     b[i, i+1, i+2, i+3] = v2 - v3
3367   //     br cond, vector.body, middle.block
3368   //
3369   //   middle.block:
3370   //     x = v2(3)
3371   //     br scalar.ph
3372   //
3373   //   scalar.ph:
3374   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3375   //     br scalar.body
3376   //
3377   // After execution completes the vector loop, we extract the next value of
3378   // the recurrence (x) to use as the initial value in the scalar loop.
3379 
3380   // Get the original loop preheader and single loop latch.
3381   auto *Preheader = OrigLoop->getLoopPreheader();
3382   auto *Latch = OrigLoop->getLoopLatch();
3383 
3384   // Get the initial and previous values of the scalar recurrence.
3385   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3386   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3387 
3388   // Create a vector from the initial value.
3389   auto *VectorInit = ScalarInit;
3390   if (VF > 1) {
3391     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3392     VectorInit = Builder.CreateInsertElement(
3393         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3394         Builder.getInt32(VF - 1), "vector.recur.init");
3395   }
3396 
3397   // We constructed a temporary phi node in the first phase of vectorization.
3398   // This phi node will eventually be deleted.
3399   Builder.SetInsertPoint(
3400       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3401 
3402   // Create a phi node for the new recurrence. The current value will either be
3403   // the initial value inserted into a vector or loop-varying vector value.
3404   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3405   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3406 
3407   // Get the vectorized previous value of the last part UF - 1. It appears last
3408   // among all unrolled iterations, due to the order of their construction.
3409   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3410 
3411   // Set the insertion point after the previous value if it is an instruction.
3412   // Note that the previous value may have been constant-folded so it is not
3413   // guaranteed to be an instruction in the vector loop. Also, if the previous
3414   // value is a phi node, we should insert after all the phi nodes to avoid
3415   // breaking basic block verification.
3416   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3417       isa<PHINode>(PreviousLastPart))
3418     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3419   else
3420     Builder.SetInsertPoint(
3421         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3422 
3423   // We will construct a vector for the recurrence by combining the values for
3424   // the current and previous iterations. This is the required shuffle mask.
3425   SmallVector<Constant *, 8> ShuffleMask(VF);
3426   ShuffleMask[0] = Builder.getInt32(VF - 1);
3427   for (unsigned I = 1; I < VF; ++I)
3428     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3429 
3430   // The vector from which to take the initial value for the current iteration
3431   // (actual or unrolled). Initially, this is the vector phi node.
3432   Value *Incoming = VecPhi;
3433 
3434   // Shuffle the current and previous vector and update the vector parts.
3435   for (unsigned Part = 0; Part < UF; ++Part) {
3436     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3437     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3438     auto *Shuffle =
3439         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3440                                              ConstantVector::get(ShuffleMask))
3441                : Incoming;
3442     PhiPart->replaceAllUsesWith(Shuffle);
3443     cast<Instruction>(PhiPart)->eraseFromParent();
3444     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3445     Incoming = PreviousPart;
3446   }
3447 
3448   // Fix the latch value of the new recurrence in the vector loop.
3449   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3450 
3451   // Extract the last vector element in the middle block. This will be the
3452   // initial value for the recurrence when jumping to the scalar loop.
3453   auto *ExtractForScalar = Incoming;
3454   if (VF > 1) {
3455     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3456     ExtractForScalar = Builder.CreateExtractElement(
3457         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3458   }
3459   // Extract the second last element in the middle block if the
3460   // Phi is used outside the loop. We need to extract the phi itself
3461   // and not the last element (the phi update in the current iteration). This
3462   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3463   // when the scalar loop is not run at all.
3464   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3465   if (VF > 1)
3466     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3467         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3468   // When loop is unrolled without vectorizing, initialize
3469   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3470   // `Incoming`. This is analogous to the vectorized case above: extracting the
3471   // second last element when VF > 1.
3472   else if (UF > 1)
3473     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3474 
3475   // Fix the initial value of the original recurrence in the scalar loop.
3476   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3477   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3478   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3479     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3480     Start->addIncoming(Incoming, BB);
3481   }
3482 
3483   Phi->setIncomingValue(Phi->getBasicBlockIndex(LoopScalarPreHeader), Start);
3484   Phi->setName("scalar.recur");
3485 
3486   // Finally, fix users of the recurrence outside the loop. The users will need
3487   // either the last value of the scalar recurrence or the last value of the
3488   // vector recurrence we extracted in the middle block. Since the loop is in
3489   // LCSSA form, we just need to find all the phi nodes for the original scalar
3490   // recurrence in the exit block, and then add an edge for the middle block.
3491   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3492     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3493       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3494     }
3495   }
3496 }
3497 
3498 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3499   Constant *Zero = Builder.getInt32(0);
3500 
3501   // Get it's reduction variable descriptor.
3502   assert(Legal->isReductionVariable(Phi) &&
3503          "Unable to find the reduction variable");
3504   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3505 
3506   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3507   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3508   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3509   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3510     RdxDesc.getMinMaxRecurrenceKind();
3511   setDebugLocFromInst(Builder, ReductionStartValue);
3512 
3513   // We need to generate a reduction vector from the incoming scalar.
3514   // To do so, we need to generate the 'identity' vector and override
3515   // one of the elements with the incoming scalar reduction. We need
3516   // to do it in the vector-loop preheader.
3517   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3518 
3519   // This is the vector-clone of the value that leaves the loop.
3520   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3521 
3522   // Find the reduction identity variable. Zero for addition, or, xor,
3523   // one for multiplication, -1 for And.
3524   Value *Identity;
3525   Value *VectorStart;
3526   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3527       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3528     // MinMax reduction have the start value as their identify.
3529     if (VF == 1) {
3530       VectorStart = Identity = ReductionStartValue;
3531     } else {
3532       VectorStart = Identity =
3533         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3534     }
3535   } else {
3536     // Handle other reduction kinds:
3537     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3538         RK, VecTy->getScalarType());
3539     if (VF == 1) {
3540       Identity = Iden;
3541       // This vector is the Identity vector where the first element is the
3542       // incoming scalar reduction.
3543       VectorStart = ReductionStartValue;
3544     } else {
3545       Identity = ConstantVector::getSplat(VF, Iden);
3546 
3547       // This vector is the Identity vector where the first element is the
3548       // incoming scalar reduction.
3549       VectorStart =
3550         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3551     }
3552   }
3553 
3554   // Fix the vector-loop phi.
3555 
3556   // Reductions do not have to start at zero. They can start with
3557   // any loop invariant values.
3558   BasicBlock *Latch = OrigLoop->getLoopLatch();
3559   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3560   for (unsigned Part = 0; Part < UF; ++Part) {
3561     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3562     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3563     // Make sure to add the reduction stat value only to the
3564     // first unroll part.
3565     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3566     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3567     cast<PHINode>(VecRdxPhi)
3568       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3569   }
3570 
3571   // Before each round, move the insertion point right between
3572   // the PHIs and the values we are going to write.
3573   // This allows us to write both PHINodes and the extractelement
3574   // instructions.
3575   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3576 
3577   setDebugLocFromInst(Builder, LoopExitInst);
3578 
3579   // If the vector reduction can be performed in a smaller type, we truncate
3580   // then extend the loop exit value to enable InstCombine to evaluate the
3581   // entire expression in the smaller type.
3582   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3583     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3584     Builder.SetInsertPoint(
3585         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3586     VectorParts RdxParts(UF);
3587     for (unsigned Part = 0; Part < UF; ++Part) {
3588       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3589       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3590       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3591                                         : Builder.CreateZExt(Trunc, VecTy);
3592       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3593            UI != RdxParts[Part]->user_end();)
3594         if (*UI != Trunc) {
3595           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3596           RdxParts[Part] = Extnd;
3597         } else {
3598           ++UI;
3599         }
3600     }
3601     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3602     for (unsigned Part = 0; Part < UF; ++Part) {
3603       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3604       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3605     }
3606   }
3607 
3608   // Reduce all of the unrolled parts into a single vector.
3609   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3610   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3611   setDebugLocFromInst(Builder, ReducedPartRdx);
3612   for (unsigned Part = 1; Part < UF; ++Part) {
3613     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3614     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3615       // Floating point operations had to be 'fast' to enable the reduction.
3616       ReducedPartRdx = addFastMathFlag(
3617           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3618                               ReducedPartRdx, "bin.rdx"),
3619           RdxDesc.getFastMathFlags());
3620     else
3621       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3622                                       RdxPart);
3623   }
3624 
3625   if (VF > 1) {
3626     bool NoNaN = Legal->hasFunNoNaNAttr();
3627     ReducedPartRdx =
3628         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3629     // If the reduction can be performed in a smaller type, we need to extend
3630     // the reduction to the wider type before we branch to the original loop.
3631     if (Phi->getType() != RdxDesc.getRecurrenceType())
3632       ReducedPartRdx =
3633         RdxDesc.isSigned()
3634         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3635         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3636   }
3637 
3638   // Create a phi node that merges control-flow from the backedge-taken check
3639   // block and the middle block.
3640   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3641                                         LoopScalarPreHeader->getTerminator());
3642   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3643     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3644   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3645 
3646   // Now, we need to fix the users of the reduction variable
3647   // inside and outside of the scalar remainder loop.
3648   // We know that the loop is in LCSSA form. We need to update the
3649   // PHI nodes in the exit blocks.
3650   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3651     // All PHINodes need to have a single entry edge, or two if
3652     // we already fixed them.
3653     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3654 
3655     // We found a reduction value exit-PHI. Update it with the
3656     // incoming bypass edge.
3657     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3658       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3659   } // end of the LCSSA phi scan.
3660 
3661     // Fix the scalar loop reduction variable with the incoming reduction sum
3662     // from the vector body and from the backedge value.
3663   int IncomingEdgeBlockIdx =
3664     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3665   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3666   // Pick the other block.
3667   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3668   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3669   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3670 }
3671 
3672 void InnerLoopVectorizer::fixLCSSAPHIs() {
3673   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3674     if (LCSSAPhi.getNumIncomingValues() == 1) {
3675       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3676       // Non-instruction incoming values will have only one value.
3677       unsigned LastLane = 0;
3678       if (isa<Instruction>(IncomingValue))
3679           LastLane = Cost->isUniformAfterVectorization(
3680                          cast<Instruction>(IncomingValue), VF)
3681                          ? 0
3682                          : VF - 1;
3683       // Can be a loop invariant incoming value or the last scalar value to be
3684       // extracted from the vectorized loop.
3685       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3686       Value *lastIncomingValue =
3687           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3688       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3689     }
3690   }
3691 }
3692 
3693 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3694   // The basic block and loop containing the predicated instruction.
3695   auto *PredBB = PredInst->getParent();
3696   auto *VectorLoop = LI->getLoopFor(PredBB);
3697 
3698   // Initialize a worklist with the operands of the predicated instruction.
3699   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3700 
3701   // Holds instructions that we need to analyze again. An instruction may be
3702   // reanalyzed if we don't yet know if we can sink it or not.
3703   SmallVector<Instruction *, 8> InstsToReanalyze;
3704 
3705   // Returns true if a given use occurs in the predicated block. Phi nodes use
3706   // their operands in their corresponding predecessor blocks.
3707   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3708     auto *I = cast<Instruction>(U.getUser());
3709     BasicBlock *BB = I->getParent();
3710     if (auto *Phi = dyn_cast<PHINode>(I))
3711       BB = Phi->getIncomingBlock(
3712           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3713     return BB == PredBB;
3714   };
3715 
3716   // Iteratively sink the scalarized operands of the predicated instruction
3717   // into the block we created for it. When an instruction is sunk, it's
3718   // operands are then added to the worklist. The algorithm ends after one pass
3719   // through the worklist doesn't sink a single instruction.
3720   bool Changed;
3721   do {
3722     // Add the instructions that need to be reanalyzed to the worklist, and
3723     // reset the changed indicator.
3724     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3725     InstsToReanalyze.clear();
3726     Changed = false;
3727 
3728     while (!Worklist.empty()) {
3729       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3730 
3731       // We can't sink an instruction if it is a phi node, is already in the
3732       // predicated block, is not in the loop, or may have side effects.
3733       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3734           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3735         continue;
3736 
3737       // It's legal to sink the instruction if all its uses occur in the
3738       // predicated block. Otherwise, there's nothing to do yet, and we may
3739       // need to reanalyze the instruction.
3740       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3741         InstsToReanalyze.push_back(I);
3742         continue;
3743       }
3744 
3745       // Move the instruction to the beginning of the predicated block, and add
3746       // it's operands to the worklist.
3747       I->moveBefore(&*PredBB->getFirstInsertionPt());
3748       Worklist.insert(I->op_begin(), I->op_end());
3749 
3750       // The sinking may have enabled other instructions to be sunk, so we will
3751       // need to iterate.
3752       Changed = true;
3753     }
3754   } while (Changed);
3755 }
3756 
3757 void InnerLoopVectorizer::fixNonInductionPHIs() {
3758   for (PHINode *OrigPhi : OrigPHIsToFix) {
3759     PHINode *NewPhi =
3760         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3761     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3762 
3763     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3764         predecessors(OrigPhi->getParent()));
3765     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3766         predecessors(NewPhi->getParent()));
3767     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3768            "Scalar and Vector BB should have the same number of predecessors");
3769 
3770     // The insertion point in Builder may be invalidated by the time we get
3771     // here. Force the Builder insertion point to something valid so that we do
3772     // not run into issues during insertion point restore in
3773     // getOrCreateVectorValue calls below.
3774     Builder.SetInsertPoint(NewPhi);
3775 
3776     // The predecessor order is preserved and we can rely on mapping between
3777     // scalar and vector block predecessors.
3778     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3779       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3780 
3781       // When looking up the new scalar/vector values to fix up, use incoming
3782       // values from original phi.
3783       Value *ScIncV =
3784           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3785 
3786       // Scalar incoming value may need a broadcast
3787       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3788       NewPhi->addIncoming(NewIncV, NewPredBB);
3789     }
3790   }
3791 }
3792 
3793 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3794                                               unsigned VF) {
3795   PHINode *P = cast<PHINode>(PN);
3796   if (EnableVPlanNativePath) {
3797     // Currently we enter here in the VPlan-native path for non-induction
3798     // PHIs where all control flow is uniform. We simply widen these PHIs.
3799     // Create a vector phi with no operands - the vector phi operands will be
3800     // set at the end of vector code generation.
3801     Type *VecTy =
3802         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3803     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3804     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3805     OrigPHIsToFix.push_back(P);
3806 
3807     return;
3808   }
3809 
3810   assert(PN->getParent() == OrigLoop->getHeader() &&
3811          "Non-header phis should have been handled elsewhere");
3812 
3813   // In order to support recurrences we need to be able to vectorize Phi nodes.
3814   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3815   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3816   // this value when we vectorize all of the instructions that use the PHI.
3817   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3818     for (unsigned Part = 0; Part < UF; ++Part) {
3819       // This is phase one of vectorizing PHIs.
3820       Type *VecTy =
3821           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3822       Value *EntryPart = PHINode::Create(
3823           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3824       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3825     }
3826     return;
3827   }
3828 
3829   setDebugLocFromInst(Builder, P);
3830 
3831   // This PHINode must be an induction variable.
3832   // Make sure that we know about it.
3833   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3834 
3835   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3836   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3837 
3838   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3839   // which can be found from the original scalar operations.
3840   switch (II.getKind()) {
3841   case InductionDescriptor::IK_NoInduction:
3842     llvm_unreachable("Unknown induction");
3843   case InductionDescriptor::IK_IntInduction:
3844   case InductionDescriptor::IK_FpInduction:
3845     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3846   case InductionDescriptor::IK_PtrInduction: {
3847     // Handle the pointer induction variable case.
3848     assert(P->getType()->isPointerTy() && "Unexpected type.");
3849     // This is the normalized GEP that starts counting at zero.
3850     Value *PtrInd = Induction;
3851     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3852     // Determine the number of scalars we need to generate for each unroll
3853     // iteration. If the instruction is uniform, we only need to generate the
3854     // first lane. Otherwise, we generate all VF values.
3855     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3856     // These are the scalar results. Notice that we don't generate vector GEPs
3857     // because scalar GEPs result in better code.
3858     for (unsigned Part = 0; Part < UF; ++Part) {
3859       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3860         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3861         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3862         Value *SclrGep =
3863             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3864         SclrGep->setName("next.gep");
3865         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3866       }
3867     }
3868     return;
3869   }
3870   }
3871 }
3872 
3873 /// A helper function for checking whether an integer division-related
3874 /// instruction may divide by zero (in which case it must be predicated if
3875 /// executed conditionally in the scalar code).
3876 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3877 /// Non-zero divisors that are non compile-time constants will not be
3878 /// converted into multiplication, so we will still end up scalarizing
3879 /// the division, but can do so w/o predication.
3880 static bool mayDivideByZero(Instruction &I) {
3881   assert((I.getOpcode() == Instruction::UDiv ||
3882           I.getOpcode() == Instruction::SDiv ||
3883           I.getOpcode() == Instruction::URem ||
3884           I.getOpcode() == Instruction::SRem) &&
3885          "Unexpected instruction");
3886   Value *Divisor = I.getOperand(1);
3887   auto *CInt = dyn_cast<ConstantInt>(Divisor);
3888   return !CInt || CInt->isZero();
3889 }
3890 
3891 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3892   switch (I.getOpcode()) {
3893   case Instruction::Br:
3894   case Instruction::PHI:
3895     llvm_unreachable("This instruction is handled by a different recipe.");
3896   case Instruction::GetElementPtr: {
3897     // Construct a vector GEP by widening the operands of the scalar GEP as
3898     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3899     // results in a vector of pointers when at least one operand of the GEP
3900     // is vector-typed. Thus, to keep the representation compact, we only use
3901     // vector-typed operands for loop-varying values.
3902     auto *GEP = cast<GetElementPtrInst>(&I);
3903 
3904     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
3905       // If we are vectorizing, but the GEP has only loop-invariant operands,
3906       // the GEP we build (by only using vector-typed operands for
3907       // loop-varying values) would be a scalar pointer. Thus, to ensure we
3908       // produce a vector of pointers, we need to either arbitrarily pick an
3909       // operand to broadcast, or broadcast a clone of the original GEP.
3910       // Here, we broadcast a clone of the original.
3911       //
3912       // TODO: If at some point we decide to scalarize instructions having
3913       //       loop-invariant operands, this special case will no longer be
3914       //       required. We would add the scalarization decision to
3915       //       collectLoopScalars() and teach getVectorValue() to broadcast
3916       //       the lane-zero scalar value.
3917       auto *Clone = Builder.Insert(GEP->clone());
3918       for (unsigned Part = 0; Part < UF; ++Part) {
3919         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
3920         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
3921         addMetadata(EntryPart, GEP);
3922       }
3923     } else {
3924       // If the GEP has at least one loop-varying operand, we are sure to
3925       // produce a vector of pointers. But if we are only unrolling, we want
3926       // to produce a scalar GEP for each unroll part. Thus, the GEP we
3927       // produce with the code below will be scalar (if VF == 1) or vector
3928       // (otherwise). Note that for the unroll-only case, we still maintain
3929       // values in the vector mapping with initVector, as we do for other
3930       // instructions.
3931       for (unsigned Part = 0; Part < UF; ++Part) {
3932         // The pointer operand of the new GEP. If it's loop-invariant, we
3933         // won't broadcast it.
3934         auto *Ptr =
3935             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
3936                 ? GEP->getPointerOperand()
3937                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
3938 
3939         // Collect all the indices for the new GEP. If any index is
3940         // loop-invariant, we won't broadcast it.
3941         SmallVector<Value *, 4> Indices;
3942         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
3943           if (OrigLoop->isLoopInvariant(U.get()))
3944             Indices.push_back(U.get());
3945           else
3946             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
3947         }
3948 
3949         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
3950         // but it should be a vector, otherwise.
3951         auto *NewGEP =
3952             GEP->isInBounds()
3953                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
3954                                             Indices)
3955                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
3956         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
3957                "NewGEP is not a pointer vector");
3958         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
3959         addMetadata(NewGEP, GEP);
3960       }
3961     }
3962 
3963     break;
3964   }
3965   case Instruction::UDiv:
3966   case Instruction::SDiv:
3967   case Instruction::SRem:
3968   case Instruction::URem:
3969   case Instruction::Add:
3970   case Instruction::FAdd:
3971   case Instruction::Sub:
3972   case Instruction::FSub:
3973   case Instruction::Mul:
3974   case Instruction::FMul:
3975   case Instruction::FDiv:
3976   case Instruction::FRem:
3977   case Instruction::Shl:
3978   case Instruction::LShr:
3979   case Instruction::AShr:
3980   case Instruction::And:
3981   case Instruction::Or:
3982   case Instruction::Xor: {
3983     // Just widen binops.
3984     auto *BinOp = cast<BinaryOperator>(&I);
3985     setDebugLocFromInst(Builder, BinOp);
3986 
3987     for (unsigned Part = 0; Part < UF; ++Part) {
3988       Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
3989       Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
3990       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
3991 
3992       if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
3993         VecOp->copyIRFlags(BinOp);
3994 
3995       // Use this vector value for all users of the original instruction.
3996       VectorLoopValueMap.setVectorValue(&I, Part, V);
3997       addMetadata(V, BinOp);
3998     }
3999 
4000     break;
4001   }
4002   case Instruction::Select: {
4003     // Widen selects.
4004     // If the selector is loop invariant we can create a select
4005     // instruction with a scalar condition. Otherwise, use vector-select.
4006     auto *SE = PSE.getSE();
4007     bool InvariantCond =
4008         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4009     setDebugLocFromInst(Builder, &I);
4010 
4011     // The condition can be loop invariant  but still defined inside the
4012     // loop. This means that we can't just use the original 'cond' value.
4013     // We have to take the 'vectorized' value and pick the first lane.
4014     // Instcombine will make this a no-op.
4015 
4016     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4017 
4018     for (unsigned Part = 0; Part < UF; ++Part) {
4019       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4020       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4021       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4022       Value *Sel =
4023           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4024       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4025       addMetadata(Sel, &I);
4026     }
4027 
4028     break;
4029   }
4030 
4031   case Instruction::ICmp:
4032   case Instruction::FCmp: {
4033     // Widen compares. Generate vector compares.
4034     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4035     auto *Cmp = dyn_cast<CmpInst>(&I);
4036     setDebugLocFromInst(Builder, Cmp);
4037     for (unsigned Part = 0; Part < UF; ++Part) {
4038       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4039       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4040       Value *C = nullptr;
4041       if (FCmp) {
4042         // Propagate fast math flags.
4043         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4044         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4045         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4046       } else {
4047         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4048       }
4049       VectorLoopValueMap.setVectorValue(&I, Part, C);
4050       addMetadata(C, &I);
4051     }
4052 
4053     break;
4054   }
4055 
4056   case Instruction::ZExt:
4057   case Instruction::SExt:
4058   case Instruction::FPToUI:
4059   case Instruction::FPToSI:
4060   case Instruction::FPExt:
4061   case Instruction::PtrToInt:
4062   case Instruction::IntToPtr:
4063   case Instruction::SIToFP:
4064   case Instruction::UIToFP:
4065   case Instruction::Trunc:
4066   case Instruction::FPTrunc:
4067   case Instruction::BitCast: {
4068     auto *CI = dyn_cast<CastInst>(&I);
4069     setDebugLocFromInst(Builder, CI);
4070 
4071     /// Vectorize casts.
4072     Type *DestTy =
4073         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4074 
4075     for (unsigned Part = 0; Part < UF; ++Part) {
4076       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4077       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4078       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4079       addMetadata(Cast, &I);
4080     }
4081     break;
4082   }
4083 
4084   case Instruction::Call: {
4085     // Ignore dbg intrinsics.
4086     if (isa<DbgInfoIntrinsic>(I))
4087       break;
4088     setDebugLocFromInst(Builder, &I);
4089 
4090     Module *M = I.getParent()->getParent()->getParent();
4091     auto *CI = cast<CallInst>(&I);
4092 
4093     StringRef FnName = CI->getCalledFunction()->getName();
4094     Function *F = CI->getCalledFunction();
4095     Type *RetTy = ToVectorTy(CI->getType(), VF);
4096     SmallVector<Type *, 4> Tys;
4097     for (Value *ArgOperand : CI->arg_operands())
4098       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4099 
4100     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4101 
4102     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4103     // version of the instruction.
4104     // Is it beneficial to perform intrinsic call compared to lib call?
4105     bool NeedToScalarize;
4106     unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
4107     bool UseVectorIntrinsic =
4108         ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
4109     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4110            "Instruction should be scalarized elsewhere.");
4111 
4112     for (unsigned Part = 0; Part < UF; ++Part) {
4113       SmallVector<Value *, 4> Args;
4114       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4115         Value *Arg = CI->getArgOperand(i);
4116         // Some intrinsics have a scalar argument - don't replace it with a
4117         // vector.
4118         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4119           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4120         Args.push_back(Arg);
4121       }
4122 
4123       Function *VectorF;
4124       if (UseVectorIntrinsic) {
4125         // Use vector version of the intrinsic.
4126         Type *TysForDecl[] = {CI->getType()};
4127         if (VF > 1)
4128           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4129         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4130       } else {
4131         // Use vector version of the library call.
4132         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4133         assert(!VFnName.empty() && "Vector function name is empty.");
4134         VectorF = M->getFunction(VFnName);
4135         if (!VectorF) {
4136           // Generate a declaration
4137           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4138           VectorF =
4139               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4140           VectorF->copyAttributesFrom(F);
4141         }
4142       }
4143       assert(VectorF && "Can't create vector function.");
4144 
4145       SmallVector<OperandBundleDef, 1> OpBundles;
4146       CI->getOperandBundlesAsDefs(OpBundles);
4147       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4148 
4149       if (isa<FPMathOperator>(V))
4150         V->copyFastMathFlags(CI);
4151 
4152       VectorLoopValueMap.setVectorValue(&I, Part, V);
4153       addMetadata(V, &I);
4154     }
4155 
4156     break;
4157   }
4158 
4159   default:
4160     // This instruction is not vectorized by simple widening.
4161     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4162     llvm_unreachable("Unhandled instruction!");
4163   } // end of switch.
4164 }
4165 
4166 void InnerLoopVectorizer::updateAnalysis() {
4167   // Forget the original basic block.
4168   PSE.getSE()->forgetLoop(OrigLoop);
4169 
4170   // DT is not kept up-to-date for outer loop vectorization
4171   if (EnableVPlanNativePath)
4172     return;
4173 
4174   // Update the dominator tree information.
4175   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4176          "Entry does not dominate exit.");
4177 
4178   DT->addNewBlock(LoopMiddleBlock,
4179                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4180   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4181   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4182   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4183   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4184 }
4185 
4186 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4187   // We should not collect Scalars more than once per VF. Right now, this
4188   // function is called from collectUniformsAndScalars(), which already does
4189   // this check. Collecting Scalars for VF=1 does not make any sense.
4190   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4191          "This function should not be visited twice for the same VF");
4192 
4193   SmallSetVector<Instruction *, 8> Worklist;
4194 
4195   // These sets are used to seed the analysis with pointers used by memory
4196   // accesses that will remain scalar.
4197   SmallSetVector<Instruction *, 8> ScalarPtrs;
4198   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4199 
4200   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4201   // The pointer operands of loads and stores will be scalar as long as the
4202   // memory access is not a gather or scatter operation. The value operand of a
4203   // store will remain scalar if the store is scalarized.
4204   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4205     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4206     assert(WideningDecision != CM_Unknown &&
4207            "Widening decision should be ready at this moment");
4208     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4209       if (Ptr == Store->getValueOperand())
4210         return WideningDecision == CM_Scalarize;
4211     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4212            "Ptr is neither a value or pointer operand");
4213     return WideningDecision != CM_GatherScatter;
4214   };
4215 
4216   // A helper that returns true if the given value is a bitcast or
4217   // getelementptr instruction contained in the loop.
4218   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4219     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4220             isa<GetElementPtrInst>(V)) &&
4221            !TheLoop->isLoopInvariant(V);
4222   };
4223 
4224   // A helper that evaluates a memory access's use of a pointer. If the use
4225   // will be a scalar use, and the pointer is only used by memory accesses, we
4226   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4227   // PossibleNonScalarPtrs.
4228   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4229     // We only care about bitcast and getelementptr instructions contained in
4230     // the loop.
4231     if (!isLoopVaryingBitCastOrGEP(Ptr))
4232       return;
4233 
4234     // If the pointer has already been identified as scalar (e.g., if it was
4235     // also identified as uniform), there's nothing to do.
4236     auto *I = cast<Instruction>(Ptr);
4237     if (Worklist.count(I))
4238       return;
4239 
4240     // If the use of the pointer will be a scalar use, and all users of the
4241     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4242     // place the pointer in PossibleNonScalarPtrs.
4243     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4244           return isa<LoadInst>(U) || isa<StoreInst>(U);
4245         }))
4246       ScalarPtrs.insert(I);
4247     else
4248       PossibleNonScalarPtrs.insert(I);
4249   };
4250 
4251   // We seed the scalars analysis with three classes of instructions: (1)
4252   // instructions marked uniform-after-vectorization, (2) bitcast and
4253   // getelementptr instructions used by memory accesses requiring a scalar use,
4254   // and (3) pointer induction variables and their update instructions (we
4255   // currently only scalarize these).
4256   //
4257   // (1) Add to the worklist all instructions that have been identified as
4258   // uniform-after-vectorization.
4259   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4260 
4261   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4262   // memory accesses requiring a scalar use. The pointer operands of loads and
4263   // stores will be scalar as long as the memory accesses is not a gather or
4264   // scatter operation. The value operand of a store will remain scalar if the
4265   // store is scalarized.
4266   for (auto *BB : TheLoop->blocks())
4267     for (auto &I : *BB) {
4268       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4269         evaluatePtrUse(Load, Load->getPointerOperand());
4270       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4271         evaluatePtrUse(Store, Store->getPointerOperand());
4272         evaluatePtrUse(Store, Store->getValueOperand());
4273       }
4274     }
4275   for (auto *I : ScalarPtrs)
4276     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4277       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4278       Worklist.insert(I);
4279     }
4280 
4281   // (3) Add to the worklist all pointer induction variables and their update
4282   // instructions.
4283   //
4284   // TODO: Once we are able to vectorize pointer induction variables we should
4285   //       no longer insert them into the worklist here.
4286   auto *Latch = TheLoop->getLoopLatch();
4287   for (auto &Induction : *Legal->getInductionVars()) {
4288     auto *Ind = Induction.first;
4289     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4290     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4291       continue;
4292     Worklist.insert(Ind);
4293     Worklist.insert(IndUpdate);
4294     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4295     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4296                       << "\n");
4297   }
4298 
4299   // Insert the forced scalars.
4300   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4301   // induction variable when the PHI user is scalarized.
4302   auto ForcedScalar = ForcedScalars.find(VF);
4303   if (ForcedScalar != ForcedScalars.end())
4304     for (auto *I : ForcedScalar->second)
4305       Worklist.insert(I);
4306 
4307   // Expand the worklist by looking through any bitcasts and getelementptr
4308   // instructions we've already identified as scalar. This is similar to the
4309   // expansion step in collectLoopUniforms(); however, here we're only
4310   // expanding to include additional bitcasts and getelementptr instructions.
4311   unsigned Idx = 0;
4312   while (Idx != Worklist.size()) {
4313     Instruction *Dst = Worklist[Idx++];
4314     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4315       continue;
4316     auto *Src = cast<Instruction>(Dst->getOperand(0));
4317     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4318           auto *J = cast<Instruction>(U);
4319           return !TheLoop->contains(J) || Worklist.count(J) ||
4320                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4321                   isScalarUse(J, Src));
4322         })) {
4323       Worklist.insert(Src);
4324       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4325     }
4326   }
4327 
4328   // An induction variable will remain scalar if all users of the induction
4329   // variable and induction variable update remain scalar.
4330   for (auto &Induction : *Legal->getInductionVars()) {
4331     auto *Ind = Induction.first;
4332     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4333 
4334     // We already considered pointer induction variables, so there's no reason
4335     // to look at their users again.
4336     //
4337     // TODO: Once we are able to vectorize pointer induction variables we
4338     //       should no longer skip over them here.
4339     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4340       continue;
4341 
4342     // Determine if all users of the induction variable are scalar after
4343     // vectorization.
4344     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4345       auto *I = cast<Instruction>(U);
4346       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4347     });
4348     if (!ScalarInd)
4349       continue;
4350 
4351     // Determine if all users of the induction variable update instruction are
4352     // scalar after vectorization.
4353     auto ScalarIndUpdate =
4354         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4355           auto *I = cast<Instruction>(U);
4356           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4357         });
4358     if (!ScalarIndUpdate)
4359       continue;
4360 
4361     // The induction variable and its update instruction will remain scalar.
4362     Worklist.insert(Ind);
4363     Worklist.insert(IndUpdate);
4364     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4365     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4366                       << "\n");
4367   }
4368 
4369   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4370 }
4371 
4372 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4373   if (!blockNeedsPredication(I->getParent()))
4374     return false;
4375   switch(I->getOpcode()) {
4376   default:
4377     break;
4378   case Instruction::Load:
4379   case Instruction::Store: {
4380     if (!Legal->isMaskRequired(I))
4381       return false;
4382     auto *Ptr = getLoadStorePointerOperand(I);
4383     auto *Ty = getMemInstValueType(I);
4384     // We have already decided how to vectorize this instruction, get that
4385     // result.
4386     if (VF > 1) {
4387       InstWidening WideningDecision = getWideningDecision(I, VF);
4388       assert(WideningDecision != CM_Unknown &&
4389              "Widening decision should be ready at this moment");
4390       return WideningDecision == CM_Scalarize;
4391     }
4392     return isa<LoadInst>(I) ?
4393         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4394       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4395   }
4396   case Instruction::UDiv:
4397   case Instruction::SDiv:
4398   case Instruction::SRem:
4399   case Instruction::URem:
4400     return mayDivideByZero(*I);
4401   }
4402   return false;
4403 }
4404 
4405 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4406                                                                unsigned VF) {
4407   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4408   assert(getWideningDecision(I, VF) == CM_Unknown &&
4409          "Decision should not be set yet.");
4410   auto *Group = getInterleavedAccessGroup(I);
4411   assert(Group && "Must have a group.");
4412 
4413   // Check if masking is required.
4414   // A Group may need masking for one of two reasons: it resides in a block that
4415   // needs predication, or it was decided to use masking to deal with gaps.
4416   bool PredicatedAccessRequiresMasking =
4417       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4418   bool AccessWithGapsRequiresMasking =
4419       Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
4420   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4421     return true;
4422 
4423   // If masked interleaving is required, we expect that the user/target had
4424   // enabled it, because otherwise it either wouldn't have been created or
4425   // it should have been invalidated by the CostModel.
4426   assert(useMaskedInterleavedAccesses(TTI) &&
4427          "Masked interleave-groups for predicated accesses are not enabled.");
4428 
4429   auto *Ty = getMemInstValueType(I);
4430   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4431                           : TTI.isLegalMaskedStore(Ty);
4432 }
4433 
4434 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4435                                                                unsigned VF) {
4436   // Get and ensure we have a valid memory instruction.
4437   LoadInst *LI = dyn_cast<LoadInst>(I);
4438   StoreInst *SI = dyn_cast<StoreInst>(I);
4439   assert((LI || SI) && "Invalid memory instruction");
4440 
4441   auto *Ptr = getLoadStorePointerOperand(I);
4442 
4443   // In order to be widened, the pointer should be consecutive, first of all.
4444   if (!Legal->isConsecutivePtr(Ptr))
4445     return false;
4446 
4447   // If the instruction is a store located in a predicated block, it will be
4448   // scalarized.
4449   if (isScalarWithPredication(I))
4450     return false;
4451 
4452   // If the instruction's allocated size doesn't equal it's type size, it
4453   // requires padding and will be scalarized.
4454   auto &DL = I->getModule()->getDataLayout();
4455   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4456   if (hasIrregularType(ScalarTy, DL, VF))
4457     return false;
4458 
4459   return true;
4460 }
4461 
4462 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4463   // We should not collect Uniforms more than once per VF. Right now,
4464   // this function is called from collectUniformsAndScalars(), which
4465   // already does this check. Collecting Uniforms for VF=1 does not make any
4466   // sense.
4467 
4468   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4469          "This function should not be visited twice for the same VF");
4470 
4471   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4472   // not analyze again.  Uniforms.count(VF) will return 1.
4473   Uniforms[VF].clear();
4474 
4475   // We now know that the loop is vectorizable!
4476   // Collect instructions inside the loop that will remain uniform after
4477   // vectorization.
4478 
4479   // Global values, params and instructions outside of current loop are out of
4480   // scope.
4481   auto isOutOfScope = [&](Value *V) -> bool {
4482     Instruction *I = dyn_cast<Instruction>(V);
4483     return (!I || !TheLoop->contains(I));
4484   };
4485 
4486   SetVector<Instruction *> Worklist;
4487   BasicBlock *Latch = TheLoop->getLoopLatch();
4488 
4489   // Start with the conditional branch. If the branch condition is an
4490   // instruction contained in the loop that is only used by the branch, it is
4491   // uniform.
4492   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4493   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4494     Worklist.insert(Cmp);
4495     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4496   }
4497 
4498   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4499   // are pointers that are treated like consecutive pointers during
4500   // vectorization. The pointer operands of interleaved accesses are an
4501   // example.
4502   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4503 
4504   // Holds pointer operands of instructions that are possibly non-uniform.
4505   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4506 
4507   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4508     InstWidening WideningDecision = getWideningDecision(I, VF);
4509     assert(WideningDecision != CM_Unknown &&
4510            "Widening decision should be ready at this moment");
4511 
4512     return (WideningDecision == CM_Widen ||
4513             WideningDecision == CM_Widen_Reverse ||
4514             WideningDecision == CM_Interleave);
4515   };
4516   // Iterate over the instructions in the loop, and collect all
4517   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4518   // that a consecutive-like pointer operand will be scalarized, we collect it
4519   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4520   // getelementptr instruction can be used by both vectorized and scalarized
4521   // memory instructions. For example, if a loop loads and stores from the same
4522   // location, but the store is conditional, the store will be scalarized, and
4523   // the getelementptr won't remain uniform.
4524   for (auto *BB : TheLoop->blocks())
4525     for (auto &I : *BB) {
4526       // If there's no pointer operand, there's nothing to do.
4527       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4528       if (!Ptr)
4529         continue;
4530 
4531       // True if all users of Ptr are memory accesses that have Ptr as their
4532       // pointer operand.
4533       auto UsersAreMemAccesses =
4534           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4535             return getLoadStorePointerOperand(U) == Ptr;
4536           });
4537 
4538       // Ensure the memory instruction will not be scalarized or used by
4539       // gather/scatter, making its pointer operand non-uniform. If the pointer
4540       // operand is used by any instruction other than a memory access, we
4541       // conservatively assume the pointer operand may be non-uniform.
4542       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4543         PossibleNonUniformPtrs.insert(Ptr);
4544 
4545       // If the memory instruction will be vectorized and its pointer operand
4546       // is consecutive-like, or interleaving - the pointer operand should
4547       // remain uniform.
4548       else
4549         ConsecutiveLikePtrs.insert(Ptr);
4550     }
4551 
4552   // Add to the Worklist all consecutive and consecutive-like pointers that
4553   // aren't also identified as possibly non-uniform.
4554   for (auto *V : ConsecutiveLikePtrs)
4555     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4556       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4557       Worklist.insert(V);
4558     }
4559 
4560   // Expand Worklist in topological order: whenever a new instruction
4561   // is added , its users should be already inside Worklist.  It ensures
4562   // a uniform instruction will only be used by uniform instructions.
4563   unsigned idx = 0;
4564   while (idx != Worklist.size()) {
4565     Instruction *I = Worklist[idx++];
4566 
4567     for (auto OV : I->operand_values()) {
4568       // isOutOfScope operands cannot be uniform instructions.
4569       if (isOutOfScope(OV))
4570         continue;
4571       // First order recurrence Phi's should typically be considered
4572       // non-uniform.
4573       auto *OP = dyn_cast<PHINode>(OV);
4574       if (OP && Legal->isFirstOrderRecurrence(OP))
4575         continue;
4576       // If all the users of the operand are uniform, then add the
4577       // operand into the uniform worklist.
4578       auto *OI = cast<Instruction>(OV);
4579       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4580             auto *J = cast<Instruction>(U);
4581             return Worklist.count(J) ||
4582                    (OI == getLoadStorePointerOperand(J) &&
4583                     isUniformDecision(J, VF));
4584           })) {
4585         Worklist.insert(OI);
4586         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4587       }
4588     }
4589   }
4590 
4591   // Returns true if Ptr is the pointer operand of a memory access instruction
4592   // I, and I is known to not require scalarization.
4593   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4594     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4595   };
4596 
4597   // For an instruction to be added into Worklist above, all its users inside
4598   // the loop should also be in Worklist. However, this condition cannot be
4599   // true for phi nodes that form a cyclic dependence. We must process phi
4600   // nodes separately. An induction variable will remain uniform if all users
4601   // of the induction variable and induction variable update remain uniform.
4602   // The code below handles both pointer and non-pointer induction variables.
4603   for (auto &Induction : *Legal->getInductionVars()) {
4604     auto *Ind = Induction.first;
4605     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4606 
4607     // Determine if all users of the induction variable are uniform after
4608     // vectorization.
4609     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4610       auto *I = cast<Instruction>(U);
4611       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4612              isVectorizedMemAccessUse(I, Ind);
4613     });
4614     if (!UniformInd)
4615       continue;
4616 
4617     // Determine if all users of the induction variable update instruction are
4618     // uniform after vectorization.
4619     auto UniformIndUpdate =
4620         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4621           auto *I = cast<Instruction>(U);
4622           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4623                  isVectorizedMemAccessUse(I, IndUpdate);
4624         });
4625     if (!UniformIndUpdate)
4626       continue;
4627 
4628     // The induction variable and its update instruction will remain uniform.
4629     Worklist.insert(Ind);
4630     Worklist.insert(IndUpdate);
4631     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4632     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4633                       << "\n");
4634   }
4635 
4636   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4637 }
4638 
4639 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4640   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4641     // TODO: It may by useful to do since it's still likely to be dynamically
4642     // uniform if the target can skip.
4643     LLVM_DEBUG(
4644         dbgs() << "LV: Not inserting runtime ptr check for divergent target");
4645 
4646     ORE->emit(
4647       createMissedAnalysis("CantVersionLoopWithDivergentTarget")
4648       << "runtime pointer checks needed. Not enabled for divergent target");
4649 
4650     return None;
4651   }
4652 
4653   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4654   if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4655     return computeFeasibleMaxVF(OptForSize, TC);
4656 
4657   if (Legal->getRuntimePointerChecking()->Need) {
4658     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4659               << "runtime pointer checks needed. Enable vectorization of this "
4660                  "loop with '#pragma clang loop vectorize(enable)' when "
4661                  "compiling with -Os/-Oz");
4662     LLVM_DEBUG(
4663         dbgs()
4664         << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
4665     return None;
4666   }
4667 
4668   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4669     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4670               << "runtime SCEV checks needed. Enable vectorization of this "
4671                  "loop with '#pragma clang loop vectorize(enable)' when "
4672                  "compiling with -Os/-Oz");
4673     LLVM_DEBUG(
4674         dbgs()
4675         << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
4676     return None;
4677   }
4678 
4679   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4680   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4681     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
4682               << "runtime stride == 1 checks needed. Enable vectorization of "
4683                  "this loop with '#pragma clang loop vectorize(enable)' when "
4684                  "compiling with -Os/-Oz");
4685     LLVM_DEBUG(
4686         dbgs()
4687         << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
4688     return None;
4689   }
4690 
4691   // If we optimize the program for size, avoid creating the tail loop.
4692   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4693 
4694   if (TC == 1) {
4695     ORE->emit(createMissedAnalysis("SingleIterationLoop")
4696               << "loop trip count is one, irrelevant for vectorization");
4697     LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
4698     return None;
4699   }
4700 
4701   // Record that scalar epilogue is not allowed.
4702   LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4703 
4704   IsScalarEpilogueAllowed = !OptForSize;
4705 
4706   // We don't create an epilogue when optimizing for size.
4707   // Invalidate interleave groups that require an epilogue if we can't mask
4708   // the interleave-group.
4709   if (!useMaskedInterleavedAccesses(TTI))
4710     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4711 
4712   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
4713 
4714   if (TC > 0 && TC % MaxVF == 0) {
4715     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4716     return MaxVF;
4717   }
4718 
4719   // If we don't know the precise trip count, or if the trip count that we
4720   // found modulo the vectorization factor is not zero, try to fold the tail
4721   // by masking.
4722   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4723   if (Legal->canFoldTailByMasking()) {
4724     FoldTailByMasking = true;
4725     return MaxVF;
4726   }
4727 
4728   if (TC == 0) {
4729     ORE->emit(
4730         createMissedAnalysis("UnknownLoopCountComplexCFG")
4731         << "unable to calculate the loop count due to complex control flow");
4732     return None;
4733   }
4734 
4735   ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
4736             << "cannot optimize for size and vectorize at the same time. "
4737                "Enable vectorization of this loop with '#pragma clang loop "
4738                "vectorize(enable)' when compiling with -Os/-Oz");
4739   return None;
4740 }
4741 
4742 unsigned
4743 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4744                                                  unsigned ConstTripCount) {
4745   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4746   unsigned SmallestType, WidestType;
4747   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4748   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4749 
4750   // Get the maximum safe dependence distance in bits computed by LAA.
4751   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4752   // the memory accesses that is most restrictive (involved in the smallest
4753   // dependence distance).
4754   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4755 
4756   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4757 
4758   unsigned MaxVectorSize = WidestRegister / WidestType;
4759 
4760   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4761                     << " / " << WidestType << " bits.\n");
4762   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4763                     << WidestRegister << " bits.\n");
4764 
4765   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4766                                  " into one vector!");
4767   if (MaxVectorSize == 0) {
4768     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4769     MaxVectorSize = 1;
4770     return MaxVectorSize;
4771   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4772              isPowerOf2_32(ConstTripCount)) {
4773     // We need to clamp the VF to be the ConstTripCount. There is no point in
4774     // choosing a higher viable VF as done in the loop below.
4775     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4776                       << ConstTripCount << "\n");
4777     MaxVectorSize = ConstTripCount;
4778     return MaxVectorSize;
4779   }
4780 
4781   unsigned MaxVF = MaxVectorSize;
4782   if (TTI.shouldMaximizeVectorBandwidth(OptForSize) ||
4783       (MaximizeBandwidth && !OptForSize)) {
4784     // Collect all viable vectorization factors larger than the default MaxVF
4785     // (i.e. MaxVectorSize).
4786     SmallVector<unsigned, 8> VFs;
4787     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4788     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4789       VFs.push_back(VS);
4790 
4791     // For each VF calculate its register usage.
4792     auto RUs = calculateRegisterUsage(VFs);
4793 
4794     // Select the largest VF which doesn't require more registers than existing
4795     // ones.
4796     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4797     for (int i = RUs.size() - 1; i >= 0; --i) {
4798       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4799         MaxVF = VFs[i];
4800         break;
4801       }
4802     }
4803     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4804       if (MaxVF < MinVF) {
4805         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4806                           << ") with target's minimum: " << MinVF << '\n');
4807         MaxVF = MinVF;
4808       }
4809     }
4810   }
4811   return MaxVF;
4812 }
4813 
4814 VectorizationFactor
4815 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4816   float Cost = expectedCost(1).first;
4817   const float ScalarCost = Cost;
4818   unsigned Width = 1;
4819   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4820 
4821   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4822   if (ForceVectorization && MaxVF > 1) {
4823     // Ignore scalar width, because the user explicitly wants vectorization.
4824     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4825     // evaluation.
4826     Cost = std::numeric_limits<float>::max();
4827   }
4828 
4829   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4830     // Notice that the vector loop needs to be executed less times, so
4831     // we need to divide the cost of the vector loops by the width of
4832     // the vector elements.
4833     VectorizationCostTy C = expectedCost(i);
4834     float VectorCost = C.first / (float)i;
4835     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4836                       << " costs: " << (int)VectorCost << ".\n");
4837     if (!C.second && !ForceVectorization) {
4838       LLVM_DEBUG(
4839           dbgs() << "LV: Not considering vector loop of width " << i
4840                  << " because it will not generate any vector instructions.\n");
4841       continue;
4842     }
4843     if (VectorCost < Cost) {
4844       Cost = VectorCost;
4845       Width = i;
4846     }
4847   }
4848 
4849   if (!EnableCondStoresVectorization && NumPredStores) {
4850     ORE->emit(createMissedAnalysis("ConditionalStore")
4851               << "store that is conditionally executed prevents vectorization");
4852     LLVM_DEBUG(
4853         dbgs() << "LV: No vectorization. There are conditional stores.\n");
4854     Width = 1;
4855     Cost = ScalarCost;
4856   }
4857 
4858   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4859              << "LV: Vectorization seems to be not beneficial, "
4860              << "but was forced by a user.\n");
4861   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4862   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4863   return Factor;
4864 }
4865 
4866 std::pair<unsigned, unsigned>
4867 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4868   unsigned MinWidth = -1U;
4869   unsigned MaxWidth = 8;
4870   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4871 
4872   // For each block.
4873   for (BasicBlock *BB : TheLoop->blocks()) {
4874     // For each instruction in the loop.
4875     for (Instruction &I : BB->instructionsWithoutDebug()) {
4876       Type *T = I.getType();
4877 
4878       // Skip ignored values.
4879       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
4880         continue;
4881 
4882       // Only examine Loads, Stores and PHINodes.
4883       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
4884         continue;
4885 
4886       // Examine PHI nodes that are reduction variables. Update the type to
4887       // account for the recurrence type.
4888       if (auto *PN = dyn_cast<PHINode>(&I)) {
4889         if (!Legal->isReductionVariable(PN))
4890           continue;
4891         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
4892         T = RdxDesc.getRecurrenceType();
4893       }
4894 
4895       // Examine the stored values.
4896       if (auto *ST = dyn_cast<StoreInst>(&I))
4897         T = ST->getValueOperand()->getType();
4898 
4899       // Ignore loaded pointer types and stored pointer types that are not
4900       // vectorizable.
4901       //
4902       // FIXME: The check here attempts to predict whether a load or store will
4903       //        be vectorized. We only know this for certain after a VF has
4904       //        been selected. Here, we assume that if an access can be
4905       //        vectorized, it will be. We should also look at extending this
4906       //        optimization to non-pointer types.
4907       //
4908       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
4909           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
4910         continue;
4911 
4912       MinWidth = std::min(MinWidth,
4913                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4914       MaxWidth = std::max(MaxWidth,
4915                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
4916     }
4917   }
4918 
4919   return {MinWidth, MaxWidth};
4920 }
4921 
4922 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4923                                                            unsigned VF,
4924                                                            unsigned LoopCost) {
4925   // -- The interleave heuristics --
4926   // We interleave the loop in order to expose ILP and reduce the loop overhead.
4927   // There are many micro-architectural considerations that we can't predict
4928   // at this level. For example, frontend pressure (on decode or fetch) due to
4929   // code size, or the number and capabilities of the execution ports.
4930   //
4931   // We use the following heuristics to select the interleave count:
4932   // 1. If the code has reductions, then we interleave to break the cross
4933   // iteration dependency.
4934   // 2. If the loop is really small, then we interleave to reduce the loop
4935   // overhead.
4936   // 3. We don't interleave if we think that we will spill registers to memory
4937   // due to the increased register pressure.
4938 
4939   // When we optimize for size, we don't interleave.
4940   if (OptForSize)
4941     return 1;
4942 
4943   // We used the distance for the interleave count.
4944   if (Legal->getMaxSafeDepDistBytes() != -1U)
4945     return 1;
4946 
4947   // Do not interleave loops with a relatively small trip count.
4948   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4949   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
4950     return 1;
4951 
4952   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
4953   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4954                     << " registers\n");
4955 
4956   if (VF == 1) {
4957     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4958       TargetNumRegisters = ForceTargetNumScalarRegs;
4959   } else {
4960     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4961       TargetNumRegisters = ForceTargetNumVectorRegs;
4962   }
4963 
4964   RegisterUsage R = calculateRegisterUsage({VF})[0];
4965   // We divide by these constants so assume that we have at least one
4966   // instruction that uses at least one register.
4967   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
4968 
4969   // We calculate the interleave count using the following formula.
4970   // Subtract the number of loop invariants from the number of available
4971   // registers. These registers are used by all of the interleaved instances.
4972   // Next, divide the remaining registers by the number of registers that is
4973   // required by the loop, in order to estimate how many parallel instances
4974   // fit without causing spills. All of this is rounded down if necessary to be
4975   // a power of two. We want power of two interleave count to simplify any
4976   // addressing operations or alignment considerations.
4977   // We also want power of two interleave counts to ensure that the induction
4978   // variable of the vector loop wraps to zero, when tail is folded by masking;
4979   // this currently happens when OptForSize, in which case IC is set to 1 above.
4980   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
4981                               R.MaxLocalUsers);
4982 
4983   // Don't count the induction variable as interleaved.
4984   if (EnableIndVarRegisterHeur)
4985     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
4986                        std::max(1U, (R.MaxLocalUsers - 1)));
4987 
4988   // Clamp the interleave ranges to reasonable counts.
4989   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4990 
4991   // Check if the user has overridden the max.
4992   if (VF == 1) {
4993     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4994       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4995   } else {
4996     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4997       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4998   }
4999 
5000   // If we did not calculate the cost for VF (because the user selected the VF)
5001   // then we calculate the cost of VF here.
5002   if (LoopCost == 0)
5003     LoopCost = expectedCost(VF).first;
5004 
5005   // Clamp the calculated IC to be between the 1 and the max interleave count
5006   // that the target allows.
5007   if (IC > MaxInterleaveCount)
5008     IC = MaxInterleaveCount;
5009   else if (IC < 1)
5010     IC = 1;
5011 
5012   // Interleave if we vectorized this loop and there is a reduction that could
5013   // benefit from interleaving.
5014   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5015     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5016     return IC;
5017   }
5018 
5019   // Note that if we've already vectorized the loop we will have done the
5020   // runtime check and so interleaving won't require further checks.
5021   bool InterleavingRequiresRuntimePointerCheck =
5022       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5023 
5024   // We want to interleave small loops in order to reduce the loop overhead and
5025   // potentially expose ILP opportunities.
5026   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5027   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5028     // We assume that the cost overhead is 1 and we use the cost model
5029     // to estimate the cost of the loop and interleave until the cost of the
5030     // loop overhead is about 5% of the cost of the loop.
5031     unsigned SmallIC =
5032         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5033 
5034     // Interleave until store/load ports (estimated by max interleave count) are
5035     // saturated.
5036     unsigned NumStores = Legal->getNumStores();
5037     unsigned NumLoads = Legal->getNumLoads();
5038     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5039     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5040 
5041     // If we have a scalar reduction (vector reductions are already dealt with
5042     // by this point), we can increase the critical path length if the loop
5043     // we're interleaving is inside another loop. Limit, by default to 2, so the
5044     // critical path only gets increased by one reduction operation.
5045     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5046       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5047       SmallIC = std::min(SmallIC, F);
5048       StoresIC = std::min(StoresIC, F);
5049       LoadsIC = std::min(LoadsIC, F);
5050     }
5051 
5052     if (EnableLoadStoreRuntimeInterleave &&
5053         std::max(StoresIC, LoadsIC) > SmallIC) {
5054       LLVM_DEBUG(
5055           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5056       return std::max(StoresIC, LoadsIC);
5057     }
5058 
5059     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5060     return SmallIC;
5061   }
5062 
5063   // Interleave if this is a large loop (small loops are already dealt with by
5064   // this point) that could benefit from interleaving.
5065   bool HasReductions = !Legal->getReductionVars()->empty();
5066   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5067     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5068     return IC;
5069   }
5070 
5071   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5072   return 1;
5073 }
5074 
5075 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5076 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5077   // This function calculates the register usage by measuring the highest number
5078   // of values that are alive at a single location. Obviously, this is a very
5079   // rough estimation. We scan the loop in a topological order in order and
5080   // assign a number to each instruction. We use RPO to ensure that defs are
5081   // met before their users. We assume that each instruction that has in-loop
5082   // users starts an interval. We record every time that an in-loop value is
5083   // used, so we have a list of the first and last occurrences of each
5084   // instruction. Next, we transpose this data structure into a multi map that
5085   // holds the list of intervals that *end* at a specific location. This multi
5086   // map allows us to perform a linear search. We scan the instructions linearly
5087   // and record each time that a new interval starts, by placing it in a set.
5088   // If we find this value in the multi-map then we remove it from the set.
5089   // The max register usage is the maximum size of the set.
5090   // We also search for instructions that are defined outside the loop, but are
5091   // used inside the loop. We need this number separately from the max-interval
5092   // usage number because when we unroll, loop-invariant values do not take
5093   // more register.
5094   LoopBlocksDFS DFS(TheLoop);
5095   DFS.perform(LI);
5096 
5097   RegisterUsage RU;
5098 
5099   // Each 'key' in the map opens a new interval. The values
5100   // of the map are the index of the 'last seen' usage of the
5101   // instruction that is the key.
5102   using IntervalMap = DenseMap<Instruction *, unsigned>;
5103 
5104   // Maps instruction to its index.
5105   SmallVector<Instruction *, 64> IdxToInstr;
5106   // Marks the end of each interval.
5107   IntervalMap EndPoint;
5108   // Saves the list of instruction indices that are used in the loop.
5109   SmallPtrSet<Instruction *, 8> Ends;
5110   // Saves the list of values that are used in the loop but are
5111   // defined outside the loop, such as arguments and constants.
5112   SmallPtrSet<Value *, 8> LoopInvariants;
5113 
5114   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5115     for (Instruction &I : BB->instructionsWithoutDebug()) {
5116       IdxToInstr.push_back(&I);
5117 
5118       // Save the end location of each USE.
5119       for (Value *U : I.operands()) {
5120         auto *Instr = dyn_cast<Instruction>(U);
5121 
5122         // Ignore non-instruction values such as arguments, constants, etc.
5123         if (!Instr)
5124           continue;
5125 
5126         // If this instruction is outside the loop then record it and continue.
5127         if (!TheLoop->contains(Instr)) {
5128           LoopInvariants.insert(Instr);
5129           continue;
5130         }
5131 
5132         // Overwrite previous end points.
5133         EndPoint[Instr] = IdxToInstr.size();
5134         Ends.insert(Instr);
5135       }
5136     }
5137   }
5138 
5139   // Saves the list of intervals that end with the index in 'key'.
5140   using InstrList = SmallVector<Instruction *, 2>;
5141   DenseMap<unsigned, InstrList> TransposeEnds;
5142 
5143   // Transpose the EndPoints to a list of values that end at each index.
5144   for (auto &Interval : EndPoint)
5145     TransposeEnds[Interval.second].push_back(Interval.first);
5146 
5147   SmallPtrSet<Instruction *, 8> OpenIntervals;
5148 
5149   // Get the size of the widest register.
5150   unsigned MaxSafeDepDist = -1U;
5151   if (Legal->getMaxSafeDepDistBytes() != -1U)
5152     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5153   unsigned WidestRegister =
5154       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5155   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5156 
5157   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5158   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5159 
5160   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5161 
5162   // A lambda that gets the register usage for the given type and VF.
5163   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5164     if (Ty->isTokenTy())
5165       return 0U;
5166     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5167     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5168   };
5169 
5170   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5171     Instruction *I = IdxToInstr[i];
5172 
5173     // Remove all of the instructions that end at this location.
5174     InstrList &List = TransposeEnds[i];
5175     for (Instruction *ToRemove : List)
5176       OpenIntervals.erase(ToRemove);
5177 
5178     // Ignore instructions that are never used within the loop.
5179     if (Ends.find(I) == Ends.end())
5180       continue;
5181 
5182     // Skip ignored values.
5183     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5184       continue;
5185 
5186     // For each VF find the maximum usage of registers.
5187     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5188       if (VFs[j] == 1) {
5189         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5190         continue;
5191       }
5192       collectUniformsAndScalars(VFs[j]);
5193       // Count the number of live intervals.
5194       unsigned RegUsage = 0;
5195       for (auto Inst : OpenIntervals) {
5196         // Skip ignored values for VF > 1.
5197         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5198             isScalarAfterVectorization(Inst, VFs[j]))
5199           continue;
5200         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5201       }
5202       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5203     }
5204 
5205     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5206                       << OpenIntervals.size() << '\n');
5207 
5208     // Add the current instruction to the list of open intervals.
5209     OpenIntervals.insert(I);
5210   }
5211 
5212   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5213     unsigned Invariant = 0;
5214     if (VFs[i] == 1)
5215       Invariant = LoopInvariants.size();
5216     else {
5217       for (auto Inst : LoopInvariants)
5218         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5219     }
5220 
5221     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5222     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5223     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5224                       << '\n');
5225 
5226     RU.LoopInvariantRegs = Invariant;
5227     RU.MaxLocalUsers = MaxUsages[i];
5228     RUs[i] = RU;
5229   }
5230 
5231   return RUs;
5232 }
5233 
5234 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5235   // TODO: Cost model for emulated masked load/store is completely
5236   // broken. This hack guides the cost model to use an artificially
5237   // high enough value to practically disable vectorization with such
5238   // operations, except where previously deployed legality hack allowed
5239   // using very low cost values. This is to avoid regressions coming simply
5240   // from moving "masked load/store" check from legality to cost model.
5241   // Masked Load/Gather emulation was previously never allowed.
5242   // Limited number of Masked Store/Scatter emulation was allowed.
5243   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5244   return isa<LoadInst>(I) ||
5245          (isa<StoreInst>(I) &&
5246           NumPredStores > NumberOfStoresToPredicate);
5247 }
5248 
5249 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5250   // If we aren't vectorizing the loop, or if we've already collected the
5251   // instructions to scalarize, there's nothing to do. Collection may already
5252   // have occurred if we have a user-selected VF and are now computing the
5253   // expected cost for interleaving.
5254   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5255     return;
5256 
5257   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5258   // not profitable to scalarize any instructions, the presence of VF in the
5259   // map will indicate that we've analyzed it already.
5260   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5261 
5262   // Find all the instructions that are scalar with predication in the loop and
5263   // determine if it would be better to not if-convert the blocks they are in.
5264   // If so, we also record the instructions to scalarize.
5265   for (BasicBlock *BB : TheLoop->blocks()) {
5266     if (!blockNeedsPredication(BB))
5267       continue;
5268     for (Instruction &I : *BB)
5269       if (isScalarWithPredication(&I)) {
5270         ScalarCostsTy ScalarCosts;
5271         // Do not apply discount logic if hacked cost is needed
5272         // for emulated masked memrefs.
5273         if (!useEmulatedMaskMemRefHack(&I) &&
5274             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5275           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5276         // Remember that BB will remain after vectorization.
5277         PredicatedBBsAfterVectorization.insert(BB);
5278       }
5279   }
5280 }
5281 
5282 int LoopVectorizationCostModel::computePredInstDiscount(
5283     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5284     unsigned VF) {
5285   assert(!isUniformAfterVectorization(PredInst, VF) &&
5286          "Instruction marked uniform-after-vectorization will be predicated");
5287 
5288   // Initialize the discount to zero, meaning that the scalar version and the
5289   // vector version cost the same.
5290   int Discount = 0;
5291 
5292   // Holds instructions to analyze. The instructions we visit are mapped in
5293   // ScalarCosts. Those instructions are the ones that would be scalarized if
5294   // we find that the scalar version costs less.
5295   SmallVector<Instruction *, 8> Worklist;
5296 
5297   // Returns true if the given instruction can be scalarized.
5298   auto canBeScalarized = [&](Instruction *I) -> bool {
5299     // We only attempt to scalarize instructions forming a single-use chain
5300     // from the original predicated block that would otherwise be vectorized.
5301     // Although not strictly necessary, we give up on instructions we know will
5302     // already be scalar to avoid traversing chains that are unlikely to be
5303     // beneficial.
5304     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5305         isScalarAfterVectorization(I, VF))
5306       return false;
5307 
5308     // If the instruction is scalar with predication, it will be analyzed
5309     // separately. We ignore it within the context of PredInst.
5310     if (isScalarWithPredication(I))
5311       return false;
5312 
5313     // If any of the instruction's operands are uniform after vectorization,
5314     // the instruction cannot be scalarized. This prevents, for example, a
5315     // masked load from being scalarized.
5316     //
5317     // We assume we will only emit a value for lane zero of an instruction
5318     // marked uniform after vectorization, rather than VF identical values.
5319     // Thus, if we scalarize an instruction that uses a uniform, we would
5320     // create uses of values corresponding to the lanes we aren't emitting code
5321     // for. This behavior can be changed by allowing getScalarValue to clone
5322     // the lane zero values for uniforms rather than asserting.
5323     for (Use &U : I->operands())
5324       if (auto *J = dyn_cast<Instruction>(U.get()))
5325         if (isUniformAfterVectorization(J, VF))
5326           return false;
5327 
5328     // Otherwise, we can scalarize the instruction.
5329     return true;
5330   };
5331 
5332   // Returns true if an operand that cannot be scalarized must be extracted
5333   // from a vector. We will account for this scalarization overhead below. Note
5334   // that the non-void predicated instructions are placed in their own blocks,
5335   // and their return values are inserted into vectors. Thus, an extract would
5336   // still be required.
5337   auto needsExtract = [&](Instruction *I) -> bool {
5338     return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
5339   };
5340 
5341   // Compute the expected cost discount from scalarizing the entire expression
5342   // feeding the predicated instruction. We currently only consider expressions
5343   // that are single-use instruction chains.
5344   Worklist.push_back(PredInst);
5345   while (!Worklist.empty()) {
5346     Instruction *I = Worklist.pop_back_val();
5347 
5348     // If we've already analyzed the instruction, there's nothing to do.
5349     if (ScalarCosts.find(I) != ScalarCosts.end())
5350       continue;
5351 
5352     // Compute the cost of the vector instruction. Note that this cost already
5353     // includes the scalarization overhead of the predicated instruction.
5354     unsigned VectorCost = getInstructionCost(I, VF).first;
5355 
5356     // Compute the cost of the scalarized instruction. This cost is the cost of
5357     // the instruction as if it wasn't if-converted and instead remained in the
5358     // predicated block. We will scale this cost by block probability after
5359     // computing the scalarization overhead.
5360     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5361 
5362     // Compute the scalarization overhead of needed insertelement instructions
5363     // and phi nodes.
5364     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5365       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5366                                                  true, false);
5367       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5368     }
5369 
5370     // Compute the scalarization overhead of needed extractelement
5371     // instructions. For each of the instruction's operands, if the operand can
5372     // be scalarized, add it to the worklist; otherwise, account for the
5373     // overhead.
5374     for (Use &U : I->operands())
5375       if (auto *J = dyn_cast<Instruction>(U.get())) {
5376         assert(VectorType::isValidElementType(J->getType()) &&
5377                "Instruction has non-scalar type");
5378         if (canBeScalarized(J))
5379           Worklist.push_back(J);
5380         else if (needsExtract(J))
5381           ScalarCost += TTI.getScalarizationOverhead(
5382                               ToVectorTy(J->getType(),VF), false, true);
5383       }
5384 
5385     // Scale the total scalar cost by block probability.
5386     ScalarCost /= getReciprocalPredBlockProb();
5387 
5388     // Compute the discount. A non-negative discount means the vector version
5389     // of the instruction costs more, and scalarizing would be beneficial.
5390     Discount += VectorCost - ScalarCost;
5391     ScalarCosts[I] = ScalarCost;
5392   }
5393 
5394   return Discount;
5395 }
5396 
5397 LoopVectorizationCostModel::VectorizationCostTy
5398 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5399   VectorizationCostTy Cost;
5400 
5401   // For each block.
5402   for (BasicBlock *BB : TheLoop->blocks()) {
5403     VectorizationCostTy BlockCost;
5404 
5405     // For each instruction in the old loop.
5406     for (Instruction &I : BB->instructionsWithoutDebug()) {
5407       // Skip ignored values.
5408       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5409           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5410         continue;
5411 
5412       VectorizationCostTy C = getInstructionCost(&I, VF);
5413 
5414       // Check if we should override the cost.
5415       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5416         C.first = ForceTargetInstructionCost;
5417 
5418       BlockCost.first += C.first;
5419       BlockCost.second |= C.second;
5420       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5421                         << " for VF " << VF << " For instruction: " << I
5422                         << '\n');
5423     }
5424 
5425     // If we are vectorizing a predicated block, it will have been
5426     // if-converted. This means that the block's instructions (aside from
5427     // stores and instructions that may divide by zero) will now be
5428     // unconditionally executed. For the scalar case, we may not always execute
5429     // the predicated block. Thus, scale the block's cost by the probability of
5430     // executing it.
5431     if (VF == 1 && blockNeedsPredication(BB))
5432       BlockCost.first /= getReciprocalPredBlockProb();
5433 
5434     Cost.first += BlockCost.first;
5435     Cost.second |= BlockCost.second;
5436   }
5437 
5438   return Cost;
5439 }
5440 
5441 /// Gets Address Access SCEV after verifying that the access pattern
5442 /// is loop invariant except the induction variable dependence.
5443 ///
5444 /// This SCEV can be sent to the Target in order to estimate the address
5445 /// calculation cost.
5446 static const SCEV *getAddressAccessSCEV(
5447               Value *Ptr,
5448               LoopVectorizationLegality *Legal,
5449               PredicatedScalarEvolution &PSE,
5450               const Loop *TheLoop) {
5451 
5452   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5453   if (!Gep)
5454     return nullptr;
5455 
5456   // We are looking for a gep with all loop invariant indices except for one
5457   // which should be an induction variable.
5458   auto SE = PSE.getSE();
5459   unsigned NumOperands = Gep->getNumOperands();
5460   for (unsigned i = 1; i < NumOperands; ++i) {
5461     Value *Opd = Gep->getOperand(i);
5462     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5463         !Legal->isInductionVariable(Opd))
5464       return nullptr;
5465   }
5466 
5467   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5468   return PSE.getSCEV(Ptr);
5469 }
5470 
5471 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5472   return Legal->hasStride(I->getOperand(0)) ||
5473          Legal->hasStride(I->getOperand(1));
5474 }
5475 
5476 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5477                                                                  unsigned VF) {
5478   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5479   Type *ValTy = getMemInstValueType(I);
5480   auto SE = PSE.getSE();
5481 
5482   unsigned Alignment = getLoadStoreAlignment(I);
5483   unsigned AS = getLoadStoreAddressSpace(I);
5484   Value *Ptr = getLoadStorePointerOperand(I);
5485   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5486 
5487   // Figure out whether the access is strided and get the stride value
5488   // if it's known in compile time
5489   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5490 
5491   // Get the cost of the scalar memory instruction and address computation.
5492   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5493 
5494   // Don't pass *I here, since it is scalar but will actually be part of a
5495   // vectorized loop where the user of it is a vectorized instruction.
5496   Cost += VF *
5497           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5498                               AS);
5499 
5500   // Get the overhead of the extractelement and insertelement instructions
5501   // we might create due to scalarization.
5502   Cost += getScalarizationOverhead(I, VF, TTI);
5503 
5504   // If we have a predicated store, it may not be executed for each vector
5505   // lane. Scale the cost by the probability of executing the predicated
5506   // block.
5507   if (isPredicatedInst(I)) {
5508     Cost /= getReciprocalPredBlockProb();
5509 
5510     if (useEmulatedMaskMemRefHack(I))
5511       // Artificially setting to a high enough value to practically disable
5512       // vectorization with such operations.
5513       Cost = 3000000;
5514   }
5515 
5516   return Cost;
5517 }
5518 
5519 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5520                                                              unsigned VF) {
5521   Type *ValTy = getMemInstValueType(I);
5522   Type *VectorTy = ToVectorTy(ValTy, VF);
5523   unsigned Alignment = getLoadStoreAlignment(I);
5524   Value *Ptr = getLoadStorePointerOperand(I);
5525   unsigned AS = getLoadStoreAddressSpace(I);
5526   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5527 
5528   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5529          "Stride should be 1 or -1 for consecutive memory access");
5530   unsigned Cost = 0;
5531   if (Legal->isMaskRequired(I))
5532     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5533   else
5534     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5535 
5536   bool Reverse = ConsecutiveStride < 0;
5537   if (Reverse)
5538     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5539   return Cost;
5540 }
5541 
5542 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5543                                                          unsigned VF) {
5544   Type *ValTy = getMemInstValueType(I);
5545   Type *VectorTy = ToVectorTy(ValTy, VF);
5546   unsigned Alignment = getLoadStoreAlignment(I);
5547   unsigned AS = getLoadStoreAddressSpace(I);
5548   if (isa<LoadInst>(I)) {
5549     return TTI.getAddressComputationCost(ValTy) +
5550            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5551            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5552   }
5553   StoreInst *SI = cast<StoreInst>(I);
5554 
5555   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5556   return TTI.getAddressComputationCost(ValTy) +
5557          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5558          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5559                                                Instruction::ExtractElement,
5560                                                VectorTy, VF - 1));
5561 }
5562 
5563 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5564                                                           unsigned VF) {
5565   Type *ValTy = getMemInstValueType(I);
5566   Type *VectorTy = ToVectorTy(ValTy, VF);
5567   unsigned Alignment = getLoadStoreAlignment(I);
5568   Value *Ptr = getLoadStorePointerOperand(I);
5569 
5570   return TTI.getAddressComputationCost(VectorTy) +
5571          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5572                                     Legal->isMaskRequired(I), Alignment);
5573 }
5574 
5575 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5576                                                             unsigned VF) {
5577   Type *ValTy = getMemInstValueType(I);
5578   Type *VectorTy = ToVectorTy(ValTy, VF);
5579   unsigned AS = getLoadStoreAddressSpace(I);
5580 
5581   auto Group = getInterleavedAccessGroup(I);
5582   assert(Group && "Fail to get an interleaved access group.");
5583 
5584   unsigned InterleaveFactor = Group->getFactor();
5585   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5586 
5587   // Holds the indices of existing members in an interleaved load group.
5588   // An interleaved store group doesn't need this as it doesn't allow gaps.
5589   SmallVector<unsigned, 4> Indices;
5590   if (isa<LoadInst>(I)) {
5591     for (unsigned i = 0; i < InterleaveFactor; i++)
5592       if (Group->getMember(i))
5593         Indices.push_back(i);
5594   }
5595 
5596   // Calculate the cost of the whole interleaved group.
5597   bool UseMaskForGaps =
5598       Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
5599   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5600       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5601       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5602 
5603   if (Group->isReverse()) {
5604     // TODO: Add support for reversed masked interleaved access.
5605     assert(!Legal->isMaskRequired(I) &&
5606            "Reverse masked interleaved access not supported.");
5607     Cost += Group->getNumMembers() *
5608             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5609   }
5610   return Cost;
5611 }
5612 
5613 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5614                                                               unsigned VF) {
5615   // Calculate scalar cost only. Vectorization cost should be ready at this
5616   // moment.
5617   if (VF == 1) {
5618     Type *ValTy = getMemInstValueType(I);
5619     unsigned Alignment = getLoadStoreAlignment(I);
5620     unsigned AS = getLoadStoreAddressSpace(I);
5621 
5622     return TTI.getAddressComputationCost(ValTy) +
5623            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5624   }
5625   return getWideningCost(I, VF);
5626 }
5627 
5628 LoopVectorizationCostModel::VectorizationCostTy
5629 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5630   // If we know that this instruction will remain uniform, check the cost of
5631   // the scalar version.
5632   if (isUniformAfterVectorization(I, VF))
5633     VF = 1;
5634 
5635   if (VF > 1 && isProfitableToScalarize(I, VF))
5636     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5637 
5638   // Forced scalars do not have any scalarization overhead.
5639   auto ForcedScalar = ForcedScalars.find(VF);
5640   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5641     auto InstSet = ForcedScalar->second;
5642     if (InstSet.find(I) != InstSet.end())
5643       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5644   }
5645 
5646   Type *VectorTy;
5647   unsigned C = getInstructionCost(I, VF, VectorTy);
5648 
5649   bool TypeNotScalarized =
5650       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5651   return VectorizationCostTy(C, TypeNotScalarized);
5652 }
5653 
5654 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5655   if (VF == 1)
5656     return;
5657   NumPredStores = 0;
5658   for (BasicBlock *BB : TheLoop->blocks()) {
5659     // For each instruction in the old loop.
5660     for (Instruction &I : *BB) {
5661       Value *Ptr =  getLoadStorePointerOperand(&I);
5662       if (!Ptr)
5663         continue;
5664 
5665       // TODO: We should generate better code and update the cost model for
5666       // predicated uniform stores. Today they are treated as any other
5667       // predicated store (see added test cases in
5668       // invariant-store-vectorization.ll).
5669       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5670         NumPredStores++;
5671 
5672       if (Legal->isUniform(Ptr) &&
5673           // Conditional loads and stores should be scalarized and predicated.
5674           // isScalarWithPredication cannot be used here since masked
5675           // gather/scatters are not considered scalar with predication.
5676           !Legal->blockNeedsPredication(I.getParent())) {
5677         // TODO: Avoid replicating loads and stores instead of
5678         // relying on instcombine to remove them.
5679         // Load: Scalar load + broadcast
5680         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5681         unsigned Cost = getUniformMemOpCost(&I, VF);
5682         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5683         continue;
5684       }
5685 
5686       // We assume that widening is the best solution when possible.
5687       if (memoryInstructionCanBeWidened(&I, VF)) {
5688         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5689         int ConsecutiveStride =
5690                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5691         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5692                "Expected consecutive stride.");
5693         InstWidening Decision =
5694             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5695         setWideningDecision(&I, VF, Decision, Cost);
5696         continue;
5697       }
5698 
5699       // Choose between Interleaving, Gather/Scatter or Scalarization.
5700       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5701       unsigned NumAccesses = 1;
5702       if (isAccessInterleaved(&I)) {
5703         auto Group = getInterleavedAccessGroup(&I);
5704         assert(Group && "Fail to get an interleaved access group.");
5705 
5706         // Make one decision for the whole group.
5707         if (getWideningDecision(&I, VF) != CM_Unknown)
5708           continue;
5709 
5710         NumAccesses = Group->getNumMembers();
5711         if (interleavedAccessCanBeWidened(&I, VF))
5712           InterleaveCost = getInterleaveGroupCost(&I, VF);
5713       }
5714 
5715       unsigned GatherScatterCost =
5716           isLegalGatherOrScatter(&I)
5717               ? getGatherScatterCost(&I, VF) * NumAccesses
5718               : std::numeric_limits<unsigned>::max();
5719 
5720       unsigned ScalarizationCost =
5721           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5722 
5723       // Choose better solution for the current VF,
5724       // write down this decision and use it during vectorization.
5725       unsigned Cost;
5726       InstWidening Decision;
5727       if (InterleaveCost <= GatherScatterCost &&
5728           InterleaveCost < ScalarizationCost) {
5729         Decision = CM_Interleave;
5730         Cost = InterleaveCost;
5731       } else if (GatherScatterCost < ScalarizationCost) {
5732         Decision = CM_GatherScatter;
5733         Cost = GatherScatterCost;
5734       } else {
5735         Decision = CM_Scalarize;
5736         Cost = ScalarizationCost;
5737       }
5738       // If the instructions belongs to an interleave group, the whole group
5739       // receives the same decision. The whole group receives the cost, but
5740       // the cost will actually be assigned to one instruction.
5741       if (auto Group = getInterleavedAccessGroup(&I))
5742         setWideningDecision(Group, VF, Decision, Cost);
5743       else
5744         setWideningDecision(&I, VF, Decision, Cost);
5745     }
5746   }
5747 
5748   // Make sure that any load of address and any other address computation
5749   // remains scalar unless there is gather/scatter support. This avoids
5750   // inevitable extracts into address registers, and also has the benefit of
5751   // activating LSR more, since that pass can't optimize vectorized
5752   // addresses.
5753   if (TTI.prefersVectorizedAddressing())
5754     return;
5755 
5756   // Start with all scalar pointer uses.
5757   SmallPtrSet<Instruction *, 8> AddrDefs;
5758   for (BasicBlock *BB : TheLoop->blocks())
5759     for (Instruction &I : *BB) {
5760       Instruction *PtrDef =
5761         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5762       if (PtrDef && TheLoop->contains(PtrDef) &&
5763           getWideningDecision(&I, VF) != CM_GatherScatter)
5764         AddrDefs.insert(PtrDef);
5765     }
5766 
5767   // Add all instructions used to generate the addresses.
5768   SmallVector<Instruction *, 4> Worklist;
5769   for (auto *I : AddrDefs)
5770     Worklist.push_back(I);
5771   while (!Worklist.empty()) {
5772     Instruction *I = Worklist.pop_back_val();
5773     for (auto &Op : I->operands())
5774       if (auto *InstOp = dyn_cast<Instruction>(Op))
5775         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5776             AddrDefs.insert(InstOp).second)
5777           Worklist.push_back(InstOp);
5778   }
5779 
5780   for (auto *I : AddrDefs) {
5781     if (isa<LoadInst>(I)) {
5782       // Setting the desired widening decision should ideally be handled in
5783       // by cost functions, but since this involves the task of finding out
5784       // if the loaded register is involved in an address computation, it is
5785       // instead changed here when we know this is the case.
5786       InstWidening Decision = getWideningDecision(I, VF);
5787       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5788         // Scalarize a widened load of address.
5789         setWideningDecision(I, VF, CM_Scalarize,
5790                             (VF * getMemoryInstructionCost(I, 1)));
5791       else if (auto Group = getInterleavedAccessGroup(I)) {
5792         // Scalarize an interleave group of address loads.
5793         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5794           if (Instruction *Member = Group->getMember(I))
5795             setWideningDecision(Member, VF, CM_Scalarize,
5796                                 (VF * getMemoryInstructionCost(Member, 1)));
5797         }
5798       }
5799     } else
5800       // Make sure I gets scalarized and a cost estimate without
5801       // scalarization overhead.
5802       ForcedScalars[VF].insert(I);
5803   }
5804 }
5805 
5806 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5807                                                         unsigned VF,
5808                                                         Type *&VectorTy) {
5809   Type *RetTy = I->getType();
5810   if (canTruncateToMinimalBitwidth(I, VF))
5811     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5812   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5813   auto SE = PSE.getSE();
5814 
5815   // TODO: We need to estimate the cost of intrinsic calls.
5816   switch (I->getOpcode()) {
5817   case Instruction::GetElementPtr:
5818     // We mark this instruction as zero-cost because the cost of GEPs in
5819     // vectorized code depends on whether the corresponding memory instruction
5820     // is scalarized or not. Therefore, we handle GEPs with the memory
5821     // instruction cost.
5822     return 0;
5823   case Instruction::Br: {
5824     // In cases of scalarized and predicated instructions, there will be VF
5825     // predicated blocks in the vectorized loop. Each branch around these
5826     // blocks requires also an extract of its vector compare i1 element.
5827     bool ScalarPredicatedBB = false;
5828     BranchInst *BI = cast<BranchInst>(I);
5829     if (VF > 1 && BI->isConditional() &&
5830         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5831              PredicatedBBsAfterVectorization.end() ||
5832          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5833              PredicatedBBsAfterVectorization.end()))
5834       ScalarPredicatedBB = true;
5835 
5836     if (ScalarPredicatedBB) {
5837       // Return cost for branches around scalarized and predicated blocks.
5838       Type *Vec_i1Ty =
5839           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
5840       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5841               (TTI.getCFInstrCost(Instruction::Br) * VF));
5842     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
5843       // The back-edge branch will remain, as will all scalar branches.
5844       return TTI.getCFInstrCost(Instruction::Br);
5845     else
5846       // This branch will be eliminated by if-conversion.
5847       return 0;
5848     // Note: We currently assume zero cost for an unconditional branch inside
5849     // a predicated block since it will become a fall-through, although we
5850     // may decide in the future to call TTI for all branches.
5851   }
5852   case Instruction::PHI: {
5853     auto *Phi = cast<PHINode>(I);
5854 
5855     // First-order recurrences are replaced by vector shuffles inside the loop.
5856     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
5857     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
5858       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
5859                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
5860 
5861     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5862     // converted into select instructions. We require N - 1 selects per phi
5863     // node, where N is the number of incoming values.
5864     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
5865       return (Phi->getNumIncomingValues() - 1) *
5866              TTI.getCmpSelInstrCost(
5867                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
5868                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
5869 
5870     return TTI.getCFInstrCost(Instruction::PHI);
5871   }
5872   case Instruction::UDiv:
5873   case Instruction::SDiv:
5874   case Instruction::URem:
5875   case Instruction::SRem:
5876     // If we have a predicated instruction, it may not be executed for each
5877     // vector lane. Get the scalarization cost and scale this amount by the
5878     // probability of executing the predicated block. If the instruction is not
5879     // predicated, we fall through to the next case.
5880     if (VF > 1 && isScalarWithPredication(I)) {
5881       unsigned Cost = 0;
5882 
5883       // These instructions have a non-void type, so account for the phi nodes
5884       // that we will create. This cost is likely to be zero. The phi node
5885       // cost, if any, should be scaled by the block probability because it
5886       // models a copy at the end of each predicated block.
5887       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
5888 
5889       // The cost of the non-predicated instruction.
5890       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
5891 
5892       // The cost of insertelement and extractelement instructions needed for
5893       // scalarization.
5894       Cost += getScalarizationOverhead(I, VF, TTI);
5895 
5896       // Scale the cost by the probability of executing the predicated blocks.
5897       // This assumes the predicated block for each vector lane is equally
5898       // likely.
5899       return Cost / getReciprocalPredBlockProb();
5900     }
5901     LLVM_FALLTHROUGH;
5902   case Instruction::Add:
5903   case Instruction::FAdd:
5904   case Instruction::Sub:
5905   case Instruction::FSub:
5906   case Instruction::Mul:
5907   case Instruction::FMul:
5908   case Instruction::FDiv:
5909   case Instruction::FRem:
5910   case Instruction::Shl:
5911   case Instruction::LShr:
5912   case Instruction::AShr:
5913   case Instruction::And:
5914   case Instruction::Or:
5915   case Instruction::Xor: {
5916     // Since we will replace the stride by 1 the multiplication should go away.
5917     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
5918       return 0;
5919     // Certain instructions can be cheaper to vectorize if they have a constant
5920     // second vector operand. One example of this are shifts on x86.
5921     Value *Op2 = I->getOperand(1);
5922     TargetTransformInfo::OperandValueProperties Op2VP;
5923     TargetTransformInfo::OperandValueKind Op2VK =
5924         TTI.getOperandInfo(Op2, Op2VP);
5925     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
5926       Op2VK = TargetTransformInfo::OK_UniformValue;
5927 
5928     SmallVector<const Value *, 4> Operands(I->operand_values());
5929     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
5930     return N * TTI.getArithmeticInstrCost(
5931                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
5932                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
5933   }
5934   case Instruction::Select: {
5935     SelectInst *SI = cast<SelectInst>(I);
5936     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
5937     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
5938     Type *CondTy = SI->getCondition()->getType();
5939     if (!ScalarCond)
5940       CondTy = VectorType::get(CondTy, VF);
5941 
5942     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
5943   }
5944   case Instruction::ICmp:
5945   case Instruction::FCmp: {
5946     Type *ValTy = I->getOperand(0)->getType();
5947     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
5948     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
5949       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
5950     VectorTy = ToVectorTy(ValTy, VF);
5951     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
5952   }
5953   case Instruction::Store:
5954   case Instruction::Load: {
5955     unsigned Width = VF;
5956     if (Width > 1) {
5957       InstWidening Decision = getWideningDecision(I, Width);
5958       assert(Decision != CM_Unknown &&
5959              "CM decision should be taken at this point");
5960       if (Decision == CM_Scalarize)
5961         Width = 1;
5962     }
5963     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
5964     return getMemoryInstructionCost(I, VF);
5965   }
5966   case Instruction::ZExt:
5967   case Instruction::SExt:
5968   case Instruction::FPToUI:
5969   case Instruction::FPToSI:
5970   case Instruction::FPExt:
5971   case Instruction::PtrToInt:
5972   case Instruction::IntToPtr:
5973   case Instruction::SIToFP:
5974   case Instruction::UIToFP:
5975   case Instruction::Trunc:
5976   case Instruction::FPTrunc:
5977   case Instruction::BitCast: {
5978     // We optimize the truncation of induction variables having constant
5979     // integer steps. The cost of these truncations is the same as the scalar
5980     // operation.
5981     if (isOptimizableIVTruncate(I, VF)) {
5982       auto *Trunc = cast<TruncInst>(I);
5983       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
5984                                   Trunc->getSrcTy(), Trunc);
5985     }
5986 
5987     Type *SrcScalarTy = I->getOperand(0)->getType();
5988     Type *SrcVecTy =
5989         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
5990     if (canTruncateToMinimalBitwidth(I, VF)) {
5991       // This cast is going to be shrunk. This may remove the cast or it might
5992       // turn it into slightly different cast. For example, if MinBW == 16,
5993       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
5994       //
5995       // Calculate the modified src and dest types.
5996       Type *MinVecTy = VectorTy;
5997       if (I->getOpcode() == Instruction::Trunc) {
5998         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
5999         VectorTy =
6000             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6001       } else if (I->getOpcode() == Instruction::ZExt ||
6002                  I->getOpcode() == Instruction::SExt) {
6003         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6004         VectorTy =
6005             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6006       }
6007     }
6008 
6009     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6010     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6011   }
6012   case Instruction::Call: {
6013     bool NeedToScalarize;
6014     CallInst *CI = cast<CallInst>(I);
6015     unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
6016     if (getVectorIntrinsicIDForCall(CI, TLI))
6017       return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
6018     return CallCost;
6019   }
6020   default:
6021     // The cost of executing VF copies of the scalar instruction. This opcode
6022     // is unknown. Assume that it is the same as 'mul'.
6023     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6024            getScalarizationOverhead(I, VF, TTI);
6025   } // end of switch.
6026 }
6027 
6028 char LoopVectorize::ID = 0;
6029 
6030 static const char lv_name[] = "Loop Vectorization";
6031 
6032 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6033 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6034 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6035 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6036 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6037 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6038 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6039 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6040 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6041 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6042 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6043 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6044 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6045 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6046 
6047 namespace llvm {
6048 
6049 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6050                               bool VectorizeOnlyWhenForced) {
6051   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6052 }
6053 
6054 } // end namespace llvm
6055 
6056 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6057   // Check if the pointer operand of a load or store instruction is
6058   // consecutive.
6059   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6060     return Legal->isConsecutivePtr(Ptr);
6061   return false;
6062 }
6063 
6064 void LoopVectorizationCostModel::collectValuesToIgnore() {
6065   // Ignore ephemeral values.
6066   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6067 
6068   // Ignore type-promoting instructions we identified during reduction
6069   // detection.
6070   for (auto &Reduction : *Legal->getReductionVars()) {
6071     RecurrenceDescriptor &RedDes = Reduction.second;
6072     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6073     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6074   }
6075   // Ignore type-casting instructions we identified during induction
6076   // detection.
6077   for (auto &Induction : *Legal->getInductionVars()) {
6078     InductionDescriptor &IndDes = Induction.second;
6079     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6080     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6081   }
6082 }
6083 
6084 VectorizationFactor
6085 LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
6086                                                 unsigned UserVF) {
6087   // Outer loop handling: They may require CFG and instruction level
6088   // transformations before even evaluating whether vectorization is profitable.
6089   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6090   // the vectorization pipeline.
6091   if (!OrigLoop->empty()) {
6092     // TODO: If UserVF is not provided, we set UserVF to 4 for stress testing.
6093     // This won't be necessary when UserVF is not required in the VPlan-native
6094     // path.
6095     if (VPlanBuildStressTest && !UserVF)
6096       UserVF = 4;
6097 
6098     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6099     assert(UserVF && "Expected UserVF for outer loop vectorization.");
6100     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6101     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6102     buildVPlans(UserVF, UserVF);
6103 
6104     // For VPlan build stress testing, we bail out after VPlan construction.
6105     if (VPlanBuildStressTest)
6106       return VectorizationFactor::Disabled();
6107 
6108     return {UserVF, 0};
6109   }
6110 
6111   LLVM_DEBUG(
6112       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6113                 "VPlan-native path.\n");
6114   return VectorizationFactor::Disabled();
6115 }
6116 
6117 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize,
6118                                                              unsigned UserVF) {
6119   assert(OrigLoop->empty() && "Inner loop expected.");
6120   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
6121   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6122     return None;
6123 
6124   // Invalidate interleave groups if all blocks of loop will be predicated.
6125   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6126       !useMaskedInterleavedAccesses(*TTI)) {
6127     LLVM_DEBUG(
6128         dbgs()
6129         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6130            "which requires masked-interleaved support.\n");
6131     CM.InterleaveInfo.reset();
6132   }
6133 
6134   if (UserVF) {
6135     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6136     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6137     // Collect the instructions (and their associated costs) that will be more
6138     // profitable to scalarize.
6139     CM.selectUserVectorizationFactor(UserVF);
6140     buildVPlansWithVPRecipes(UserVF, UserVF);
6141     LLVM_DEBUG(printPlans(dbgs()));
6142     return {{UserVF, 0}};
6143   }
6144 
6145   unsigned MaxVF = MaybeMaxVF.getValue();
6146   assert(MaxVF != 0 && "MaxVF is zero.");
6147 
6148   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6149     // Collect Uniform and Scalar instructions after vectorization with VF.
6150     CM.collectUniformsAndScalars(VF);
6151 
6152     // Collect the instructions (and their associated costs) that will be more
6153     // profitable to scalarize.
6154     if (VF > 1)
6155       CM.collectInstsToScalarize(VF);
6156   }
6157 
6158   buildVPlansWithVPRecipes(1, MaxVF);
6159   LLVM_DEBUG(printPlans(dbgs()));
6160   if (MaxVF == 1)
6161     return VectorizationFactor::Disabled();
6162 
6163   // Select the optimal vectorization factor.
6164   return CM.selectVectorizationFactor(MaxVF);
6165 }
6166 
6167 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6168   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6169                     << '\n');
6170   BestVF = VF;
6171   BestUF = UF;
6172 
6173   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6174     return !Plan->hasVF(VF);
6175   });
6176   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6177 }
6178 
6179 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6180                                            DominatorTree *DT) {
6181   // Perform the actual loop transformation.
6182 
6183   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6184   VPCallbackILV CallbackILV(ILV);
6185 
6186   VPTransformState State{BestVF, BestUF,      LI,
6187                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6188                          &ILV,   CallbackILV};
6189   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6190   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6191 
6192   //===------------------------------------------------===//
6193   //
6194   // Notice: any optimization or new instruction that go
6195   // into the code below should also be implemented in
6196   // the cost-model.
6197   //
6198   //===------------------------------------------------===//
6199 
6200   // 2. Copy and widen instructions from the old loop into the new loop.
6201   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6202   VPlans.front()->execute(&State);
6203 
6204   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6205   //    predication, updating analyses.
6206   ILV.fixVectorizedLoop();
6207 }
6208 
6209 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6210     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6211   BasicBlock *Latch = OrigLoop->getLoopLatch();
6212 
6213   // We create new control-flow for the vectorized loop, so the original
6214   // condition will be dead after vectorization if it's only used by the
6215   // branch.
6216   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6217   if (Cmp && Cmp->hasOneUse())
6218     DeadInstructions.insert(Cmp);
6219 
6220   // We create new "steps" for induction variable updates to which the original
6221   // induction variables map. An original update instruction will be dead if
6222   // all its users except the induction variable are dead.
6223   for (auto &Induction : *Legal->getInductionVars()) {
6224     PHINode *Ind = Induction.first;
6225     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6226     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6227           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6228                                  DeadInstructions.end();
6229         }))
6230       DeadInstructions.insert(IndUpdate);
6231 
6232     // We record as "Dead" also the type-casting instructions we had identified
6233     // during induction analysis. We don't need any handling for them in the
6234     // vectorized loop because we have proven that, under a proper runtime
6235     // test guarding the vectorized loop, the value of the phi, and the casted
6236     // value of the phi, are the same. The last instruction in this casting chain
6237     // will get its scalar/vector/widened def from the scalar/vector/widened def
6238     // of the respective phi node. Any other casts in the induction def-use chain
6239     // have no other uses outside the phi update chain, and will be ignored.
6240     InductionDescriptor &IndDes = Induction.second;
6241     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6242     DeadInstructions.insert(Casts.begin(), Casts.end());
6243   }
6244 }
6245 
6246 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6247 
6248 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6249 
6250 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6251                                         Instruction::BinaryOps BinOp) {
6252   // When unrolling and the VF is 1, we only need to add a simple scalar.
6253   Type *Ty = Val->getType();
6254   assert(!Ty->isVectorTy() && "Val must be a scalar");
6255 
6256   if (Ty->isFloatingPointTy()) {
6257     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6258 
6259     // Floating point operations had to be 'fast' to enable the unrolling.
6260     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6261     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6262   }
6263   Constant *C = ConstantInt::get(Ty, StartIdx);
6264   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6265 }
6266 
6267 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6268   SmallVector<Metadata *, 4> MDs;
6269   // Reserve first location for self reference to the LoopID metadata node.
6270   MDs.push_back(nullptr);
6271   bool IsUnrollMetadata = false;
6272   MDNode *LoopID = L->getLoopID();
6273   if (LoopID) {
6274     // First find existing loop unrolling disable metadata.
6275     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6276       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6277       if (MD) {
6278         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6279         IsUnrollMetadata =
6280             S && S->getString().startswith("llvm.loop.unroll.disable");
6281       }
6282       MDs.push_back(LoopID->getOperand(i));
6283     }
6284   }
6285 
6286   if (!IsUnrollMetadata) {
6287     // Add runtime unroll disable metadata.
6288     LLVMContext &Context = L->getHeader()->getContext();
6289     SmallVector<Metadata *, 1> DisableOperands;
6290     DisableOperands.push_back(
6291         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6292     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6293     MDs.push_back(DisableNode);
6294     MDNode *NewLoopID = MDNode::get(Context, MDs);
6295     // Set operand 0 to refer to the loop id itself.
6296     NewLoopID->replaceOperandWith(0, NewLoopID);
6297     L->setLoopID(NewLoopID);
6298   }
6299 }
6300 
6301 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6302     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6303   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6304   bool PredicateAtRangeStart = Predicate(Range.Start);
6305 
6306   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6307     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6308       Range.End = TmpVF;
6309       break;
6310     }
6311 
6312   return PredicateAtRangeStart;
6313 }
6314 
6315 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6316 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6317 /// of VF's starting at a given VF and extending it as much as possible. Each
6318 /// vectorization decision can potentially shorten this sub-range during
6319 /// buildVPlan().
6320 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6321   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6322     VFRange SubRange = {VF, MaxVF + 1};
6323     VPlans.push_back(buildVPlan(SubRange));
6324     VF = SubRange.End;
6325   }
6326 }
6327 
6328 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6329                                          VPlanPtr &Plan) {
6330   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6331 
6332   // Look for cached value.
6333   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6334   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6335   if (ECEntryIt != EdgeMaskCache.end())
6336     return ECEntryIt->second;
6337 
6338   VPValue *SrcMask = createBlockInMask(Src, Plan);
6339 
6340   // The terminator has to be a branch inst!
6341   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6342   assert(BI && "Unexpected terminator found");
6343 
6344   if (!BI->isConditional())
6345     return EdgeMaskCache[Edge] = SrcMask;
6346 
6347   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6348   assert(EdgeMask && "No Edge Mask found for condition");
6349 
6350   if (BI->getSuccessor(0) != Dst)
6351     EdgeMask = Builder.createNot(EdgeMask);
6352 
6353   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6354     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6355 
6356   return EdgeMaskCache[Edge] = EdgeMask;
6357 }
6358 
6359 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6360   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6361 
6362   // Look for cached value.
6363   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6364   if (BCEntryIt != BlockMaskCache.end())
6365     return BCEntryIt->second;
6366 
6367   // All-one mask is modelled as no-mask following the convention for masked
6368   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6369   VPValue *BlockMask = nullptr;
6370 
6371   if (OrigLoop->getHeader() == BB) {
6372     if (!CM.blockNeedsPredication(BB))
6373       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6374 
6375     // Introduce the early-exit compare IV <= BTC to form header block mask.
6376     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6377     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6378     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6379     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6380     return BlockMaskCache[BB] = BlockMask;
6381   }
6382 
6383   // This is the block mask. We OR all incoming edges.
6384   for (auto *Predecessor : predecessors(BB)) {
6385     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6386     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6387       return BlockMaskCache[BB] = EdgeMask;
6388 
6389     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6390       BlockMask = EdgeMask;
6391       continue;
6392     }
6393 
6394     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6395   }
6396 
6397   return BlockMaskCache[BB] = BlockMask;
6398 }
6399 
6400 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6401                                                            VFRange &Range,
6402                                                            VPlanPtr &Plan) {
6403   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6404   if (!IG)
6405     return nullptr;
6406 
6407   // Now check if IG is relevant for VF's in the given range.
6408   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6409     return [=](unsigned VF) -> bool {
6410       return (VF >= 2 && // Query is illegal for VF == 1
6411               CM.getWideningDecision(I, VF) ==
6412                   LoopVectorizationCostModel::CM_Interleave);
6413     };
6414   };
6415   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6416     return nullptr;
6417 
6418   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6419   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6420   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6421   assert(I == IG->getInsertPos() &&
6422          "Generating a recipe for an adjunct member of an interleave group");
6423 
6424   VPValue *Mask = nullptr;
6425   if (Legal->isMaskRequired(I))
6426     Mask = createBlockInMask(I->getParent(), Plan);
6427 
6428   return new VPInterleaveRecipe(IG, Mask);
6429 }
6430 
6431 VPWidenMemoryInstructionRecipe *
6432 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6433                                   VPlanPtr &Plan) {
6434   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6435     return nullptr;
6436 
6437   auto willWiden = [&](unsigned VF) -> bool {
6438     if (VF == 1)
6439       return false;
6440     if (CM.isScalarAfterVectorization(I, VF) ||
6441         CM.isProfitableToScalarize(I, VF))
6442       return false;
6443     LoopVectorizationCostModel::InstWidening Decision =
6444         CM.getWideningDecision(I, VF);
6445     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6446            "CM decision should be taken at this point.");
6447     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6448            "Interleave memory opportunity should be caught earlier.");
6449     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6450   };
6451 
6452   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6453     return nullptr;
6454 
6455   VPValue *Mask = nullptr;
6456   if (Legal->isMaskRequired(I))
6457     Mask = createBlockInMask(I->getParent(), Plan);
6458 
6459   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6460 }
6461 
6462 VPWidenIntOrFpInductionRecipe *
6463 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6464   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6465     // Check if this is an integer or fp induction. If so, build the recipe that
6466     // produces its scalar and vector values.
6467     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6468     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6469         II.getKind() == InductionDescriptor::IK_FpInduction)
6470       return new VPWidenIntOrFpInductionRecipe(Phi);
6471 
6472     return nullptr;
6473   }
6474 
6475   // Optimize the special case where the source is a constant integer
6476   // induction variable. Notice that we can only optimize the 'trunc' case
6477   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6478   // (c) other casts depend on pointer size.
6479 
6480   // Determine whether \p K is a truncation based on an induction variable that
6481   // can be optimized.
6482   auto isOptimizableIVTruncate =
6483       [&](Instruction *K) -> std::function<bool(unsigned)> {
6484     return
6485         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6486   };
6487 
6488   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6489                                isOptimizableIVTruncate(I), Range))
6490     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6491                                              cast<TruncInst>(I));
6492   return nullptr;
6493 }
6494 
6495 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6496   PHINode *Phi = dyn_cast<PHINode>(I);
6497   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6498     return nullptr;
6499 
6500   // We know that all PHIs in non-header blocks are converted into selects, so
6501   // we don't have to worry about the insertion order and we can just use the
6502   // builder. At this point we generate the predication tree. There may be
6503   // duplications since this is a simple recursive scan, but future
6504   // optimizations will clean it up.
6505 
6506   SmallVector<VPValue *, 2> Masks;
6507   unsigned NumIncoming = Phi->getNumIncomingValues();
6508   for (unsigned In = 0; In < NumIncoming; In++) {
6509     VPValue *EdgeMask =
6510       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6511     assert((EdgeMask || NumIncoming == 1) &&
6512            "Multiple predecessors with one having a full mask");
6513     if (EdgeMask)
6514       Masks.push_back(EdgeMask);
6515   }
6516   return new VPBlendRecipe(Phi, Masks);
6517 }
6518 
6519 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6520                                  VFRange &Range) {
6521 
6522   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6523       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6524 
6525   if (IsPredicated)
6526     return false;
6527 
6528   auto IsVectorizableOpcode = [](unsigned Opcode) {
6529     switch (Opcode) {
6530     case Instruction::Add:
6531     case Instruction::And:
6532     case Instruction::AShr:
6533     case Instruction::BitCast:
6534     case Instruction::Br:
6535     case Instruction::Call:
6536     case Instruction::FAdd:
6537     case Instruction::FCmp:
6538     case Instruction::FDiv:
6539     case Instruction::FMul:
6540     case Instruction::FPExt:
6541     case Instruction::FPToSI:
6542     case Instruction::FPToUI:
6543     case Instruction::FPTrunc:
6544     case Instruction::FRem:
6545     case Instruction::FSub:
6546     case Instruction::GetElementPtr:
6547     case Instruction::ICmp:
6548     case Instruction::IntToPtr:
6549     case Instruction::Load:
6550     case Instruction::LShr:
6551     case Instruction::Mul:
6552     case Instruction::Or:
6553     case Instruction::PHI:
6554     case Instruction::PtrToInt:
6555     case Instruction::SDiv:
6556     case Instruction::Select:
6557     case Instruction::SExt:
6558     case Instruction::Shl:
6559     case Instruction::SIToFP:
6560     case Instruction::SRem:
6561     case Instruction::Store:
6562     case Instruction::Sub:
6563     case Instruction::Trunc:
6564     case Instruction::UDiv:
6565     case Instruction::UIToFP:
6566     case Instruction::URem:
6567     case Instruction::Xor:
6568     case Instruction::ZExt:
6569       return true;
6570     }
6571     return false;
6572   };
6573 
6574   if (!IsVectorizableOpcode(I->getOpcode()))
6575     return false;
6576 
6577   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6578     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6579     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6580                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6581       return false;
6582   }
6583 
6584   auto willWiden = [&](unsigned VF) -> bool {
6585     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6586                              CM.isProfitableToScalarize(I, VF)))
6587       return false;
6588     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6589       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6590       // The following case may be scalarized depending on the VF.
6591       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6592       // version of the instruction.
6593       // Is it beneficial to perform intrinsic call compared to lib call?
6594       bool NeedToScalarize;
6595       unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
6596       bool UseVectorIntrinsic =
6597           ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
6598       return UseVectorIntrinsic || !NeedToScalarize;
6599     }
6600     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6601       assert(CM.getWideningDecision(I, VF) ==
6602                  LoopVectorizationCostModel::CM_Scalarize &&
6603              "Memory widening decisions should have been taken care by now");
6604       return false;
6605     }
6606     return true;
6607   };
6608 
6609   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6610     return false;
6611 
6612   // Success: widen this instruction. We optimize the common case where
6613   // consecutive instructions can be represented by a single recipe.
6614   if (!VPBB->empty()) {
6615     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6616     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6617       return true;
6618   }
6619 
6620   VPBB->appendRecipe(new VPWidenRecipe(I));
6621   return true;
6622 }
6623 
6624 VPBasicBlock *VPRecipeBuilder::handleReplication(
6625     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6626     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6627     VPlanPtr &Plan) {
6628   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6629       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6630       Range);
6631 
6632   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6633       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6634 
6635   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6636 
6637   // Find if I uses a predicated instruction. If so, it will use its scalar
6638   // value. Avoid hoisting the insert-element which packs the scalar value into
6639   // a vector value, as that happens iff all users use the vector value.
6640   for (auto &Op : I->operands())
6641     if (auto *PredInst = dyn_cast<Instruction>(Op))
6642       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6643         PredInst2Recipe[PredInst]->setAlsoPack(false);
6644 
6645   // Finalize the recipe for Instr, first if it is not predicated.
6646   if (!IsPredicated) {
6647     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6648     VPBB->appendRecipe(Recipe);
6649     return VPBB;
6650   }
6651   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6652   assert(VPBB->getSuccessors().empty() &&
6653          "VPBB has successors when handling predicated replication.");
6654   // Record predicated instructions for above packing optimizations.
6655   PredInst2Recipe[I] = Recipe;
6656   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6657   VPBlockUtils::insertBlockAfter(Region, VPBB);
6658   auto *RegSucc = new VPBasicBlock();
6659   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6660   return RegSucc;
6661 }
6662 
6663 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6664                                                       VPRecipeBase *PredRecipe,
6665                                                       VPlanPtr &Plan) {
6666   // Instructions marked for predication are replicated and placed under an
6667   // if-then construct to prevent side-effects.
6668 
6669   // Generate recipes to compute the block mask for this region.
6670   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6671 
6672   // Build the triangular if-then region.
6673   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6674   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6675   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6676   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6677   auto *PHIRecipe =
6678       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6679   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6680   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6681   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6682 
6683   // Note: first set Entry as region entry and then connect successors starting
6684   // from it in order, to propagate the "parent" of each VPBasicBlock.
6685   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6686   VPBlockUtils::connectBlocks(Pred, Exit);
6687 
6688   return Region;
6689 }
6690 
6691 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6692                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6693   VPRecipeBase *Recipe = nullptr;
6694   // Check if Instr should belong to an interleave memory recipe, or already
6695   // does. In the latter case Instr is irrelevant.
6696   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6697     VPBB->appendRecipe(Recipe);
6698     return true;
6699   }
6700 
6701   // Check if Instr is a memory operation that should be widened.
6702   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6703     VPBB->appendRecipe(Recipe);
6704     return true;
6705   }
6706 
6707   // Check if Instr should form some PHI recipe.
6708   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6709     VPBB->appendRecipe(Recipe);
6710     return true;
6711   }
6712   if ((Recipe = tryToBlend(Instr, Plan))) {
6713     VPBB->appendRecipe(Recipe);
6714     return true;
6715   }
6716   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6717     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6718     return true;
6719   }
6720 
6721   // Check if Instr is to be widened by a general VPWidenRecipe, after
6722   // having first checked for specific widening recipes that deal with
6723   // Interleave Groups, Inductions and Phi nodes.
6724   if (tryToWiden(Instr, VPBB, Range))
6725     return true;
6726 
6727   return false;
6728 }
6729 
6730 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6731                                                         unsigned MaxVF) {
6732   assert(OrigLoop->empty() && "Inner loop expected.");
6733 
6734   // Collect conditions feeding internal conditional branches; they need to be
6735   // represented in VPlan for it to model masking.
6736   SmallPtrSet<Value *, 1> NeedDef;
6737 
6738   auto *Latch = OrigLoop->getLoopLatch();
6739   for (BasicBlock *BB : OrigLoop->blocks()) {
6740     if (BB == Latch)
6741       continue;
6742     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6743     if (Branch && Branch->isConditional())
6744       NeedDef.insert(Branch->getCondition());
6745   }
6746 
6747   // If the tail is to be folded by masking, the primary induction variable
6748   // needs to be represented in VPlan for it to model early-exit masking.
6749   if (CM.foldTailByMasking())
6750     NeedDef.insert(Legal->getPrimaryInduction());
6751 
6752   // Collect instructions from the original loop that will become trivially dead
6753   // in the vectorized loop. We don't need to vectorize these instructions. For
6754   // example, original induction update instructions can become dead because we
6755   // separately emit induction "steps" when generating code for the new loop.
6756   // Similarly, we create a new latch condition when setting up the structure
6757   // of the new loop, so the old one can become dead.
6758   SmallPtrSet<Instruction *, 4> DeadInstructions;
6759   collectTriviallyDeadInstructions(DeadInstructions);
6760 
6761   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6762     VFRange SubRange = {VF, MaxVF + 1};
6763     VPlans.push_back(
6764         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6765     VF = SubRange.End;
6766   }
6767 }
6768 
6769 LoopVectorizationPlanner::VPlanPtr
6770 LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6771     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6772     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6773   // Hold a mapping from predicated instructions to their recipes, in order to
6774   // fix their AlsoPack behavior if a user is determined to replicate and use a
6775   // scalar instead of vector value.
6776   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6777 
6778   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6779   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6780 
6781   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6782   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6783   auto Plan = llvm::make_unique<VPlan>(VPBB);
6784 
6785   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder);
6786   // Represent values that will have defs inside VPlan.
6787   for (Value *V : NeedDef)
6788     Plan->addVPValue(V);
6789 
6790   // Scan the body of the loop in a topological order to visit each basic block
6791   // after having visited its predecessor basic blocks.
6792   LoopBlocksDFS DFS(OrigLoop);
6793   DFS.perform(LI);
6794 
6795   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6796     // Relevant instructions from basic block BB will be grouped into VPRecipe
6797     // ingredients and fill a new VPBasicBlock.
6798     unsigned VPBBsForBB = 0;
6799     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6800     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6801     VPBB = FirstVPBBForBB;
6802     Builder.setInsertPoint(VPBB);
6803 
6804     std::vector<Instruction *> Ingredients;
6805 
6806     // Organize the ingredients to vectorize from current basic block in the
6807     // right order.
6808     for (Instruction &I : BB->instructionsWithoutDebug()) {
6809       Instruction *Instr = &I;
6810 
6811       // First filter out irrelevant instructions, to ensure no recipes are
6812       // built for them.
6813       if (isa<BranchInst>(Instr) ||
6814           DeadInstructions.find(Instr) != DeadInstructions.end())
6815         continue;
6816 
6817       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
6818       // member of the IG, do not construct any Recipe for it.
6819       const InterleaveGroup<Instruction> *IG =
6820           CM.getInterleavedAccessGroup(Instr);
6821       if (IG && Instr != IG->getInsertPos() &&
6822           Range.Start >= 2 && // Query is illegal for VF == 1
6823           CM.getWideningDecision(Instr, Range.Start) ==
6824               LoopVectorizationCostModel::CM_Interleave) {
6825         auto SinkCandidate = SinkAfterInverse.find(Instr);
6826         if (SinkCandidate != SinkAfterInverse.end())
6827           Ingredients.push_back(SinkCandidate->second);
6828         continue;
6829       }
6830 
6831       // Move instructions to handle first-order recurrences, step 1: avoid
6832       // handling this instruction until after we've handled the instruction it
6833       // should follow.
6834       auto SAIt = SinkAfter.find(Instr);
6835       if (SAIt != SinkAfter.end()) {
6836         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
6837                           << *SAIt->second
6838                           << " to vectorize a 1st order recurrence.\n");
6839         SinkAfterInverse[SAIt->second] = Instr;
6840         continue;
6841       }
6842 
6843       Ingredients.push_back(Instr);
6844 
6845       // Move instructions to handle first-order recurrences, step 2: push the
6846       // instruction to be sunk at its insertion point.
6847       auto SAInvIt = SinkAfterInverse.find(Instr);
6848       if (SAInvIt != SinkAfterInverse.end())
6849         Ingredients.push_back(SAInvIt->second);
6850     }
6851 
6852     // Introduce each ingredient into VPlan.
6853     for (Instruction *Instr : Ingredients) {
6854       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
6855         continue;
6856 
6857       // Otherwise, if all widening options failed, Instruction is to be
6858       // replicated. This may create a successor for VPBB.
6859       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
6860           Instr, Range, VPBB, PredInst2Recipe, Plan);
6861       if (NextVPBB != VPBB) {
6862         VPBB = NextVPBB;
6863         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
6864                                     : "");
6865       }
6866     }
6867   }
6868 
6869   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
6870   // may also be empty, such as the last one VPBB, reflecting original
6871   // basic-blocks with no recipes.
6872   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
6873   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
6874   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
6875   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
6876   delete PreEntry;
6877 
6878   std::string PlanName;
6879   raw_string_ostream RSO(PlanName);
6880   unsigned VF = Range.Start;
6881   Plan->addVF(VF);
6882   RSO << "Initial VPlan for VF={" << VF;
6883   for (VF *= 2; VF < Range.End; VF *= 2) {
6884     Plan->addVF(VF);
6885     RSO << "," << VF;
6886   }
6887   RSO << "},UF>=1";
6888   RSO.flush();
6889   Plan->setName(PlanName);
6890 
6891   return Plan;
6892 }
6893 
6894 LoopVectorizationPlanner::VPlanPtr
6895 LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
6896   // Outer loop handling: They may require CFG and instruction level
6897   // transformations before even evaluating whether vectorization is profitable.
6898   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6899   // the vectorization pipeline.
6900   assert(!OrigLoop->empty());
6901   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6902 
6903   // Create new empty VPlan
6904   auto Plan = llvm::make_unique<VPlan>();
6905 
6906   // Build hierarchical CFG
6907   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
6908   HCFGBuilder.buildHierarchicalCFG();
6909 
6910   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
6911     Plan->addVF(VF);
6912 
6913   if (EnableVPlanPredication) {
6914     VPlanPredicator VPP(*Plan);
6915     VPP.predicate();
6916 
6917     // Avoid running transformation to recipes until masked code generation in
6918     // VPlan-native path is in place.
6919     return Plan;
6920   }
6921 
6922   SmallPtrSet<Instruction *, 1> DeadInstructions;
6923   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
6924       Plan, Legal->getInductionVars(), DeadInstructions);
6925 
6926   return Plan;
6927 }
6928 
6929 Value* LoopVectorizationPlanner::VPCallbackILV::
6930 getOrCreateVectorValues(Value *V, unsigned Part) {
6931       return ILV.getOrCreateVectorValue(V, Part);
6932 }
6933 
6934 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
6935   O << " +\n"
6936     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
6937   IG->getInsertPos()->printAsOperand(O, false);
6938   if (User) {
6939     O << ", ";
6940     User->getOperand(0)->printAsOperand(O);
6941   }
6942   O << "\\l\"";
6943   for (unsigned i = 0; i < IG->getFactor(); ++i)
6944     if (Instruction *I = IG->getMember(i))
6945       O << " +\n"
6946         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
6947 }
6948 
6949 void VPWidenRecipe::execute(VPTransformState &State) {
6950   for (auto &Instr : make_range(Begin, End))
6951     State.ILV->widenInstruction(Instr);
6952 }
6953 
6954 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
6955   assert(!State.Instance && "Int or FP induction being replicated.");
6956   State.ILV->widenIntOrFpInduction(IV, Trunc);
6957 }
6958 
6959 void VPWidenPHIRecipe::execute(VPTransformState &State) {
6960   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
6961 }
6962 
6963 void VPBlendRecipe::execute(VPTransformState &State) {
6964   State.ILV->setDebugLocFromInst(State.Builder, Phi);
6965   // We know that all PHIs in non-header blocks are converted into
6966   // selects, so we don't have to worry about the insertion order and we
6967   // can just use the builder.
6968   // At this point we generate the predication tree. There may be
6969   // duplications since this is a simple recursive scan, but future
6970   // optimizations will clean it up.
6971 
6972   unsigned NumIncoming = Phi->getNumIncomingValues();
6973 
6974   assert((User || NumIncoming == 1) &&
6975          "Multiple predecessors with predecessors having a full mask");
6976   // Generate a sequence of selects of the form:
6977   // SELECT(Mask3, In3,
6978   //      SELECT(Mask2, In2,
6979   //                   ( ...)))
6980   InnerLoopVectorizer::VectorParts Entry(State.UF);
6981   for (unsigned In = 0; In < NumIncoming; ++In) {
6982     for (unsigned Part = 0; Part < State.UF; ++Part) {
6983       // We might have single edge PHIs (blocks) - use an identity
6984       // 'select' for the first PHI operand.
6985       Value *In0 =
6986           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
6987       if (In == 0)
6988         Entry[Part] = In0; // Initialize with the first incoming value.
6989       else {
6990         // Select between the current value and the previous incoming edge
6991         // based on the incoming mask.
6992         Value *Cond = State.get(User->getOperand(In), Part);
6993         Entry[Part] =
6994             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
6995       }
6996     }
6997   }
6998   for (unsigned Part = 0; Part < State.UF; ++Part)
6999     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7000 }
7001 
7002 void VPInterleaveRecipe::execute(VPTransformState &State) {
7003   assert(!State.Instance && "Interleave group being replicated.");
7004   if (!User)
7005     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7006 
7007   // Last (and currently only) operand is a mask.
7008   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7009   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7010   for (unsigned Part = 0; Part < State.UF; ++Part)
7011     MaskValues[Part] = State.get(Mask, Part);
7012   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7013 }
7014 
7015 void VPReplicateRecipe::execute(VPTransformState &State) {
7016   if (State.Instance) { // Generate a single instance.
7017     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7018     // Insert scalar instance packing it into a vector.
7019     if (AlsoPack && State.VF > 1) {
7020       // If we're constructing lane 0, initialize to start from undef.
7021       if (State.Instance->Lane == 0) {
7022         Value *Undef =
7023             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7024         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7025       }
7026       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7027     }
7028     return;
7029   }
7030 
7031   // Generate scalar instances for all VF lanes of all UF parts, unless the
7032   // instruction is uniform inwhich case generate only the first lane for each
7033   // of the UF parts.
7034   unsigned EndLane = IsUniform ? 1 : State.VF;
7035   for (unsigned Part = 0; Part < State.UF; ++Part)
7036     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7037       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7038 }
7039 
7040 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7041   assert(State.Instance && "Branch on Mask works only on single instance.");
7042 
7043   unsigned Part = State.Instance->Part;
7044   unsigned Lane = State.Instance->Lane;
7045 
7046   Value *ConditionBit = nullptr;
7047   if (!User) // Block in mask is all-one.
7048     ConditionBit = State.Builder.getTrue();
7049   else {
7050     VPValue *BlockInMask = User->getOperand(0);
7051     ConditionBit = State.get(BlockInMask, Part);
7052     if (ConditionBit->getType()->isVectorTy())
7053       ConditionBit = State.Builder.CreateExtractElement(
7054           ConditionBit, State.Builder.getInt32(Lane));
7055   }
7056 
7057   // Replace the temporary unreachable terminator with a new conditional branch,
7058   // whose two destinations will be set later when they are created.
7059   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7060   assert(isa<UnreachableInst>(CurrentTerminator) &&
7061          "Expected to replace unreachable terminator with conditional branch.");
7062   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7063   CondBr->setSuccessor(0, nullptr);
7064   ReplaceInstWithInst(CurrentTerminator, CondBr);
7065 }
7066 
7067 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7068   assert(State.Instance && "Predicated instruction PHI works per instance.");
7069   Instruction *ScalarPredInst = cast<Instruction>(
7070       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7071   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7072   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7073   assert(PredicatingBB && "Predicated block has no single predecessor.");
7074 
7075   // By current pack/unpack logic we need to generate only a single phi node: if
7076   // a vector value for the predicated instruction exists at this point it means
7077   // the instruction has vector users only, and a phi for the vector value is
7078   // needed. In this case the recipe of the predicated instruction is marked to
7079   // also do that packing, thereby "hoisting" the insert-element sequence.
7080   // Otherwise, a phi node for the scalar value is needed.
7081   unsigned Part = State.Instance->Part;
7082   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7083     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7084     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7085     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7086     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7087     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7088     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7089   } else {
7090     Type *PredInstType = PredInst->getType();
7091     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7092     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7093     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7094     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7095   }
7096 }
7097 
7098 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7099   if (!User)
7100     return State.ILV->vectorizeMemoryInstruction(&Instr);
7101 
7102   // Last (and currently only) operand is a mask.
7103   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7104   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7105   for (unsigned Part = 0; Part < State.UF; ++Part)
7106     MaskValues[Part] = State.get(Mask, Part);
7107   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7108 }
7109 
7110 // Process the loop in the VPlan-native vectorization path. This path builds
7111 // VPlan upfront in the vectorization pipeline, which allows to apply
7112 // VPlan-to-VPlan transformations from the very beginning without modifying the
7113 // input LLVM IR.
7114 static bool processLoopInVPlanNativePath(
7115     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7116     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7117     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7118     OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
7119 
7120   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7121   Function *F = L->getHeader()->getParent();
7122   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7123   LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7124                                 &Hints, IAI);
7125   // Use the planner for outer loop vectorization.
7126   // TODO: CM is not used at this point inside the planner. Turn CM into an
7127   // optional argument if we don't need it in the future.
7128   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7129 
7130   // Get user vectorization factor.
7131   unsigned UserVF = Hints.getWidth();
7132 
7133   // Check the function attributes to find out if this function should be
7134   // optimized for size.
7135   bool OptForSize =
7136       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7137 
7138   // Plan how to best vectorize, return the best VF and its cost.
7139   VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
7140 
7141   // If we are stress testing VPlan builds, do not attempt to generate vector
7142   // code. Masked vector code generation support will follow soon.
7143   if (VPlanBuildStressTest || EnableVPlanPredication)
7144     return false;
7145 
7146   LVP.setBestPlan(VF.Width, 1);
7147 
7148   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,
7149                          &CM);
7150   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7151                     << L->getHeader()->getParent()->getName() << "\"\n");
7152   LVP.executePlan(LB, DT);
7153 
7154   // Mark the loop as already vectorized to avoid vectorizing again.
7155   Hints.setAlreadyVectorized();
7156 
7157   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7158   return true;
7159 }
7160 
7161 bool LoopVectorizePass::processLoop(Loop *L) {
7162   assert((EnableVPlanNativePath || L->empty()) &&
7163          "VPlan-native path is not enabled. Only process inner loops.");
7164 
7165 #ifndef NDEBUG
7166   const std::string DebugLocStr = getDebugLocString(L);
7167 #endif /* NDEBUG */
7168 
7169   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7170                     << L->getHeader()->getParent()->getName() << "\" from "
7171                     << DebugLocStr << "\n");
7172 
7173   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7174 
7175   LLVM_DEBUG(
7176       dbgs() << "LV: Loop hints:"
7177              << " force="
7178              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7179                      ? "disabled"
7180                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7181                             ? "enabled"
7182                             : "?"))
7183              << " width=" << Hints.getWidth()
7184              << " unroll=" << Hints.getInterleave() << "\n");
7185 
7186   // Function containing loop
7187   Function *F = L->getHeader()->getParent();
7188 
7189   // Looking at the diagnostic output is the only way to determine if a loop
7190   // was vectorized (other than looking at the IR or machine code), so it
7191   // is important to generate an optimization remark for each loop. Most of
7192   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7193   // generated as OptimizationRemark and OptimizationRemarkMissed are
7194   // less verbose reporting vectorized loops and unvectorized loops that may
7195   // benefit from vectorization, respectively.
7196 
7197   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7198     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7199     return false;
7200   }
7201 
7202   PredicatedScalarEvolution PSE(*SE, *L);
7203 
7204   // Check if it is legal to vectorize the loop.
7205   LoopVectorizationRequirements Requirements(*ORE);
7206   LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, GetLAA, LI, ORE,
7207                                 &Requirements, &Hints, DB, AC);
7208   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7209     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7210     Hints.emitRemarkWithHints();
7211     return false;
7212   }
7213 
7214   // Check the function attributes to find out if this function should be
7215   // optimized for size.
7216   bool OptForSize =
7217       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7218 
7219   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7220   // here. They may require CFG and instruction level transformations before
7221   // even evaluating whether vectorization is profitable. Since we cannot modify
7222   // the incoming IR, we need to build VPlan upfront in the vectorization
7223   // pipeline.
7224   if (!L->empty())
7225     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7226                                         ORE, Hints);
7227 
7228   assert(L->empty() && "Inner loop expected.");
7229   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7230   // count by optimizing for size, to minimize overheads.
7231   // Prefer constant trip counts over profile data, over upper bound estimate.
7232   unsigned ExpectedTC = 0;
7233   bool HasExpectedTC = false;
7234   if (const SCEVConstant *ConstExits =
7235       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7236     const APInt &ExitsCount = ConstExits->getAPInt();
7237     // We are interested in small values for ExpectedTC. Skip over those that
7238     // can't fit an unsigned.
7239     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7240       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7241       HasExpectedTC = true;
7242     }
7243   }
7244   // ExpectedTC may be large because it's bound by a variable. Check
7245   // profiling information to validate we should vectorize.
7246   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7247     auto EstimatedTC = getLoopEstimatedTripCount(L);
7248     if (EstimatedTC) {
7249       ExpectedTC = *EstimatedTC;
7250       HasExpectedTC = true;
7251     }
7252   }
7253   if (!HasExpectedTC) {
7254     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7255     HasExpectedTC = (ExpectedTC > 0);
7256   }
7257 
7258   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7259     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7260                       << "This loop is worth vectorizing only if no scalar "
7261                       << "iteration overheads are incurred.");
7262     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7263       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7264     else {
7265       LLVM_DEBUG(dbgs() << "\n");
7266       // Loops with a very small trip count are considered for vectorization
7267       // under OptForSize, thereby making sure the cost of their loop body is
7268       // dominant, free of runtime guards and scalar iteration overheads.
7269       OptForSize = true;
7270     }
7271   }
7272 
7273   // Check the function attributes to see if implicit floats are allowed.
7274   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7275   // an integer loop and the vector instructions selected are purely integer
7276   // vector instructions?
7277   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7278     LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
7279                          "attribute is used.\n");
7280     ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(),
7281                                      "NoImplicitFloat", L)
7282               << "loop not vectorized due to NoImplicitFloat attribute");
7283     Hints.emitRemarkWithHints();
7284     return false;
7285   }
7286 
7287   // Check if the target supports potentially unsafe FP vectorization.
7288   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7289   // for the target we're vectorizing for, to make sure none of the
7290   // additional fp-math flags can help.
7291   if (Hints.isPotentiallyUnsafe() &&
7292       TTI->isFPVectorizationPotentiallyUnsafe()) {
7293     LLVM_DEBUG(
7294         dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n");
7295     ORE->emit(
7296         createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L)
7297         << "loop not vectorized due to unsafe FP support.");
7298     Hints.emitRemarkWithHints();
7299     return false;
7300   }
7301 
7302   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7303   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7304 
7305   // If an override option has been passed in for interleaved accesses, use it.
7306   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7307     UseInterleaved = EnableInterleavedMemAccesses;
7308 
7309   // Analyze interleaved memory accesses.
7310   if (UseInterleaved) {
7311     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7312   }
7313 
7314   // Use the cost model.
7315   LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
7316                                 &Hints, IAI);
7317   CM.collectValuesToIgnore();
7318 
7319   // Use the planner for vectorization.
7320   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7321 
7322   // Get user vectorization factor.
7323   unsigned UserVF = Hints.getWidth();
7324 
7325   // Plan how to best vectorize, return the best VF and its cost.
7326   Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF);
7327 
7328   VectorizationFactor VF = VectorizationFactor::Disabled();
7329   unsigned IC = 1;
7330   unsigned UserIC = Hints.getInterleave();
7331 
7332   if (MaybeVF) {
7333     VF = *MaybeVF;
7334     // Select the interleave count.
7335     IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
7336   }
7337 
7338   // Identify the diagnostic messages that should be produced.
7339   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7340   bool VectorizeLoop = true, InterleaveLoop = true;
7341   if (Requirements.doesNotMeet(F, L, Hints)) {
7342     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7343                          "requirements.\n");
7344     Hints.emitRemarkWithHints();
7345     return false;
7346   }
7347 
7348   if (VF.Width == 1) {
7349     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7350     VecDiagMsg = std::make_pair(
7351         "VectorizationNotBeneficial",
7352         "the cost-model indicates that vectorization is not beneficial");
7353     VectorizeLoop = false;
7354   }
7355 
7356   if (!MaybeVF && UserIC > 1) {
7357     // Tell the user interleaving was avoided up-front, despite being explicitly
7358     // requested.
7359     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7360                          "interleaving should be avoided up front\n");
7361     IntDiagMsg = std::make_pair(
7362         "InterleavingAvoided",
7363         "Ignoring UserIC, because interleaving was avoided up front");
7364     InterleaveLoop = false;
7365   } else if (IC == 1 && UserIC <= 1) {
7366     // Tell the user interleaving is not beneficial.
7367     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7368     IntDiagMsg = std::make_pair(
7369         "InterleavingNotBeneficial",
7370         "the cost-model indicates that interleaving is not beneficial");
7371     InterleaveLoop = false;
7372     if (UserIC == 1) {
7373       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7374       IntDiagMsg.second +=
7375           " and is explicitly disabled or interleave count is set to 1";
7376     }
7377   } else if (IC > 1 && UserIC == 1) {
7378     // Tell the user interleaving is beneficial, but it explicitly disabled.
7379     LLVM_DEBUG(
7380         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7381     IntDiagMsg = std::make_pair(
7382         "InterleavingBeneficialButDisabled",
7383         "the cost-model indicates that interleaving is beneficial "
7384         "but is explicitly disabled or interleave count is set to 1");
7385     InterleaveLoop = false;
7386   }
7387 
7388   // Override IC if user provided an interleave count.
7389   IC = UserIC > 0 ? UserIC : IC;
7390 
7391   // Emit diagnostic messages, if any.
7392   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7393   if (!VectorizeLoop && !InterleaveLoop) {
7394     // Do not vectorize or interleaving the loop.
7395     ORE->emit([&]() {
7396       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7397                                       L->getStartLoc(), L->getHeader())
7398              << VecDiagMsg.second;
7399     });
7400     ORE->emit([&]() {
7401       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7402                                       L->getStartLoc(), L->getHeader())
7403              << IntDiagMsg.second;
7404     });
7405     return false;
7406   } else if (!VectorizeLoop && InterleaveLoop) {
7407     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7408     ORE->emit([&]() {
7409       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7410                                         L->getStartLoc(), L->getHeader())
7411              << VecDiagMsg.second;
7412     });
7413   } else if (VectorizeLoop && !InterleaveLoop) {
7414     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7415                       << ") in " << DebugLocStr << '\n');
7416     ORE->emit([&]() {
7417       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7418                                         L->getStartLoc(), L->getHeader())
7419              << IntDiagMsg.second;
7420     });
7421   } else if (VectorizeLoop && InterleaveLoop) {
7422     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7423                       << ") in " << DebugLocStr << '\n');
7424     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7425   }
7426 
7427   LVP.setBestPlan(VF.Width, IC);
7428 
7429   using namespace ore;
7430   bool DisableRuntimeUnroll = false;
7431   MDNode *OrigLoopID = L->getLoopID();
7432 
7433   if (!VectorizeLoop) {
7434     assert(IC > 1 && "interleave count should not be 1 or 0");
7435     // If we decided that it is not legal to vectorize the loop, then
7436     // interleave it.
7437     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7438                                &CM);
7439     LVP.executePlan(Unroller, DT);
7440 
7441     ORE->emit([&]() {
7442       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7443                                 L->getHeader())
7444              << "interleaved loop (interleaved count: "
7445              << NV("InterleaveCount", IC) << ")";
7446     });
7447   } else {
7448     // If we decided that it is *legal* to vectorize the loop, then do it.
7449     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7450                            &LVL, &CM);
7451     LVP.executePlan(LB, DT);
7452     ++LoopsVectorized;
7453 
7454     // Add metadata to disable runtime unrolling a scalar loop when there are
7455     // no runtime checks about strides and memory. A scalar loop that is
7456     // rarely used is not worth unrolling.
7457     if (!LB.areSafetyChecksAdded())
7458       DisableRuntimeUnroll = true;
7459 
7460     // Report the vectorization decision.
7461     ORE->emit([&]() {
7462       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7463                                 L->getHeader())
7464              << "vectorized loop (vectorization width: "
7465              << NV("VectorizationFactor", VF.Width)
7466              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7467     });
7468   }
7469 
7470   Optional<MDNode *> RemainderLoopID =
7471       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7472                                       LLVMLoopVectorizeFollowupEpilogue});
7473   if (RemainderLoopID.hasValue()) {
7474     L->setLoopID(RemainderLoopID.getValue());
7475   } else {
7476     if (DisableRuntimeUnroll)
7477       AddRuntimeUnrollDisableMetaData(L);
7478 
7479     // Mark the loop as already vectorized to avoid vectorizing again.
7480     Hints.setAlreadyVectorized();
7481   }
7482 
7483   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7484   return true;
7485 }
7486 
7487 bool LoopVectorizePass::runImpl(
7488     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7489     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7490     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7491     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7492     OptimizationRemarkEmitter &ORE_) {
7493   SE = &SE_;
7494   LI = &LI_;
7495   TTI = &TTI_;
7496   DT = &DT_;
7497   BFI = &BFI_;
7498   TLI = TLI_;
7499   AA = &AA_;
7500   AC = &AC_;
7501   GetLAA = &GetLAA_;
7502   DB = &DB_;
7503   ORE = &ORE_;
7504 
7505   // Don't attempt if
7506   // 1. the target claims to have no vector registers, and
7507   // 2. interleaving won't help ILP.
7508   //
7509   // The second condition is necessary because, even if the target has no
7510   // vector registers, loop vectorization may still enable scalar
7511   // interleaving.
7512   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7513     return false;
7514 
7515   bool Changed = false;
7516 
7517   // The vectorizer requires loops to be in simplified form.
7518   // Since simplification may add new inner loops, it has to run before the
7519   // legality and profitability checks. This means running the loop vectorizer
7520   // will simplify all loops, regardless of whether anything end up being
7521   // vectorized.
7522   for (auto &L : *LI)
7523     Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
7524 
7525   // Build up a worklist of inner-loops to vectorize. This is necessary as
7526   // the act of vectorizing or partially unrolling a loop creates new loops
7527   // and can invalidate iterators across the loops.
7528   SmallVector<Loop *, 8> Worklist;
7529 
7530   for (Loop *L : *LI)
7531     collectSupportedLoops(*L, LI, ORE, Worklist);
7532 
7533   LoopsAnalyzed += Worklist.size();
7534 
7535   // Now walk the identified inner loops.
7536   while (!Worklist.empty()) {
7537     Loop *L = Worklist.pop_back_val();
7538 
7539     // For the inner loops we actually process, form LCSSA to simplify the
7540     // transform.
7541     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7542 
7543     Changed |= processLoop(L);
7544   }
7545 
7546   // Process each loop nest in the function.
7547   return Changed;
7548 }
7549 
7550 PreservedAnalyses LoopVectorizePass::run(Function &F,
7551                                          FunctionAnalysisManager &AM) {
7552     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7553     auto &LI = AM.getResult<LoopAnalysis>(F);
7554     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7555     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7556     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7557     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7558     auto &AA = AM.getResult<AAManager>(F);
7559     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7560     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7561     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7562     MemorySSA *MSSA = EnableMSSALoopDependency
7563                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7564                           : nullptr;
7565 
7566     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7567     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7568         [&](Loop &L) -> const LoopAccessInfo & {
7569       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7570       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7571     };
7572     bool Changed =
7573         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
7574     if (!Changed)
7575       return PreservedAnalyses::all();
7576     PreservedAnalyses PA;
7577 
7578     // We currently do not preserve loopinfo/dominator analyses with outer loop
7579     // vectorization. Until this is addressed, mark these analyses as preserved
7580     // only for non-VPlan-native path.
7581     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7582     if (!EnableVPlanNativePath) {
7583       PA.preserve<LoopAnalysis>();
7584       PA.preserve<DominatorTreeAnalysis>();
7585     }
7586     PA.preserve<BasicAA>();
7587     PA.preserve<GlobalsAA>();
7588     return PA;
7589 }
7590