1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I, VPUser &Operands,
411                         VPTransformState &State);
412 
413   /// Widen a single call instruction within the innermost loop.
414   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
415                             VPTransformState &State);
416 
417   /// Widen a single select instruction within the innermost loop.
418   void widenSelectInstruction(SelectInst &I, bool InvariantCond);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize interleaved access group \p Group with the base address
486   /// given in \p Addr, optionally masking the vector operations if \p
487   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
488   /// values in the vectorized loop.
489   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
490                                 VPTransformState &State, VPValue *Addr,
491                                 VPValue *BlockInMask = nullptr);
492 
493   /// Vectorize Load and Store instructions with the base address given in \p
494   /// Addr, optionally masking the vector operations if \p BlockInMask is
495   /// non-null. Use \p State to translate given VPValues to IR values in the
496   /// vectorized loop.
497   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
498                                   VPValue *Addr, VPValue *StoredValue,
499                                   VPValue *BlockInMask);
500 
501   /// Set the debug location in the builder using the debug location in
502   /// the instruction.
503   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
504 
505   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
506   void fixNonInductionPHIs(void);
507 
508 protected:
509   friend class LoopVectorizationPlanner;
510 
511   /// A small list of PHINodes.
512   using PhiVector = SmallVector<PHINode *, 4>;
513 
514   /// A type for scalarized values in the new loop. Each value from the
515   /// original loop, when scalarized, is represented by UF x VF scalar values
516   /// in the new unrolled loop, where UF is the unroll factor and VF is the
517   /// vectorization factor.
518   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
519 
520   /// Set up the values of the IVs correctly when exiting the vector loop.
521   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
522                     Value *CountRoundDown, Value *EndValue,
523                     BasicBlock *MiddleBlock);
524 
525   /// Create a new induction variable inside L.
526   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
527                                    Value *Step, Instruction *DL);
528 
529   /// Handle all cross-iteration phis in the header.
530   void fixCrossIterationPHIs();
531 
532   /// Fix a first-order recurrence. This is the second phase of vectorizing
533   /// this phi node.
534   void fixFirstOrderRecurrence(PHINode *Phi);
535 
536   /// Fix a reduction cross-iteration phi. This is the second phase of
537   /// vectorizing this phi node.
538   void fixReduction(PHINode *Phi);
539 
540   /// Clear NSW/NUW flags from reduction instructions if necessary.
541   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
542 
543   /// The Loop exit block may have single value PHI nodes with some
544   /// incoming value. While vectorizing we only handled real values
545   /// that were defined inside the loop and we should have one value for
546   /// each predecessor of its parent basic block. See PR14725.
547   void fixLCSSAPHIs();
548 
549   /// Iteratively sink the scalarized operands of a predicated instruction into
550   /// the block that was created for it.
551   void sinkScalarOperands(Instruction *PredInst);
552 
553   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
554   /// represented as.
555   void truncateToMinimalBitwidths();
556 
557   /// Create a broadcast instruction. This method generates a broadcast
558   /// instruction (shuffle) for loop invariant values and for the induction
559   /// value. If this is the induction variable then we extend it to N, N+1, ...
560   /// this is needed because each iteration in the loop corresponds to a SIMD
561   /// element.
562   virtual Value *getBroadcastInstrs(Value *V);
563 
564   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
565   /// to each vector element of Val. The sequence starts at StartIndex.
566   /// \p Opcode is relevant for FP induction variable.
567   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
568                                Instruction::BinaryOps Opcode =
569                                Instruction::BinaryOpsEnd);
570 
571   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
572   /// variable on which to base the steps, \p Step is the size of the step, and
573   /// \p EntryVal is the value from the original loop that maps to the steps.
574   /// Note that \p EntryVal doesn't have to be an induction variable - it
575   /// can also be a truncate instruction.
576   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
577                         const InductionDescriptor &ID);
578 
579   /// Create a vector induction phi node based on an existing scalar one. \p
580   /// EntryVal is the value from the original loop that maps to the vector phi
581   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
582   /// truncate instruction, instead of widening the original IV, we widen a
583   /// version of the IV truncated to \p EntryVal's type.
584   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
585                                        Value *Step, Instruction *EntryVal);
586 
587   /// Returns true if an instruction \p I should be scalarized instead of
588   /// vectorized for the chosen vectorization factor.
589   bool shouldScalarizeInstruction(Instruction *I) const;
590 
591   /// Returns true if we should generate a scalar version of \p IV.
592   bool needsScalarInduction(Instruction *IV) const;
593 
594   /// If there is a cast involved in the induction variable \p ID, which should
595   /// be ignored in the vectorized loop body, this function records the
596   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
597   /// cast. We had already proved that the casted Phi is equal to the uncasted
598   /// Phi in the vectorized loop (under a runtime guard), and therefore
599   /// there is no need to vectorize the cast - the same value can be used in the
600   /// vector loop for both the Phi and the cast.
601   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
602   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
603   ///
604   /// \p EntryVal is the value from the original loop that maps to the vector
605   /// phi node and is used to distinguish what is the IV currently being
606   /// processed - original one (if \p EntryVal is a phi corresponding to the
607   /// original IV) or the "newly-created" one based on the proof mentioned above
608   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
609   /// latter case \p EntryVal is a TruncInst and we must not record anything for
610   /// that IV, but it's error-prone to expect callers of this routine to care
611   /// about that, hence this explicit parameter.
612   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
613                                              const Instruction *EntryVal,
614                                              Value *VectorLoopValue,
615                                              unsigned Part,
616                                              unsigned Lane = UINT_MAX);
617 
618   /// Generate a shuffle sequence that will reverse the vector Vec.
619   virtual Value *reverseVector(Value *Vec);
620 
621   /// Returns (and creates if needed) the original loop trip count.
622   Value *getOrCreateTripCount(Loop *NewLoop);
623 
624   /// Returns (and creates if needed) the trip count of the widened loop.
625   Value *getOrCreateVectorTripCount(Loop *NewLoop);
626 
627   /// Returns a bitcasted value to the requested vector type.
628   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
629   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
630                                 const DataLayout &DL);
631 
632   /// Emit a bypass check to see if the vector trip count is zero, including if
633   /// it overflows.
634   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
635 
636   /// Emit a bypass check to see if all of the SCEV assumptions we've
637   /// had to make are correct.
638   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
639 
640   /// Emit bypass checks to check any memory assumptions we may have made.
641   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
642 
643   /// Compute the transformed value of Index at offset StartValue using step
644   /// StepValue.
645   /// For integer induction, returns StartValue + Index * StepValue.
646   /// For pointer induction, returns StartValue[Index * StepValue].
647   /// FIXME: The newly created binary instructions should contain nsw/nuw
648   /// flags, which can be found from the original scalar operations.
649   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
650                               const DataLayout &DL,
651                               const InductionDescriptor &ID) const;
652 
653   /// Add additional metadata to \p To that was not present on \p Orig.
654   ///
655   /// Currently this is used to add the noalias annotations based on the
656   /// inserted memchecks.  Use this for instructions that are *cloned* into the
657   /// vector loop.
658   void addNewMetadata(Instruction *To, const Instruction *Orig);
659 
660   /// Add metadata from one instruction to another.
661   ///
662   /// This includes both the original MDs from \p From and additional ones (\see
663   /// addNewMetadata).  Use this for *newly created* instructions in the vector
664   /// loop.
665   void addMetadata(Instruction *To, Instruction *From);
666 
667   /// Similar to the previous function but it adds the metadata to a
668   /// vector of instructions.
669   void addMetadata(ArrayRef<Value *> To, Instruction *From);
670 
671   /// The original loop.
672   Loop *OrigLoop;
673 
674   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
675   /// dynamic knowledge to simplify SCEV expressions and converts them to a
676   /// more usable form.
677   PredicatedScalarEvolution &PSE;
678 
679   /// Loop Info.
680   LoopInfo *LI;
681 
682   /// Dominator Tree.
683   DominatorTree *DT;
684 
685   /// Alias Analysis.
686   AliasAnalysis *AA;
687 
688   /// Target Library Info.
689   const TargetLibraryInfo *TLI;
690 
691   /// Target Transform Info.
692   const TargetTransformInfo *TTI;
693 
694   /// Assumption Cache.
695   AssumptionCache *AC;
696 
697   /// Interface to emit optimization remarks.
698   OptimizationRemarkEmitter *ORE;
699 
700   /// LoopVersioning.  It's only set up (non-null) if memchecks were
701   /// used.
702   ///
703   /// This is currently only used to add no-alias metadata based on the
704   /// memchecks.  The actually versioning is performed manually.
705   std::unique_ptr<LoopVersioning> LVer;
706 
707   /// The vectorization SIMD factor to use. Each vector will have this many
708   /// vector elements.
709   unsigned VF;
710 
711   /// The vectorization unroll factor to use. Each scalar is vectorized to this
712   /// many different vector instructions.
713   unsigned UF;
714 
715   /// The builder that we use
716   IRBuilder<> Builder;
717 
718   // --- Vectorization state ---
719 
720   /// The vector-loop preheader.
721   BasicBlock *LoopVectorPreHeader;
722 
723   /// The scalar-loop preheader.
724   BasicBlock *LoopScalarPreHeader;
725 
726   /// Middle Block between the vector and the scalar.
727   BasicBlock *LoopMiddleBlock;
728 
729   /// The ExitBlock of the scalar loop.
730   BasicBlock *LoopExitBlock;
731 
732   /// The vector loop body.
733   BasicBlock *LoopVectorBody;
734 
735   /// The scalar loop body.
736   BasicBlock *LoopScalarBody;
737 
738   /// A list of all bypass blocks. The first block is the entry of the loop.
739   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
740 
741   /// The new Induction variable which was added to the new block.
742   PHINode *Induction = nullptr;
743 
744   /// The induction variable of the old basic block.
745   PHINode *OldInduction = nullptr;
746 
747   /// Maps values from the original loop to their corresponding values in the
748   /// vectorized loop. A key value can map to either vector values, scalar
749   /// values or both kinds of values, depending on whether the key was
750   /// vectorized and scalarized.
751   VectorizerValueMap VectorLoopValueMap;
752 
753   /// Store instructions that were predicated.
754   SmallVector<Instruction *, 4> PredicatedInstructions;
755 
756   /// Trip count of the original loop.
757   Value *TripCount = nullptr;
758 
759   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
760   Value *VectorTripCount = nullptr;
761 
762   /// The legality analysis.
763   LoopVectorizationLegality *Legal;
764 
765   /// The profitablity analysis.
766   LoopVectorizationCostModel *Cost;
767 
768   // Record whether runtime checks are added.
769   bool AddedSafetyChecks = false;
770 
771   // Holds the end values for each induction variable. We save the end values
772   // so we can later fix-up the external users of the induction variables.
773   DenseMap<PHINode *, Value *> IVEndValues;
774 
775   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
776   // fixed up at the end of vector code generation.
777   SmallVector<PHINode *, 8> OrigPHIsToFix;
778 };
779 
780 class InnerLoopUnroller : public InnerLoopVectorizer {
781 public:
782   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
783                     LoopInfo *LI, DominatorTree *DT,
784                     const TargetLibraryInfo *TLI,
785                     const TargetTransformInfo *TTI, AssumptionCache *AC,
786                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
787                     LoopVectorizationLegality *LVL,
788                     LoopVectorizationCostModel *CM)
789       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
790                             UnrollFactor, LVL, CM) {}
791 
792 private:
793   Value *getBroadcastInstrs(Value *V) override;
794   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
795                        Instruction::BinaryOps Opcode =
796                        Instruction::BinaryOpsEnd) override;
797   Value *reverseVector(Value *Vec) override;
798 };
799 
800 } // end namespace llvm
801 
802 /// Look for a meaningful debug location on the instruction or it's
803 /// operands.
804 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
805   if (!I)
806     return I;
807 
808   DebugLoc Empty;
809   if (I->getDebugLoc() != Empty)
810     return I;
811 
812   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
813     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
814       if (OpInst->getDebugLoc() != Empty)
815         return OpInst;
816   }
817 
818   return I;
819 }
820 
821 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
822   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
823     const DILocation *DIL = Inst->getDebugLoc();
824     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
825         !isa<DbgInfoIntrinsic>(Inst)) {
826       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
827       if (NewDIL)
828         B.SetCurrentDebugLocation(NewDIL.getValue());
829       else
830         LLVM_DEBUG(dbgs()
831                    << "Failed to create new discriminator: "
832                    << DIL->getFilename() << " Line: " << DIL->getLine());
833     }
834     else
835       B.SetCurrentDebugLocation(DIL);
836   } else
837     B.SetCurrentDebugLocation(DebugLoc());
838 }
839 
840 /// Write a record \p DebugMsg about vectorization failure to the debug
841 /// output stream. If \p I is passed, it is an instruction that prevents
842 /// vectorization.
843 #ifndef NDEBUG
844 static void debugVectorizationFailure(const StringRef DebugMsg,
845     Instruction *I) {
846   dbgs() << "LV: Not vectorizing: " << DebugMsg;
847   if (I != nullptr)
848     dbgs() << " " << *I;
849   else
850     dbgs() << '.';
851   dbgs() << '\n';
852 }
853 #endif
854 
855 /// Create an analysis remark that explains why vectorization failed
856 ///
857 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
858 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
859 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
860 /// the location of the remark.  \return the remark object that can be
861 /// streamed to.
862 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
863     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
864   Value *CodeRegion = TheLoop->getHeader();
865   DebugLoc DL = TheLoop->getStartLoc();
866 
867   if (I) {
868     CodeRegion = I->getParent();
869     // If there is no debug location attached to the instruction, revert back to
870     // using the loop's.
871     if (I->getDebugLoc())
872       DL = I->getDebugLoc();
873   }
874 
875   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
876   R << "loop not vectorized: ";
877   return R;
878 }
879 
880 namespace llvm {
881 
882 void reportVectorizationFailure(const StringRef DebugMsg,
883     const StringRef OREMsg, const StringRef ORETag,
884     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
885   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
886   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
887   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
888                 ORETag, TheLoop, I) << OREMsg);
889 }
890 
891 } // end namespace llvm
892 
893 #ifndef NDEBUG
894 /// \return string containing a file name and a line # for the given loop.
895 static std::string getDebugLocString(const Loop *L) {
896   std::string Result;
897   if (L) {
898     raw_string_ostream OS(Result);
899     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
900       LoopDbgLoc.print(OS);
901     else
902       // Just print the module name.
903       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
904     OS.flush();
905   }
906   return Result;
907 }
908 #endif
909 
910 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
911                                          const Instruction *Orig) {
912   // If the loop was versioned with memchecks, add the corresponding no-alias
913   // metadata.
914   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
915     LVer->annotateInstWithNoAlias(To, Orig);
916 }
917 
918 void InnerLoopVectorizer::addMetadata(Instruction *To,
919                                       Instruction *From) {
920   propagateMetadata(To, From);
921   addNewMetadata(To, From);
922 }
923 
924 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
925                                       Instruction *From) {
926   for (Value *V : To) {
927     if (Instruction *I = dyn_cast<Instruction>(V))
928       addMetadata(I, From);
929   }
930 }
931 
932 namespace llvm {
933 
934 // Loop vectorization cost-model hints how the scalar epilogue loop should be
935 // lowered.
936 enum ScalarEpilogueLowering {
937 
938   // The default: allowing scalar epilogues.
939   CM_ScalarEpilogueAllowed,
940 
941   // Vectorization with OptForSize: don't allow epilogues.
942   CM_ScalarEpilogueNotAllowedOptSize,
943 
944   // A special case of vectorisation with OptForSize: loops with a very small
945   // trip count are considered for vectorization under OptForSize, thereby
946   // making sure the cost of their loop body is dominant, free of runtime
947   // guards and scalar iteration overheads.
948   CM_ScalarEpilogueNotAllowedLowTripLoop,
949 
950   // Loop hint predicate indicating an epilogue is undesired.
951   CM_ScalarEpilogueNotNeededUsePredicate
952 };
953 
954 /// LoopVectorizationCostModel - estimates the expected speedups due to
955 /// vectorization.
956 /// In many cases vectorization is not profitable. This can happen because of
957 /// a number of reasons. In this class we mainly attempt to predict the
958 /// expected speedup/slowdowns due to the supported instruction set. We use the
959 /// TargetTransformInfo to query the different backends for the cost of
960 /// different operations.
961 class LoopVectorizationCostModel {
962 public:
963   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
964                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
965                              LoopVectorizationLegality *Legal,
966                              const TargetTransformInfo &TTI,
967                              const TargetLibraryInfo *TLI, DemandedBits *DB,
968                              AssumptionCache *AC,
969                              OptimizationRemarkEmitter *ORE, const Function *F,
970                              const LoopVectorizeHints *Hints,
971                              InterleavedAccessInfo &IAI)
972       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
973         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
974         Hints(Hints), InterleaveInfo(IAI) {}
975 
976   /// \return An upper bound for the vectorization factor, or None if
977   /// vectorization and interleaving should be avoided up front.
978   Optional<unsigned> computeMaxVF();
979 
980   /// \return True if runtime checks are required for vectorization, and false
981   /// otherwise.
982   bool runtimeChecksRequired();
983 
984   /// \return The most profitable vectorization factor and the cost of that VF.
985   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
986   /// then this vectorization factor will be selected if vectorization is
987   /// possible.
988   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
989 
990   /// Setup cost-based decisions for user vectorization factor.
991   void selectUserVectorizationFactor(unsigned UserVF) {
992     collectUniformsAndScalars(UserVF);
993     collectInstsToScalarize(UserVF);
994   }
995 
996   /// \return The size (in bits) of the smallest and widest types in the code
997   /// that needs to be vectorized. We ignore values that remain scalar such as
998   /// 64 bit loop indices.
999   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1000 
1001   /// \return The desired interleave count.
1002   /// If interleave count has been specified by metadata it will be returned.
1003   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1004   /// are the selected vectorization factor and the cost of the selected VF.
1005   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1006 
1007   /// Memory access instruction may be vectorized in more than one way.
1008   /// Form of instruction after vectorization depends on cost.
1009   /// This function takes cost-based decisions for Load/Store instructions
1010   /// and collects them in a map. This decisions map is used for building
1011   /// the lists of loop-uniform and loop-scalar instructions.
1012   /// The calculated cost is saved with widening decision in order to
1013   /// avoid redundant calculations.
1014   void setCostBasedWideningDecision(unsigned VF);
1015 
1016   /// A struct that represents some properties of the register usage
1017   /// of a loop.
1018   struct RegisterUsage {
1019     /// Holds the number of loop invariant values that are used in the loop.
1020     /// The key is ClassID of target-provided register class.
1021     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1022     /// Holds the maximum number of concurrent live intervals in the loop.
1023     /// The key is ClassID of target-provided register class.
1024     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1025   };
1026 
1027   /// \return Returns information about the register usages of the loop for the
1028   /// given vectorization factors.
1029   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1030 
1031   /// Collect values we want to ignore in the cost model.
1032   void collectValuesToIgnore();
1033 
1034   /// \returns The smallest bitwidth each instruction can be represented with.
1035   /// The vector equivalents of these instructions should be truncated to this
1036   /// type.
1037   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1038     return MinBWs;
1039   }
1040 
1041   /// \returns True if it is more profitable to scalarize instruction \p I for
1042   /// vectorization factor \p VF.
1043   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1044     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1045 
1046     // Cost model is not run in the VPlan-native path - return conservative
1047     // result until this changes.
1048     if (EnableVPlanNativePath)
1049       return false;
1050 
1051     auto Scalars = InstsToScalarize.find(VF);
1052     assert(Scalars != InstsToScalarize.end() &&
1053            "VF not yet analyzed for scalarization profitability");
1054     return Scalars->second.find(I) != Scalars->second.end();
1055   }
1056 
1057   /// Returns true if \p I is known to be uniform after vectorization.
1058   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1059     if (VF == 1)
1060       return true;
1061 
1062     // Cost model is not run in the VPlan-native path - return conservative
1063     // result until this changes.
1064     if (EnableVPlanNativePath)
1065       return false;
1066 
1067     auto UniformsPerVF = Uniforms.find(VF);
1068     assert(UniformsPerVF != Uniforms.end() &&
1069            "VF not yet analyzed for uniformity");
1070     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1071   }
1072 
1073   /// Returns true if \p I is known to be scalar after vectorization.
1074   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1075     if (VF == 1)
1076       return true;
1077 
1078     // Cost model is not run in the VPlan-native path - return conservative
1079     // result until this changes.
1080     if (EnableVPlanNativePath)
1081       return false;
1082 
1083     auto ScalarsPerVF = Scalars.find(VF);
1084     assert(ScalarsPerVF != Scalars.end() &&
1085            "Scalar values are not calculated for VF");
1086     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1087   }
1088 
1089   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1090   /// for vectorization factor \p VF.
1091   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1092     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1093            !isProfitableToScalarize(I, VF) &&
1094            !isScalarAfterVectorization(I, VF);
1095   }
1096 
1097   /// Decision that was taken during cost calculation for memory instruction.
1098   enum InstWidening {
1099     CM_Unknown,
1100     CM_Widen,         // For consecutive accesses with stride +1.
1101     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1102     CM_Interleave,
1103     CM_GatherScatter,
1104     CM_Scalarize
1105   };
1106 
1107   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1108   /// instruction \p I and vector width \p VF.
1109   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1110                            unsigned Cost) {
1111     assert(VF >= 2 && "Expected VF >=2");
1112     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1113   }
1114 
1115   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1116   /// interleaving group \p Grp and vector width \p VF.
1117   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1118                            InstWidening W, unsigned Cost) {
1119     assert(VF >= 2 && "Expected VF >=2");
1120     /// Broadcast this decicion to all instructions inside the group.
1121     /// But the cost will be assigned to one instruction only.
1122     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1123       if (auto *I = Grp->getMember(i)) {
1124         if (Grp->getInsertPos() == I)
1125           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1126         else
1127           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1128       }
1129     }
1130   }
1131 
1132   /// Return the cost model decision for the given instruction \p I and vector
1133   /// width \p VF. Return CM_Unknown if this instruction did not pass
1134   /// through the cost modeling.
1135   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1136     assert(VF >= 2 && "Expected VF >=2");
1137 
1138     // Cost model is not run in the VPlan-native path - return conservative
1139     // result until this changes.
1140     if (EnableVPlanNativePath)
1141       return CM_GatherScatter;
1142 
1143     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1144     auto Itr = WideningDecisions.find(InstOnVF);
1145     if (Itr == WideningDecisions.end())
1146       return CM_Unknown;
1147     return Itr->second.first;
1148   }
1149 
1150   /// Return the vectorization cost for the given instruction \p I and vector
1151   /// width \p VF.
1152   unsigned getWideningCost(Instruction *I, unsigned VF) {
1153     assert(VF >= 2 && "Expected VF >=2");
1154     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1155     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1156            "The cost is not calculated");
1157     return WideningDecisions[InstOnVF].second;
1158   }
1159 
1160   /// Return True if instruction \p I is an optimizable truncate whose operand
1161   /// is an induction variable. Such a truncate will be removed by adding a new
1162   /// induction variable with the destination type.
1163   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1164     // If the instruction is not a truncate, return false.
1165     auto *Trunc = dyn_cast<TruncInst>(I);
1166     if (!Trunc)
1167       return false;
1168 
1169     // Get the source and destination types of the truncate.
1170     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1171     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1172 
1173     // If the truncate is free for the given types, return false. Replacing a
1174     // free truncate with an induction variable would add an induction variable
1175     // update instruction to each iteration of the loop. We exclude from this
1176     // check the primary induction variable since it will need an update
1177     // instruction regardless.
1178     Value *Op = Trunc->getOperand(0);
1179     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1180       return false;
1181 
1182     // If the truncated value is not an induction variable, return false.
1183     return Legal->isInductionPhi(Op);
1184   }
1185 
1186   /// Collects the instructions to scalarize for each predicated instruction in
1187   /// the loop.
1188   void collectInstsToScalarize(unsigned VF);
1189 
1190   /// Collect Uniform and Scalar values for the given \p VF.
1191   /// The sets depend on CM decision for Load/Store instructions
1192   /// that may be vectorized as interleave, gather-scatter or scalarized.
1193   void collectUniformsAndScalars(unsigned VF) {
1194     // Do the analysis once.
1195     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1196       return;
1197     setCostBasedWideningDecision(VF);
1198     collectLoopUniforms(VF);
1199     collectLoopScalars(VF);
1200   }
1201 
1202   /// Returns true if the target machine supports masked store operation
1203   /// for the given \p DataType and kind of access to \p Ptr.
1204   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1205     return Legal->isConsecutivePtr(Ptr) &&
1206            TTI.isLegalMaskedStore(DataType, Alignment);
1207   }
1208 
1209   /// Returns true if the target machine supports masked load operation
1210   /// for the given \p DataType and kind of access to \p Ptr.
1211   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1212     return Legal->isConsecutivePtr(Ptr) &&
1213            TTI.isLegalMaskedLoad(DataType, Alignment);
1214   }
1215 
1216   /// Returns true if the target machine supports masked scatter operation
1217   /// for the given \p DataType.
1218   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1219     return TTI.isLegalMaskedScatter(DataType, Alignment);
1220   }
1221 
1222   /// Returns true if the target machine supports masked gather operation
1223   /// for the given \p DataType.
1224   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1225     return TTI.isLegalMaskedGather(DataType, Alignment);
1226   }
1227 
1228   /// Returns true if the target machine can represent \p V as a masked gather
1229   /// or scatter operation.
1230   bool isLegalGatherOrScatter(Value *V) {
1231     bool LI = isa<LoadInst>(V);
1232     bool SI = isa<StoreInst>(V);
1233     if (!LI && !SI)
1234       return false;
1235     auto *Ty = getMemInstValueType(V);
1236     MaybeAlign Align = getLoadStoreAlignment(V);
1237     return (LI && isLegalMaskedGather(Ty, Align)) ||
1238            (SI && isLegalMaskedScatter(Ty, Align));
1239   }
1240 
1241   /// Returns true if \p I is an instruction that will be scalarized with
1242   /// predication. Such instructions include conditional stores and
1243   /// instructions that may divide by zero.
1244   /// If a non-zero VF has been calculated, we check if I will be scalarized
1245   /// predication for that VF.
1246   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1247 
1248   // Returns true if \p I is an instruction that will be predicated either
1249   // through scalar predication or masked load/store or masked gather/scatter.
1250   // Superset of instructions that return true for isScalarWithPredication.
1251   bool isPredicatedInst(Instruction *I) {
1252     if (!blockNeedsPredication(I->getParent()))
1253       return false;
1254     // Loads and stores that need some form of masked operation are predicated
1255     // instructions.
1256     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1257       return Legal->isMaskRequired(I);
1258     return isScalarWithPredication(I);
1259   }
1260 
1261   /// Returns true if \p I is a memory instruction with consecutive memory
1262   /// access that can be widened.
1263   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1264 
1265   /// Returns true if \p I is a memory instruction in an interleaved-group
1266   /// of memory accesses that can be vectorized with wide vector loads/stores
1267   /// and shuffles.
1268   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1269 
1270   /// Check if \p Instr belongs to any interleaved access group.
1271   bool isAccessInterleaved(Instruction *Instr) {
1272     return InterleaveInfo.isInterleaved(Instr);
1273   }
1274 
1275   /// Get the interleaved access group that \p Instr belongs to.
1276   const InterleaveGroup<Instruction> *
1277   getInterleavedAccessGroup(Instruction *Instr) {
1278     return InterleaveInfo.getInterleaveGroup(Instr);
1279   }
1280 
1281   /// Returns true if an interleaved group requires a scalar iteration
1282   /// to handle accesses with gaps, and there is nothing preventing us from
1283   /// creating a scalar epilogue.
1284   bool requiresScalarEpilogue() const {
1285     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1286   }
1287 
1288   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1289   /// loop hint annotation.
1290   bool isScalarEpilogueAllowed() const {
1291     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1292   }
1293 
1294   /// Returns true if all loop blocks should be masked to fold tail loop.
1295   bool foldTailByMasking() const { return FoldTailByMasking; }
1296 
1297   bool blockNeedsPredication(BasicBlock *BB) {
1298     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1299   }
1300 
1301   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1302   /// with factor VF.  Return the cost of the instruction, including
1303   /// scalarization overhead if it's needed.
1304   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1305 
1306   /// Estimate cost of a call instruction CI if it were vectorized with factor
1307   /// VF. Return the cost of the instruction, including scalarization overhead
1308   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1309   /// scalarized -
1310   /// i.e. either vector version isn't available, or is too expensive.
1311   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1312 
1313   /// Invalidates decisions already taken by the cost model.
1314   void invalidateCostModelingDecisions() {
1315     WideningDecisions.clear();
1316     Uniforms.clear();
1317     Scalars.clear();
1318   }
1319 
1320 private:
1321   unsigned NumPredStores = 0;
1322 
1323   /// \return An upper bound for the vectorization factor, larger than zero.
1324   /// One is returned if vectorization should best be avoided due to cost.
1325   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1326 
1327   /// The vectorization cost is a combination of the cost itself and a boolean
1328   /// indicating whether any of the contributing operations will actually
1329   /// operate on
1330   /// vector values after type legalization in the backend. If this latter value
1331   /// is
1332   /// false, then all operations will be scalarized (i.e. no vectorization has
1333   /// actually taken place).
1334   using VectorizationCostTy = std::pair<unsigned, bool>;
1335 
1336   /// Returns the expected execution cost. The unit of the cost does
1337   /// not matter because we use the 'cost' units to compare different
1338   /// vector widths. The cost that is returned is *not* normalized by
1339   /// the factor width.
1340   VectorizationCostTy expectedCost(unsigned VF);
1341 
1342   /// Returns the execution time cost of an instruction for a given vector
1343   /// width. Vector width of one means scalar.
1344   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost-computation logic from getInstructionCost which provides
1347   /// the vector type as an output parameter.
1348   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1349 
1350   /// Calculate vectorization cost of memory instruction \p I.
1351   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost computation for scalarized memory instruction.
1354   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1355 
1356   /// The cost computation for interleaving group of memory instructions.
1357   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1358 
1359   /// The cost computation for Gather/Scatter instruction.
1360   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1361 
1362   /// The cost computation for widening instruction \p I with consecutive
1363   /// memory access.
1364   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1365 
1366   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1367   /// Load: scalar load + broadcast.
1368   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1369   /// element)
1370   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1371 
1372   /// Estimate the overhead of scalarizing an instruction. This is a
1373   /// convenience wrapper for the type-based getScalarizationOverhead API.
1374   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1375 
1376   /// Returns whether the instruction is a load or store and will be a emitted
1377   /// as a vector operation.
1378   bool isConsecutiveLoadOrStore(Instruction *I);
1379 
1380   /// Returns true if an artificially high cost for emulated masked memrefs
1381   /// should be used.
1382   bool useEmulatedMaskMemRefHack(Instruction *I);
1383 
1384   /// Map of scalar integer values to the smallest bitwidth they can be legally
1385   /// represented as. The vector equivalents of these values should be truncated
1386   /// to this type.
1387   MapVector<Instruction *, uint64_t> MinBWs;
1388 
1389   /// A type representing the costs for instructions if they were to be
1390   /// scalarized rather than vectorized. The entries are Instruction-Cost
1391   /// pairs.
1392   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1393 
1394   /// A set containing all BasicBlocks that are known to present after
1395   /// vectorization as a predicated block.
1396   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1397 
1398   /// Records whether it is allowed to have the original scalar loop execute at
1399   /// least once. This may be needed as a fallback loop in case runtime
1400   /// aliasing/dependence checks fail, or to handle the tail/remainder
1401   /// iterations when the trip count is unknown or doesn't divide by the VF,
1402   /// or as a peel-loop to handle gaps in interleave-groups.
1403   /// Under optsize and when the trip count is very small we don't allow any
1404   /// iterations to execute in the scalar loop.
1405   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1406 
1407   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1408   bool FoldTailByMasking = false;
1409 
1410   /// A map holding scalar costs for different vectorization factors. The
1411   /// presence of a cost for an instruction in the mapping indicates that the
1412   /// instruction will be scalarized when vectorizing with the associated
1413   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1414   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1415 
1416   /// Holds the instructions known to be uniform after vectorization.
1417   /// The data is collected per VF.
1418   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1419 
1420   /// Holds the instructions known to be scalar after vectorization.
1421   /// The data is collected per VF.
1422   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1423 
1424   /// Holds the instructions (address computations) that are forced to be
1425   /// scalarized.
1426   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1427 
1428   /// Returns the expected difference in cost from scalarizing the expression
1429   /// feeding a predicated instruction \p PredInst. The instructions to
1430   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1431   /// non-negative return value implies the expression will be scalarized.
1432   /// Currently, only single-use chains are considered for scalarization.
1433   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1434                               unsigned VF);
1435 
1436   /// Collect the instructions that are uniform after vectorization. An
1437   /// instruction is uniform if we represent it with a single scalar value in
1438   /// the vectorized loop corresponding to each vector iteration. Examples of
1439   /// uniform instructions include pointer operands of consecutive or
1440   /// interleaved memory accesses. Note that although uniformity implies an
1441   /// instruction will be scalar, the reverse is not true. In general, a
1442   /// scalarized instruction will be represented by VF scalar values in the
1443   /// vectorized loop, each corresponding to an iteration of the original
1444   /// scalar loop.
1445   void collectLoopUniforms(unsigned VF);
1446 
1447   /// Collect the instructions that are scalar after vectorization. An
1448   /// instruction is scalar if it is known to be uniform or will be scalarized
1449   /// during vectorization. Non-uniform scalarized instructions will be
1450   /// represented by VF values in the vectorized loop, each corresponding to an
1451   /// iteration of the original scalar loop.
1452   void collectLoopScalars(unsigned VF);
1453 
1454   /// Keeps cost model vectorization decision and cost for instructions.
1455   /// Right now it is used for memory instructions only.
1456   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1457                                 std::pair<InstWidening, unsigned>>;
1458 
1459   DecisionList WideningDecisions;
1460 
1461   /// Returns true if \p V is expected to be vectorized and it needs to be
1462   /// extracted.
1463   bool needsExtract(Value *V, unsigned VF) const {
1464     Instruction *I = dyn_cast<Instruction>(V);
1465     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1466       return false;
1467 
1468     // Assume we can vectorize V (and hence we need extraction) if the
1469     // scalars are not computed yet. This can happen, because it is called
1470     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1471     // the scalars are collected. That should be a safe assumption in most
1472     // cases, because we check if the operands have vectorizable types
1473     // beforehand in LoopVectorizationLegality.
1474     return Scalars.find(VF) == Scalars.end() ||
1475            !isScalarAfterVectorization(I, VF);
1476   };
1477 
1478   /// Returns a range containing only operands needing to be extracted.
1479   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1480                                                    unsigned VF) {
1481     return SmallVector<Value *, 4>(make_filter_range(
1482         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1483   }
1484 
1485 public:
1486   /// The loop that we evaluate.
1487   Loop *TheLoop;
1488 
1489   /// Predicated scalar evolution analysis.
1490   PredicatedScalarEvolution &PSE;
1491 
1492   /// Loop Info analysis.
1493   LoopInfo *LI;
1494 
1495   /// Vectorization legality.
1496   LoopVectorizationLegality *Legal;
1497 
1498   /// Vector target information.
1499   const TargetTransformInfo &TTI;
1500 
1501   /// Target Library Info.
1502   const TargetLibraryInfo *TLI;
1503 
1504   /// Demanded bits analysis.
1505   DemandedBits *DB;
1506 
1507   /// Assumption cache.
1508   AssumptionCache *AC;
1509 
1510   /// Interface to emit optimization remarks.
1511   OptimizationRemarkEmitter *ORE;
1512 
1513   const Function *TheFunction;
1514 
1515   /// Loop Vectorize Hint.
1516   const LoopVectorizeHints *Hints;
1517 
1518   /// The interleave access information contains groups of interleaved accesses
1519   /// with the same stride and close to each other.
1520   InterleavedAccessInfo &InterleaveInfo;
1521 
1522   /// Values to ignore in the cost model.
1523   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1524 
1525   /// Values to ignore in the cost model when VF > 1.
1526   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1527 };
1528 
1529 } // end namespace llvm
1530 
1531 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1532 // vectorization. The loop needs to be annotated with #pragma omp simd
1533 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1534 // vector length information is not provided, vectorization is not considered
1535 // explicit. Interleave hints are not allowed either. These limitations will be
1536 // relaxed in the future.
1537 // Please, note that we are currently forced to abuse the pragma 'clang
1538 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1539 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1540 // provides *explicit vectorization hints* (LV can bypass legal checks and
1541 // assume that vectorization is legal). However, both hints are implemented
1542 // using the same metadata (llvm.loop.vectorize, processed by
1543 // LoopVectorizeHints). This will be fixed in the future when the native IR
1544 // representation for pragma 'omp simd' is introduced.
1545 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1546                                    OptimizationRemarkEmitter *ORE) {
1547   assert(!OuterLp->empty() && "This is not an outer loop");
1548   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1549 
1550   // Only outer loops with an explicit vectorization hint are supported.
1551   // Unannotated outer loops are ignored.
1552   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1553     return false;
1554 
1555   Function *Fn = OuterLp->getHeader()->getParent();
1556   if (!Hints.allowVectorization(Fn, OuterLp,
1557                                 true /*VectorizeOnlyWhenForced*/)) {
1558     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1559     return false;
1560   }
1561 
1562   if (Hints.getInterleave() > 1) {
1563     // TODO: Interleave support is future work.
1564     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1565                          "outer loops.\n");
1566     Hints.emitRemarkWithHints();
1567     return false;
1568   }
1569 
1570   return true;
1571 }
1572 
1573 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1574                                   OptimizationRemarkEmitter *ORE,
1575                                   SmallVectorImpl<Loop *> &V) {
1576   // Collect inner loops and outer loops without irreducible control flow. For
1577   // now, only collect outer loops that have explicit vectorization hints. If we
1578   // are stress testing the VPlan H-CFG construction, we collect the outermost
1579   // loop of every loop nest.
1580   if (L.empty() || VPlanBuildStressTest ||
1581       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1582     LoopBlocksRPO RPOT(&L);
1583     RPOT.perform(LI);
1584     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1585       V.push_back(&L);
1586       // TODO: Collect inner loops inside marked outer loops in case
1587       // vectorization fails for the outer loop. Do not invoke
1588       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1589       // already known to be reducible. We can use an inherited attribute for
1590       // that.
1591       return;
1592     }
1593   }
1594   for (Loop *InnerL : L)
1595     collectSupportedLoops(*InnerL, LI, ORE, V);
1596 }
1597 
1598 namespace {
1599 
1600 /// The LoopVectorize Pass.
1601 struct LoopVectorize : public FunctionPass {
1602   /// Pass identification, replacement for typeid
1603   static char ID;
1604 
1605   LoopVectorizePass Impl;
1606 
1607   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1608                          bool VectorizeOnlyWhenForced = false)
1609       : FunctionPass(ID),
1610         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1611     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1612   }
1613 
1614   bool runOnFunction(Function &F) override {
1615     if (skipFunction(F))
1616       return false;
1617 
1618     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1619     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1620     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1621     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1622     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1623     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1624     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1625     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1626     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1627     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1628     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1629     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1630     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1631 
1632     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1633         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1634 
1635     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1636                         GetLAA, *ORE, PSI).MadeAnyChange;
1637   }
1638 
1639   void getAnalysisUsage(AnalysisUsage &AU) const override {
1640     AU.addRequired<AssumptionCacheTracker>();
1641     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1642     AU.addRequired<DominatorTreeWrapperPass>();
1643     AU.addRequired<LoopInfoWrapperPass>();
1644     AU.addRequired<ScalarEvolutionWrapperPass>();
1645     AU.addRequired<TargetTransformInfoWrapperPass>();
1646     AU.addRequired<AAResultsWrapperPass>();
1647     AU.addRequired<LoopAccessLegacyAnalysis>();
1648     AU.addRequired<DemandedBitsWrapperPass>();
1649     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1650     AU.addRequired<InjectTLIMappingsLegacy>();
1651 
1652     // We currently do not preserve loopinfo/dominator analyses with outer loop
1653     // vectorization. Until this is addressed, mark these analyses as preserved
1654     // only for non-VPlan-native path.
1655     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1656     if (!EnableVPlanNativePath) {
1657       AU.addPreserved<LoopInfoWrapperPass>();
1658       AU.addPreserved<DominatorTreeWrapperPass>();
1659     }
1660 
1661     AU.addPreserved<BasicAAWrapperPass>();
1662     AU.addPreserved<GlobalsAAWrapperPass>();
1663     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1664   }
1665 };
1666 
1667 } // end anonymous namespace
1668 
1669 //===----------------------------------------------------------------------===//
1670 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1671 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1672 //===----------------------------------------------------------------------===//
1673 
1674 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1675   // We need to place the broadcast of invariant variables outside the loop,
1676   // but only if it's proven safe to do so. Else, broadcast will be inside
1677   // vector loop body.
1678   Instruction *Instr = dyn_cast<Instruction>(V);
1679   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1680                      (!Instr ||
1681                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1682   // Place the code for broadcasting invariant variables in the new preheader.
1683   IRBuilder<>::InsertPointGuard Guard(Builder);
1684   if (SafeToHoist)
1685     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1686 
1687   // Broadcast the scalar into all locations in the vector.
1688   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1689 
1690   return Shuf;
1691 }
1692 
1693 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1694     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1695   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1696          "Expected either an induction phi-node or a truncate of it!");
1697   Value *Start = II.getStartValue();
1698 
1699   // Construct the initial value of the vector IV in the vector loop preheader
1700   auto CurrIP = Builder.saveIP();
1701   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1702   if (isa<TruncInst>(EntryVal)) {
1703     assert(Start->getType()->isIntegerTy() &&
1704            "Truncation requires an integer type");
1705     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1706     Step = Builder.CreateTrunc(Step, TruncType);
1707     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1708   }
1709   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1710   Value *SteppedStart =
1711       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1712 
1713   // We create vector phi nodes for both integer and floating-point induction
1714   // variables. Here, we determine the kind of arithmetic we will perform.
1715   Instruction::BinaryOps AddOp;
1716   Instruction::BinaryOps MulOp;
1717   if (Step->getType()->isIntegerTy()) {
1718     AddOp = Instruction::Add;
1719     MulOp = Instruction::Mul;
1720   } else {
1721     AddOp = II.getInductionOpcode();
1722     MulOp = Instruction::FMul;
1723   }
1724 
1725   // Multiply the vectorization factor by the step using integer or
1726   // floating-point arithmetic as appropriate.
1727   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1728   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1729 
1730   // Create a vector splat to use in the induction update.
1731   //
1732   // FIXME: If the step is non-constant, we create the vector splat with
1733   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1734   //        handle a constant vector splat.
1735   Value *SplatVF =
1736       isa<Constant>(Mul)
1737           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1738           : Builder.CreateVectorSplat(VF, Mul);
1739   Builder.restoreIP(CurrIP);
1740 
1741   // We may need to add the step a number of times, depending on the unroll
1742   // factor. The last of those goes into the PHI.
1743   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1744                                     &*LoopVectorBody->getFirstInsertionPt());
1745   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1746   Instruction *LastInduction = VecInd;
1747   for (unsigned Part = 0; Part < UF; ++Part) {
1748     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1749 
1750     if (isa<TruncInst>(EntryVal))
1751       addMetadata(LastInduction, EntryVal);
1752     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1753 
1754     LastInduction = cast<Instruction>(addFastMathFlag(
1755         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1756     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1757   }
1758 
1759   // Move the last step to the end of the latch block. This ensures consistent
1760   // placement of all induction updates.
1761   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1762   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1763   auto *ICmp = cast<Instruction>(Br->getCondition());
1764   LastInduction->moveBefore(ICmp);
1765   LastInduction->setName("vec.ind.next");
1766 
1767   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1768   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1769 }
1770 
1771 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1772   return Cost->isScalarAfterVectorization(I, VF) ||
1773          Cost->isProfitableToScalarize(I, VF);
1774 }
1775 
1776 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1777   if (shouldScalarizeInstruction(IV))
1778     return true;
1779   auto isScalarInst = [&](User *U) -> bool {
1780     auto *I = cast<Instruction>(U);
1781     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1782   };
1783   return llvm::any_of(IV->users(), isScalarInst);
1784 }
1785 
1786 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1787     const InductionDescriptor &ID, const Instruction *EntryVal,
1788     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1789   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1790          "Expected either an induction phi-node or a truncate of it!");
1791 
1792   // This induction variable is not the phi from the original loop but the
1793   // newly-created IV based on the proof that casted Phi is equal to the
1794   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1795   // re-uses the same InductionDescriptor that original IV uses but we don't
1796   // have to do any recording in this case - that is done when original IV is
1797   // processed.
1798   if (isa<TruncInst>(EntryVal))
1799     return;
1800 
1801   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1802   if (Casts.empty())
1803     return;
1804   // Only the first Cast instruction in the Casts vector is of interest.
1805   // The rest of the Casts (if exist) have no uses outside the
1806   // induction update chain itself.
1807   Instruction *CastInst = *Casts.begin();
1808   if (Lane < UINT_MAX)
1809     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1810   else
1811     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1812 }
1813 
1814 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1815   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1816          "Primary induction variable must have an integer type");
1817 
1818   auto II = Legal->getInductionVars().find(IV);
1819   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1820 
1821   auto ID = II->second;
1822   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1823 
1824   // The value from the original loop to which we are mapping the new induction
1825   // variable.
1826   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1827 
1828   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1829 
1830   // Generate code for the induction step. Note that induction steps are
1831   // required to be loop-invariant
1832   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1833     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1834            "Induction step should be loop invariant");
1835     if (PSE.getSE()->isSCEVable(IV->getType())) {
1836       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1837       return Exp.expandCodeFor(Step, Step->getType(),
1838                                LoopVectorPreHeader->getTerminator());
1839     }
1840     return cast<SCEVUnknown>(Step)->getValue();
1841   };
1842 
1843   // The scalar value to broadcast. This is derived from the canonical
1844   // induction variable. If a truncation type is given, truncate the canonical
1845   // induction variable and step. Otherwise, derive these values from the
1846   // induction descriptor.
1847   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1848     Value *ScalarIV = Induction;
1849     if (IV != OldInduction) {
1850       ScalarIV = IV->getType()->isIntegerTy()
1851                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1852                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1853                                           IV->getType());
1854       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1855       ScalarIV->setName("offset.idx");
1856     }
1857     if (Trunc) {
1858       auto *TruncType = cast<IntegerType>(Trunc->getType());
1859       assert(Step->getType()->isIntegerTy() &&
1860              "Truncation requires an integer step");
1861       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1862       Step = Builder.CreateTrunc(Step, TruncType);
1863     }
1864     return ScalarIV;
1865   };
1866 
1867   // Create the vector values from the scalar IV, in the absence of creating a
1868   // vector IV.
1869   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1870     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1871     for (unsigned Part = 0; Part < UF; ++Part) {
1872       Value *EntryPart =
1873           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1874       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1875       if (Trunc)
1876         addMetadata(EntryPart, Trunc);
1877       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1878     }
1879   };
1880 
1881   // Now do the actual transformations, and start with creating the step value.
1882   Value *Step = CreateStepValue(ID.getStep());
1883   if (VF <= 1) {
1884     Value *ScalarIV = CreateScalarIV(Step);
1885     CreateSplatIV(ScalarIV, Step);
1886     return;
1887   }
1888 
1889   // Determine if we want a scalar version of the induction variable. This is
1890   // true if the induction variable itself is not widened, or if it has at
1891   // least one user in the loop that is not widened.
1892   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1893   if (!NeedsScalarIV) {
1894     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1895     return;
1896   }
1897 
1898   // Try to create a new independent vector induction variable. If we can't
1899   // create the phi node, we will splat the scalar induction variable in each
1900   // loop iteration.
1901   if (!shouldScalarizeInstruction(EntryVal)) {
1902     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1903     Value *ScalarIV = CreateScalarIV(Step);
1904     // Create scalar steps that can be used by instructions we will later
1905     // scalarize. Note that the addition of the scalar steps will not increase
1906     // the number of instructions in the loop in the common case prior to
1907     // InstCombine. We will be trading one vector extract for each scalar step.
1908     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1909     return;
1910   }
1911 
1912   // All IV users are scalar instructions, so only emit a scalar IV, not a
1913   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
1914   // predicate used by the masked loads/stores.
1915   Value *ScalarIV = CreateScalarIV(Step);
1916   if (!Cost->isScalarEpilogueAllowed())
1917     CreateSplatIV(ScalarIV, Step);
1918   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1919 }
1920 
1921 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1922                                           Instruction::BinaryOps BinOp) {
1923   // Create and check the types.
1924   auto *ValVTy = cast<VectorType>(Val->getType());
1925   int VLen = ValVTy->getNumElements();
1926 
1927   Type *STy = Val->getType()->getScalarType();
1928   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1929          "Induction Step must be an integer or FP");
1930   assert(Step->getType() == STy && "Step has wrong type");
1931 
1932   SmallVector<Constant *, 8> Indices;
1933 
1934   if (STy->isIntegerTy()) {
1935     // Create a vector of consecutive numbers from zero to VF.
1936     for (int i = 0; i < VLen; ++i)
1937       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1938 
1939     // Add the consecutive indices to the vector value.
1940     Constant *Cv = ConstantVector::get(Indices);
1941     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1942     Step = Builder.CreateVectorSplat(VLen, Step);
1943     assert(Step->getType() == Val->getType() && "Invalid step vec");
1944     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1945     // which can be found from the original scalar operations.
1946     Step = Builder.CreateMul(Cv, Step);
1947     return Builder.CreateAdd(Val, Step, "induction");
1948   }
1949 
1950   // Floating point induction.
1951   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1952          "Binary Opcode should be specified for FP induction");
1953   // Create a vector of consecutive numbers from zero to VF.
1954   for (int i = 0; i < VLen; ++i)
1955     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1956 
1957   // Add the consecutive indices to the vector value.
1958   Constant *Cv = ConstantVector::get(Indices);
1959 
1960   Step = Builder.CreateVectorSplat(VLen, Step);
1961 
1962   // Floating point operations had to be 'fast' to enable the induction.
1963   FastMathFlags Flags;
1964   Flags.setFast();
1965 
1966   Value *MulOp = Builder.CreateFMul(Cv, Step);
1967   if (isa<Instruction>(MulOp))
1968     // Have to check, MulOp may be a constant
1969     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1970 
1971   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1972   if (isa<Instruction>(BOp))
1973     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1974   return BOp;
1975 }
1976 
1977 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1978                                            Instruction *EntryVal,
1979                                            const InductionDescriptor &ID) {
1980   // We shouldn't have to build scalar steps if we aren't vectorizing.
1981   assert(VF > 1 && "VF should be greater than one");
1982 
1983   // Get the value type and ensure it and the step have the same integer type.
1984   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1985   assert(ScalarIVTy == Step->getType() &&
1986          "Val and Step should have the same type");
1987 
1988   // We build scalar steps for both integer and floating-point induction
1989   // variables. Here, we determine the kind of arithmetic we will perform.
1990   Instruction::BinaryOps AddOp;
1991   Instruction::BinaryOps MulOp;
1992   if (ScalarIVTy->isIntegerTy()) {
1993     AddOp = Instruction::Add;
1994     MulOp = Instruction::Mul;
1995   } else {
1996     AddOp = ID.getInductionOpcode();
1997     MulOp = Instruction::FMul;
1998   }
1999 
2000   // Determine the number of scalars we need to generate for each unroll
2001   // iteration. If EntryVal is uniform, we only need to generate the first
2002   // lane. Otherwise, we generate all VF values.
2003   unsigned Lanes =
2004       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2005                                                                          : VF;
2006   // Compute the scalar steps and save the results in VectorLoopValueMap.
2007   for (unsigned Part = 0; Part < UF; ++Part) {
2008     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2009       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2010       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2011       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2012       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2013       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2014     }
2015   }
2016 }
2017 
2018 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2019   assert(V != Induction && "The new induction variable should not be used.");
2020   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2021   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2022 
2023   // If we have a stride that is replaced by one, do it here. Defer this for
2024   // the VPlan-native path until we start running Legal checks in that path.
2025   if (!EnableVPlanNativePath && Legal->hasStride(V))
2026     V = ConstantInt::get(V->getType(), 1);
2027 
2028   // If we have a vector mapped to this value, return it.
2029   if (VectorLoopValueMap.hasVectorValue(V, Part))
2030     return VectorLoopValueMap.getVectorValue(V, Part);
2031 
2032   // If the value has not been vectorized, check if it has been scalarized
2033   // instead. If it has been scalarized, and we actually need the value in
2034   // vector form, we will construct the vector values on demand.
2035   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2036     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2037 
2038     // If we've scalarized a value, that value should be an instruction.
2039     auto *I = cast<Instruction>(V);
2040 
2041     // If we aren't vectorizing, we can just copy the scalar map values over to
2042     // the vector map.
2043     if (VF == 1) {
2044       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2045       return ScalarValue;
2046     }
2047 
2048     // Get the last scalar instruction we generated for V and Part. If the value
2049     // is known to be uniform after vectorization, this corresponds to lane zero
2050     // of the Part unroll iteration. Otherwise, the last instruction is the one
2051     // we created for the last vector lane of the Part unroll iteration.
2052     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2053     auto *LastInst = cast<Instruction>(
2054         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2055 
2056     // Set the insert point after the last scalarized instruction. This ensures
2057     // the insertelement sequence will directly follow the scalar definitions.
2058     auto OldIP = Builder.saveIP();
2059     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2060     Builder.SetInsertPoint(&*NewIP);
2061 
2062     // However, if we are vectorizing, we need to construct the vector values.
2063     // If the value is known to be uniform after vectorization, we can just
2064     // broadcast the scalar value corresponding to lane zero for each unroll
2065     // iteration. Otherwise, we construct the vector values using insertelement
2066     // instructions. Since the resulting vectors are stored in
2067     // VectorLoopValueMap, we will only generate the insertelements once.
2068     Value *VectorValue = nullptr;
2069     if (Cost->isUniformAfterVectorization(I, VF)) {
2070       VectorValue = getBroadcastInstrs(ScalarValue);
2071       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2072     } else {
2073       // Initialize packing with insertelements to start from undef.
2074       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2075       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2076       for (unsigned Lane = 0; Lane < VF; ++Lane)
2077         packScalarIntoVectorValue(V, {Part, Lane});
2078       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2079     }
2080     Builder.restoreIP(OldIP);
2081     return VectorValue;
2082   }
2083 
2084   // If this scalar is unknown, assume that it is a constant or that it is
2085   // loop invariant. Broadcast V and save the value for future uses.
2086   Value *B = getBroadcastInstrs(V);
2087   VectorLoopValueMap.setVectorValue(V, Part, B);
2088   return B;
2089 }
2090 
2091 Value *
2092 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2093                                             const VPIteration &Instance) {
2094   // If the value is not an instruction contained in the loop, it should
2095   // already be scalar.
2096   if (OrigLoop->isLoopInvariant(V))
2097     return V;
2098 
2099   assert(Instance.Lane > 0
2100              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2101              : true && "Uniform values only have lane zero");
2102 
2103   // If the value from the original loop has not been vectorized, it is
2104   // represented by UF x VF scalar values in the new loop. Return the requested
2105   // scalar value.
2106   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2107     return VectorLoopValueMap.getScalarValue(V, Instance);
2108 
2109   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2110   // for the given unroll part. If this entry is not a vector type (i.e., the
2111   // vectorization factor is one), there is no need to generate an
2112   // extractelement instruction.
2113   auto *U = getOrCreateVectorValue(V, Instance.Part);
2114   if (!U->getType()->isVectorTy()) {
2115     assert(VF == 1 && "Value not scalarized has non-vector type");
2116     return U;
2117   }
2118 
2119   // Otherwise, the value from the original loop has been vectorized and is
2120   // represented by UF vector values. Extract and return the requested scalar
2121   // value from the appropriate vector lane.
2122   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2123 }
2124 
2125 void InnerLoopVectorizer::packScalarIntoVectorValue(
2126     Value *V, const VPIteration &Instance) {
2127   assert(V != Induction && "The new induction variable should not be used.");
2128   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2129   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2130 
2131   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2132   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2133   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2134                                             Builder.getInt32(Instance.Lane));
2135   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2136 }
2137 
2138 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2139   assert(Vec->getType()->isVectorTy() && "Invalid type");
2140   SmallVector<int, 8> ShuffleMask;
2141   for (unsigned i = 0; i < VF; ++i)
2142     ShuffleMask.push_back(VF - i - 1);
2143 
2144   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2145                                      ShuffleMask, "reverse");
2146 }
2147 
2148 // Return whether we allow using masked interleave-groups (for dealing with
2149 // strided loads/stores that reside in predicated blocks, or for dealing
2150 // with gaps).
2151 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2152   // If an override option has been passed in for interleaved accesses, use it.
2153   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2154     return EnableMaskedInterleavedMemAccesses;
2155 
2156   return TTI.enableMaskedInterleavedAccessVectorization();
2157 }
2158 
2159 // Try to vectorize the interleave group that \p Instr belongs to.
2160 //
2161 // E.g. Translate following interleaved load group (factor = 3):
2162 //   for (i = 0; i < N; i+=3) {
2163 //     R = Pic[i];             // Member of index 0
2164 //     G = Pic[i+1];           // Member of index 1
2165 //     B = Pic[i+2];           // Member of index 2
2166 //     ... // do something to R, G, B
2167 //   }
2168 // To:
2169 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2170 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2171 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2172 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2173 //
2174 // Or translate following interleaved store group (factor = 3):
2175 //   for (i = 0; i < N; i+=3) {
2176 //     ... do something to R, G, B
2177 //     Pic[i]   = R;           // Member of index 0
2178 //     Pic[i+1] = G;           // Member of index 1
2179 //     Pic[i+2] = B;           // Member of index 2
2180 //   }
2181 // To:
2182 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2183 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2184 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2185 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2186 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2187 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2188     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2189     VPValue *Addr, VPValue *BlockInMask) {
2190   Instruction *Instr = Group->getInsertPos();
2191   const DataLayout &DL = Instr->getModule()->getDataLayout();
2192 
2193   // Prepare for the vector type of the interleaved load/store.
2194   Type *ScalarTy = getMemInstValueType(Instr);
2195   unsigned InterleaveFactor = Group->getFactor();
2196   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2197 
2198   // Prepare for the new pointers.
2199   SmallVector<Value *, 2> AddrParts;
2200   unsigned Index = Group->getIndex(Instr);
2201 
2202   // TODO: extend the masked interleaved-group support to reversed access.
2203   assert((!BlockInMask || !Group->isReverse()) &&
2204          "Reversed masked interleave-group not supported.");
2205 
2206   // If the group is reverse, adjust the index to refer to the last vector lane
2207   // instead of the first. We adjust the index from the first vector lane,
2208   // rather than directly getting the pointer for lane VF - 1, because the
2209   // pointer operand of the interleaved access is supposed to be uniform. For
2210   // uniform instructions, we're only required to generate a value for the
2211   // first vector lane in each unroll iteration.
2212   if (Group->isReverse())
2213     Index += (VF - 1) * Group->getFactor();
2214 
2215   for (unsigned Part = 0; Part < UF; Part++) {
2216     Value *AddrPart = State.get(Addr, {Part, 0});
2217     setDebugLocFromInst(Builder, AddrPart);
2218 
2219     // Notice current instruction could be any index. Need to adjust the address
2220     // to the member of index 0.
2221     //
2222     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2223     //       b = A[i];       // Member of index 0
2224     // Current pointer is pointed to A[i+1], adjust it to A[i].
2225     //
2226     // E.g.  A[i+1] = a;     // Member of index 1
2227     //       A[i]   = b;     // Member of index 0
2228     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2229     // Current pointer is pointed to A[i+2], adjust it to A[i].
2230 
2231     bool InBounds = false;
2232     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2233       InBounds = gep->isInBounds();
2234     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2235     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2236 
2237     // Cast to the vector pointer type.
2238     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2239     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2240     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2241   }
2242 
2243   setDebugLocFromInst(Builder, Instr);
2244   Value *UndefVec = UndefValue::get(VecTy);
2245 
2246   Value *MaskForGaps = nullptr;
2247   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2248     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2249     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2250   }
2251 
2252   // Vectorize the interleaved load group.
2253   if (isa<LoadInst>(Instr)) {
2254     // For each unroll part, create a wide load for the group.
2255     SmallVector<Value *, 2> NewLoads;
2256     for (unsigned Part = 0; Part < UF; Part++) {
2257       Instruction *NewLoad;
2258       if (BlockInMask || MaskForGaps) {
2259         assert(useMaskedInterleavedAccesses(*TTI) &&
2260                "masked interleaved groups are not allowed.");
2261         Value *GroupMask = MaskForGaps;
2262         if (BlockInMask) {
2263           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2264           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2265           Value *ShuffledMask = Builder.CreateShuffleVector(
2266               BlockInMaskPart, Undefs,
2267               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2268           GroupMask = MaskForGaps
2269                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2270                                                 MaskForGaps)
2271                           : ShuffledMask;
2272         }
2273         NewLoad =
2274             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2275                                      GroupMask, UndefVec, "wide.masked.vec");
2276       }
2277       else
2278         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2279                                             Group->getAlign(), "wide.vec");
2280       Group->addMetadata(NewLoad);
2281       NewLoads.push_back(NewLoad);
2282     }
2283 
2284     // For each member in the group, shuffle out the appropriate data from the
2285     // wide loads.
2286     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2287       Instruction *Member = Group->getMember(I);
2288 
2289       // Skip the gaps in the group.
2290       if (!Member)
2291         continue;
2292 
2293       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2294       for (unsigned Part = 0; Part < UF; Part++) {
2295         Value *StridedVec = Builder.CreateShuffleVector(
2296             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2297 
2298         // If this member has different type, cast the result type.
2299         if (Member->getType() != ScalarTy) {
2300           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2301           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2302         }
2303 
2304         if (Group->isReverse())
2305           StridedVec = reverseVector(StridedVec);
2306 
2307         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2308       }
2309     }
2310     return;
2311   }
2312 
2313   // The sub vector type for current instruction.
2314   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2315 
2316   // Vectorize the interleaved store group.
2317   for (unsigned Part = 0; Part < UF; Part++) {
2318     // Collect the stored vector from each member.
2319     SmallVector<Value *, 4> StoredVecs;
2320     for (unsigned i = 0; i < InterleaveFactor; i++) {
2321       // Interleaved store group doesn't allow a gap, so each index has a member
2322       Instruction *Member = Group->getMember(i);
2323       assert(Member && "Fail to get a member from an interleaved store group");
2324 
2325       Value *StoredVec = getOrCreateVectorValue(
2326           cast<StoreInst>(Member)->getValueOperand(), Part);
2327       if (Group->isReverse())
2328         StoredVec = reverseVector(StoredVec);
2329 
2330       // If this member has different type, cast it to a unified type.
2331 
2332       if (StoredVec->getType() != SubVT)
2333         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2334 
2335       StoredVecs.push_back(StoredVec);
2336     }
2337 
2338     // Concatenate all vectors into a wide vector.
2339     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2340 
2341     // Interleave the elements in the wide vector.
2342     Value *IVec = Builder.CreateShuffleVector(
2343         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2344         "interleaved.vec");
2345 
2346     Instruction *NewStoreInstr;
2347     if (BlockInMask) {
2348       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2349       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2350       Value *ShuffledMask = Builder.CreateShuffleVector(
2351           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2352           "interleaved.mask");
2353       NewStoreInstr = Builder.CreateMaskedStore(
2354           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2355     }
2356     else
2357       NewStoreInstr =
2358           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2359 
2360     Group->addMetadata(NewStoreInstr);
2361   }
2362 }
2363 
2364 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2365                                                      VPTransformState &State,
2366                                                      VPValue *Addr,
2367                                                      VPValue *StoredValue,
2368                                                      VPValue *BlockInMask) {
2369   // Attempt to issue a wide load.
2370   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2371   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2372 
2373   assert((LI || SI) && "Invalid Load/Store instruction");
2374   assert((!SI || StoredValue) && "No stored value provided for widened store");
2375   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2376 
2377   LoopVectorizationCostModel::InstWidening Decision =
2378       Cost->getWideningDecision(Instr, VF);
2379   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2380           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2381           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2382          "CM decision is not to widen the memory instruction");
2383 
2384   Type *ScalarDataTy = getMemInstValueType(Instr);
2385   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2386   // An alignment of 0 means target abi alignment. We need to use the scalar's
2387   // target abi alignment in such a case.
2388   const DataLayout &DL = Instr->getModule()->getDataLayout();
2389   const Align Alignment =
2390       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2391 
2392   // Determine if the pointer operand of the access is either consecutive or
2393   // reverse consecutive.
2394   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2395   bool ConsecutiveStride =
2396       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2397   bool CreateGatherScatter =
2398       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2399 
2400   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2401   // gather/scatter. Otherwise Decision should have been to Scalarize.
2402   assert((ConsecutiveStride || CreateGatherScatter) &&
2403          "The instruction should be scalarized");
2404   (void)ConsecutiveStride;
2405 
2406   VectorParts BlockInMaskParts(UF);
2407   bool isMaskRequired = BlockInMask;
2408   if (isMaskRequired)
2409     for (unsigned Part = 0; Part < UF; ++Part)
2410       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2411 
2412   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2413     // Calculate the pointer for the specific unroll-part.
2414     GetElementPtrInst *PartPtr = nullptr;
2415 
2416     bool InBounds = false;
2417     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2418       InBounds = gep->isInBounds();
2419 
2420     if (Reverse) {
2421       // If the address is consecutive but reversed, then the
2422       // wide store needs to start at the last vector element.
2423       PartPtr = cast<GetElementPtrInst>(
2424           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2425       PartPtr->setIsInBounds(InBounds);
2426       PartPtr = cast<GetElementPtrInst>(
2427           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2428       PartPtr->setIsInBounds(InBounds);
2429       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2430         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2431     } else {
2432       PartPtr = cast<GetElementPtrInst>(
2433           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2434       PartPtr->setIsInBounds(InBounds);
2435     }
2436 
2437     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2438     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2439   };
2440 
2441   // Handle Stores:
2442   if (SI) {
2443     setDebugLocFromInst(Builder, SI);
2444 
2445     for (unsigned Part = 0; Part < UF; ++Part) {
2446       Instruction *NewSI = nullptr;
2447       Value *StoredVal = State.get(StoredValue, Part);
2448       if (CreateGatherScatter) {
2449         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2450         Value *VectorGep = State.get(Addr, Part);
2451         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2452                                             MaskPart);
2453       } else {
2454         if (Reverse) {
2455           // If we store to reverse consecutive memory locations, then we need
2456           // to reverse the order of elements in the stored value.
2457           StoredVal = reverseVector(StoredVal);
2458           // We don't want to update the value in the map as it might be used in
2459           // another expression. So don't call resetVectorValue(StoredVal).
2460         }
2461         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2462         if (isMaskRequired)
2463           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2464                                             BlockInMaskParts[Part]);
2465         else
2466           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2467       }
2468       addMetadata(NewSI, SI);
2469     }
2470     return;
2471   }
2472 
2473   // Handle loads.
2474   assert(LI && "Must have a load instruction");
2475   setDebugLocFromInst(Builder, LI);
2476   for (unsigned Part = 0; Part < UF; ++Part) {
2477     Value *NewLI;
2478     if (CreateGatherScatter) {
2479       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2480       Value *VectorGep = State.get(Addr, Part);
2481       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2482                                          nullptr, "wide.masked.gather");
2483       addMetadata(NewLI, LI);
2484     } else {
2485       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2486       if (isMaskRequired)
2487         NewLI = Builder.CreateMaskedLoad(
2488             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2489             "wide.masked.load");
2490       else
2491         NewLI =
2492             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2493 
2494       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2495       addMetadata(NewLI, LI);
2496       if (Reverse)
2497         NewLI = reverseVector(NewLI);
2498     }
2499     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2500   }
2501 }
2502 
2503 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2504                                                const VPIteration &Instance,
2505                                                bool IfPredicateInstr) {
2506   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2507 
2508   setDebugLocFromInst(Builder, Instr);
2509 
2510   // Does this instruction return a value ?
2511   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2512 
2513   Instruction *Cloned = Instr->clone();
2514   if (!IsVoidRetTy)
2515     Cloned->setName(Instr->getName() + ".cloned");
2516 
2517   // Replace the operands of the cloned instructions with their scalar
2518   // equivalents in the new loop.
2519   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2520     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2521     Cloned->setOperand(op, NewOp);
2522   }
2523   addNewMetadata(Cloned, Instr);
2524 
2525   // Place the cloned scalar in the new loop.
2526   Builder.Insert(Cloned);
2527 
2528   // Add the cloned scalar to the scalar map entry.
2529   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2530 
2531   // If we just cloned a new assumption, add it the assumption cache.
2532   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2533     if (II->getIntrinsicID() == Intrinsic::assume)
2534       AC->registerAssumption(II);
2535 
2536   // End if-block.
2537   if (IfPredicateInstr)
2538     PredicatedInstructions.push_back(Cloned);
2539 }
2540 
2541 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2542                                                       Value *End, Value *Step,
2543                                                       Instruction *DL) {
2544   BasicBlock *Header = L->getHeader();
2545   BasicBlock *Latch = L->getLoopLatch();
2546   // As we're just creating this loop, it's possible no latch exists
2547   // yet. If so, use the header as this will be a single block loop.
2548   if (!Latch)
2549     Latch = Header;
2550 
2551   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2552   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2553   setDebugLocFromInst(Builder, OldInst);
2554   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2555 
2556   Builder.SetInsertPoint(Latch->getTerminator());
2557   setDebugLocFromInst(Builder, OldInst);
2558 
2559   // Create i+1 and fill the PHINode.
2560   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2561   Induction->addIncoming(Start, L->getLoopPreheader());
2562   Induction->addIncoming(Next, Latch);
2563   // Create the compare.
2564   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2565   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2566 
2567   // Now we have two terminators. Remove the old one from the block.
2568   Latch->getTerminator()->eraseFromParent();
2569 
2570   return Induction;
2571 }
2572 
2573 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2574   if (TripCount)
2575     return TripCount;
2576 
2577   assert(L && "Create Trip Count for null loop.");
2578   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2579   // Find the loop boundaries.
2580   ScalarEvolution *SE = PSE.getSE();
2581   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2582   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2583          "Invalid loop count");
2584 
2585   Type *IdxTy = Legal->getWidestInductionType();
2586   assert(IdxTy && "No type for induction");
2587 
2588   // The exit count might have the type of i64 while the phi is i32. This can
2589   // happen if we have an induction variable that is sign extended before the
2590   // compare. The only way that we get a backedge taken count is that the
2591   // induction variable was signed and as such will not overflow. In such a case
2592   // truncation is legal.
2593   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2594       IdxTy->getPrimitiveSizeInBits())
2595     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2596   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2597 
2598   // Get the total trip count from the count by adding 1.
2599   const SCEV *ExitCount = SE->getAddExpr(
2600       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2601 
2602   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2603 
2604   // Expand the trip count and place the new instructions in the preheader.
2605   // Notice that the pre-header does not change, only the loop body.
2606   SCEVExpander Exp(*SE, DL, "induction");
2607 
2608   // Count holds the overall loop count (N).
2609   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2610                                 L->getLoopPreheader()->getTerminator());
2611 
2612   if (TripCount->getType()->isPointerTy())
2613     TripCount =
2614         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2615                                     L->getLoopPreheader()->getTerminator());
2616 
2617   return TripCount;
2618 }
2619 
2620 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2621   if (VectorTripCount)
2622     return VectorTripCount;
2623 
2624   Value *TC = getOrCreateTripCount(L);
2625   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2626 
2627   Type *Ty = TC->getType();
2628   Constant *Step = ConstantInt::get(Ty, VF * UF);
2629 
2630   // If the tail is to be folded by masking, round the number of iterations N
2631   // up to a multiple of Step instead of rounding down. This is done by first
2632   // adding Step-1 and then rounding down. Note that it's ok if this addition
2633   // overflows: the vector induction variable will eventually wrap to zero given
2634   // that it starts at zero and its Step is a power of two; the loop will then
2635   // exit, with the last early-exit vector comparison also producing all-true.
2636   if (Cost->foldTailByMasking()) {
2637     assert(isPowerOf2_32(VF * UF) &&
2638            "VF*UF must be a power of 2 when folding tail by masking");
2639     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2640   }
2641 
2642   // Now we need to generate the expression for the part of the loop that the
2643   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2644   // iterations are not required for correctness, or N - Step, otherwise. Step
2645   // is equal to the vectorization factor (number of SIMD elements) times the
2646   // unroll factor (number of SIMD instructions).
2647   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2648 
2649   // If there is a non-reversed interleaved group that may speculatively access
2650   // memory out-of-bounds, we need to ensure that there will be at least one
2651   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2652   // the trip count, we set the remainder to be equal to the step. If the step
2653   // does not evenly divide the trip count, no adjustment is necessary since
2654   // there will already be scalar iterations. Note that the minimum iterations
2655   // check ensures that N >= Step.
2656   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2657     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2658     R = Builder.CreateSelect(IsZero, Step, R);
2659   }
2660 
2661   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2662 
2663   return VectorTripCount;
2664 }
2665 
2666 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2667                                                    const DataLayout &DL) {
2668   // Verify that V is a vector type with same number of elements as DstVTy.
2669   unsigned VF = DstVTy->getNumElements();
2670   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2671   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2672   Type *SrcElemTy = SrcVecTy->getElementType();
2673   Type *DstElemTy = DstVTy->getElementType();
2674   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2675          "Vector elements must have same size");
2676 
2677   // Do a direct cast if element types are castable.
2678   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2679     return Builder.CreateBitOrPointerCast(V, DstVTy);
2680   }
2681   // V cannot be directly casted to desired vector type.
2682   // May happen when V is a floating point vector but DstVTy is a vector of
2683   // pointers or vice-versa. Handle this using a two-step bitcast using an
2684   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2685   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2686          "Only one type should be a pointer type");
2687   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2688          "Only one type should be a floating point type");
2689   Type *IntTy =
2690       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2691   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2692   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2693   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2694 }
2695 
2696 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2697                                                          BasicBlock *Bypass) {
2698   Value *Count = getOrCreateTripCount(L);
2699   // Reuse existing vector loop preheader for TC checks.
2700   // Note that new preheader block is generated for vector loop.
2701   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2702   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2703 
2704   // Generate code to check if the loop's trip count is less than VF * UF, or
2705   // equal to it in case a scalar epilogue is required; this implies that the
2706   // vector trip count is zero. This check also covers the case where adding one
2707   // to the backedge-taken count overflowed leading to an incorrect trip count
2708   // of zero. In this case we will also jump to the scalar loop.
2709   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2710                                           : ICmpInst::ICMP_ULT;
2711 
2712   // If tail is to be folded, vector loop takes care of all iterations.
2713   Value *CheckMinIters = Builder.getFalse();
2714   if (!Cost->foldTailByMasking())
2715     CheckMinIters = Builder.CreateICmp(
2716         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2717         "min.iters.check");
2718 
2719   // Create new preheader for vector loop.
2720   LoopVectorPreHeader =
2721       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2722                  "vector.ph");
2723 
2724   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2725                                DT->getNode(Bypass)->getIDom()) &&
2726          "TC check is expected to dominate Bypass");
2727 
2728   // Update dominator for Bypass & LoopExit.
2729   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2730   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2731 
2732   ReplaceInstWithInst(
2733       TCCheckBlock->getTerminator(),
2734       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2735   LoopBypassBlocks.push_back(TCCheckBlock);
2736 }
2737 
2738 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2739   // Reuse existing vector loop preheader for SCEV checks.
2740   // Note that new preheader block is generated for vector loop.
2741   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2742 
2743   // Generate the code to check that the SCEV assumptions that we made.
2744   // We want the new basic block to start at the first instruction in a
2745   // sequence of instructions that form a check.
2746   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2747                    "scev.check");
2748   Value *SCEVCheck = Exp.expandCodeForPredicate(
2749       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2750 
2751   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2752     if (C->isZero())
2753       return;
2754 
2755   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2756          "Cannot SCEV check stride or overflow when optimizing for size");
2757 
2758   SCEVCheckBlock->setName("vector.scevcheck");
2759   // Create new preheader for vector loop.
2760   LoopVectorPreHeader =
2761       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2762                  nullptr, "vector.ph");
2763 
2764   // Update dominator only if this is first RT check.
2765   if (LoopBypassBlocks.empty()) {
2766     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2767     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2768   }
2769 
2770   ReplaceInstWithInst(
2771       SCEVCheckBlock->getTerminator(),
2772       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2773   LoopBypassBlocks.push_back(SCEVCheckBlock);
2774   AddedSafetyChecks = true;
2775 }
2776 
2777 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2778   // VPlan-native path does not do any analysis for runtime checks currently.
2779   if (EnableVPlanNativePath)
2780     return;
2781 
2782   // Reuse existing vector loop preheader for runtime memory checks.
2783   // Note that new preheader block is generated for vector loop.
2784   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2785 
2786   // Generate the code that checks in runtime if arrays overlap. We put the
2787   // checks into a separate block to make the more common case of few elements
2788   // faster.
2789   auto *LAI = Legal->getLAI();
2790   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2791   if (!RtPtrChecking.Need)
2792     return;
2793   Instruction *FirstCheckInst;
2794   Instruction *MemRuntimeCheck;
2795   std::tie(FirstCheckInst, MemRuntimeCheck) =
2796       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2797                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2798   if (!MemRuntimeCheck)
2799     return;
2800 
2801   if (MemCheckBlock->getParent()->hasOptSize()) {
2802     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2803            "Cannot emit memory checks when optimizing for size, unless forced "
2804            "to vectorize.");
2805     ORE->emit([&]() {
2806       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2807                                         L->getStartLoc(), L->getHeader())
2808              << "Code-size may be reduced by not forcing "
2809                 "vectorization, or by source-code modifications "
2810                 "eliminating the need for runtime checks "
2811                 "(e.g., adding 'restrict').";
2812     });
2813   }
2814 
2815   MemCheckBlock->setName("vector.memcheck");
2816   // Create new preheader for vector loop.
2817   LoopVectorPreHeader =
2818       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2819                  "vector.ph");
2820 
2821   // Update dominator only if this is first RT check.
2822   if (LoopBypassBlocks.empty()) {
2823     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2824     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2825   }
2826 
2827   ReplaceInstWithInst(
2828       MemCheckBlock->getTerminator(),
2829       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2830   LoopBypassBlocks.push_back(MemCheckBlock);
2831   AddedSafetyChecks = true;
2832 
2833   // We currently don't use LoopVersioning for the actual loop cloning but we
2834   // still use it to add the noalias metadata.
2835   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2836                                           PSE.getSE());
2837   LVer->prepareNoAliasMetadata();
2838 }
2839 
2840 Value *InnerLoopVectorizer::emitTransformedIndex(
2841     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2842     const InductionDescriptor &ID) const {
2843 
2844   SCEVExpander Exp(*SE, DL, "induction");
2845   auto Step = ID.getStep();
2846   auto StartValue = ID.getStartValue();
2847   assert(Index->getType() == Step->getType() &&
2848          "Index type does not match StepValue type");
2849 
2850   // Note: the IR at this point is broken. We cannot use SE to create any new
2851   // SCEV and then expand it, hoping that SCEV's simplification will give us
2852   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2853   // lead to various SCEV crashes. So all we can do is to use builder and rely
2854   // on InstCombine for future simplifications. Here we handle some trivial
2855   // cases only.
2856   auto CreateAdd = [&B](Value *X, Value *Y) {
2857     assert(X->getType() == Y->getType() && "Types don't match!");
2858     if (auto *CX = dyn_cast<ConstantInt>(X))
2859       if (CX->isZero())
2860         return Y;
2861     if (auto *CY = dyn_cast<ConstantInt>(Y))
2862       if (CY->isZero())
2863         return X;
2864     return B.CreateAdd(X, Y);
2865   };
2866 
2867   auto CreateMul = [&B](Value *X, Value *Y) {
2868     assert(X->getType() == Y->getType() && "Types don't match!");
2869     if (auto *CX = dyn_cast<ConstantInt>(X))
2870       if (CX->isOne())
2871         return Y;
2872     if (auto *CY = dyn_cast<ConstantInt>(Y))
2873       if (CY->isOne())
2874         return X;
2875     return B.CreateMul(X, Y);
2876   };
2877 
2878   switch (ID.getKind()) {
2879   case InductionDescriptor::IK_IntInduction: {
2880     assert(Index->getType() == StartValue->getType() &&
2881            "Index type does not match StartValue type");
2882     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2883       return B.CreateSub(StartValue, Index);
2884     auto *Offset = CreateMul(
2885         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2886     return CreateAdd(StartValue, Offset);
2887   }
2888   case InductionDescriptor::IK_PtrInduction: {
2889     assert(isa<SCEVConstant>(Step) &&
2890            "Expected constant step for pointer induction");
2891     return B.CreateGEP(
2892         StartValue->getType()->getPointerElementType(), StartValue,
2893         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2894                                            &*B.GetInsertPoint())));
2895   }
2896   case InductionDescriptor::IK_FpInduction: {
2897     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2898     auto InductionBinOp = ID.getInductionBinOp();
2899     assert(InductionBinOp &&
2900            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2901             InductionBinOp->getOpcode() == Instruction::FSub) &&
2902            "Original bin op should be defined for FP induction");
2903 
2904     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2905 
2906     // Floating point operations had to be 'fast' to enable the induction.
2907     FastMathFlags Flags;
2908     Flags.setFast();
2909 
2910     Value *MulExp = B.CreateFMul(StepValue, Index);
2911     if (isa<Instruction>(MulExp))
2912       // We have to check, the MulExp may be a constant.
2913       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2914 
2915     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2916                                "induction");
2917     if (isa<Instruction>(BOp))
2918       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2919 
2920     return BOp;
2921   }
2922   case InductionDescriptor::IK_NoInduction:
2923     return nullptr;
2924   }
2925   llvm_unreachable("invalid enum");
2926 }
2927 
2928 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2929   /*
2930    In this function we generate a new loop. The new loop will contain
2931    the vectorized instructions while the old loop will continue to run the
2932    scalar remainder.
2933 
2934        [ ] <-- loop iteration number check.
2935     /   |
2936    /    v
2937   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2938   |  /  |
2939   | /   v
2940   ||   [ ]     <-- vector pre header.
2941   |/    |
2942   |     v
2943   |    [  ] \
2944   |    [  ]_|   <-- vector loop.
2945   |     |
2946   |     v
2947   |   -[ ]   <--- middle-block.
2948   |  /  |
2949   | /   v
2950   -|- >[ ]     <--- new preheader.
2951    |    |
2952    |    v
2953    |   [ ] \
2954    |   [ ]_|   <-- old scalar loop to handle remainder.
2955     \   |
2956      \  v
2957       >[ ]     <-- exit block.
2958    ...
2959    */
2960 
2961   MDNode *OrigLoopID = OrigLoop->getLoopID();
2962 
2963   // Some loops have a single integer induction variable, while other loops
2964   // don't. One example is c++ iterators that often have multiple pointer
2965   // induction variables. In the code below we also support a case where we
2966   // don't have a single induction variable.
2967   //
2968   // We try to obtain an induction variable from the original loop as hard
2969   // as possible. However if we don't find one that:
2970   //   - is an integer
2971   //   - counts from zero, stepping by one
2972   //   - is the size of the widest induction variable type
2973   // then we create a new one.
2974   OldInduction = Legal->getPrimaryInduction();
2975   Type *IdxTy = Legal->getWidestInductionType();
2976 
2977   // Split the single block loop into the two loop structure described above.
2978   LoopScalarBody = OrigLoop->getHeader();
2979   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2980   LoopExitBlock = OrigLoop->getExitBlock();
2981   assert(LoopExitBlock && "Must have an exit block");
2982   assert(LoopVectorPreHeader && "Invalid loop structure");
2983 
2984   LoopMiddleBlock =
2985       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2986                  LI, nullptr, "middle.block");
2987   LoopScalarPreHeader =
2988       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2989                  nullptr, "scalar.ph");
2990   // We intentionally don't let SplitBlock to update LoopInfo since
2991   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2992   // LoopVectorBody is explicitly added to the correct place few lines later.
2993   LoopVectorBody =
2994       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2995                  nullptr, nullptr, "vector.body");
2996 
2997   // Update dominator for loop exit.
2998   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2999 
3000   // Create and register the new vector loop.
3001   Loop *Lp = LI->AllocateLoop();
3002   Loop *ParentLoop = OrigLoop->getParentLoop();
3003 
3004   // Insert the new loop into the loop nest and register the new basic blocks
3005   // before calling any utilities such as SCEV that require valid LoopInfo.
3006   if (ParentLoop) {
3007     ParentLoop->addChildLoop(Lp);
3008   } else {
3009     LI->addTopLevelLoop(Lp);
3010   }
3011   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3012 
3013   // Find the loop boundaries.
3014   Value *Count = getOrCreateTripCount(Lp);
3015 
3016   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3017 
3018   // Now, compare the new count to zero. If it is zero skip the vector loop and
3019   // jump to the scalar loop. This check also covers the case where the
3020   // backedge-taken count is uint##_max: adding one to it will overflow leading
3021   // to an incorrect trip count of zero. In this (rare) case we will also jump
3022   // to the scalar loop.
3023   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3024 
3025   // Generate the code to check any assumptions that we've made for SCEV
3026   // expressions.
3027   emitSCEVChecks(Lp, LoopScalarPreHeader);
3028 
3029   // Generate the code that checks in runtime if arrays overlap. We put the
3030   // checks into a separate block to make the more common case of few elements
3031   // faster.
3032   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3033 
3034   // Generate the induction variable.
3035   // The loop step is equal to the vectorization factor (num of SIMD elements)
3036   // times the unroll factor (num of SIMD instructions).
3037   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3038   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3039   Induction =
3040       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3041                               getDebugLocFromInstOrOperands(OldInduction));
3042 
3043   // We are going to resume the execution of the scalar loop.
3044   // Go over all of the induction variables that we found and fix the
3045   // PHIs that are left in the scalar version of the loop.
3046   // The starting values of PHI nodes depend on the counter of the last
3047   // iteration in the vectorized loop.
3048   // If we come from a bypass edge then we need to start from the original
3049   // start value.
3050 
3051   // This variable saves the new starting index for the scalar loop. It is used
3052   // to test if there are any tail iterations left once the vector loop has
3053   // completed.
3054   for (auto &InductionEntry : Legal->getInductionVars()) {
3055     PHINode *OrigPhi = InductionEntry.first;
3056     InductionDescriptor II = InductionEntry.second;
3057 
3058     // Create phi nodes to merge from the  backedge-taken check block.
3059     PHINode *BCResumeVal =
3060         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3061                         LoopScalarPreHeader->getTerminator());
3062     // Copy original phi DL over to the new one.
3063     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3064     Value *&EndValue = IVEndValues[OrigPhi];
3065     if (OrigPhi == OldInduction) {
3066       // We know what the end value is.
3067       EndValue = CountRoundDown;
3068     } else {
3069       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3070       Type *StepType = II.getStep()->getType();
3071       Instruction::CastOps CastOp =
3072           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3073       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3074       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3075       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3076       EndValue->setName("ind.end");
3077     }
3078 
3079     // The new PHI merges the original incoming value, in case of a bypass,
3080     // or the value at the end of the vectorized loop.
3081     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3082 
3083     // Fix the scalar body counter (PHI node).
3084     // The old induction's phi node in the scalar body needs the truncated
3085     // value.
3086     for (BasicBlock *BB : LoopBypassBlocks)
3087       BCResumeVal->addIncoming(II.getStartValue(), BB);
3088     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3089   }
3090 
3091   // We need the OrigLoop (scalar loop part) latch terminator to help
3092   // produce correct debug info for the middle block BB instructions.
3093   // The legality check stage guarantees that the loop will have a single
3094   // latch.
3095   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3096          "Scalar loop latch terminator isn't a branch");
3097   BranchInst *ScalarLatchBr =
3098       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3099 
3100   // Add a check in the middle block to see if we have completed
3101   // all of the iterations in the first vector loop.
3102   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3103   // If tail is to be folded, we know we don't need to run the remainder.
3104   Value *CmpN = Builder.getTrue();
3105   if (!Cost->foldTailByMasking()) {
3106     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3107                            CountRoundDown, "cmp.n",
3108                            LoopMiddleBlock->getTerminator());
3109 
3110     // Here we use the same DebugLoc as the scalar loop latch branch instead
3111     // of the corresponding compare because they may have ended up with
3112     // different line numbers and we want to avoid awkward line stepping while
3113     // debugging. Eg. if the compare has got a line number inside the loop.
3114     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3115   }
3116 
3117   BranchInst *BrInst =
3118       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3119   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3120   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3121 
3122   // Get ready to start creating new instructions into the vectorized body.
3123   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3124          "Inconsistent vector loop preheader");
3125   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3126 
3127   Optional<MDNode *> VectorizedLoopID =
3128       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3129                                       LLVMLoopVectorizeFollowupVectorized});
3130   if (VectorizedLoopID.hasValue()) {
3131     Lp->setLoopID(VectorizedLoopID.getValue());
3132 
3133     // Do not setAlreadyVectorized if loop attributes have been defined
3134     // explicitly.
3135     return LoopVectorPreHeader;
3136   }
3137 
3138   // Keep all loop hints from the original loop on the vector loop (we'll
3139   // replace the vectorizer-specific hints below).
3140   if (MDNode *LID = OrigLoop->getLoopID())
3141     Lp->setLoopID(LID);
3142 
3143   LoopVectorizeHints Hints(Lp, true, *ORE);
3144   Hints.setAlreadyVectorized();
3145 
3146 #ifdef EXPENSIVE_CHECKS
3147   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3148   LI->verify(*DT);
3149 #endif
3150 
3151   return LoopVectorPreHeader;
3152 }
3153 
3154 // Fix up external users of the induction variable. At this point, we are
3155 // in LCSSA form, with all external PHIs that use the IV having one input value,
3156 // coming from the remainder loop. We need those PHIs to also have a correct
3157 // value for the IV when arriving directly from the middle block.
3158 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3159                                        const InductionDescriptor &II,
3160                                        Value *CountRoundDown, Value *EndValue,
3161                                        BasicBlock *MiddleBlock) {
3162   // There are two kinds of external IV usages - those that use the value
3163   // computed in the last iteration (the PHI) and those that use the penultimate
3164   // value (the value that feeds into the phi from the loop latch).
3165   // We allow both, but they, obviously, have different values.
3166 
3167   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3168 
3169   DenseMap<Value *, Value *> MissingVals;
3170 
3171   // An external user of the last iteration's value should see the value that
3172   // the remainder loop uses to initialize its own IV.
3173   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3174   for (User *U : PostInc->users()) {
3175     Instruction *UI = cast<Instruction>(U);
3176     if (!OrigLoop->contains(UI)) {
3177       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3178       MissingVals[UI] = EndValue;
3179     }
3180   }
3181 
3182   // An external user of the penultimate value need to see EndValue - Step.
3183   // The simplest way to get this is to recompute it from the constituent SCEVs,
3184   // that is Start + (Step * (CRD - 1)).
3185   for (User *U : OrigPhi->users()) {
3186     auto *UI = cast<Instruction>(U);
3187     if (!OrigLoop->contains(UI)) {
3188       const DataLayout &DL =
3189           OrigLoop->getHeader()->getModule()->getDataLayout();
3190       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3191 
3192       IRBuilder<> B(MiddleBlock->getTerminator());
3193       Value *CountMinusOne = B.CreateSub(
3194           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3195       Value *CMO =
3196           !II.getStep()->getType()->isIntegerTy()
3197               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3198                              II.getStep()->getType())
3199               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3200       CMO->setName("cast.cmo");
3201       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3202       Escape->setName("ind.escape");
3203       MissingVals[UI] = Escape;
3204     }
3205   }
3206 
3207   for (auto &I : MissingVals) {
3208     PHINode *PHI = cast<PHINode>(I.first);
3209     // One corner case we have to handle is two IVs "chasing" each-other,
3210     // that is %IV2 = phi [...], [ %IV1, %latch ]
3211     // In this case, if IV1 has an external use, we need to avoid adding both
3212     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3213     // don't already have an incoming value for the middle block.
3214     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3215       PHI->addIncoming(I.second, MiddleBlock);
3216   }
3217 }
3218 
3219 namespace {
3220 
3221 struct CSEDenseMapInfo {
3222   static bool canHandle(const Instruction *I) {
3223     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3224            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3225   }
3226 
3227   static inline Instruction *getEmptyKey() {
3228     return DenseMapInfo<Instruction *>::getEmptyKey();
3229   }
3230 
3231   static inline Instruction *getTombstoneKey() {
3232     return DenseMapInfo<Instruction *>::getTombstoneKey();
3233   }
3234 
3235   static unsigned getHashValue(const Instruction *I) {
3236     assert(canHandle(I) && "Unknown instruction!");
3237     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3238                                                            I->value_op_end()));
3239   }
3240 
3241   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3242     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3243         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3244       return LHS == RHS;
3245     return LHS->isIdenticalTo(RHS);
3246   }
3247 };
3248 
3249 } // end anonymous namespace
3250 
3251 ///Perform cse of induction variable instructions.
3252 static void cse(BasicBlock *BB) {
3253   // Perform simple cse.
3254   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3255   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3256     Instruction *In = &*I++;
3257 
3258     if (!CSEDenseMapInfo::canHandle(In))
3259       continue;
3260 
3261     // Check if we can replace this instruction with any of the
3262     // visited instructions.
3263     if (Instruction *V = CSEMap.lookup(In)) {
3264       In->replaceAllUsesWith(V);
3265       In->eraseFromParent();
3266       continue;
3267     }
3268 
3269     CSEMap[In] = In;
3270   }
3271 }
3272 
3273 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3274                                                        unsigned VF,
3275                                                        bool &NeedToScalarize) {
3276   Function *F = CI->getCalledFunction();
3277   Type *ScalarRetTy = CI->getType();
3278   SmallVector<Type *, 4> Tys, ScalarTys;
3279   for (auto &ArgOp : CI->arg_operands())
3280     ScalarTys.push_back(ArgOp->getType());
3281 
3282   // Estimate cost of scalarized vector call. The source operands are assumed
3283   // to be vectors, so we need to extract individual elements from there,
3284   // execute VF scalar calls, and then gather the result into the vector return
3285   // value.
3286   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3287                                                  TTI::TCK_RecipThroughput);
3288   if (VF == 1)
3289     return ScalarCallCost;
3290 
3291   // Compute corresponding vector type for return value and arguments.
3292   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3293   for (Type *ScalarTy : ScalarTys)
3294     Tys.push_back(ToVectorTy(ScalarTy, VF));
3295 
3296   // Compute costs of unpacking argument values for the scalar calls and
3297   // packing the return values to a vector.
3298   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3299 
3300   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3301 
3302   // If we can't emit a vector call for this function, then the currently found
3303   // cost is the cost we need to return.
3304   NeedToScalarize = true;
3305   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3306   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3307 
3308   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3309     return Cost;
3310 
3311   // If the corresponding vector cost is cheaper, return its cost.
3312   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3313                                                  TTI::TCK_RecipThroughput);
3314   if (VectorCallCost < Cost) {
3315     NeedToScalarize = false;
3316     return VectorCallCost;
3317   }
3318   return Cost;
3319 }
3320 
3321 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3322                                                             unsigned VF) {
3323   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3324   assert(ID && "Expected intrinsic call!");
3325 
3326   FastMathFlags FMF;
3327   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3328     FMF = FPMO->getFastMathFlags();
3329 
3330   SmallVector<Value *, 4> Operands(CI->arg_operands());
3331   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF,
3332                                    TargetTransformInfo::TCK_RecipThroughput,
3333                                    CI);
3334 }
3335 
3336 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3337   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3338   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3339   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3340 }
3341 
3342 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3343   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3344   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3345   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3346 }
3347 
3348 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3349   // For every instruction `I` in MinBWs, truncate the operands, create a
3350   // truncated version of `I` and reextend its result. InstCombine runs
3351   // later and will remove any ext/trunc pairs.
3352   SmallPtrSet<Value *, 4> Erased;
3353   for (const auto &KV : Cost->getMinimalBitwidths()) {
3354     // If the value wasn't vectorized, we must maintain the original scalar
3355     // type. The absence of the value from VectorLoopValueMap indicates that it
3356     // wasn't vectorized.
3357     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3358       continue;
3359     for (unsigned Part = 0; Part < UF; ++Part) {
3360       Value *I = getOrCreateVectorValue(KV.first, Part);
3361       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3362           !isa<Instruction>(I))
3363         continue;
3364       Type *OriginalTy = I->getType();
3365       Type *ScalarTruncatedTy =
3366           IntegerType::get(OriginalTy->getContext(), KV.second);
3367       Type *TruncatedTy = VectorType::get(
3368           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3369       if (TruncatedTy == OriginalTy)
3370         continue;
3371 
3372       IRBuilder<> B(cast<Instruction>(I));
3373       auto ShrinkOperand = [&](Value *V) -> Value * {
3374         if (auto *ZI = dyn_cast<ZExtInst>(V))
3375           if (ZI->getSrcTy() == TruncatedTy)
3376             return ZI->getOperand(0);
3377         return B.CreateZExtOrTrunc(V, TruncatedTy);
3378       };
3379 
3380       // The actual instruction modification depends on the instruction type,
3381       // unfortunately.
3382       Value *NewI = nullptr;
3383       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3384         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3385                              ShrinkOperand(BO->getOperand(1)));
3386 
3387         // Any wrapping introduced by shrinking this operation shouldn't be
3388         // considered undefined behavior. So, we can't unconditionally copy
3389         // arithmetic wrapping flags to NewI.
3390         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3391       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3392         NewI =
3393             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3394                          ShrinkOperand(CI->getOperand(1)));
3395       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3396         NewI = B.CreateSelect(SI->getCondition(),
3397                               ShrinkOperand(SI->getTrueValue()),
3398                               ShrinkOperand(SI->getFalseValue()));
3399       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3400         switch (CI->getOpcode()) {
3401         default:
3402           llvm_unreachable("Unhandled cast!");
3403         case Instruction::Trunc:
3404           NewI = ShrinkOperand(CI->getOperand(0));
3405           break;
3406         case Instruction::SExt:
3407           NewI = B.CreateSExtOrTrunc(
3408               CI->getOperand(0),
3409               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3410           break;
3411         case Instruction::ZExt:
3412           NewI = B.CreateZExtOrTrunc(
3413               CI->getOperand(0),
3414               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3415           break;
3416         }
3417       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3418         auto Elements0 =
3419             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3420         auto *O0 = B.CreateZExtOrTrunc(
3421             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3422         auto Elements1 =
3423             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3424         auto *O1 = B.CreateZExtOrTrunc(
3425             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3426 
3427         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3428       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3429         // Don't do anything with the operands, just extend the result.
3430         continue;
3431       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3432         auto Elements =
3433             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3434         auto *O0 = B.CreateZExtOrTrunc(
3435             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3436         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3437         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3438       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3439         auto Elements =
3440             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3441         auto *O0 = B.CreateZExtOrTrunc(
3442             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3443         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3444       } else {
3445         // If we don't know what to do, be conservative and don't do anything.
3446         continue;
3447       }
3448 
3449       // Lastly, extend the result.
3450       NewI->takeName(cast<Instruction>(I));
3451       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3452       I->replaceAllUsesWith(Res);
3453       cast<Instruction>(I)->eraseFromParent();
3454       Erased.insert(I);
3455       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3456     }
3457   }
3458 
3459   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3460   for (const auto &KV : Cost->getMinimalBitwidths()) {
3461     // If the value wasn't vectorized, we must maintain the original scalar
3462     // type. The absence of the value from VectorLoopValueMap indicates that it
3463     // wasn't vectorized.
3464     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3465       continue;
3466     for (unsigned Part = 0; Part < UF; ++Part) {
3467       Value *I = getOrCreateVectorValue(KV.first, Part);
3468       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3469       if (Inst && Inst->use_empty()) {
3470         Value *NewI = Inst->getOperand(0);
3471         Inst->eraseFromParent();
3472         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3473       }
3474     }
3475   }
3476 }
3477 
3478 void InnerLoopVectorizer::fixVectorizedLoop() {
3479   // Insert truncates and extends for any truncated instructions as hints to
3480   // InstCombine.
3481   if (VF > 1)
3482     truncateToMinimalBitwidths();
3483 
3484   // Fix widened non-induction PHIs by setting up the PHI operands.
3485   if (OrigPHIsToFix.size()) {
3486     assert(EnableVPlanNativePath &&
3487            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3488     fixNonInductionPHIs();
3489   }
3490 
3491   // At this point every instruction in the original loop is widened to a
3492   // vector form. Now we need to fix the recurrences in the loop. These PHI
3493   // nodes are currently empty because we did not want to introduce cycles.
3494   // This is the second stage of vectorizing recurrences.
3495   fixCrossIterationPHIs();
3496 
3497   // Forget the original basic block.
3498   PSE.getSE()->forgetLoop(OrigLoop);
3499 
3500   // Fix-up external users of the induction variables.
3501   for (auto &Entry : Legal->getInductionVars())
3502     fixupIVUsers(Entry.first, Entry.second,
3503                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3504                  IVEndValues[Entry.first], LoopMiddleBlock);
3505 
3506   fixLCSSAPHIs();
3507   for (Instruction *PI : PredicatedInstructions)
3508     sinkScalarOperands(&*PI);
3509 
3510   // Remove redundant induction instructions.
3511   cse(LoopVectorBody);
3512 
3513   // Set/update profile weights for the vector and remainder loops as original
3514   // loop iterations are now distributed among them. Note that original loop
3515   // represented by LoopScalarBody becomes remainder loop after vectorization.
3516   //
3517   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3518   // end up getting slightly roughened result but that should be OK since
3519   // profile is not inherently precise anyway. Note also possible bypass of
3520   // vector code caused by legality checks is ignored, assigning all the weight
3521   // to the vector loop, optimistically.
3522   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3523                                LI->getLoopFor(LoopVectorBody),
3524                                LI->getLoopFor(LoopScalarBody), VF * UF);
3525 }
3526 
3527 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3528   // In order to support recurrences we need to be able to vectorize Phi nodes.
3529   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3530   // stage #2: We now need to fix the recurrences by adding incoming edges to
3531   // the currently empty PHI nodes. At this point every instruction in the
3532   // original loop is widened to a vector form so we can use them to construct
3533   // the incoming edges.
3534   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3535     // Handle first-order recurrences and reductions that need to be fixed.
3536     if (Legal->isFirstOrderRecurrence(&Phi))
3537       fixFirstOrderRecurrence(&Phi);
3538     else if (Legal->isReductionVariable(&Phi))
3539       fixReduction(&Phi);
3540   }
3541 }
3542 
3543 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3544   // This is the second phase of vectorizing first-order recurrences. An
3545   // overview of the transformation is described below. Suppose we have the
3546   // following loop.
3547   //
3548   //   for (int i = 0; i < n; ++i)
3549   //     b[i] = a[i] - a[i - 1];
3550   //
3551   // There is a first-order recurrence on "a". For this loop, the shorthand
3552   // scalar IR looks like:
3553   //
3554   //   scalar.ph:
3555   //     s_init = a[-1]
3556   //     br scalar.body
3557   //
3558   //   scalar.body:
3559   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3560   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3561   //     s2 = a[i]
3562   //     b[i] = s2 - s1
3563   //     br cond, scalar.body, ...
3564   //
3565   // In this example, s1 is a recurrence because it's value depends on the
3566   // previous iteration. In the first phase of vectorization, we created a
3567   // temporary value for s1. We now complete the vectorization and produce the
3568   // shorthand vector IR shown below (for VF = 4, UF = 1).
3569   //
3570   //   vector.ph:
3571   //     v_init = vector(..., ..., ..., a[-1])
3572   //     br vector.body
3573   //
3574   //   vector.body
3575   //     i = phi [0, vector.ph], [i+4, vector.body]
3576   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3577   //     v2 = a[i, i+1, i+2, i+3];
3578   //     v3 = vector(v1(3), v2(0, 1, 2))
3579   //     b[i, i+1, i+2, i+3] = v2 - v3
3580   //     br cond, vector.body, middle.block
3581   //
3582   //   middle.block:
3583   //     x = v2(3)
3584   //     br scalar.ph
3585   //
3586   //   scalar.ph:
3587   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3588   //     br scalar.body
3589   //
3590   // After execution completes the vector loop, we extract the next value of
3591   // the recurrence (x) to use as the initial value in the scalar loop.
3592 
3593   // Get the original loop preheader and single loop latch.
3594   auto *Preheader = OrigLoop->getLoopPreheader();
3595   auto *Latch = OrigLoop->getLoopLatch();
3596 
3597   // Get the initial and previous values of the scalar recurrence.
3598   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3599   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3600 
3601   // Create a vector from the initial value.
3602   auto *VectorInit = ScalarInit;
3603   if (VF > 1) {
3604     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3605     VectorInit = Builder.CreateInsertElement(
3606         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3607         Builder.getInt32(VF - 1), "vector.recur.init");
3608   }
3609 
3610   // We constructed a temporary phi node in the first phase of vectorization.
3611   // This phi node will eventually be deleted.
3612   Builder.SetInsertPoint(
3613       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3614 
3615   // Create a phi node for the new recurrence. The current value will either be
3616   // the initial value inserted into a vector or loop-varying vector value.
3617   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3618   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3619 
3620   // Get the vectorized previous value of the last part UF - 1. It appears last
3621   // among all unrolled iterations, due to the order of their construction.
3622   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3623 
3624   // Find and set the insertion point after the previous value if it is an
3625   // instruction.
3626   BasicBlock::iterator InsertPt;
3627   // Note that the previous value may have been constant-folded so it is not
3628   // guaranteed to be an instruction in the vector loop.
3629   // FIXME: Loop invariant values do not form recurrences. We should deal with
3630   //        them earlier.
3631   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3632     InsertPt = LoopVectorBody->getFirstInsertionPt();
3633   else {
3634     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3635     if (isa<PHINode>(PreviousLastPart))
3636       // If the previous value is a phi node, we should insert after all the phi
3637       // nodes in the block containing the PHI to avoid breaking basic block
3638       // verification. Note that the basic block may be different to
3639       // LoopVectorBody, in case we predicate the loop.
3640       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3641     else
3642       InsertPt = ++PreviousInst->getIterator();
3643   }
3644   Builder.SetInsertPoint(&*InsertPt);
3645 
3646   // We will construct a vector for the recurrence by combining the values for
3647   // the current and previous iterations. This is the required shuffle mask.
3648   SmallVector<int, 8> ShuffleMask(VF);
3649   ShuffleMask[0] = VF - 1;
3650   for (unsigned I = 1; I < VF; ++I)
3651     ShuffleMask[I] = I + VF - 1;
3652 
3653   // The vector from which to take the initial value for the current iteration
3654   // (actual or unrolled). Initially, this is the vector phi node.
3655   Value *Incoming = VecPhi;
3656 
3657   // Shuffle the current and previous vector and update the vector parts.
3658   for (unsigned Part = 0; Part < UF; ++Part) {
3659     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3660     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3661     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3662                                                          ShuffleMask)
3663                            : Incoming;
3664     PhiPart->replaceAllUsesWith(Shuffle);
3665     cast<Instruction>(PhiPart)->eraseFromParent();
3666     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3667     Incoming = PreviousPart;
3668   }
3669 
3670   // Fix the latch value of the new recurrence in the vector loop.
3671   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3672 
3673   // Extract the last vector element in the middle block. This will be the
3674   // initial value for the recurrence when jumping to the scalar loop.
3675   auto *ExtractForScalar = Incoming;
3676   if (VF > 1) {
3677     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3678     ExtractForScalar = Builder.CreateExtractElement(
3679         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3680   }
3681   // Extract the second last element in the middle block if the
3682   // Phi is used outside the loop. We need to extract the phi itself
3683   // and not the last element (the phi update in the current iteration). This
3684   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3685   // when the scalar loop is not run at all.
3686   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3687   if (VF > 1)
3688     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3689         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3690   // When loop is unrolled without vectorizing, initialize
3691   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3692   // `Incoming`. This is analogous to the vectorized case above: extracting the
3693   // second last element when VF > 1.
3694   else if (UF > 1)
3695     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3696 
3697   // Fix the initial value of the original recurrence in the scalar loop.
3698   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3699   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3700   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3701     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3702     Start->addIncoming(Incoming, BB);
3703   }
3704 
3705   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3706   Phi->setName("scalar.recur");
3707 
3708   // Finally, fix users of the recurrence outside the loop. The users will need
3709   // either the last value of the scalar recurrence or the last value of the
3710   // vector recurrence we extracted in the middle block. Since the loop is in
3711   // LCSSA form, we just need to find all the phi nodes for the original scalar
3712   // recurrence in the exit block, and then add an edge for the middle block.
3713   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3714     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3715       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3716     }
3717   }
3718 }
3719 
3720 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3721   Constant *Zero = Builder.getInt32(0);
3722 
3723   // Get it's reduction variable descriptor.
3724   assert(Legal->isReductionVariable(Phi) &&
3725          "Unable to find the reduction variable");
3726   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3727 
3728   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3729   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3730   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3731   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3732     RdxDesc.getMinMaxRecurrenceKind();
3733   setDebugLocFromInst(Builder, ReductionStartValue);
3734 
3735   // We need to generate a reduction vector from the incoming scalar.
3736   // To do so, we need to generate the 'identity' vector and override
3737   // one of the elements with the incoming scalar reduction. We need
3738   // to do it in the vector-loop preheader.
3739   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3740 
3741   // This is the vector-clone of the value that leaves the loop.
3742   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3743 
3744   // Find the reduction identity variable. Zero for addition, or, xor,
3745   // one for multiplication, -1 for And.
3746   Value *Identity;
3747   Value *VectorStart;
3748   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3749       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3750     // MinMax reduction have the start value as their identify.
3751     if (VF == 1) {
3752       VectorStart = Identity = ReductionStartValue;
3753     } else {
3754       VectorStart = Identity =
3755         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3756     }
3757   } else {
3758     // Handle other reduction kinds:
3759     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3760         RK, VecTy->getScalarType());
3761     if (VF == 1) {
3762       Identity = Iden;
3763       // This vector is the Identity vector where the first element is the
3764       // incoming scalar reduction.
3765       VectorStart = ReductionStartValue;
3766     } else {
3767       Identity = ConstantVector::getSplat({VF, false}, Iden);
3768 
3769       // This vector is the Identity vector where the first element is the
3770       // incoming scalar reduction.
3771       VectorStart =
3772         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3773     }
3774   }
3775 
3776   // Wrap flags are in general invalid after vectorization, clear them.
3777   clearReductionWrapFlags(RdxDesc);
3778 
3779   // Fix the vector-loop phi.
3780 
3781   // Reductions do not have to start at zero. They can start with
3782   // any loop invariant values.
3783   BasicBlock *Latch = OrigLoop->getLoopLatch();
3784   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3785 
3786   for (unsigned Part = 0; Part < UF; ++Part) {
3787     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3788     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3789     // Make sure to add the reduction start value only to the
3790     // first unroll part.
3791     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3792     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3793     cast<PHINode>(VecRdxPhi)
3794       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3795   }
3796 
3797   // Before each round, move the insertion point right between
3798   // the PHIs and the values we are going to write.
3799   // This allows us to write both PHINodes and the extractelement
3800   // instructions.
3801   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3802 
3803   setDebugLocFromInst(Builder, LoopExitInst);
3804 
3805   // If tail is folded by masking, the vector value to leave the loop should be
3806   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3807   // instead of the former.
3808   if (Cost->foldTailByMasking()) {
3809     for (unsigned Part = 0; Part < UF; ++Part) {
3810       Value *VecLoopExitInst =
3811           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3812       Value *Sel = nullptr;
3813       for (User *U : VecLoopExitInst->users()) {
3814         if (isa<SelectInst>(U)) {
3815           assert(!Sel && "Reduction exit feeding two selects");
3816           Sel = U;
3817         } else
3818           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3819       }
3820       assert(Sel && "Reduction exit feeds no select");
3821       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3822     }
3823   }
3824 
3825   // If the vector reduction can be performed in a smaller type, we truncate
3826   // then extend the loop exit value to enable InstCombine to evaluate the
3827   // entire expression in the smaller type.
3828   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3829     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3830     Builder.SetInsertPoint(
3831         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3832     VectorParts RdxParts(UF);
3833     for (unsigned Part = 0; Part < UF; ++Part) {
3834       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3835       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3836       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3837                                         : Builder.CreateZExt(Trunc, VecTy);
3838       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3839            UI != RdxParts[Part]->user_end();)
3840         if (*UI != Trunc) {
3841           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3842           RdxParts[Part] = Extnd;
3843         } else {
3844           ++UI;
3845         }
3846     }
3847     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3848     for (unsigned Part = 0; Part < UF; ++Part) {
3849       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3850       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3851     }
3852   }
3853 
3854   // Reduce all of the unrolled parts into a single vector.
3855   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3856   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3857 
3858   // The middle block terminator has already been assigned a DebugLoc here (the
3859   // OrigLoop's single latch terminator). We want the whole middle block to
3860   // appear to execute on this line because: (a) it is all compiler generated,
3861   // (b) these instructions are always executed after evaluating the latch
3862   // conditional branch, and (c) other passes may add new predecessors which
3863   // terminate on this line. This is the easiest way to ensure we don't
3864   // accidentally cause an extra step back into the loop while debugging.
3865   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3866   for (unsigned Part = 1; Part < UF; ++Part) {
3867     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3868     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3869       // Floating point operations had to be 'fast' to enable the reduction.
3870       ReducedPartRdx = addFastMathFlag(
3871           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3872                               ReducedPartRdx, "bin.rdx"),
3873           RdxDesc.getFastMathFlags());
3874     else
3875       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3876                                       RdxPart);
3877   }
3878 
3879   if (VF > 1) {
3880     bool NoNaN = Legal->hasFunNoNaNAttr();
3881     ReducedPartRdx =
3882         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3883     // If the reduction can be performed in a smaller type, we need to extend
3884     // the reduction to the wider type before we branch to the original loop.
3885     if (Phi->getType() != RdxDesc.getRecurrenceType())
3886       ReducedPartRdx =
3887         RdxDesc.isSigned()
3888         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3889         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3890   }
3891 
3892   // Create a phi node that merges control-flow from the backedge-taken check
3893   // block and the middle block.
3894   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3895                                         LoopScalarPreHeader->getTerminator());
3896   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3897     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3898   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3899 
3900   // Now, we need to fix the users of the reduction variable
3901   // inside and outside of the scalar remainder loop.
3902   // We know that the loop is in LCSSA form. We need to update the
3903   // PHI nodes in the exit blocks.
3904   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3905     // All PHINodes need to have a single entry edge, or two if
3906     // we already fixed them.
3907     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3908 
3909     // We found a reduction value exit-PHI. Update it with the
3910     // incoming bypass edge.
3911     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3912       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3913   } // end of the LCSSA phi scan.
3914 
3915     // Fix the scalar loop reduction variable with the incoming reduction sum
3916     // from the vector body and from the backedge value.
3917   int IncomingEdgeBlockIdx =
3918     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3919   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3920   // Pick the other block.
3921   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3922   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3923   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3924 }
3925 
3926 void InnerLoopVectorizer::clearReductionWrapFlags(
3927     RecurrenceDescriptor &RdxDesc) {
3928   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3929   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3930       RK != RecurrenceDescriptor::RK_IntegerMult)
3931     return;
3932 
3933   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3934   assert(LoopExitInstr && "null loop exit instruction");
3935   SmallVector<Instruction *, 8> Worklist;
3936   SmallPtrSet<Instruction *, 8> Visited;
3937   Worklist.push_back(LoopExitInstr);
3938   Visited.insert(LoopExitInstr);
3939 
3940   while (!Worklist.empty()) {
3941     Instruction *Cur = Worklist.pop_back_val();
3942     if (isa<OverflowingBinaryOperator>(Cur))
3943       for (unsigned Part = 0; Part < UF; ++Part) {
3944         Value *V = getOrCreateVectorValue(Cur, Part);
3945         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3946       }
3947 
3948     for (User *U : Cur->users()) {
3949       Instruction *UI = cast<Instruction>(U);
3950       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3951           Visited.insert(UI).second)
3952         Worklist.push_back(UI);
3953     }
3954   }
3955 }
3956 
3957 void InnerLoopVectorizer::fixLCSSAPHIs() {
3958   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3959     if (LCSSAPhi.getNumIncomingValues() == 1) {
3960       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3961       // Non-instruction incoming values will have only one value.
3962       unsigned LastLane = 0;
3963       if (isa<Instruction>(IncomingValue))
3964           LastLane = Cost->isUniformAfterVectorization(
3965                          cast<Instruction>(IncomingValue), VF)
3966                          ? 0
3967                          : VF - 1;
3968       // Can be a loop invariant incoming value or the last scalar value to be
3969       // extracted from the vectorized loop.
3970       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3971       Value *lastIncomingValue =
3972           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3973       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3974     }
3975   }
3976 }
3977 
3978 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3979   // The basic block and loop containing the predicated instruction.
3980   auto *PredBB = PredInst->getParent();
3981   auto *VectorLoop = LI->getLoopFor(PredBB);
3982 
3983   // Initialize a worklist with the operands of the predicated instruction.
3984   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3985 
3986   // Holds instructions that we need to analyze again. An instruction may be
3987   // reanalyzed if we don't yet know if we can sink it or not.
3988   SmallVector<Instruction *, 8> InstsToReanalyze;
3989 
3990   // Returns true if a given use occurs in the predicated block. Phi nodes use
3991   // their operands in their corresponding predecessor blocks.
3992   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3993     auto *I = cast<Instruction>(U.getUser());
3994     BasicBlock *BB = I->getParent();
3995     if (auto *Phi = dyn_cast<PHINode>(I))
3996       BB = Phi->getIncomingBlock(
3997           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3998     return BB == PredBB;
3999   };
4000 
4001   // Iteratively sink the scalarized operands of the predicated instruction
4002   // into the block we created for it. When an instruction is sunk, it's
4003   // operands are then added to the worklist. The algorithm ends after one pass
4004   // through the worklist doesn't sink a single instruction.
4005   bool Changed;
4006   do {
4007     // Add the instructions that need to be reanalyzed to the worklist, and
4008     // reset the changed indicator.
4009     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4010     InstsToReanalyze.clear();
4011     Changed = false;
4012 
4013     while (!Worklist.empty()) {
4014       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4015 
4016       // We can't sink an instruction if it is a phi node, is already in the
4017       // predicated block, is not in the loop, or may have side effects.
4018       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4019           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4020         continue;
4021 
4022       // It's legal to sink the instruction if all its uses occur in the
4023       // predicated block. Otherwise, there's nothing to do yet, and we may
4024       // need to reanalyze the instruction.
4025       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4026         InstsToReanalyze.push_back(I);
4027         continue;
4028       }
4029 
4030       // Move the instruction to the beginning of the predicated block, and add
4031       // it's operands to the worklist.
4032       I->moveBefore(&*PredBB->getFirstInsertionPt());
4033       Worklist.insert(I->op_begin(), I->op_end());
4034 
4035       // The sinking may have enabled other instructions to be sunk, so we will
4036       // need to iterate.
4037       Changed = true;
4038     }
4039   } while (Changed);
4040 }
4041 
4042 void InnerLoopVectorizer::fixNonInductionPHIs() {
4043   for (PHINode *OrigPhi : OrigPHIsToFix) {
4044     PHINode *NewPhi =
4045         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4046     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4047 
4048     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4049         predecessors(OrigPhi->getParent()));
4050     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4051         predecessors(NewPhi->getParent()));
4052     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4053            "Scalar and Vector BB should have the same number of predecessors");
4054 
4055     // The insertion point in Builder may be invalidated by the time we get
4056     // here. Force the Builder insertion point to something valid so that we do
4057     // not run into issues during insertion point restore in
4058     // getOrCreateVectorValue calls below.
4059     Builder.SetInsertPoint(NewPhi);
4060 
4061     // The predecessor order is preserved and we can rely on mapping between
4062     // scalar and vector block predecessors.
4063     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4064       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4065 
4066       // When looking up the new scalar/vector values to fix up, use incoming
4067       // values from original phi.
4068       Value *ScIncV =
4069           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4070 
4071       // Scalar incoming value may need a broadcast
4072       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4073       NewPhi->addIncoming(NewIncV, NewPredBB);
4074     }
4075   }
4076 }
4077 
4078 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4079                                    unsigned VF, bool IsPtrLoopInvariant,
4080                                    SmallBitVector &IsIndexLoopInvariant) {
4081   // Construct a vector GEP by widening the operands of the scalar GEP as
4082   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4083   // results in a vector of pointers when at least one operand of the GEP
4084   // is vector-typed. Thus, to keep the representation compact, we only use
4085   // vector-typed operands for loop-varying values.
4086 
4087   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4088     // If we are vectorizing, but the GEP has only loop-invariant operands,
4089     // the GEP we build (by only using vector-typed operands for
4090     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4091     // produce a vector of pointers, we need to either arbitrarily pick an
4092     // operand to broadcast, or broadcast a clone of the original GEP.
4093     // Here, we broadcast a clone of the original.
4094     //
4095     // TODO: If at some point we decide to scalarize instructions having
4096     //       loop-invariant operands, this special case will no longer be
4097     //       required. We would add the scalarization decision to
4098     //       collectLoopScalars() and teach getVectorValue() to broadcast
4099     //       the lane-zero scalar value.
4100     auto *Clone = Builder.Insert(GEP->clone());
4101     for (unsigned Part = 0; Part < UF; ++Part) {
4102       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4103       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4104       addMetadata(EntryPart, GEP);
4105     }
4106   } else {
4107     // If the GEP has at least one loop-varying operand, we are sure to
4108     // produce a vector of pointers. But if we are only unrolling, we want
4109     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4110     // produce with the code below will be scalar (if VF == 1) or vector
4111     // (otherwise). Note that for the unroll-only case, we still maintain
4112     // values in the vector mapping with initVector, as we do for other
4113     // instructions.
4114     for (unsigned Part = 0; Part < UF; ++Part) {
4115       // The pointer operand of the new GEP. If it's loop-invariant, we
4116       // won't broadcast it.
4117       auto *Ptr = IsPtrLoopInvariant
4118                       ? GEP->getPointerOperand()
4119                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4120 
4121       // Collect all the indices for the new GEP. If any index is
4122       // loop-invariant, we won't broadcast it.
4123       SmallVector<Value *, 4> Indices;
4124       for (auto Index : enumerate(GEP->indices())) {
4125         Value *User = Index.value().get();
4126         if (IsIndexLoopInvariant[Index.index()])
4127           Indices.push_back(User);
4128         else
4129           Indices.push_back(getOrCreateVectorValue(User, Part));
4130       }
4131 
4132       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4133       // but it should be a vector, otherwise.
4134       auto *NewGEP =
4135           GEP->isInBounds()
4136               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4137                                           Indices)
4138               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4139       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4140              "NewGEP is not a pointer vector");
4141       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4142       addMetadata(NewGEP, GEP);
4143     }
4144   }
4145 }
4146 
4147 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4148                                               unsigned VF) {
4149   PHINode *P = cast<PHINode>(PN);
4150   if (EnableVPlanNativePath) {
4151     // Currently we enter here in the VPlan-native path for non-induction
4152     // PHIs where all control flow is uniform. We simply widen these PHIs.
4153     // Create a vector phi with no operands - the vector phi operands will be
4154     // set at the end of vector code generation.
4155     Type *VecTy =
4156         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4157     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4158     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4159     OrigPHIsToFix.push_back(P);
4160 
4161     return;
4162   }
4163 
4164   assert(PN->getParent() == OrigLoop->getHeader() &&
4165          "Non-header phis should have been handled elsewhere");
4166 
4167   // In order to support recurrences we need to be able to vectorize Phi nodes.
4168   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4169   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4170   // this value when we vectorize all of the instructions that use the PHI.
4171   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4172     for (unsigned Part = 0; Part < UF; ++Part) {
4173       // This is phase one of vectorizing PHIs.
4174       Type *VecTy =
4175           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4176       Value *EntryPart = PHINode::Create(
4177           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4178       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4179     }
4180     return;
4181   }
4182 
4183   setDebugLocFromInst(Builder, P);
4184 
4185   // This PHINode must be an induction variable.
4186   // Make sure that we know about it.
4187   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4188 
4189   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4190   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4191 
4192   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4193   // which can be found from the original scalar operations.
4194   switch (II.getKind()) {
4195   case InductionDescriptor::IK_NoInduction:
4196     llvm_unreachable("Unknown induction");
4197   case InductionDescriptor::IK_IntInduction:
4198   case InductionDescriptor::IK_FpInduction:
4199     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4200   case InductionDescriptor::IK_PtrInduction: {
4201     // Handle the pointer induction variable case.
4202     assert(P->getType()->isPointerTy() && "Unexpected type.");
4203     // This is the normalized GEP that starts counting at zero.
4204     Value *PtrInd = Induction;
4205     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4206     // Determine the number of scalars we need to generate for each unroll
4207     // iteration. If the instruction is uniform, we only need to generate the
4208     // first lane. Otherwise, we generate all VF values.
4209     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4210     // These are the scalar results. Notice that we don't generate vector GEPs
4211     // because scalar GEPs result in better code.
4212     for (unsigned Part = 0; Part < UF; ++Part) {
4213       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4214         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4215         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4216         Value *SclrGep =
4217             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4218         SclrGep->setName("next.gep");
4219         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4220       }
4221     }
4222     return;
4223   }
4224   }
4225 }
4226 
4227 /// A helper function for checking whether an integer division-related
4228 /// instruction may divide by zero (in which case it must be predicated if
4229 /// executed conditionally in the scalar code).
4230 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4231 /// Non-zero divisors that are non compile-time constants will not be
4232 /// converted into multiplication, so we will still end up scalarizing
4233 /// the division, but can do so w/o predication.
4234 static bool mayDivideByZero(Instruction &I) {
4235   assert((I.getOpcode() == Instruction::UDiv ||
4236           I.getOpcode() == Instruction::SDiv ||
4237           I.getOpcode() == Instruction::URem ||
4238           I.getOpcode() == Instruction::SRem) &&
4239          "Unexpected instruction");
4240   Value *Divisor = I.getOperand(1);
4241   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4242   return !CInt || CInt->isZero();
4243 }
4244 
4245 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4246                                            VPTransformState &State) {
4247   switch (I.getOpcode()) {
4248   case Instruction::Call:
4249   case Instruction::Br:
4250   case Instruction::PHI:
4251   case Instruction::GetElementPtr:
4252   case Instruction::Select:
4253     llvm_unreachable("This instruction is handled by a different recipe.");
4254   case Instruction::UDiv:
4255   case Instruction::SDiv:
4256   case Instruction::SRem:
4257   case Instruction::URem:
4258   case Instruction::Add:
4259   case Instruction::FAdd:
4260   case Instruction::Sub:
4261   case Instruction::FSub:
4262   case Instruction::FNeg:
4263   case Instruction::Mul:
4264   case Instruction::FMul:
4265   case Instruction::FDiv:
4266   case Instruction::FRem:
4267   case Instruction::Shl:
4268   case Instruction::LShr:
4269   case Instruction::AShr:
4270   case Instruction::And:
4271   case Instruction::Or:
4272   case Instruction::Xor: {
4273     // Just widen unops and binops.
4274     setDebugLocFromInst(Builder, &I);
4275 
4276     for (unsigned Part = 0; Part < UF; ++Part) {
4277       SmallVector<Value *, 2> Ops;
4278       for (VPValue *VPOp : User.operands())
4279         Ops.push_back(State.get(VPOp, Part));
4280 
4281       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4282 
4283       if (auto *VecOp = dyn_cast<Instruction>(V))
4284         VecOp->copyIRFlags(&I);
4285 
4286       // Use this vector value for all users of the original instruction.
4287       VectorLoopValueMap.setVectorValue(&I, Part, V);
4288       addMetadata(V, &I);
4289     }
4290 
4291     break;
4292   }
4293   case Instruction::ICmp:
4294   case Instruction::FCmp: {
4295     // Widen compares. Generate vector compares.
4296     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4297     auto *Cmp = cast<CmpInst>(&I);
4298     setDebugLocFromInst(Builder, Cmp);
4299     for (unsigned Part = 0; Part < UF; ++Part) {
4300       Value *A = State.get(User.getOperand(0), Part);
4301       Value *B = State.get(User.getOperand(1), Part);
4302       Value *C = nullptr;
4303       if (FCmp) {
4304         // Propagate fast math flags.
4305         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4306         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4307         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4308       } else {
4309         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4310       }
4311       VectorLoopValueMap.setVectorValue(&I, Part, C);
4312       addMetadata(C, &I);
4313     }
4314 
4315     break;
4316   }
4317 
4318   case Instruction::ZExt:
4319   case Instruction::SExt:
4320   case Instruction::FPToUI:
4321   case Instruction::FPToSI:
4322   case Instruction::FPExt:
4323   case Instruction::PtrToInt:
4324   case Instruction::IntToPtr:
4325   case Instruction::SIToFP:
4326   case Instruction::UIToFP:
4327   case Instruction::Trunc:
4328   case Instruction::FPTrunc:
4329   case Instruction::BitCast: {
4330     auto *CI = cast<CastInst>(&I);
4331     setDebugLocFromInst(Builder, CI);
4332 
4333     /// Vectorize casts.
4334     Type *DestTy =
4335         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4336 
4337     for (unsigned Part = 0; Part < UF; ++Part) {
4338       Value *A = State.get(User.getOperand(0), Part);
4339       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4340       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4341       addMetadata(Cast, &I);
4342     }
4343     break;
4344   }
4345   default:
4346     // This instruction is not vectorized by simple widening.
4347     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4348     llvm_unreachable("Unhandled instruction!");
4349   } // end of switch.
4350 }
4351 
4352 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4353                                                VPTransformState &State) {
4354   assert(!isa<DbgInfoIntrinsic>(I) &&
4355          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4356   setDebugLocFromInst(Builder, &I);
4357 
4358   Module *M = I.getParent()->getParent()->getParent();
4359   auto *CI = cast<CallInst>(&I);
4360 
4361   SmallVector<Type *, 4> Tys;
4362   for (Value *ArgOperand : CI->arg_operands())
4363     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4364 
4365   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4366 
4367   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4368   // version of the instruction.
4369   // Is it beneficial to perform intrinsic call compared to lib call?
4370   bool NeedToScalarize = false;
4371   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4372   bool UseVectorIntrinsic =
4373       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4374   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4375          "Instruction should be scalarized elsewhere.");
4376 
4377   for (unsigned Part = 0; Part < UF; ++Part) {
4378     SmallVector<Value *, 4> Args;
4379     for (auto &I : enumerate(ArgOperands.operands())) {
4380       // Some intrinsics have a scalar argument - don't replace it with a
4381       // vector.
4382       Value *Arg;
4383       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4384         Arg = State.get(I.value(), Part);
4385       else
4386         Arg = State.get(I.value(), {0, 0});
4387       Args.push_back(Arg);
4388     }
4389 
4390     Function *VectorF;
4391     if (UseVectorIntrinsic) {
4392       // Use vector version of the intrinsic.
4393       Type *TysForDecl[] = {CI->getType()};
4394       if (VF > 1)
4395         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4396       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4397       assert(VectorF && "Can't retrieve vector intrinsic.");
4398     } else {
4399       // Use vector version of the function call.
4400       const VFShape Shape =
4401           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4402 #ifndef NDEBUG
4403       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4404              "Can't create vector function.");
4405 #endif
4406         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4407     }
4408       SmallVector<OperandBundleDef, 1> OpBundles;
4409       CI->getOperandBundlesAsDefs(OpBundles);
4410       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4411 
4412       if (isa<FPMathOperator>(V))
4413         V->copyFastMathFlags(CI);
4414 
4415       VectorLoopValueMap.setVectorValue(&I, Part, V);
4416       addMetadata(V, &I);
4417   }
4418 }
4419 
4420 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4421                                                  bool InvariantCond) {
4422   setDebugLocFromInst(Builder, &I);
4423 
4424   // The condition can be loop invariant  but still defined inside the
4425   // loop. This means that we can't just use the original 'cond' value.
4426   // We have to take the 'vectorized' value and pick the first lane.
4427   // Instcombine will make this a no-op.
4428 
4429   auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4430 
4431   for (unsigned Part = 0; Part < UF; ++Part) {
4432     Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4433     Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4434     Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4435     Value *Sel =
4436         Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4437     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4438     addMetadata(Sel, &I);
4439   }
4440 }
4441 
4442 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4443   // We should not collect Scalars more than once per VF. Right now, this
4444   // function is called from collectUniformsAndScalars(), which already does
4445   // this check. Collecting Scalars for VF=1 does not make any sense.
4446   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4447          "This function should not be visited twice for the same VF");
4448 
4449   SmallSetVector<Instruction *, 8> Worklist;
4450 
4451   // These sets are used to seed the analysis with pointers used by memory
4452   // accesses that will remain scalar.
4453   SmallSetVector<Instruction *, 8> ScalarPtrs;
4454   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4455 
4456   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4457   // The pointer operands of loads and stores will be scalar as long as the
4458   // memory access is not a gather or scatter operation. The value operand of a
4459   // store will remain scalar if the store is scalarized.
4460   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4461     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4462     assert(WideningDecision != CM_Unknown &&
4463            "Widening decision should be ready at this moment");
4464     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4465       if (Ptr == Store->getValueOperand())
4466         return WideningDecision == CM_Scalarize;
4467     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4468            "Ptr is neither a value or pointer operand");
4469     return WideningDecision != CM_GatherScatter;
4470   };
4471 
4472   // A helper that returns true if the given value is a bitcast or
4473   // getelementptr instruction contained in the loop.
4474   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4475     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4476             isa<GetElementPtrInst>(V)) &&
4477            !TheLoop->isLoopInvariant(V);
4478   };
4479 
4480   // A helper that evaluates a memory access's use of a pointer. If the use
4481   // will be a scalar use, and the pointer is only used by memory accesses, we
4482   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4483   // PossibleNonScalarPtrs.
4484   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4485     // We only care about bitcast and getelementptr instructions contained in
4486     // the loop.
4487     if (!isLoopVaryingBitCastOrGEP(Ptr))
4488       return;
4489 
4490     // If the pointer has already been identified as scalar (e.g., if it was
4491     // also identified as uniform), there's nothing to do.
4492     auto *I = cast<Instruction>(Ptr);
4493     if (Worklist.count(I))
4494       return;
4495 
4496     // If the use of the pointer will be a scalar use, and all users of the
4497     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4498     // place the pointer in PossibleNonScalarPtrs.
4499     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4500           return isa<LoadInst>(U) || isa<StoreInst>(U);
4501         }))
4502       ScalarPtrs.insert(I);
4503     else
4504       PossibleNonScalarPtrs.insert(I);
4505   };
4506 
4507   // We seed the scalars analysis with three classes of instructions: (1)
4508   // instructions marked uniform-after-vectorization, (2) bitcast and
4509   // getelementptr instructions used by memory accesses requiring a scalar use,
4510   // and (3) pointer induction variables and their update instructions (we
4511   // currently only scalarize these).
4512   //
4513   // (1) Add to the worklist all instructions that have been identified as
4514   // uniform-after-vectorization.
4515   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4516 
4517   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4518   // memory accesses requiring a scalar use. The pointer operands of loads and
4519   // stores will be scalar as long as the memory accesses is not a gather or
4520   // scatter operation. The value operand of a store will remain scalar if the
4521   // store is scalarized.
4522   for (auto *BB : TheLoop->blocks())
4523     for (auto &I : *BB) {
4524       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4525         evaluatePtrUse(Load, Load->getPointerOperand());
4526       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4527         evaluatePtrUse(Store, Store->getPointerOperand());
4528         evaluatePtrUse(Store, Store->getValueOperand());
4529       }
4530     }
4531   for (auto *I : ScalarPtrs)
4532     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4533       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4534       Worklist.insert(I);
4535     }
4536 
4537   // (3) Add to the worklist all pointer induction variables and their update
4538   // instructions.
4539   //
4540   // TODO: Once we are able to vectorize pointer induction variables we should
4541   //       no longer insert them into the worklist here.
4542   auto *Latch = TheLoop->getLoopLatch();
4543   for (auto &Induction : Legal->getInductionVars()) {
4544     auto *Ind = Induction.first;
4545     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4546     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4547       continue;
4548     Worklist.insert(Ind);
4549     Worklist.insert(IndUpdate);
4550     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4551     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4552                       << "\n");
4553   }
4554 
4555   // Insert the forced scalars.
4556   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4557   // induction variable when the PHI user is scalarized.
4558   auto ForcedScalar = ForcedScalars.find(VF);
4559   if (ForcedScalar != ForcedScalars.end())
4560     for (auto *I : ForcedScalar->second)
4561       Worklist.insert(I);
4562 
4563   // Expand the worklist by looking through any bitcasts and getelementptr
4564   // instructions we've already identified as scalar. This is similar to the
4565   // expansion step in collectLoopUniforms(); however, here we're only
4566   // expanding to include additional bitcasts and getelementptr instructions.
4567   unsigned Idx = 0;
4568   while (Idx != Worklist.size()) {
4569     Instruction *Dst = Worklist[Idx++];
4570     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4571       continue;
4572     auto *Src = cast<Instruction>(Dst->getOperand(0));
4573     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4574           auto *J = cast<Instruction>(U);
4575           return !TheLoop->contains(J) || Worklist.count(J) ||
4576                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4577                   isScalarUse(J, Src));
4578         })) {
4579       Worklist.insert(Src);
4580       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4581     }
4582   }
4583 
4584   // An induction variable will remain scalar if all users of the induction
4585   // variable and induction variable update remain scalar.
4586   for (auto &Induction : Legal->getInductionVars()) {
4587     auto *Ind = Induction.first;
4588     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4589 
4590     // We already considered pointer induction variables, so there's no reason
4591     // to look at their users again.
4592     //
4593     // TODO: Once we are able to vectorize pointer induction variables we
4594     //       should no longer skip over them here.
4595     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4596       continue;
4597 
4598     // If tail-folding is applied, the primary induction variable will be used
4599     // to feed a vector compare.
4600     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4601       continue;
4602 
4603     // Determine if all users of the induction variable are scalar after
4604     // vectorization.
4605     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4606       auto *I = cast<Instruction>(U);
4607       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4608     });
4609     if (!ScalarInd)
4610       continue;
4611 
4612     // Determine if all users of the induction variable update instruction are
4613     // scalar after vectorization.
4614     auto ScalarIndUpdate =
4615         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4616           auto *I = cast<Instruction>(U);
4617           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4618         });
4619     if (!ScalarIndUpdate)
4620       continue;
4621 
4622     // The induction variable and its update instruction will remain scalar.
4623     Worklist.insert(Ind);
4624     Worklist.insert(IndUpdate);
4625     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4626     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4627                       << "\n");
4628   }
4629 
4630   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4631 }
4632 
4633 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4634   if (!blockNeedsPredication(I->getParent()))
4635     return false;
4636   switch(I->getOpcode()) {
4637   default:
4638     break;
4639   case Instruction::Load:
4640   case Instruction::Store: {
4641     if (!Legal->isMaskRequired(I))
4642       return false;
4643     auto *Ptr = getLoadStorePointerOperand(I);
4644     auto *Ty = getMemInstValueType(I);
4645     // We have already decided how to vectorize this instruction, get that
4646     // result.
4647     if (VF > 1) {
4648       InstWidening WideningDecision = getWideningDecision(I, VF);
4649       assert(WideningDecision != CM_Unknown &&
4650              "Widening decision should be ready at this moment");
4651       return WideningDecision == CM_Scalarize;
4652     }
4653     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4654     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4655                                 isLegalMaskedGather(Ty, Alignment))
4656                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4657                                 isLegalMaskedScatter(Ty, Alignment));
4658   }
4659   case Instruction::UDiv:
4660   case Instruction::SDiv:
4661   case Instruction::SRem:
4662   case Instruction::URem:
4663     return mayDivideByZero(*I);
4664   }
4665   return false;
4666 }
4667 
4668 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4669                                                                unsigned VF) {
4670   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4671   assert(getWideningDecision(I, VF) == CM_Unknown &&
4672          "Decision should not be set yet.");
4673   auto *Group = getInterleavedAccessGroup(I);
4674   assert(Group && "Must have a group.");
4675 
4676   // If the instruction's allocated size doesn't equal it's type size, it
4677   // requires padding and will be scalarized.
4678   auto &DL = I->getModule()->getDataLayout();
4679   auto *ScalarTy = getMemInstValueType(I);
4680   if (hasIrregularType(ScalarTy, DL, VF))
4681     return false;
4682 
4683   // Check if masking is required.
4684   // A Group may need masking for one of two reasons: it resides in a block that
4685   // needs predication, or it was decided to use masking to deal with gaps.
4686   bool PredicatedAccessRequiresMasking =
4687       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4688   bool AccessWithGapsRequiresMasking =
4689       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4690   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4691     return true;
4692 
4693   // If masked interleaving is required, we expect that the user/target had
4694   // enabled it, because otherwise it either wouldn't have been created or
4695   // it should have been invalidated by the CostModel.
4696   assert(useMaskedInterleavedAccesses(TTI) &&
4697          "Masked interleave-groups for predicated accesses are not enabled.");
4698 
4699   auto *Ty = getMemInstValueType(I);
4700   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4701   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4702                           : TTI.isLegalMaskedStore(Ty, Alignment);
4703 }
4704 
4705 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4706                                                                unsigned VF) {
4707   // Get and ensure we have a valid memory instruction.
4708   LoadInst *LI = dyn_cast<LoadInst>(I);
4709   StoreInst *SI = dyn_cast<StoreInst>(I);
4710   assert((LI || SI) && "Invalid memory instruction");
4711 
4712   auto *Ptr = getLoadStorePointerOperand(I);
4713 
4714   // In order to be widened, the pointer should be consecutive, first of all.
4715   if (!Legal->isConsecutivePtr(Ptr))
4716     return false;
4717 
4718   // If the instruction is a store located in a predicated block, it will be
4719   // scalarized.
4720   if (isScalarWithPredication(I))
4721     return false;
4722 
4723   // If the instruction's allocated size doesn't equal it's type size, it
4724   // requires padding and will be scalarized.
4725   auto &DL = I->getModule()->getDataLayout();
4726   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4727   if (hasIrregularType(ScalarTy, DL, VF))
4728     return false;
4729 
4730   return true;
4731 }
4732 
4733 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4734   // We should not collect Uniforms more than once per VF. Right now,
4735   // this function is called from collectUniformsAndScalars(), which
4736   // already does this check. Collecting Uniforms for VF=1 does not make any
4737   // sense.
4738 
4739   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4740          "This function should not be visited twice for the same VF");
4741 
4742   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4743   // not analyze again.  Uniforms.count(VF) will return 1.
4744   Uniforms[VF].clear();
4745 
4746   // We now know that the loop is vectorizable!
4747   // Collect instructions inside the loop that will remain uniform after
4748   // vectorization.
4749 
4750   // Global values, params and instructions outside of current loop are out of
4751   // scope.
4752   auto isOutOfScope = [&](Value *V) -> bool {
4753     Instruction *I = dyn_cast<Instruction>(V);
4754     return (!I || !TheLoop->contains(I));
4755   };
4756 
4757   SetVector<Instruction *> Worklist;
4758   BasicBlock *Latch = TheLoop->getLoopLatch();
4759 
4760   // Instructions that are scalar with predication must not be considered
4761   // uniform after vectorization, because that would create an erroneous
4762   // replicating region where only a single instance out of VF should be formed.
4763   // TODO: optimize such seldom cases if found important, see PR40816.
4764   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4765     if (isScalarWithPredication(I, VF)) {
4766       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4767                         << *I << "\n");
4768       return;
4769     }
4770     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4771     Worklist.insert(I);
4772   };
4773 
4774   // Start with the conditional branch. If the branch condition is an
4775   // instruction contained in the loop that is only used by the branch, it is
4776   // uniform.
4777   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4778   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4779     addToWorklistIfAllowed(Cmp);
4780 
4781   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4782   // are pointers that are treated like consecutive pointers during
4783   // vectorization. The pointer operands of interleaved accesses are an
4784   // example.
4785   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4786 
4787   // Holds pointer operands of instructions that are possibly non-uniform.
4788   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4789 
4790   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4791     InstWidening WideningDecision = getWideningDecision(I, VF);
4792     assert(WideningDecision != CM_Unknown &&
4793            "Widening decision should be ready at this moment");
4794 
4795     return (WideningDecision == CM_Widen ||
4796             WideningDecision == CM_Widen_Reverse ||
4797             WideningDecision == CM_Interleave);
4798   };
4799   // Iterate over the instructions in the loop, and collect all
4800   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4801   // that a consecutive-like pointer operand will be scalarized, we collect it
4802   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4803   // getelementptr instruction can be used by both vectorized and scalarized
4804   // memory instructions. For example, if a loop loads and stores from the same
4805   // location, but the store is conditional, the store will be scalarized, and
4806   // the getelementptr won't remain uniform.
4807   for (auto *BB : TheLoop->blocks())
4808     for (auto &I : *BB) {
4809       // If there's no pointer operand, there's nothing to do.
4810       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4811       if (!Ptr)
4812         continue;
4813 
4814       // True if all users of Ptr are memory accesses that have Ptr as their
4815       // pointer operand.
4816       auto UsersAreMemAccesses =
4817           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4818             return getLoadStorePointerOperand(U) == Ptr;
4819           });
4820 
4821       // Ensure the memory instruction will not be scalarized or used by
4822       // gather/scatter, making its pointer operand non-uniform. If the pointer
4823       // operand is used by any instruction other than a memory access, we
4824       // conservatively assume the pointer operand may be non-uniform.
4825       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4826         PossibleNonUniformPtrs.insert(Ptr);
4827 
4828       // If the memory instruction will be vectorized and its pointer operand
4829       // is consecutive-like, or interleaving - the pointer operand should
4830       // remain uniform.
4831       else
4832         ConsecutiveLikePtrs.insert(Ptr);
4833     }
4834 
4835   // Add to the Worklist all consecutive and consecutive-like pointers that
4836   // aren't also identified as possibly non-uniform.
4837   for (auto *V : ConsecutiveLikePtrs)
4838     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4839       addToWorklistIfAllowed(V);
4840 
4841   // Expand Worklist in topological order: whenever a new instruction
4842   // is added , its users should be already inside Worklist.  It ensures
4843   // a uniform instruction will only be used by uniform instructions.
4844   unsigned idx = 0;
4845   while (idx != Worklist.size()) {
4846     Instruction *I = Worklist[idx++];
4847 
4848     for (auto OV : I->operand_values()) {
4849       // isOutOfScope operands cannot be uniform instructions.
4850       if (isOutOfScope(OV))
4851         continue;
4852       // First order recurrence Phi's should typically be considered
4853       // non-uniform.
4854       auto *OP = dyn_cast<PHINode>(OV);
4855       if (OP && Legal->isFirstOrderRecurrence(OP))
4856         continue;
4857       // If all the users of the operand are uniform, then add the
4858       // operand into the uniform worklist.
4859       auto *OI = cast<Instruction>(OV);
4860       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4861             auto *J = cast<Instruction>(U);
4862             return Worklist.count(J) ||
4863                    (OI == getLoadStorePointerOperand(J) &&
4864                     isUniformDecision(J, VF));
4865           }))
4866         addToWorklistIfAllowed(OI);
4867     }
4868   }
4869 
4870   // Returns true if Ptr is the pointer operand of a memory access instruction
4871   // I, and I is known to not require scalarization.
4872   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4873     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4874   };
4875 
4876   // For an instruction to be added into Worklist above, all its users inside
4877   // the loop should also be in Worklist. However, this condition cannot be
4878   // true for phi nodes that form a cyclic dependence. We must process phi
4879   // nodes separately. An induction variable will remain uniform if all users
4880   // of the induction variable and induction variable update remain uniform.
4881   // The code below handles both pointer and non-pointer induction variables.
4882   for (auto &Induction : Legal->getInductionVars()) {
4883     auto *Ind = Induction.first;
4884     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4885 
4886     // Determine if all users of the induction variable are uniform after
4887     // vectorization.
4888     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4889       auto *I = cast<Instruction>(U);
4890       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4891              isVectorizedMemAccessUse(I, Ind);
4892     });
4893     if (!UniformInd)
4894       continue;
4895 
4896     // Determine if all users of the induction variable update instruction are
4897     // uniform after vectorization.
4898     auto UniformIndUpdate =
4899         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4900           auto *I = cast<Instruction>(U);
4901           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4902                  isVectorizedMemAccessUse(I, IndUpdate);
4903         });
4904     if (!UniformIndUpdate)
4905       continue;
4906 
4907     // The induction variable and its update instruction will remain uniform.
4908     addToWorklistIfAllowed(Ind);
4909     addToWorklistIfAllowed(IndUpdate);
4910   }
4911 
4912   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4913 }
4914 
4915 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4916   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4917 
4918   if (Legal->getRuntimePointerChecking()->Need) {
4919     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4920         "runtime pointer checks needed. Enable vectorization of this "
4921         "loop with '#pragma clang loop vectorize(enable)' when "
4922         "compiling with -Os/-Oz",
4923         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4924     return true;
4925   }
4926 
4927   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4928     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4929         "runtime SCEV checks needed. Enable vectorization of this "
4930         "loop with '#pragma clang loop vectorize(enable)' when "
4931         "compiling with -Os/-Oz",
4932         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4933     return true;
4934   }
4935 
4936   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4937   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4938     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4939         "runtime stride == 1 checks needed. Enable vectorization of "
4940         "this loop with '#pragma clang loop vectorize(enable)' when "
4941         "compiling with -Os/-Oz",
4942         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4943     return true;
4944   }
4945 
4946   return false;
4947 }
4948 
4949 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4950   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4951     // TODO: It may by useful to do since it's still likely to be dynamically
4952     // uniform if the target can skip.
4953     reportVectorizationFailure(
4954         "Not inserting runtime ptr check for divergent target",
4955         "runtime pointer checks needed. Not enabled for divergent target",
4956         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4957     return None;
4958   }
4959 
4960   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4961   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4962   if (TC == 1) {
4963     reportVectorizationFailure("Single iteration (non) loop",
4964         "loop trip count is one, irrelevant for vectorization",
4965         "SingleIterationLoop", ORE, TheLoop);
4966     return None;
4967   }
4968 
4969   switch (ScalarEpilogueStatus) {
4970   case CM_ScalarEpilogueAllowed:
4971     return computeFeasibleMaxVF(TC);
4972   case CM_ScalarEpilogueNotNeededUsePredicate:
4973     LLVM_DEBUG(
4974         dbgs() << "LV: vector predicate hint/switch found.\n"
4975                << "LV: Not allowing scalar epilogue, creating predicated "
4976                << "vector loop.\n");
4977     break;
4978   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4979     // fallthrough as a special case of OptForSize
4980   case CM_ScalarEpilogueNotAllowedOptSize:
4981     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4982       LLVM_DEBUG(
4983           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4984     else
4985       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4986                         << "count.\n");
4987 
4988     // Bail if runtime checks are required, which are not good when optimising
4989     // for size.
4990     if (runtimeChecksRequired())
4991       return None;
4992     break;
4993   }
4994 
4995   // Now try the tail folding
4996 
4997   // Invalidate interleave groups that require an epilogue if we can't mask
4998   // the interleave-group.
4999   if (!useMaskedInterleavedAccesses(TTI)) {
5000     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5001            "No decisions should have been taken at this point");
5002     // Note: There is no need to invalidate any cost modeling decisions here, as
5003     // non where taken so far.
5004     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5005   }
5006 
5007   unsigned MaxVF = computeFeasibleMaxVF(TC);
5008   if (TC > 0 && TC % MaxVF == 0) {
5009     // Accept MaxVF if we do not have a tail.
5010     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5011     return MaxVF;
5012   }
5013 
5014   // If we don't know the precise trip count, or if the trip count that we
5015   // found modulo the vectorization factor is not zero, try to fold the tail
5016   // by masking.
5017   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5018   if (Legal->prepareToFoldTailByMasking()) {
5019     FoldTailByMasking = true;
5020     return MaxVF;
5021   }
5022 
5023   if (TC == 0) {
5024     reportVectorizationFailure(
5025         "Unable to calculate the loop count due to complex control flow",
5026         "unable to calculate the loop count due to complex control flow",
5027         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5028     return None;
5029   }
5030 
5031   reportVectorizationFailure(
5032       "Cannot optimize for size and vectorize at the same time.",
5033       "cannot optimize for size and vectorize at the same time. "
5034       "Enable vectorization of this loop with '#pragma clang loop "
5035       "vectorize(enable)' when compiling with -Os/-Oz",
5036       "NoTailLoopWithOptForSize", ORE, TheLoop);
5037   return None;
5038 }
5039 
5040 unsigned
5041 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5042   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5043   unsigned SmallestType, WidestType;
5044   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5045   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5046 
5047   // Get the maximum safe dependence distance in bits computed by LAA.
5048   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5049   // the memory accesses that is most restrictive (involved in the smallest
5050   // dependence distance).
5051   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5052 
5053   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5054 
5055   unsigned MaxVectorSize = WidestRegister / WidestType;
5056 
5057   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5058                     << " / " << WidestType << " bits.\n");
5059   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5060                     << WidestRegister << " bits.\n");
5061 
5062   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5063                                  " into one vector!");
5064   if (MaxVectorSize == 0) {
5065     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5066     MaxVectorSize = 1;
5067     return MaxVectorSize;
5068   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5069              isPowerOf2_32(ConstTripCount)) {
5070     // We need to clamp the VF to be the ConstTripCount. There is no point in
5071     // choosing a higher viable VF as done in the loop below.
5072     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5073                       << ConstTripCount << "\n");
5074     MaxVectorSize = ConstTripCount;
5075     return MaxVectorSize;
5076   }
5077 
5078   unsigned MaxVF = MaxVectorSize;
5079   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5080       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5081     // Collect all viable vectorization factors larger than the default MaxVF
5082     // (i.e. MaxVectorSize).
5083     SmallVector<unsigned, 8> VFs;
5084     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5085     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5086       VFs.push_back(VS);
5087 
5088     // For each VF calculate its register usage.
5089     auto RUs = calculateRegisterUsage(VFs);
5090 
5091     // Select the largest VF which doesn't require more registers than existing
5092     // ones.
5093     for (int i = RUs.size() - 1; i >= 0; --i) {
5094       bool Selected = true;
5095       for (auto& pair : RUs[i].MaxLocalUsers) {
5096         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5097         if (pair.second > TargetNumRegisters)
5098           Selected = false;
5099       }
5100       if (Selected) {
5101         MaxVF = VFs[i];
5102         break;
5103       }
5104     }
5105     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5106       if (MaxVF < MinVF) {
5107         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5108                           << ") with target's minimum: " << MinVF << '\n');
5109         MaxVF = MinVF;
5110       }
5111     }
5112   }
5113   return MaxVF;
5114 }
5115 
5116 VectorizationFactor
5117 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5118   float Cost = expectedCost(1).first;
5119   const float ScalarCost = Cost;
5120   unsigned Width = 1;
5121   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5122 
5123   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5124   if (ForceVectorization && MaxVF > 1) {
5125     // Ignore scalar width, because the user explicitly wants vectorization.
5126     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5127     // evaluation.
5128     Cost = std::numeric_limits<float>::max();
5129   }
5130 
5131   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5132     // Notice that the vector loop needs to be executed less times, so
5133     // we need to divide the cost of the vector loops by the width of
5134     // the vector elements.
5135     VectorizationCostTy C = expectedCost(i);
5136     float VectorCost = C.first / (float)i;
5137     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5138                       << " costs: " << (int)VectorCost << ".\n");
5139     if (!C.second && !ForceVectorization) {
5140       LLVM_DEBUG(
5141           dbgs() << "LV: Not considering vector loop of width " << i
5142                  << " because it will not generate any vector instructions.\n");
5143       continue;
5144     }
5145     if (VectorCost < Cost) {
5146       Cost = VectorCost;
5147       Width = i;
5148     }
5149   }
5150 
5151   if (!EnableCondStoresVectorization && NumPredStores) {
5152     reportVectorizationFailure("There are conditional stores.",
5153         "store that is conditionally executed prevents vectorization",
5154         "ConditionalStore", ORE, TheLoop);
5155     Width = 1;
5156     Cost = ScalarCost;
5157   }
5158 
5159   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5160              << "LV: Vectorization seems to be not beneficial, "
5161              << "but was forced by a user.\n");
5162   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5163   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5164   return Factor;
5165 }
5166 
5167 std::pair<unsigned, unsigned>
5168 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5169   unsigned MinWidth = -1U;
5170   unsigned MaxWidth = 8;
5171   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5172 
5173   // For each block.
5174   for (BasicBlock *BB : TheLoop->blocks()) {
5175     // For each instruction in the loop.
5176     for (Instruction &I : BB->instructionsWithoutDebug()) {
5177       Type *T = I.getType();
5178 
5179       // Skip ignored values.
5180       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5181         continue;
5182 
5183       // Only examine Loads, Stores and PHINodes.
5184       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5185         continue;
5186 
5187       // Examine PHI nodes that are reduction variables. Update the type to
5188       // account for the recurrence type.
5189       if (auto *PN = dyn_cast<PHINode>(&I)) {
5190         if (!Legal->isReductionVariable(PN))
5191           continue;
5192         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5193         T = RdxDesc.getRecurrenceType();
5194       }
5195 
5196       // Examine the stored values.
5197       if (auto *ST = dyn_cast<StoreInst>(&I))
5198         T = ST->getValueOperand()->getType();
5199 
5200       // Ignore loaded pointer types and stored pointer types that are not
5201       // vectorizable.
5202       //
5203       // FIXME: The check here attempts to predict whether a load or store will
5204       //        be vectorized. We only know this for certain after a VF has
5205       //        been selected. Here, we assume that if an access can be
5206       //        vectorized, it will be. We should also look at extending this
5207       //        optimization to non-pointer types.
5208       //
5209       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5210           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5211         continue;
5212 
5213       MinWidth = std::min(MinWidth,
5214                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5215       MaxWidth = std::max(MaxWidth,
5216                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5217     }
5218   }
5219 
5220   return {MinWidth, MaxWidth};
5221 }
5222 
5223 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5224                                                            unsigned LoopCost) {
5225   // -- The interleave heuristics --
5226   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5227   // There are many micro-architectural considerations that we can't predict
5228   // at this level. For example, frontend pressure (on decode or fetch) due to
5229   // code size, or the number and capabilities of the execution ports.
5230   //
5231   // We use the following heuristics to select the interleave count:
5232   // 1. If the code has reductions, then we interleave to break the cross
5233   // iteration dependency.
5234   // 2. If the loop is really small, then we interleave to reduce the loop
5235   // overhead.
5236   // 3. We don't interleave if we think that we will spill registers to memory
5237   // due to the increased register pressure.
5238 
5239   if (!isScalarEpilogueAllowed())
5240     return 1;
5241 
5242   // We used the distance for the interleave count.
5243   if (Legal->getMaxSafeDepDistBytes() != -1U)
5244     return 1;
5245 
5246   // Do not interleave loops with a relatively small known or estimated trip
5247   // count.
5248   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5249   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5250     return 1;
5251 
5252   RegisterUsage R = calculateRegisterUsage({VF})[0];
5253   // We divide by these constants so assume that we have at least one
5254   // instruction that uses at least one register.
5255   for (auto& pair : R.MaxLocalUsers) {
5256     pair.second = std::max(pair.second, 1U);
5257   }
5258 
5259   // We calculate the interleave count using the following formula.
5260   // Subtract the number of loop invariants from the number of available
5261   // registers. These registers are used by all of the interleaved instances.
5262   // Next, divide the remaining registers by the number of registers that is
5263   // required by the loop, in order to estimate how many parallel instances
5264   // fit without causing spills. All of this is rounded down if necessary to be
5265   // a power of two. We want power of two interleave count to simplify any
5266   // addressing operations or alignment considerations.
5267   // We also want power of two interleave counts to ensure that the induction
5268   // variable of the vector loop wraps to zero, when tail is folded by masking;
5269   // this currently happens when OptForSize, in which case IC is set to 1 above.
5270   unsigned IC = UINT_MAX;
5271 
5272   for (auto& pair : R.MaxLocalUsers) {
5273     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5274     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5275                       << " registers of "
5276                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5277     if (VF == 1) {
5278       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5279         TargetNumRegisters = ForceTargetNumScalarRegs;
5280     } else {
5281       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5282         TargetNumRegisters = ForceTargetNumVectorRegs;
5283     }
5284     unsigned MaxLocalUsers = pair.second;
5285     unsigned LoopInvariantRegs = 0;
5286     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5287       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5288 
5289     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5290     // Don't count the induction variable as interleaved.
5291     if (EnableIndVarRegisterHeur) {
5292       TmpIC =
5293           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5294                         std::max(1U, (MaxLocalUsers - 1)));
5295     }
5296 
5297     IC = std::min(IC, TmpIC);
5298   }
5299 
5300   // Clamp the interleave ranges to reasonable counts.
5301   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5302 
5303   // Check if the user has overridden the max.
5304   if (VF == 1) {
5305     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5306       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5307   } else {
5308     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5309       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5310   }
5311 
5312   // If trip count is known or estimated compile time constant, limit the
5313   // interleave count to be less than the trip count divided by VF.
5314   if (BestKnownTC) {
5315     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5316   }
5317 
5318   // If we did not calculate the cost for VF (because the user selected the VF)
5319   // then we calculate the cost of VF here.
5320   if (LoopCost == 0)
5321     LoopCost = expectedCost(VF).first;
5322 
5323   assert(LoopCost && "Non-zero loop cost expected");
5324 
5325   // Clamp the calculated IC to be between the 1 and the max interleave count
5326   // that the target and trip count allows.
5327   if (IC > MaxInterleaveCount)
5328     IC = MaxInterleaveCount;
5329   else if (IC < 1)
5330     IC = 1;
5331 
5332   // Interleave if we vectorized this loop and there is a reduction that could
5333   // benefit from interleaving.
5334   if (VF > 1 && !Legal->getReductionVars().empty()) {
5335     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5336     return IC;
5337   }
5338 
5339   // Note that if we've already vectorized the loop we will have done the
5340   // runtime check and so interleaving won't require further checks.
5341   bool InterleavingRequiresRuntimePointerCheck =
5342       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5343 
5344   // We want to interleave small loops in order to reduce the loop overhead and
5345   // potentially expose ILP opportunities.
5346   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5347   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5348     // We assume that the cost overhead is 1 and we use the cost model
5349     // to estimate the cost of the loop and interleave until the cost of the
5350     // loop overhead is about 5% of the cost of the loop.
5351     unsigned SmallIC =
5352         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5353 
5354     // Interleave until store/load ports (estimated by max interleave count) are
5355     // saturated.
5356     unsigned NumStores = Legal->getNumStores();
5357     unsigned NumLoads = Legal->getNumLoads();
5358     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5359     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5360 
5361     // If we have a scalar reduction (vector reductions are already dealt with
5362     // by this point), we can increase the critical path length if the loop
5363     // we're interleaving is inside another loop. Limit, by default to 2, so the
5364     // critical path only gets increased by one reduction operation.
5365     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5366       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5367       SmallIC = std::min(SmallIC, F);
5368       StoresIC = std::min(StoresIC, F);
5369       LoadsIC = std::min(LoadsIC, F);
5370     }
5371 
5372     if (EnableLoadStoreRuntimeInterleave &&
5373         std::max(StoresIC, LoadsIC) > SmallIC) {
5374       LLVM_DEBUG(
5375           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5376       return std::max(StoresIC, LoadsIC);
5377     }
5378 
5379     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5380     return SmallIC;
5381   }
5382 
5383   // Interleave if this is a large loop (small loops are already dealt with by
5384   // this point) that could benefit from interleaving.
5385   bool HasReductions = !Legal->getReductionVars().empty();
5386   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5387     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5388     return IC;
5389   }
5390 
5391   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5392   return 1;
5393 }
5394 
5395 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5396 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5397   // This function calculates the register usage by measuring the highest number
5398   // of values that are alive at a single location. Obviously, this is a very
5399   // rough estimation. We scan the loop in a topological order in order and
5400   // assign a number to each instruction. We use RPO to ensure that defs are
5401   // met before their users. We assume that each instruction that has in-loop
5402   // users starts an interval. We record every time that an in-loop value is
5403   // used, so we have a list of the first and last occurrences of each
5404   // instruction. Next, we transpose this data structure into a multi map that
5405   // holds the list of intervals that *end* at a specific location. This multi
5406   // map allows us to perform a linear search. We scan the instructions linearly
5407   // and record each time that a new interval starts, by placing it in a set.
5408   // If we find this value in the multi-map then we remove it from the set.
5409   // The max register usage is the maximum size of the set.
5410   // We also search for instructions that are defined outside the loop, but are
5411   // used inside the loop. We need this number separately from the max-interval
5412   // usage number because when we unroll, loop-invariant values do not take
5413   // more register.
5414   LoopBlocksDFS DFS(TheLoop);
5415   DFS.perform(LI);
5416 
5417   RegisterUsage RU;
5418 
5419   // Each 'key' in the map opens a new interval. The values
5420   // of the map are the index of the 'last seen' usage of the
5421   // instruction that is the key.
5422   using IntervalMap = DenseMap<Instruction *, unsigned>;
5423 
5424   // Maps instruction to its index.
5425   SmallVector<Instruction *, 64> IdxToInstr;
5426   // Marks the end of each interval.
5427   IntervalMap EndPoint;
5428   // Saves the list of instruction indices that are used in the loop.
5429   SmallPtrSet<Instruction *, 8> Ends;
5430   // Saves the list of values that are used in the loop but are
5431   // defined outside the loop, such as arguments and constants.
5432   SmallPtrSet<Value *, 8> LoopInvariants;
5433 
5434   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5435     for (Instruction &I : BB->instructionsWithoutDebug()) {
5436       IdxToInstr.push_back(&I);
5437 
5438       // Save the end location of each USE.
5439       for (Value *U : I.operands()) {
5440         auto *Instr = dyn_cast<Instruction>(U);
5441 
5442         // Ignore non-instruction values such as arguments, constants, etc.
5443         if (!Instr)
5444           continue;
5445 
5446         // If this instruction is outside the loop then record it and continue.
5447         if (!TheLoop->contains(Instr)) {
5448           LoopInvariants.insert(Instr);
5449           continue;
5450         }
5451 
5452         // Overwrite previous end points.
5453         EndPoint[Instr] = IdxToInstr.size();
5454         Ends.insert(Instr);
5455       }
5456     }
5457   }
5458 
5459   // Saves the list of intervals that end with the index in 'key'.
5460   using InstrList = SmallVector<Instruction *, 2>;
5461   DenseMap<unsigned, InstrList> TransposeEnds;
5462 
5463   // Transpose the EndPoints to a list of values that end at each index.
5464   for (auto &Interval : EndPoint)
5465     TransposeEnds[Interval.second].push_back(Interval.first);
5466 
5467   SmallPtrSet<Instruction *, 8> OpenIntervals;
5468 
5469   // Get the size of the widest register.
5470   unsigned MaxSafeDepDist = -1U;
5471   if (Legal->getMaxSafeDepDistBytes() != -1U)
5472     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5473   unsigned WidestRegister =
5474       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5475   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5476 
5477   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5478   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5479 
5480   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5481 
5482   // A lambda that gets the register usage for the given type and VF.
5483   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5484     if (Ty->isTokenTy())
5485       return 0U;
5486     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5487     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5488   };
5489 
5490   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5491     Instruction *I = IdxToInstr[i];
5492 
5493     // Remove all of the instructions that end at this location.
5494     InstrList &List = TransposeEnds[i];
5495     for (Instruction *ToRemove : List)
5496       OpenIntervals.erase(ToRemove);
5497 
5498     // Ignore instructions that are never used within the loop.
5499     if (Ends.find(I) == Ends.end())
5500       continue;
5501 
5502     // Skip ignored values.
5503     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5504       continue;
5505 
5506     // For each VF find the maximum usage of registers.
5507     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5508       // Count the number of live intervals.
5509       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5510 
5511       if (VFs[j] == 1) {
5512         for (auto Inst : OpenIntervals) {
5513           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5514           if (RegUsage.find(ClassID) == RegUsage.end())
5515             RegUsage[ClassID] = 1;
5516           else
5517             RegUsage[ClassID] += 1;
5518         }
5519       } else {
5520         collectUniformsAndScalars(VFs[j]);
5521         for (auto Inst : OpenIntervals) {
5522           // Skip ignored values for VF > 1.
5523           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5524             continue;
5525           if (isScalarAfterVectorization(Inst, VFs[j])) {
5526             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5527             if (RegUsage.find(ClassID) == RegUsage.end())
5528               RegUsage[ClassID] = 1;
5529             else
5530               RegUsage[ClassID] += 1;
5531           } else {
5532             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5533             if (RegUsage.find(ClassID) == RegUsage.end())
5534               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5535             else
5536               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5537           }
5538         }
5539       }
5540 
5541       for (auto& pair : RegUsage) {
5542         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5543           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5544         else
5545           MaxUsages[j][pair.first] = pair.second;
5546       }
5547     }
5548 
5549     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5550                       << OpenIntervals.size() << '\n');
5551 
5552     // Add the current instruction to the list of open intervals.
5553     OpenIntervals.insert(I);
5554   }
5555 
5556   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5557     SmallMapVector<unsigned, unsigned, 4> Invariant;
5558 
5559     for (auto Inst : LoopInvariants) {
5560       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5561       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5562       if (Invariant.find(ClassID) == Invariant.end())
5563         Invariant[ClassID] = Usage;
5564       else
5565         Invariant[ClassID] += Usage;
5566     }
5567 
5568     LLVM_DEBUG({
5569       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5570       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5571              << " item\n";
5572       for (const auto &pair : MaxUsages[i]) {
5573         dbgs() << "LV(REG): RegisterClass: "
5574                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5575                << " registers\n";
5576       }
5577       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5578              << " item\n";
5579       for (const auto &pair : Invariant) {
5580         dbgs() << "LV(REG): RegisterClass: "
5581                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5582                << " registers\n";
5583       }
5584     });
5585 
5586     RU.LoopInvariantRegs = Invariant;
5587     RU.MaxLocalUsers = MaxUsages[i];
5588     RUs[i] = RU;
5589   }
5590 
5591   return RUs;
5592 }
5593 
5594 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5595   // TODO: Cost model for emulated masked load/store is completely
5596   // broken. This hack guides the cost model to use an artificially
5597   // high enough value to practically disable vectorization with such
5598   // operations, except where previously deployed legality hack allowed
5599   // using very low cost values. This is to avoid regressions coming simply
5600   // from moving "masked load/store" check from legality to cost model.
5601   // Masked Load/Gather emulation was previously never allowed.
5602   // Limited number of Masked Store/Scatter emulation was allowed.
5603   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5604   return isa<LoadInst>(I) ||
5605          (isa<StoreInst>(I) &&
5606           NumPredStores > NumberOfStoresToPredicate);
5607 }
5608 
5609 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5610   // If we aren't vectorizing the loop, or if we've already collected the
5611   // instructions to scalarize, there's nothing to do. Collection may already
5612   // have occurred if we have a user-selected VF and are now computing the
5613   // expected cost for interleaving.
5614   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5615     return;
5616 
5617   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5618   // not profitable to scalarize any instructions, the presence of VF in the
5619   // map will indicate that we've analyzed it already.
5620   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5621 
5622   // Find all the instructions that are scalar with predication in the loop and
5623   // determine if it would be better to not if-convert the blocks they are in.
5624   // If so, we also record the instructions to scalarize.
5625   for (BasicBlock *BB : TheLoop->blocks()) {
5626     if (!blockNeedsPredication(BB))
5627       continue;
5628     for (Instruction &I : *BB)
5629       if (isScalarWithPredication(&I)) {
5630         ScalarCostsTy ScalarCosts;
5631         // Do not apply discount logic if hacked cost is needed
5632         // for emulated masked memrefs.
5633         if (!useEmulatedMaskMemRefHack(&I) &&
5634             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5635           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5636         // Remember that BB will remain after vectorization.
5637         PredicatedBBsAfterVectorization.insert(BB);
5638       }
5639   }
5640 }
5641 
5642 int LoopVectorizationCostModel::computePredInstDiscount(
5643     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5644     unsigned VF) {
5645   assert(!isUniformAfterVectorization(PredInst, VF) &&
5646          "Instruction marked uniform-after-vectorization will be predicated");
5647 
5648   // Initialize the discount to zero, meaning that the scalar version and the
5649   // vector version cost the same.
5650   int Discount = 0;
5651 
5652   // Holds instructions to analyze. The instructions we visit are mapped in
5653   // ScalarCosts. Those instructions are the ones that would be scalarized if
5654   // we find that the scalar version costs less.
5655   SmallVector<Instruction *, 8> Worklist;
5656 
5657   // Returns true if the given instruction can be scalarized.
5658   auto canBeScalarized = [&](Instruction *I) -> bool {
5659     // We only attempt to scalarize instructions forming a single-use chain
5660     // from the original predicated block that would otherwise be vectorized.
5661     // Although not strictly necessary, we give up on instructions we know will
5662     // already be scalar to avoid traversing chains that are unlikely to be
5663     // beneficial.
5664     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5665         isScalarAfterVectorization(I, VF))
5666       return false;
5667 
5668     // If the instruction is scalar with predication, it will be analyzed
5669     // separately. We ignore it within the context of PredInst.
5670     if (isScalarWithPredication(I))
5671       return false;
5672 
5673     // If any of the instruction's operands are uniform after vectorization,
5674     // the instruction cannot be scalarized. This prevents, for example, a
5675     // masked load from being scalarized.
5676     //
5677     // We assume we will only emit a value for lane zero of an instruction
5678     // marked uniform after vectorization, rather than VF identical values.
5679     // Thus, if we scalarize an instruction that uses a uniform, we would
5680     // create uses of values corresponding to the lanes we aren't emitting code
5681     // for. This behavior can be changed by allowing getScalarValue to clone
5682     // the lane zero values for uniforms rather than asserting.
5683     for (Use &U : I->operands())
5684       if (auto *J = dyn_cast<Instruction>(U.get()))
5685         if (isUniformAfterVectorization(J, VF))
5686           return false;
5687 
5688     // Otherwise, we can scalarize the instruction.
5689     return true;
5690   };
5691 
5692   // Compute the expected cost discount from scalarizing the entire expression
5693   // feeding the predicated instruction. We currently only consider expressions
5694   // that are single-use instruction chains.
5695   Worklist.push_back(PredInst);
5696   while (!Worklist.empty()) {
5697     Instruction *I = Worklist.pop_back_val();
5698 
5699     // If we've already analyzed the instruction, there's nothing to do.
5700     if (ScalarCosts.find(I) != ScalarCosts.end())
5701       continue;
5702 
5703     // Compute the cost of the vector instruction. Note that this cost already
5704     // includes the scalarization overhead of the predicated instruction.
5705     unsigned VectorCost = getInstructionCost(I, VF).first;
5706 
5707     // Compute the cost of the scalarized instruction. This cost is the cost of
5708     // the instruction as if it wasn't if-converted and instead remained in the
5709     // predicated block. We will scale this cost by block probability after
5710     // computing the scalarization overhead.
5711     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5712 
5713     // Compute the scalarization overhead of needed insertelement instructions
5714     // and phi nodes.
5715     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5716       ScalarCost += TTI.getScalarizationOverhead(
5717           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5718           APInt::getAllOnesValue(VF), true, false);
5719       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5720     }
5721 
5722     // Compute the scalarization overhead of needed extractelement
5723     // instructions. For each of the instruction's operands, if the operand can
5724     // be scalarized, add it to the worklist; otherwise, account for the
5725     // overhead.
5726     for (Use &U : I->operands())
5727       if (auto *J = dyn_cast<Instruction>(U.get())) {
5728         assert(VectorType::isValidElementType(J->getType()) &&
5729                "Instruction has non-scalar type");
5730         if (canBeScalarized(J))
5731           Worklist.push_back(J);
5732         else if (needsExtract(J, VF))
5733           ScalarCost += TTI.getScalarizationOverhead(
5734               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5735               APInt::getAllOnesValue(VF), false, true);
5736       }
5737 
5738     // Scale the total scalar cost by block probability.
5739     ScalarCost /= getReciprocalPredBlockProb();
5740 
5741     // Compute the discount. A non-negative discount means the vector version
5742     // of the instruction costs more, and scalarizing would be beneficial.
5743     Discount += VectorCost - ScalarCost;
5744     ScalarCosts[I] = ScalarCost;
5745   }
5746 
5747   return Discount;
5748 }
5749 
5750 LoopVectorizationCostModel::VectorizationCostTy
5751 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5752   VectorizationCostTy Cost;
5753 
5754   // For each block.
5755   for (BasicBlock *BB : TheLoop->blocks()) {
5756     VectorizationCostTy BlockCost;
5757 
5758     // For each instruction in the old loop.
5759     for (Instruction &I : BB->instructionsWithoutDebug()) {
5760       // Skip ignored values.
5761       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5762           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5763         continue;
5764 
5765       VectorizationCostTy C = getInstructionCost(&I, VF);
5766 
5767       // Check if we should override the cost.
5768       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5769         C.first = ForceTargetInstructionCost;
5770 
5771       BlockCost.first += C.first;
5772       BlockCost.second |= C.second;
5773       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5774                         << " for VF " << VF << " For instruction: " << I
5775                         << '\n');
5776     }
5777 
5778     // If we are vectorizing a predicated block, it will have been
5779     // if-converted. This means that the block's instructions (aside from
5780     // stores and instructions that may divide by zero) will now be
5781     // unconditionally executed. For the scalar case, we may not always execute
5782     // the predicated block. Thus, scale the block's cost by the probability of
5783     // executing it.
5784     if (VF == 1 && blockNeedsPredication(BB))
5785       BlockCost.first /= getReciprocalPredBlockProb();
5786 
5787     Cost.first += BlockCost.first;
5788     Cost.second |= BlockCost.second;
5789   }
5790 
5791   return Cost;
5792 }
5793 
5794 /// Gets Address Access SCEV after verifying that the access pattern
5795 /// is loop invariant except the induction variable dependence.
5796 ///
5797 /// This SCEV can be sent to the Target in order to estimate the address
5798 /// calculation cost.
5799 static const SCEV *getAddressAccessSCEV(
5800               Value *Ptr,
5801               LoopVectorizationLegality *Legal,
5802               PredicatedScalarEvolution &PSE,
5803               const Loop *TheLoop) {
5804 
5805   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5806   if (!Gep)
5807     return nullptr;
5808 
5809   // We are looking for a gep with all loop invariant indices except for one
5810   // which should be an induction variable.
5811   auto SE = PSE.getSE();
5812   unsigned NumOperands = Gep->getNumOperands();
5813   for (unsigned i = 1; i < NumOperands; ++i) {
5814     Value *Opd = Gep->getOperand(i);
5815     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5816         !Legal->isInductionVariable(Opd))
5817       return nullptr;
5818   }
5819 
5820   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5821   return PSE.getSCEV(Ptr);
5822 }
5823 
5824 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5825   return Legal->hasStride(I->getOperand(0)) ||
5826          Legal->hasStride(I->getOperand(1));
5827 }
5828 
5829 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5830                                                                  unsigned VF) {
5831   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5832   Type *ValTy = getMemInstValueType(I);
5833   auto SE = PSE.getSE();
5834 
5835   unsigned AS = getLoadStoreAddressSpace(I);
5836   Value *Ptr = getLoadStorePointerOperand(I);
5837   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5838 
5839   // Figure out whether the access is strided and get the stride value
5840   // if it's known in compile time
5841   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5842 
5843   // Get the cost of the scalar memory instruction and address computation.
5844   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5845 
5846   // Don't pass *I here, since it is scalar but will actually be part of a
5847   // vectorized loop where the user of it is a vectorized instruction.
5848   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5849   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5850                                    Alignment, AS,
5851                                    TTI::TCK_RecipThroughput);
5852 
5853   // Get the overhead of the extractelement and insertelement instructions
5854   // we might create due to scalarization.
5855   Cost += getScalarizationOverhead(I, VF);
5856 
5857   // If we have a predicated store, it may not be executed for each vector
5858   // lane. Scale the cost by the probability of executing the predicated
5859   // block.
5860   if (isPredicatedInst(I)) {
5861     Cost /= getReciprocalPredBlockProb();
5862 
5863     if (useEmulatedMaskMemRefHack(I))
5864       // Artificially setting to a high enough value to practically disable
5865       // vectorization with such operations.
5866       Cost = 3000000;
5867   }
5868 
5869   return Cost;
5870 }
5871 
5872 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5873                                                              unsigned VF) {
5874   Type *ValTy = getMemInstValueType(I);
5875   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5876   Value *Ptr = getLoadStorePointerOperand(I);
5877   unsigned AS = getLoadStoreAddressSpace(I);
5878   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5879   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5880 
5881   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5882          "Stride should be 1 or -1 for consecutive memory access");
5883   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5884   unsigned Cost = 0;
5885   if (Legal->isMaskRequired(I))
5886     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5887                                       Alignment ? Alignment->value() : 0, AS,
5888                                       CostKind);
5889   else
5890     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
5891                                 CostKind, I);
5892 
5893   bool Reverse = ConsecutiveStride < 0;
5894   if (Reverse)
5895     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5896   return Cost;
5897 }
5898 
5899 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5900                                                          unsigned VF) {
5901   Type *ValTy = getMemInstValueType(I);
5902   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5903   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5904   unsigned AS = getLoadStoreAddressSpace(I);
5905   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5906   if (isa<LoadInst>(I)) {
5907     return TTI.getAddressComputationCost(ValTy) +
5908            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
5909                                CostKind) +
5910            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5911   }
5912   StoreInst *SI = cast<StoreInst>(I);
5913 
5914   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5915   return TTI.getAddressComputationCost(ValTy) +
5916          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
5917                              CostKind) +
5918          (isLoopInvariantStoreValue
5919               ? 0
5920               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5921                                        VF - 1));
5922 }
5923 
5924 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5925                                                           unsigned VF) {
5926   Type *ValTy = getMemInstValueType(I);
5927   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5928   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5929   Value *Ptr = getLoadStorePointerOperand(I);
5930 
5931   return TTI.getAddressComputationCost(VectorTy) +
5932          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5933                                     Legal->isMaskRequired(I),
5934                                     Alignment ? Alignment->value() : 0,
5935                                     TargetTransformInfo::TCK_RecipThroughput,
5936                                     I);
5937 }
5938 
5939 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5940                                                             unsigned VF) {
5941   Type *ValTy = getMemInstValueType(I);
5942   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
5943   unsigned AS = getLoadStoreAddressSpace(I);
5944 
5945   auto Group = getInterleavedAccessGroup(I);
5946   assert(Group && "Fail to get an interleaved access group.");
5947 
5948   unsigned InterleaveFactor = Group->getFactor();
5949   VectorType *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5950 
5951   // Holds the indices of existing members in an interleaved load group.
5952   // An interleaved store group doesn't need this as it doesn't allow gaps.
5953   SmallVector<unsigned, 4> Indices;
5954   if (isa<LoadInst>(I)) {
5955     for (unsigned i = 0; i < InterleaveFactor; i++)
5956       if (Group->getMember(i))
5957         Indices.push_back(i);
5958   }
5959 
5960   // Calculate the cost of the whole interleaved group.
5961   bool UseMaskForGaps =
5962       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5963   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5964       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5965       Group->getAlign().value(), AS, TTI::TCK_RecipThroughput,
5966       Legal->isMaskRequired(I), UseMaskForGaps);
5967 
5968   if (Group->isReverse()) {
5969     // TODO: Add support for reversed masked interleaved access.
5970     assert(!Legal->isMaskRequired(I) &&
5971            "Reverse masked interleaved access not supported.");
5972     Cost += Group->getNumMembers() *
5973             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5974   }
5975   return Cost;
5976 }
5977 
5978 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5979                                                               unsigned VF) {
5980   // Calculate scalar cost only. Vectorization cost should be ready at this
5981   // moment.
5982   if (VF == 1) {
5983     Type *ValTy = getMemInstValueType(I);
5984     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5985     unsigned AS = getLoadStoreAddressSpace(I);
5986 
5987     return TTI.getAddressComputationCost(ValTy) +
5988            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
5989                                TTI::TCK_RecipThroughput, I);
5990   }
5991   return getWideningCost(I, VF);
5992 }
5993 
5994 LoopVectorizationCostModel::VectorizationCostTy
5995 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5996   // If we know that this instruction will remain uniform, check the cost of
5997   // the scalar version.
5998   if (isUniformAfterVectorization(I, VF))
5999     VF = 1;
6000 
6001   if (VF > 1 && isProfitableToScalarize(I, VF))
6002     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6003 
6004   // Forced scalars do not have any scalarization overhead.
6005   auto ForcedScalar = ForcedScalars.find(VF);
6006   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
6007     auto InstSet = ForcedScalar->second;
6008     if (InstSet.find(I) != InstSet.end())
6009       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6010   }
6011 
6012   Type *VectorTy;
6013   unsigned C = getInstructionCost(I, VF, VectorTy);
6014 
6015   bool TypeNotScalarized =
6016       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6017   return VectorizationCostTy(C, TypeNotScalarized);
6018 }
6019 
6020 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6021                                                               unsigned VF) {
6022 
6023   if (VF == 1)
6024     return 0;
6025 
6026   unsigned Cost = 0;
6027   Type *RetTy = ToVectorTy(I->getType(), VF);
6028   if (!RetTy->isVoidTy() &&
6029       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6030     Cost += TTI.getScalarizationOverhead(
6031         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6032 
6033   // Some targets keep addresses scalar.
6034   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6035     return Cost;
6036 
6037   // Some targets support efficient element stores.
6038   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6039     return Cost;
6040 
6041   // Collect operands to consider.
6042   CallInst *CI = dyn_cast<CallInst>(I);
6043   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6044 
6045   // Skip operands that do not require extraction/scalarization and do not incur
6046   // any overhead.
6047   return Cost + TTI.getOperandsScalarizationOverhead(
6048                     filterExtractingOperands(Ops, VF), VF);
6049 }
6050 
6051 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6052   if (VF == 1)
6053     return;
6054   NumPredStores = 0;
6055   for (BasicBlock *BB : TheLoop->blocks()) {
6056     // For each instruction in the old loop.
6057     for (Instruction &I : *BB) {
6058       Value *Ptr =  getLoadStorePointerOperand(&I);
6059       if (!Ptr)
6060         continue;
6061 
6062       // TODO: We should generate better code and update the cost model for
6063       // predicated uniform stores. Today they are treated as any other
6064       // predicated store (see added test cases in
6065       // invariant-store-vectorization.ll).
6066       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6067         NumPredStores++;
6068 
6069       if (Legal->isUniform(Ptr) &&
6070           // Conditional loads and stores should be scalarized and predicated.
6071           // isScalarWithPredication cannot be used here since masked
6072           // gather/scatters are not considered scalar with predication.
6073           !Legal->blockNeedsPredication(I.getParent())) {
6074         // TODO: Avoid replicating loads and stores instead of
6075         // relying on instcombine to remove them.
6076         // Load: Scalar load + broadcast
6077         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6078         unsigned Cost = getUniformMemOpCost(&I, VF);
6079         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6080         continue;
6081       }
6082 
6083       // We assume that widening is the best solution when possible.
6084       if (memoryInstructionCanBeWidened(&I, VF)) {
6085         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6086         int ConsecutiveStride =
6087                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6088         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6089                "Expected consecutive stride.");
6090         InstWidening Decision =
6091             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6092         setWideningDecision(&I, VF, Decision, Cost);
6093         continue;
6094       }
6095 
6096       // Choose between Interleaving, Gather/Scatter or Scalarization.
6097       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6098       unsigned NumAccesses = 1;
6099       if (isAccessInterleaved(&I)) {
6100         auto Group = getInterleavedAccessGroup(&I);
6101         assert(Group && "Fail to get an interleaved access group.");
6102 
6103         // Make one decision for the whole group.
6104         if (getWideningDecision(&I, VF) != CM_Unknown)
6105           continue;
6106 
6107         NumAccesses = Group->getNumMembers();
6108         if (interleavedAccessCanBeWidened(&I, VF))
6109           InterleaveCost = getInterleaveGroupCost(&I, VF);
6110       }
6111 
6112       unsigned GatherScatterCost =
6113           isLegalGatherOrScatter(&I)
6114               ? getGatherScatterCost(&I, VF) * NumAccesses
6115               : std::numeric_limits<unsigned>::max();
6116 
6117       unsigned ScalarizationCost =
6118           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6119 
6120       // Choose better solution for the current VF,
6121       // write down this decision and use it during vectorization.
6122       unsigned Cost;
6123       InstWidening Decision;
6124       if (InterleaveCost <= GatherScatterCost &&
6125           InterleaveCost < ScalarizationCost) {
6126         Decision = CM_Interleave;
6127         Cost = InterleaveCost;
6128       } else if (GatherScatterCost < ScalarizationCost) {
6129         Decision = CM_GatherScatter;
6130         Cost = GatherScatterCost;
6131       } else {
6132         Decision = CM_Scalarize;
6133         Cost = ScalarizationCost;
6134       }
6135       // If the instructions belongs to an interleave group, the whole group
6136       // receives the same decision. The whole group receives the cost, but
6137       // the cost will actually be assigned to one instruction.
6138       if (auto Group = getInterleavedAccessGroup(&I))
6139         setWideningDecision(Group, VF, Decision, Cost);
6140       else
6141         setWideningDecision(&I, VF, Decision, Cost);
6142     }
6143   }
6144 
6145   // Make sure that any load of address and any other address computation
6146   // remains scalar unless there is gather/scatter support. This avoids
6147   // inevitable extracts into address registers, and also has the benefit of
6148   // activating LSR more, since that pass can't optimize vectorized
6149   // addresses.
6150   if (TTI.prefersVectorizedAddressing())
6151     return;
6152 
6153   // Start with all scalar pointer uses.
6154   SmallPtrSet<Instruction *, 8> AddrDefs;
6155   for (BasicBlock *BB : TheLoop->blocks())
6156     for (Instruction &I : *BB) {
6157       Instruction *PtrDef =
6158         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6159       if (PtrDef && TheLoop->contains(PtrDef) &&
6160           getWideningDecision(&I, VF) != CM_GatherScatter)
6161         AddrDefs.insert(PtrDef);
6162     }
6163 
6164   // Add all instructions used to generate the addresses.
6165   SmallVector<Instruction *, 4> Worklist;
6166   for (auto *I : AddrDefs)
6167     Worklist.push_back(I);
6168   while (!Worklist.empty()) {
6169     Instruction *I = Worklist.pop_back_val();
6170     for (auto &Op : I->operands())
6171       if (auto *InstOp = dyn_cast<Instruction>(Op))
6172         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6173             AddrDefs.insert(InstOp).second)
6174           Worklist.push_back(InstOp);
6175   }
6176 
6177   for (auto *I : AddrDefs) {
6178     if (isa<LoadInst>(I)) {
6179       // Setting the desired widening decision should ideally be handled in
6180       // by cost functions, but since this involves the task of finding out
6181       // if the loaded register is involved in an address computation, it is
6182       // instead changed here when we know this is the case.
6183       InstWidening Decision = getWideningDecision(I, VF);
6184       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6185         // Scalarize a widened load of address.
6186         setWideningDecision(I, VF, CM_Scalarize,
6187                             (VF * getMemoryInstructionCost(I, 1)));
6188       else if (auto Group = getInterleavedAccessGroup(I)) {
6189         // Scalarize an interleave group of address loads.
6190         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6191           if (Instruction *Member = Group->getMember(I))
6192             setWideningDecision(Member, VF, CM_Scalarize,
6193                                 (VF * getMemoryInstructionCost(Member, 1)));
6194         }
6195       }
6196     } else
6197       // Make sure I gets scalarized and a cost estimate without
6198       // scalarization overhead.
6199       ForcedScalars[VF].insert(I);
6200   }
6201 }
6202 
6203 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6204                                                         unsigned VF,
6205                                                         Type *&VectorTy) {
6206   Type *RetTy = I->getType();
6207   if (canTruncateToMinimalBitwidth(I, VF))
6208     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6209   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6210   auto SE = PSE.getSE();
6211   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6212 
6213   // TODO: We need to estimate the cost of intrinsic calls.
6214   switch (I->getOpcode()) {
6215   case Instruction::GetElementPtr:
6216     // We mark this instruction as zero-cost because the cost of GEPs in
6217     // vectorized code depends on whether the corresponding memory instruction
6218     // is scalarized or not. Therefore, we handle GEPs with the memory
6219     // instruction cost.
6220     return 0;
6221   case Instruction::Br: {
6222     // In cases of scalarized and predicated instructions, there will be VF
6223     // predicated blocks in the vectorized loop. Each branch around these
6224     // blocks requires also an extract of its vector compare i1 element.
6225     bool ScalarPredicatedBB = false;
6226     BranchInst *BI = cast<BranchInst>(I);
6227     if (VF > 1 && BI->isConditional() &&
6228         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6229              PredicatedBBsAfterVectorization.end() ||
6230          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6231              PredicatedBBsAfterVectorization.end()))
6232       ScalarPredicatedBB = true;
6233 
6234     if (ScalarPredicatedBB) {
6235       // Return cost for branches around scalarized and predicated blocks.
6236       VectorType *Vec_i1Ty =
6237           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6238       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6239                                            false, true) +
6240               (TTI.getCFInstrCost(Instruction::Br) * VF));
6241     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6242       // The back-edge branch will remain, as will all scalar branches.
6243       return TTI.getCFInstrCost(Instruction::Br);
6244     else
6245       // This branch will be eliminated by if-conversion.
6246       return 0;
6247     // Note: We currently assume zero cost for an unconditional branch inside
6248     // a predicated block since it will become a fall-through, although we
6249     // may decide in the future to call TTI for all branches.
6250   }
6251   case Instruction::PHI: {
6252     auto *Phi = cast<PHINode>(I);
6253 
6254     // First-order recurrences are replaced by vector shuffles inside the loop.
6255     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6256     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6257       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6258                                 cast<VectorType>(VectorTy), VF - 1,
6259                                 VectorType::get(RetTy, 1));
6260 
6261     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6262     // converted into select instructions. We require N - 1 selects per phi
6263     // node, where N is the number of incoming values.
6264     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6265       return (Phi->getNumIncomingValues() - 1) *
6266              TTI.getCmpSelInstrCost(
6267                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6268                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6269                  CostKind);
6270 
6271     return TTI.getCFInstrCost(Instruction::PHI);
6272   }
6273   case Instruction::UDiv:
6274   case Instruction::SDiv:
6275   case Instruction::URem:
6276   case Instruction::SRem:
6277     // If we have a predicated instruction, it may not be executed for each
6278     // vector lane. Get the scalarization cost and scale this amount by the
6279     // probability of executing the predicated block. If the instruction is not
6280     // predicated, we fall through to the next case.
6281     if (VF > 1 && isScalarWithPredication(I)) {
6282       unsigned Cost = 0;
6283 
6284       // These instructions have a non-void type, so account for the phi nodes
6285       // that we will create. This cost is likely to be zero. The phi node
6286       // cost, if any, should be scaled by the block probability because it
6287       // models a copy at the end of each predicated block.
6288       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6289 
6290       // The cost of the non-predicated instruction.
6291       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6292 
6293       // The cost of insertelement and extractelement instructions needed for
6294       // scalarization.
6295       Cost += getScalarizationOverhead(I, VF);
6296 
6297       // Scale the cost by the probability of executing the predicated blocks.
6298       // This assumes the predicated block for each vector lane is equally
6299       // likely.
6300       return Cost / getReciprocalPredBlockProb();
6301     }
6302     LLVM_FALLTHROUGH;
6303   case Instruction::Add:
6304   case Instruction::FAdd:
6305   case Instruction::Sub:
6306   case Instruction::FSub:
6307   case Instruction::Mul:
6308   case Instruction::FMul:
6309   case Instruction::FDiv:
6310   case Instruction::FRem:
6311   case Instruction::Shl:
6312   case Instruction::LShr:
6313   case Instruction::AShr:
6314   case Instruction::And:
6315   case Instruction::Or:
6316   case Instruction::Xor: {
6317     // Since we will replace the stride by 1 the multiplication should go away.
6318     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6319       return 0;
6320     // Certain instructions can be cheaper to vectorize if they have a constant
6321     // second vector operand. One example of this are shifts on x86.
6322     Value *Op2 = I->getOperand(1);
6323     TargetTransformInfo::OperandValueProperties Op2VP;
6324     TargetTransformInfo::OperandValueKind Op2VK =
6325         TTI.getOperandInfo(Op2, Op2VP);
6326     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6327       Op2VK = TargetTransformInfo::OK_UniformValue;
6328 
6329     SmallVector<const Value *, 4> Operands(I->operand_values());
6330     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6331     return N * TTI.getArithmeticInstrCost(
6332                    I->getOpcode(), VectorTy, CostKind,
6333                    TargetTransformInfo::OK_AnyValue,
6334                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6335   }
6336   case Instruction::FNeg: {
6337     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6338     return N * TTI.getArithmeticInstrCost(
6339                    I->getOpcode(), VectorTy, CostKind,
6340                    TargetTransformInfo::OK_AnyValue,
6341                    TargetTransformInfo::OK_AnyValue,
6342                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6343                    I->getOperand(0), I);
6344   }
6345   case Instruction::Select: {
6346     SelectInst *SI = cast<SelectInst>(I);
6347     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6348     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6349     Type *CondTy = SI->getCondition()->getType();
6350     if (!ScalarCond)
6351       CondTy = VectorType::get(CondTy, VF);
6352 
6353     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6354                                   CostKind, I);
6355   }
6356   case Instruction::ICmp:
6357   case Instruction::FCmp: {
6358     Type *ValTy = I->getOperand(0)->getType();
6359     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6360     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6361       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6362     VectorTy = ToVectorTy(ValTy, VF);
6363     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6364                                   I);
6365   }
6366   case Instruction::Store:
6367   case Instruction::Load: {
6368     unsigned Width = VF;
6369     if (Width > 1) {
6370       InstWidening Decision = getWideningDecision(I, Width);
6371       assert(Decision != CM_Unknown &&
6372              "CM decision should be taken at this point");
6373       if (Decision == CM_Scalarize)
6374         Width = 1;
6375     }
6376     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6377     return getMemoryInstructionCost(I, VF);
6378   }
6379   case Instruction::ZExt:
6380   case Instruction::SExt:
6381   case Instruction::FPToUI:
6382   case Instruction::FPToSI:
6383   case Instruction::FPExt:
6384   case Instruction::PtrToInt:
6385   case Instruction::IntToPtr:
6386   case Instruction::SIToFP:
6387   case Instruction::UIToFP:
6388   case Instruction::Trunc:
6389   case Instruction::FPTrunc:
6390   case Instruction::BitCast: {
6391     // We optimize the truncation of induction variables having constant
6392     // integer steps. The cost of these truncations is the same as the scalar
6393     // operation.
6394     if (isOptimizableIVTruncate(I, VF)) {
6395       auto *Trunc = cast<TruncInst>(I);
6396       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6397                                   Trunc->getSrcTy(), CostKind, Trunc);
6398     }
6399 
6400     Type *SrcScalarTy = I->getOperand(0)->getType();
6401     Type *SrcVecTy =
6402         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6403     if (canTruncateToMinimalBitwidth(I, VF)) {
6404       // This cast is going to be shrunk. This may remove the cast or it might
6405       // turn it into slightly different cast. For example, if MinBW == 16,
6406       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6407       //
6408       // Calculate the modified src and dest types.
6409       Type *MinVecTy = VectorTy;
6410       if (I->getOpcode() == Instruction::Trunc) {
6411         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6412         VectorTy =
6413             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6414       } else if (I->getOpcode() == Instruction::ZExt ||
6415                  I->getOpcode() == Instruction::SExt) {
6416         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6417         VectorTy =
6418             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6419       }
6420     }
6421 
6422     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6423     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
6424                                     CostKind, I);
6425   }
6426   case Instruction::Call: {
6427     bool NeedToScalarize;
6428     CallInst *CI = cast<CallInst>(I);
6429     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6430     if (getVectorIntrinsicIDForCall(CI, TLI))
6431       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6432     return CallCost;
6433   }
6434   default:
6435     // The cost of executing VF copies of the scalar instruction. This opcode
6436     // is unknown. Assume that it is the same as 'mul'.
6437     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6438                                            CostKind) +
6439            getScalarizationOverhead(I, VF);
6440   } // end of switch.
6441 }
6442 
6443 char LoopVectorize::ID = 0;
6444 
6445 static const char lv_name[] = "Loop Vectorization";
6446 
6447 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6448 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6449 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6450 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6451 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6452 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6453 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6454 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6455 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6456 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6457 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6458 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6459 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6460 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6461 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6462 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6463 
6464 namespace llvm {
6465 
6466 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6467 
6468 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6469                               bool VectorizeOnlyWhenForced) {
6470   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6471 }
6472 
6473 } // end namespace llvm
6474 
6475 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6476   // Check if the pointer operand of a load or store instruction is
6477   // consecutive.
6478   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6479     return Legal->isConsecutivePtr(Ptr);
6480   return false;
6481 }
6482 
6483 void LoopVectorizationCostModel::collectValuesToIgnore() {
6484   // Ignore ephemeral values.
6485   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6486 
6487   // Ignore type-promoting instructions we identified during reduction
6488   // detection.
6489   for (auto &Reduction : Legal->getReductionVars()) {
6490     RecurrenceDescriptor &RedDes = Reduction.second;
6491     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6492     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6493   }
6494   // Ignore type-casting instructions we identified during induction
6495   // detection.
6496   for (auto &Induction : Legal->getInductionVars()) {
6497     InductionDescriptor &IndDes = Induction.second;
6498     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6499     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6500   }
6501 }
6502 
6503 // TODO: we could return a pair of values that specify the max VF and
6504 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6505 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6506 // doesn't have a cost model that can choose which plan to execute if
6507 // more than one is generated.
6508 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6509                                  LoopVectorizationCostModel &CM) {
6510   unsigned WidestType;
6511   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6512   return WidestVectorRegBits / WidestType;
6513 }
6514 
6515 VectorizationFactor
6516 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6517   unsigned VF = UserVF;
6518   // Outer loop handling: They may require CFG and instruction level
6519   // transformations before even evaluating whether vectorization is profitable.
6520   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6521   // the vectorization pipeline.
6522   if (!OrigLoop->empty()) {
6523     // If the user doesn't provide a vectorization factor, determine a
6524     // reasonable one.
6525     if (!UserVF) {
6526       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6527       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6528 
6529       // Make sure we have a VF > 1 for stress testing.
6530       if (VPlanBuildStressTest && VF < 2) {
6531         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6532                           << "overriding computed VF.\n");
6533         VF = 4;
6534       }
6535     }
6536     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6537     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6538     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6539                       << " to build VPlans.\n");
6540     buildVPlans(VF, VF);
6541 
6542     // For VPlan build stress testing, we bail out after VPlan construction.
6543     if (VPlanBuildStressTest)
6544       return VectorizationFactor::Disabled();
6545 
6546     return {VF, 0};
6547   }
6548 
6549   LLVM_DEBUG(
6550       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6551                 "VPlan-native path.\n");
6552   return VectorizationFactor::Disabled();
6553 }
6554 
6555 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6556   assert(OrigLoop->empty() && "Inner loop expected.");
6557   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6558   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6559     return None;
6560 
6561   // Invalidate interleave groups if all blocks of loop will be predicated.
6562   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6563       !useMaskedInterleavedAccesses(*TTI)) {
6564     LLVM_DEBUG(
6565         dbgs()
6566         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6567            "which requires masked-interleaved support.\n");
6568     if (CM.InterleaveInfo.invalidateGroups())
6569       // Invalidating interleave groups also requires invalidating all decisions
6570       // based on them, which includes widening decisions and uniform and scalar
6571       // values.
6572       CM.invalidateCostModelingDecisions();
6573   }
6574 
6575   if (UserVF) {
6576     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6577     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6578     // Collect the instructions (and their associated costs) that will be more
6579     // profitable to scalarize.
6580     CM.selectUserVectorizationFactor(UserVF);
6581     buildVPlansWithVPRecipes(UserVF, UserVF);
6582     LLVM_DEBUG(printPlans(dbgs()));
6583     return {{UserVF, 0}};
6584   }
6585 
6586   unsigned MaxVF = MaybeMaxVF.getValue();
6587   assert(MaxVF != 0 && "MaxVF is zero.");
6588 
6589   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6590     // Collect Uniform and Scalar instructions after vectorization with VF.
6591     CM.collectUniformsAndScalars(VF);
6592 
6593     // Collect the instructions (and their associated costs) that will be more
6594     // profitable to scalarize.
6595     if (VF > 1)
6596       CM.collectInstsToScalarize(VF);
6597   }
6598 
6599   buildVPlansWithVPRecipes(1, MaxVF);
6600   LLVM_DEBUG(printPlans(dbgs()));
6601   if (MaxVF == 1)
6602     return VectorizationFactor::Disabled();
6603 
6604   // Select the optimal vectorization factor.
6605   return CM.selectVectorizationFactor(MaxVF);
6606 }
6607 
6608 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6609   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6610                     << '\n');
6611   BestVF = VF;
6612   BestUF = UF;
6613 
6614   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6615     return !Plan->hasVF(VF);
6616   });
6617   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6618 }
6619 
6620 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6621                                            DominatorTree *DT) {
6622   // Perform the actual loop transformation.
6623 
6624   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6625   VPCallbackILV CallbackILV(ILV);
6626 
6627   VPTransformState State{BestVF, BestUF,      LI,
6628                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6629                          &ILV,   CallbackILV};
6630   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6631   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6632   State.CanonicalIV = ILV.Induction;
6633 
6634   //===------------------------------------------------===//
6635   //
6636   // Notice: any optimization or new instruction that go
6637   // into the code below should also be implemented in
6638   // the cost-model.
6639   //
6640   //===------------------------------------------------===//
6641 
6642   // 2. Copy and widen instructions from the old loop into the new loop.
6643   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6644   VPlans.front()->execute(&State);
6645 
6646   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6647   //    predication, updating analyses.
6648   ILV.fixVectorizedLoop();
6649 }
6650 
6651 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6652     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6653   BasicBlock *Latch = OrigLoop->getLoopLatch();
6654 
6655   // We create new control-flow for the vectorized loop, so the original
6656   // condition will be dead after vectorization if it's only used by the
6657   // branch.
6658   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6659   if (Cmp && Cmp->hasOneUse())
6660     DeadInstructions.insert(Cmp);
6661 
6662   // We create new "steps" for induction variable updates to which the original
6663   // induction variables map. An original update instruction will be dead if
6664   // all its users except the induction variable are dead.
6665   for (auto &Induction : Legal->getInductionVars()) {
6666     PHINode *Ind = Induction.first;
6667     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6668     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6669           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6670                                  DeadInstructions.end();
6671         }))
6672       DeadInstructions.insert(IndUpdate);
6673 
6674     // We record as "Dead" also the type-casting instructions we had identified
6675     // during induction analysis. We don't need any handling for them in the
6676     // vectorized loop because we have proven that, under a proper runtime
6677     // test guarding the vectorized loop, the value of the phi, and the casted
6678     // value of the phi, are the same. The last instruction in this casting chain
6679     // will get its scalar/vector/widened def from the scalar/vector/widened def
6680     // of the respective phi node. Any other casts in the induction def-use chain
6681     // have no other uses outside the phi update chain, and will be ignored.
6682     InductionDescriptor &IndDes = Induction.second;
6683     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6684     DeadInstructions.insert(Casts.begin(), Casts.end());
6685   }
6686 }
6687 
6688 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6689 
6690 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6691 
6692 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6693                                         Instruction::BinaryOps BinOp) {
6694   // When unrolling and the VF is 1, we only need to add a simple scalar.
6695   Type *Ty = Val->getType();
6696   assert(!Ty->isVectorTy() && "Val must be a scalar");
6697 
6698   if (Ty->isFloatingPointTy()) {
6699     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6700 
6701     // Floating point operations had to be 'fast' to enable the unrolling.
6702     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6703     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6704   }
6705   Constant *C = ConstantInt::get(Ty, StartIdx);
6706   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6707 }
6708 
6709 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6710   SmallVector<Metadata *, 4> MDs;
6711   // Reserve first location for self reference to the LoopID metadata node.
6712   MDs.push_back(nullptr);
6713   bool IsUnrollMetadata = false;
6714   MDNode *LoopID = L->getLoopID();
6715   if (LoopID) {
6716     // First find existing loop unrolling disable metadata.
6717     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6718       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6719       if (MD) {
6720         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6721         IsUnrollMetadata =
6722             S && S->getString().startswith("llvm.loop.unroll.disable");
6723       }
6724       MDs.push_back(LoopID->getOperand(i));
6725     }
6726   }
6727 
6728   if (!IsUnrollMetadata) {
6729     // Add runtime unroll disable metadata.
6730     LLVMContext &Context = L->getHeader()->getContext();
6731     SmallVector<Metadata *, 1> DisableOperands;
6732     DisableOperands.push_back(
6733         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6734     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6735     MDs.push_back(DisableNode);
6736     MDNode *NewLoopID = MDNode::get(Context, MDs);
6737     // Set operand 0 to refer to the loop id itself.
6738     NewLoopID->replaceOperandWith(0, NewLoopID);
6739     L->setLoopID(NewLoopID);
6740   }
6741 }
6742 
6743 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6744     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6745   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6746   bool PredicateAtRangeStart = Predicate(Range.Start);
6747 
6748   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6749     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6750       Range.End = TmpVF;
6751       break;
6752     }
6753 
6754   return PredicateAtRangeStart;
6755 }
6756 
6757 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6758 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6759 /// of VF's starting at a given VF and extending it as much as possible. Each
6760 /// vectorization decision can potentially shorten this sub-range during
6761 /// buildVPlan().
6762 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6763   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6764     VFRange SubRange = {VF, MaxVF + 1};
6765     VPlans.push_back(buildVPlan(SubRange));
6766     VF = SubRange.End;
6767   }
6768 }
6769 
6770 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6771                                          VPlanPtr &Plan) {
6772   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6773 
6774   // Look for cached value.
6775   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6776   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6777   if (ECEntryIt != EdgeMaskCache.end())
6778     return ECEntryIt->second;
6779 
6780   VPValue *SrcMask = createBlockInMask(Src, Plan);
6781 
6782   // The terminator has to be a branch inst!
6783   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6784   assert(BI && "Unexpected terminator found");
6785 
6786   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6787     return EdgeMaskCache[Edge] = SrcMask;
6788 
6789   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6790   assert(EdgeMask && "No Edge Mask found for condition");
6791 
6792   if (BI->getSuccessor(0) != Dst)
6793     EdgeMask = Builder.createNot(EdgeMask);
6794 
6795   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6796     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6797 
6798   return EdgeMaskCache[Edge] = EdgeMask;
6799 }
6800 
6801 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6802   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6803 
6804   // Look for cached value.
6805   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6806   if (BCEntryIt != BlockMaskCache.end())
6807     return BCEntryIt->second;
6808 
6809   // All-one mask is modelled as no-mask following the convention for masked
6810   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6811   VPValue *BlockMask = nullptr;
6812 
6813   if (OrigLoop->getHeader() == BB) {
6814     if (!CM.blockNeedsPredication(BB))
6815       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6816 
6817     // Introduce the early-exit compare IV <= BTC to form header block mask.
6818     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6819     // Start by constructing the desired canonical IV.
6820     VPValue *IV = nullptr;
6821     if (Legal->getPrimaryInduction())
6822       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6823     else {
6824       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6825       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6826       IV = IVRecipe->getVPValue();
6827     }
6828     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6829     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6830     return BlockMaskCache[BB] = BlockMask;
6831   }
6832 
6833   // This is the block mask. We OR all incoming edges.
6834   for (auto *Predecessor : predecessors(BB)) {
6835     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6836     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6837       return BlockMaskCache[BB] = EdgeMask;
6838 
6839     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6840       BlockMask = EdgeMask;
6841       continue;
6842     }
6843 
6844     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6845   }
6846 
6847   return BlockMaskCache[BB] = BlockMask;
6848 }
6849 
6850 VPWidenMemoryInstructionRecipe *
6851 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6852                                   VPlanPtr &Plan) {
6853   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6854          "Must be called with either a load or store");
6855 
6856   auto willWiden = [&](unsigned VF) -> bool {
6857     if (VF == 1)
6858       return false;
6859     LoopVectorizationCostModel::InstWidening Decision =
6860         CM.getWideningDecision(I, VF);
6861     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6862            "CM decision should be taken at this point.");
6863     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6864       return true;
6865     if (CM.isScalarAfterVectorization(I, VF) ||
6866         CM.isProfitableToScalarize(I, VF))
6867       return false;
6868     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6869   };
6870 
6871   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6872     return nullptr;
6873 
6874   VPValue *Mask = nullptr;
6875   if (Legal->isMaskRequired(I))
6876     Mask = createBlockInMask(I->getParent(), Plan);
6877 
6878   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6879   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6880     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6881 
6882   StoreInst *Store = cast<StoreInst>(I);
6883   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6884   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6885 }
6886 
6887 VPWidenIntOrFpInductionRecipe *
6888 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
6889   // Check if this is an integer or fp induction. If so, build the recipe that
6890   // produces its scalar and vector values.
6891   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6892   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6893       II.getKind() == InductionDescriptor::IK_FpInduction)
6894     return new VPWidenIntOrFpInductionRecipe(Phi);
6895 
6896   return nullptr;
6897 }
6898 
6899 VPWidenIntOrFpInductionRecipe *
6900 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
6901                                                 VFRange &Range) const {
6902   // Optimize the special case where the source is a constant integer
6903   // induction variable. Notice that we can only optimize the 'trunc' case
6904   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6905   // (c) other casts depend on pointer size.
6906 
6907   // Determine whether \p K is a truncation based on an induction variable that
6908   // can be optimized.
6909   auto isOptimizableIVTruncate =
6910       [&](Instruction *K) -> std::function<bool(unsigned)> {
6911     return
6912         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6913   };
6914 
6915   if (LoopVectorizationPlanner::getDecisionAndClampRange(
6916           isOptimizableIVTruncate(I), Range))
6917     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6918                                              I);
6919   return nullptr;
6920 }
6921 
6922 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
6923   // We know that all PHIs in non-header blocks are converted into selects, so
6924   // we don't have to worry about the insertion order and we can just use the
6925   // builder. At this point we generate the predication tree. There may be
6926   // duplications since this is a simple recursive scan, but future
6927   // optimizations will clean it up.
6928 
6929   SmallVector<VPValue *, 2> Operands;
6930   unsigned NumIncoming = Phi->getNumIncomingValues();
6931   for (unsigned In = 0; In < NumIncoming; In++) {
6932     VPValue *EdgeMask =
6933       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6934     assert((EdgeMask || NumIncoming == 1) &&
6935            "Multiple predecessors with one having a full mask");
6936     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
6937     if (EdgeMask)
6938       Operands.push_back(EdgeMask);
6939   }
6940   return new VPBlendRecipe(Phi, Operands);
6941 }
6942 
6943 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
6944                                                    VPlan &Plan) const {
6945 
6946   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6947       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
6948       Range);
6949 
6950   if (IsPredicated)
6951     return nullptr;
6952 
6953   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6954   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6955              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6956     return nullptr;
6957 
6958   auto willWiden = [&](unsigned VF) -> bool {
6959     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6960     // The following case may be scalarized depending on the VF.
6961     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6962     // version of the instruction.
6963     // Is it beneficial to perform intrinsic call compared to lib call?
6964     bool NeedToScalarize = false;
6965     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6966     bool UseVectorIntrinsic =
6967         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6968     return UseVectorIntrinsic || !NeedToScalarize;
6969   };
6970 
6971   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6972     return nullptr;
6973 
6974   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
6975 }
6976 
6977 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
6978   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
6979          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
6980   // Instruction should be widened, unless it is scalar after vectorization,
6981   // scalarization is profitable or it is predicated.
6982   auto WillScalarize = [this, I](unsigned VF) -> bool {
6983     return CM.isScalarAfterVectorization(I, VF) ||
6984            CM.isProfitableToScalarize(I, VF) ||
6985            CM.isScalarWithPredication(I, VF);
6986   };
6987   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
6988                                                              Range);
6989 }
6990 
6991 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
6992   auto IsVectorizableOpcode = [](unsigned Opcode) {
6993     switch (Opcode) {
6994     case Instruction::Add:
6995     case Instruction::And:
6996     case Instruction::AShr:
6997     case Instruction::BitCast:
6998     case Instruction::FAdd:
6999     case Instruction::FCmp:
7000     case Instruction::FDiv:
7001     case Instruction::FMul:
7002     case Instruction::FNeg:
7003     case Instruction::FPExt:
7004     case Instruction::FPToSI:
7005     case Instruction::FPToUI:
7006     case Instruction::FPTrunc:
7007     case Instruction::FRem:
7008     case Instruction::FSub:
7009     case Instruction::ICmp:
7010     case Instruction::IntToPtr:
7011     case Instruction::LShr:
7012     case Instruction::Mul:
7013     case Instruction::Or:
7014     case Instruction::PtrToInt:
7015     case Instruction::SDiv:
7016     case Instruction::Select:
7017     case Instruction::SExt:
7018     case Instruction::Shl:
7019     case Instruction::SIToFP:
7020     case Instruction::SRem:
7021     case Instruction::Sub:
7022     case Instruction::Trunc:
7023     case Instruction::UDiv:
7024     case Instruction::UIToFP:
7025     case Instruction::URem:
7026     case Instruction::Xor:
7027     case Instruction::ZExt:
7028       return true;
7029     }
7030     return false;
7031   };
7032 
7033   if (!IsVectorizableOpcode(I->getOpcode()))
7034     return nullptr;
7035 
7036   // Success: widen this instruction.
7037   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7038 }
7039 
7040 VPBasicBlock *VPRecipeBuilder::handleReplication(
7041     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7042     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7043     VPlanPtr &Plan) {
7044   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7045       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7046       Range);
7047 
7048   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7049       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7050 
7051   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7052   setRecipe(I, Recipe);
7053 
7054   // Find if I uses a predicated instruction. If so, it will use its scalar
7055   // value. Avoid hoisting the insert-element which packs the scalar value into
7056   // a vector value, as that happens iff all users use the vector value.
7057   for (auto &Op : I->operands())
7058     if (auto *PredInst = dyn_cast<Instruction>(Op))
7059       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7060         PredInst2Recipe[PredInst]->setAlsoPack(false);
7061 
7062   // Finalize the recipe for Instr, first if it is not predicated.
7063   if (!IsPredicated) {
7064     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7065     VPBB->appendRecipe(Recipe);
7066     return VPBB;
7067   }
7068   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7069   assert(VPBB->getSuccessors().empty() &&
7070          "VPBB has successors when handling predicated replication.");
7071   // Record predicated instructions for above packing optimizations.
7072   PredInst2Recipe[I] = Recipe;
7073   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7074   VPBlockUtils::insertBlockAfter(Region, VPBB);
7075   auto *RegSucc = new VPBasicBlock();
7076   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7077   return RegSucc;
7078 }
7079 
7080 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7081                                                       VPRecipeBase *PredRecipe,
7082                                                       VPlanPtr &Plan) {
7083   // Instructions marked for predication are replicated and placed under an
7084   // if-then construct to prevent side-effects.
7085 
7086   // Generate recipes to compute the block mask for this region.
7087   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7088 
7089   // Build the triangular if-then region.
7090   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7091   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7092   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7093   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7094   auto *PHIRecipe =
7095       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7096   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7097   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7098   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7099 
7100   // Note: first set Entry as region entry and then connect successors starting
7101   // from it in order, to propagate the "parent" of each VPBasicBlock.
7102   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7103   VPBlockUtils::connectBlocks(Pred, Exit);
7104 
7105   return Region;
7106 }
7107 
7108 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7109                                                       VFRange &Range,
7110                                                       VPlanPtr &Plan) {
7111   // First, check for specific widening recipes that deal with calls, memory
7112   // operations, inductions and Phi nodes.
7113   if (auto *CI = dyn_cast<CallInst>(Instr))
7114     return tryToWidenCall(CI, Range, *Plan);
7115 
7116   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7117     return tryToWidenMemory(Instr, Range, Plan);
7118 
7119   VPRecipeBase *Recipe;
7120   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7121     if (Phi->getParent() != OrigLoop->getHeader())
7122       return tryToBlend(Phi, Plan);
7123     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7124       return Recipe;
7125     return new VPWidenPHIRecipe(Phi);
7126     return new VPWidenPHIRecipe(Phi);
7127   }
7128 
7129   if (isa<TruncInst>(Instr) &&
7130       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7131     return Recipe;
7132 
7133   if (!shouldWiden(Instr, Range))
7134     return nullptr;
7135 
7136   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7137     return new VPWidenGEPRecipe(GEP, OrigLoop);
7138 
7139   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7140     bool InvariantCond =
7141         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7142     return new VPWidenSelectRecipe(*SI, InvariantCond);
7143   }
7144 
7145   return tryToWiden(Instr, *Plan);
7146 }
7147 
7148 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7149                                                         unsigned MaxVF) {
7150   assert(OrigLoop->empty() && "Inner loop expected.");
7151 
7152   // Collect conditions feeding internal conditional branches; they need to be
7153   // represented in VPlan for it to model masking.
7154   SmallPtrSet<Value *, 1> NeedDef;
7155 
7156   auto *Latch = OrigLoop->getLoopLatch();
7157   for (BasicBlock *BB : OrigLoop->blocks()) {
7158     if (BB == Latch)
7159       continue;
7160     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7161     if (Branch && Branch->isConditional())
7162       NeedDef.insert(Branch->getCondition());
7163   }
7164 
7165   // If the tail is to be folded by masking, the primary induction variable, if
7166   // exists needs to be represented in VPlan for it to model early-exit masking.
7167   // Also, both the Phi and the live-out instruction of each reduction are
7168   // required in order to introduce a select between them in VPlan.
7169   if (CM.foldTailByMasking()) {
7170     if (Legal->getPrimaryInduction())
7171       NeedDef.insert(Legal->getPrimaryInduction());
7172     for (auto &Reduction : Legal->getReductionVars()) {
7173       NeedDef.insert(Reduction.first);
7174       NeedDef.insert(Reduction.second.getLoopExitInstr());
7175     }
7176   }
7177 
7178   // Collect instructions from the original loop that will become trivially dead
7179   // in the vectorized loop. We don't need to vectorize these instructions. For
7180   // example, original induction update instructions can become dead because we
7181   // separately emit induction "steps" when generating code for the new loop.
7182   // Similarly, we create a new latch condition when setting up the structure
7183   // of the new loop, so the old one can become dead.
7184   SmallPtrSet<Instruction *, 4> DeadInstructions;
7185   collectTriviallyDeadInstructions(DeadInstructions);
7186 
7187   // Add assume instructions we need to drop to DeadInstructions, to prevent
7188   // them from being added to the VPlan.
7189   // TODO: We only need to drop assumes in blocks that get flattend. If the
7190   // control flow is preserved, we should keep them.
7191   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7192   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7193 
7194   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7195   // Dead instructions do not need sinking. Remove them from SinkAfter.
7196   for (Instruction *I : DeadInstructions)
7197     SinkAfter.erase(I);
7198 
7199   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7200     VFRange SubRange = {VF, MaxVF + 1};
7201     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7202                                              DeadInstructions, SinkAfter));
7203     VF = SubRange.End;
7204   }
7205 }
7206 
7207 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7208     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7209     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7210     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7211 
7212   // Hold a mapping from predicated instructions to their recipes, in order to
7213   // fix their AlsoPack behavior if a user is determined to replicate and use a
7214   // scalar instead of vector value.
7215   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7216 
7217   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7218 
7219   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7220 
7221   // ---------------------------------------------------------------------------
7222   // Pre-construction: record ingredients whose recipes we'll need to further
7223   // process after constructing the initial VPlan.
7224   // ---------------------------------------------------------------------------
7225 
7226   // Mark instructions we'll need to sink later and their targets as
7227   // ingredients whose recipe we'll need to record.
7228   for (auto &Entry : SinkAfter) {
7229     RecipeBuilder.recordRecipeOf(Entry.first);
7230     RecipeBuilder.recordRecipeOf(Entry.second);
7231   }
7232 
7233   // For each interleave group which is relevant for this (possibly trimmed)
7234   // Range, add it to the set of groups to be later applied to the VPlan and add
7235   // placeholders for its members' Recipes which we'll be replacing with a
7236   // single VPInterleaveRecipe.
7237   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7238     auto applyIG = [IG, this](unsigned VF) -> bool {
7239       return (VF >= 2 && // Query is illegal for VF == 1
7240               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7241                   LoopVectorizationCostModel::CM_Interleave);
7242     };
7243     if (!getDecisionAndClampRange(applyIG, Range))
7244       continue;
7245     InterleaveGroups.insert(IG);
7246     for (unsigned i = 0; i < IG->getFactor(); i++)
7247       if (Instruction *Member = IG->getMember(i))
7248         RecipeBuilder.recordRecipeOf(Member);
7249   };
7250 
7251   // ---------------------------------------------------------------------------
7252   // Build initial VPlan: Scan the body of the loop in a topological order to
7253   // visit each basic block after having visited its predecessor basic blocks.
7254   // ---------------------------------------------------------------------------
7255 
7256   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7257   auto Plan = std::make_unique<VPlan>();
7258   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7259   Plan->setEntry(VPBB);
7260 
7261   // Represent values that will have defs inside VPlan.
7262   for (Value *V : NeedDef)
7263     Plan->addVPValue(V);
7264 
7265   // Scan the body of the loop in a topological order to visit each basic block
7266   // after having visited its predecessor basic blocks.
7267   LoopBlocksDFS DFS(OrigLoop);
7268   DFS.perform(LI);
7269 
7270   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7271     // Relevant instructions from basic block BB will be grouped into VPRecipe
7272     // ingredients and fill a new VPBasicBlock.
7273     unsigned VPBBsForBB = 0;
7274     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7275     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7276     VPBB = FirstVPBBForBB;
7277     Builder.setInsertPoint(VPBB);
7278 
7279     // Introduce each ingredient into VPlan.
7280     // TODO: Model and preserve debug instrinsics in VPlan.
7281     for (Instruction &I : BB->instructionsWithoutDebug()) {
7282       Instruction *Instr = &I;
7283 
7284       // First filter out irrelevant instructions, to ensure no recipes are
7285       // built for them.
7286       if (isa<BranchInst>(Instr) ||
7287           DeadInstructions.find(Instr) != DeadInstructions.end())
7288         continue;
7289 
7290       if (auto Recipe =
7291               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7292         RecipeBuilder.setRecipe(Instr, Recipe);
7293         VPBB->appendRecipe(Recipe);
7294         continue;
7295       }
7296 
7297       // Otherwise, if all widening options failed, Instruction is to be
7298       // replicated. This may create a successor for VPBB.
7299       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7300           Instr, Range, VPBB, PredInst2Recipe, Plan);
7301       if (NextVPBB != VPBB) {
7302         VPBB = NextVPBB;
7303         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7304                                     : "");
7305       }
7306     }
7307   }
7308 
7309   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7310   // may also be empty, such as the last one VPBB, reflecting original
7311   // basic-blocks with no recipes.
7312   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7313   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7314   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7315   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7316   delete PreEntry;
7317 
7318   // ---------------------------------------------------------------------------
7319   // Transform initial VPlan: Apply previously taken decisions, in order, to
7320   // bring the VPlan to its final state.
7321   // ---------------------------------------------------------------------------
7322 
7323   // Apply Sink-After legal constraints.
7324   for (auto &Entry : SinkAfter) {
7325     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7326     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7327     Sink->moveAfter(Target);
7328   }
7329 
7330   // Interleave memory: for each Interleave Group we marked earlier as relevant
7331   // for this VPlan, replace the Recipes widening its memory instructions with a
7332   // single VPInterleaveRecipe at its insertion point.
7333   for (auto IG : InterleaveGroups) {
7334     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7335         RecipeBuilder.getRecipe(IG->getInsertPos()));
7336     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7337         ->insertBefore(Recipe);
7338 
7339     for (unsigned i = 0; i < IG->getFactor(); ++i)
7340       if (Instruction *Member = IG->getMember(i)) {
7341         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7342       }
7343   }
7344 
7345   // Finally, if tail is folded by masking, introduce selects between the phi
7346   // and the live-out instruction of each reduction, at the end of the latch.
7347   if (CM.foldTailByMasking()) {
7348     Builder.setInsertPoint(VPBB);
7349     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7350     for (auto &Reduction : Legal->getReductionVars()) {
7351       VPValue *Phi = Plan->getVPValue(Reduction.first);
7352       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7353       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7354     }
7355   }
7356 
7357   std::string PlanName;
7358   raw_string_ostream RSO(PlanName);
7359   unsigned VF = Range.Start;
7360   Plan->addVF(VF);
7361   RSO << "Initial VPlan for VF={" << VF;
7362   for (VF *= 2; VF < Range.End; VF *= 2) {
7363     Plan->addVF(VF);
7364     RSO << "," << VF;
7365   }
7366   RSO << "},UF>=1";
7367   RSO.flush();
7368   Plan->setName(PlanName);
7369 
7370   return Plan;
7371 }
7372 
7373 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7374   // Outer loop handling: They may require CFG and instruction level
7375   // transformations before even evaluating whether vectorization is profitable.
7376   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7377   // the vectorization pipeline.
7378   assert(!OrigLoop->empty());
7379   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7380 
7381   // Create new empty VPlan
7382   auto Plan = std::make_unique<VPlan>();
7383 
7384   // Build hierarchical CFG
7385   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7386   HCFGBuilder.buildHierarchicalCFG();
7387 
7388   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7389     Plan->addVF(VF);
7390 
7391   if (EnableVPlanPredication) {
7392     VPlanPredicator VPP(*Plan);
7393     VPP.predicate();
7394 
7395     // Avoid running transformation to recipes until masked code generation in
7396     // VPlan-native path is in place.
7397     return Plan;
7398   }
7399 
7400   SmallPtrSet<Instruction *, 1> DeadInstructions;
7401   VPlanTransforms::VPInstructionsToVPRecipes(
7402       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7403   return Plan;
7404 }
7405 
7406 Value* LoopVectorizationPlanner::VPCallbackILV::
7407 getOrCreateVectorValues(Value *V, unsigned Part) {
7408       return ILV.getOrCreateVectorValue(V, Part);
7409 }
7410 
7411 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7412     Value *V, const VPIteration &Instance) {
7413   return ILV.getOrCreateScalarValue(V, Instance);
7414 }
7415 
7416 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7417                                VPSlotTracker &SlotTracker) const {
7418   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7419   IG->getInsertPos()->printAsOperand(O, false);
7420   O << ", ";
7421   getAddr()->printAsOperand(O, SlotTracker);
7422   VPValue *Mask = getMask();
7423   if (Mask) {
7424     O << ", ";
7425     Mask->printAsOperand(O, SlotTracker);
7426   }
7427   for (unsigned i = 0; i < IG->getFactor(); ++i)
7428     if (Instruction *I = IG->getMember(i))
7429       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7430 }
7431 
7432 void VPWidenCallRecipe::execute(VPTransformState &State) {
7433   State.ILV->widenCallInstruction(Ingredient, User, State);
7434 }
7435 
7436 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7437   State.ILV->widenSelectInstruction(Ingredient, InvariantCond);
7438 }
7439 
7440 void VPWidenRecipe::execute(VPTransformState &State) {
7441   State.ILV->widenInstruction(Ingredient, User, State);
7442 }
7443 
7444 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7445   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7446                       IsIndexLoopInvariant);
7447 }
7448 
7449 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7450   assert(!State.Instance && "Int or FP induction being replicated.");
7451   State.ILV->widenIntOrFpInduction(IV, Trunc);
7452 }
7453 
7454 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7455   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7456 }
7457 
7458 void VPBlendRecipe::execute(VPTransformState &State) {
7459   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7460   // We know that all PHIs in non-header blocks are converted into
7461   // selects, so we don't have to worry about the insertion order and we
7462   // can just use the builder.
7463   // At this point we generate the predication tree. There may be
7464   // duplications since this is a simple recursive scan, but future
7465   // optimizations will clean it up.
7466 
7467   unsigned NumIncoming = getNumIncomingValues();
7468 
7469   // Generate a sequence of selects of the form:
7470   // SELECT(Mask3, In3,
7471   //        SELECT(Mask2, In2,
7472   //               SELECT(Mask1, In1,
7473   //                      In0)))
7474   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7475   // are essentially undef are taken from In0.
7476   InnerLoopVectorizer::VectorParts Entry(State.UF);
7477   for (unsigned In = 0; In < NumIncoming; ++In) {
7478     for (unsigned Part = 0; Part < State.UF; ++Part) {
7479       // We might have single edge PHIs (blocks) - use an identity
7480       // 'select' for the first PHI operand.
7481       Value *In0 = State.get(getIncomingValue(In), Part);
7482       if (In == 0)
7483         Entry[Part] = In0; // Initialize with the first incoming value.
7484       else {
7485         // Select between the current value and the previous incoming edge
7486         // based on the incoming mask.
7487         Value *Cond = State.get(getMask(In), Part);
7488         Entry[Part] =
7489             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7490       }
7491     }
7492   }
7493   for (unsigned Part = 0; Part < State.UF; ++Part)
7494     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7495 }
7496 
7497 void VPInterleaveRecipe::execute(VPTransformState &State) {
7498   assert(!State.Instance && "Interleave group being replicated.");
7499   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7500 }
7501 
7502 void VPReplicateRecipe::execute(VPTransformState &State) {
7503   if (State.Instance) { // Generate a single instance.
7504     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7505     // Insert scalar instance packing it into a vector.
7506     if (AlsoPack && State.VF > 1) {
7507       // If we're constructing lane 0, initialize to start from undef.
7508       if (State.Instance->Lane == 0) {
7509         Value *Undef =
7510             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7511         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7512       }
7513       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7514     }
7515     return;
7516   }
7517 
7518   // Generate scalar instances for all VF lanes of all UF parts, unless the
7519   // instruction is uniform inwhich case generate only the first lane for each
7520   // of the UF parts.
7521   unsigned EndLane = IsUniform ? 1 : State.VF;
7522   for (unsigned Part = 0; Part < State.UF; ++Part)
7523     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7524       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7525 }
7526 
7527 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7528   assert(State.Instance && "Branch on Mask works only on single instance.");
7529 
7530   unsigned Part = State.Instance->Part;
7531   unsigned Lane = State.Instance->Lane;
7532 
7533   Value *ConditionBit = nullptr;
7534   if (!User) // Block in mask is all-one.
7535     ConditionBit = State.Builder.getTrue();
7536   else {
7537     VPValue *BlockInMask = User->getOperand(0);
7538     ConditionBit = State.get(BlockInMask, Part);
7539     if (ConditionBit->getType()->isVectorTy())
7540       ConditionBit = State.Builder.CreateExtractElement(
7541           ConditionBit, State.Builder.getInt32(Lane));
7542   }
7543 
7544   // Replace the temporary unreachable terminator with a new conditional branch,
7545   // whose two destinations will be set later when they are created.
7546   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7547   assert(isa<UnreachableInst>(CurrentTerminator) &&
7548          "Expected to replace unreachable terminator with conditional branch.");
7549   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7550   CondBr->setSuccessor(0, nullptr);
7551   ReplaceInstWithInst(CurrentTerminator, CondBr);
7552 }
7553 
7554 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7555   assert(State.Instance && "Predicated instruction PHI works per instance.");
7556   Instruction *ScalarPredInst = cast<Instruction>(
7557       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7558   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7559   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7560   assert(PredicatingBB && "Predicated block has no single predecessor.");
7561 
7562   // By current pack/unpack logic we need to generate only a single phi node: if
7563   // a vector value for the predicated instruction exists at this point it means
7564   // the instruction has vector users only, and a phi for the vector value is
7565   // needed. In this case the recipe of the predicated instruction is marked to
7566   // also do that packing, thereby "hoisting" the insert-element sequence.
7567   // Otherwise, a phi node for the scalar value is needed.
7568   unsigned Part = State.Instance->Part;
7569   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7570     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7571     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7572     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7573     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7574     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7575     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7576   } else {
7577     Type *PredInstType = PredInst->getType();
7578     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7579     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7580     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7581     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7582   }
7583 }
7584 
7585 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7586   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7587   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7588                                         getMask());
7589 }
7590 
7591 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7592 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7593 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7594 // for predication.
7595 static ScalarEpilogueLowering getScalarEpilogueLowering(
7596     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7597     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7598     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7599     LoopVectorizationLegality &LVL) {
7600   bool OptSize =
7601       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7602                                                      PGSOQueryType::IRPass);
7603   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7604   // don't look at hints or options, and don't request a scalar epilogue.
7605   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7606     return CM_ScalarEpilogueNotAllowedOptSize;
7607 
7608   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7609                               !PreferPredicateOverEpilog;
7610 
7611   // 2) Next, if disabling predication is requested on the command line, honour
7612   // this and request a scalar epilogue.
7613   if (PredicateOptDisabled)
7614     return CM_ScalarEpilogueAllowed;
7615 
7616   // 3) and 4) look if enabling predication is requested on the command line,
7617   // with a loop hint, or if the TTI hook indicates this is profitable, request
7618   // predication .
7619   if (PreferPredicateOverEpilog ||
7620       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7621       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7622                                         LVL.getLAI()) &&
7623        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7624     return CM_ScalarEpilogueNotNeededUsePredicate;
7625 
7626   return CM_ScalarEpilogueAllowed;
7627 }
7628 
7629 // Process the loop in the VPlan-native vectorization path. This path builds
7630 // VPlan upfront in the vectorization pipeline, which allows to apply
7631 // VPlan-to-VPlan transformations from the very beginning without modifying the
7632 // input LLVM IR.
7633 static bool processLoopInVPlanNativePath(
7634     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7635     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7636     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7637     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7638     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7639 
7640   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7641   Function *F = L->getHeader()->getParent();
7642   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7643 
7644   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7645       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7646 
7647   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7648                                 &Hints, IAI);
7649   // Use the planner for outer loop vectorization.
7650   // TODO: CM is not used at this point inside the planner. Turn CM into an
7651   // optional argument if we don't need it in the future.
7652   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7653 
7654   // Get user vectorization factor.
7655   const unsigned UserVF = Hints.getWidth();
7656 
7657   // Plan how to best vectorize, return the best VF and its cost.
7658   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7659 
7660   // If we are stress testing VPlan builds, do not attempt to generate vector
7661   // code. Masked vector code generation support will follow soon.
7662   // Also, do not attempt to vectorize if no vector code will be produced.
7663   if (VPlanBuildStressTest || EnableVPlanPredication ||
7664       VectorizationFactor::Disabled() == VF)
7665     return false;
7666 
7667   LVP.setBestPlan(VF.Width, 1);
7668 
7669   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7670                          &CM);
7671   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7672                     << L->getHeader()->getParent()->getName() << "\"\n");
7673   LVP.executePlan(LB, DT);
7674 
7675   // Mark the loop as already vectorized to avoid vectorizing again.
7676   Hints.setAlreadyVectorized();
7677 
7678   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7679   return true;
7680 }
7681 
7682 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7683     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7684                                !EnableLoopInterleaving),
7685       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7686                               !EnableLoopVectorization) {}
7687 
7688 bool LoopVectorizePass::processLoop(Loop *L) {
7689   assert((EnableVPlanNativePath || L->empty()) &&
7690          "VPlan-native path is not enabled. Only process inner loops.");
7691 
7692 #ifndef NDEBUG
7693   const std::string DebugLocStr = getDebugLocString(L);
7694 #endif /* NDEBUG */
7695 
7696   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7697                     << L->getHeader()->getParent()->getName() << "\" from "
7698                     << DebugLocStr << "\n");
7699 
7700   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7701 
7702   LLVM_DEBUG(
7703       dbgs() << "LV: Loop hints:"
7704              << " force="
7705              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7706                      ? "disabled"
7707                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7708                             ? "enabled"
7709                             : "?"))
7710              << " width=" << Hints.getWidth()
7711              << " unroll=" << Hints.getInterleave() << "\n");
7712 
7713   // Function containing loop
7714   Function *F = L->getHeader()->getParent();
7715 
7716   // Looking at the diagnostic output is the only way to determine if a loop
7717   // was vectorized (other than looking at the IR or machine code), so it
7718   // is important to generate an optimization remark for each loop. Most of
7719   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7720   // generated as OptimizationRemark and OptimizationRemarkMissed are
7721   // less verbose reporting vectorized loops and unvectorized loops that may
7722   // benefit from vectorization, respectively.
7723 
7724   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7725     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7726     return false;
7727   }
7728 
7729   PredicatedScalarEvolution PSE(*SE, *L);
7730 
7731   // Check if it is legal to vectorize the loop.
7732   LoopVectorizationRequirements Requirements(*ORE);
7733   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7734                                 &Requirements, &Hints, DB, AC);
7735   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7736     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7737     Hints.emitRemarkWithHints();
7738     return false;
7739   }
7740 
7741   // Check the function attributes and profiles to find out if this function
7742   // should be optimized for size.
7743   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7744       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7745 
7746   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7747   // here. They may require CFG and instruction level transformations before
7748   // even evaluating whether vectorization is profitable. Since we cannot modify
7749   // the incoming IR, we need to build VPlan upfront in the vectorization
7750   // pipeline.
7751   if (!L->empty())
7752     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7753                                         ORE, BFI, PSI, Hints);
7754 
7755   assert(L->empty() && "Inner loop expected.");
7756 
7757   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7758   // count by optimizing for size, to minimize overheads.
7759   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7760   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7761     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7762                       << "This loop is worth vectorizing only if no scalar "
7763                       << "iteration overheads are incurred.");
7764     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7765       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7766     else {
7767       LLVM_DEBUG(dbgs() << "\n");
7768       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7769     }
7770   }
7771 
7772   // Check the function attributes to see if implicit floats are allowed.
7773   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7774   // an integer loop and the vector instructions selected are purely integer
7775   // vector instructions?
7776   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7777     reportVectorizationFailure(
7778         "Can't vectorize when the NoImplicitFloat attribute is used",
7779         "loop not vectorized due to NoImplicitFloat attribute",
7780         "NoImplicitFloat", ORE, L);
7781     Hints.emitRemarkWithHints();
7782     return false;
7783   }
7784 
7785   // Check if the target supports potentially unsafe FP vectorization.
7786   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7787   // for the target we're vectorizing for, to make sure none of the
7788   // additional fp-math flags can help.
7789   if (Hints.isPotentiallyUnsafe() &&
7790       TTI->isFPVectorizationPotentiallyUnsafe()) {
7791     reportVectorizationFailure(
7792         "Potentially unsafe FP op prevents vectorization",
7793         "loop not vectorized due to unsafe FP support.",
7794         "UnsafeFP", ORE, L);
7795     Hints.emitRemarkWithHints();
7796     return false;
7797   }
7798 
7799   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7800   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7801 
7802   // If an override option has been passed in for interleaved accesses, use it.
7803   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7804     UseInterleaved = EnableInterleavedMemAccesses;
7805 
7806   // Analyze interleaved memory accesses.
7807   if (UseInterleaved) {
7808     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7809   }
7810 
7811   // Use the cost model.
7812   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7813                                 F, &Hints, IAI);
7814   CM.collectValuesToIgnore();
7815 
7816   // Use the planner for vectorization.
7817   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7818 
7819   // Get user vectorization factor.
7820   unsigned UserVF = Hints.getWidth();
7821 
7822   // Plan how to best vectorize, return the best VF and its cost.
7823   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7824 
7825   VectorizationFactor VF = VectorizationFactor::Disabled();
7826   unsigned IC = 1;
7827   unsigned UserIC = Hints.getInterleave();
7828 
7829   if (MaybeVF) {
7830     VF = *MaybeVF;
7831     // Select the interleave count.
7832     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7833   }
7834 
7835   // Identify the diagnostic messages that should be produced.
7836   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7837   bool VectorizeLoop = true, InterleaveLoop = true;
7838   if (Requirements.doesNotMeet(F, L, Hints)) {
7839     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7840                          "requirements.\n");
7841     Hints.emitRemarkWithHints();
7842     return false;
7843   }
7844 
7845   if (VF.Width == 1) {
7846     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7847     VecDiagMsg = std::make_pair(
7848         "VectorizationNotBeneficial",
7849         "the cost-model indicates that vectorization is not beneficial");
7850     VectorizeLoop = false;
7851   }
7852 
7853   if (!MaybeVF && UserIC > 1) {
7854     // Tell the user interleaving was avoided up-front, despite being explicitly
7855     // requested.
7856     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7857                          "interleaving should be avoided up front\n");
7858     IntDiagMsg = std::make_pair(
7859         "InterleavingAvoided",
7860         "Ignoring UserIC, because interleaving was avoided up front");
7861     InterleaveLoop = false;
7862   } else if (IC == 1 && UserIC <= 1) {
7863     // Tell the user interleaving is not beneficial.
7864     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7865     IntDiagMsg = std::make_pair(
7866         "InterleavingNotBeneficial",
7867         "the cost-model indicates that interleaving is not beneficial");
7868     InterleaveLoop = false;
7869     if (UserIC == 1) {
7870       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7871       IntDiagMsg.second +=
7872           " and is explicitly disabled or interleave count is set to 1";
7873     }
7874   } else if (IC > 1 && UserIC == 1) {
7875     // Tell the user interleaving is beneficial, but it explicitly disabled.
7876     LLVM_DEBUG(
7877         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7878     IntDiagMsg = std::make_pair(
7879         "InterleavingBeneficialButDisabled",
7880         "the cost-model indicates that interleaving is beneficial "
7881         "but is explicitly disabled or interleave count is set to 1");
7882     InterleaveLoop = false;
7883   }
7884 
7885   // Override IC if user provided an interleave count.
7886   IC = UserIC > 0 ? UserIC : IC;
7887 
7888   // Emit diagnostic messages, if any.
7889   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7890   if (!VectorizeLoop && !InterleaveLoop) {
7891     // Do not vectorize or interleaving the loop.
7892     ORE->emit([&]() {
7893       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7894                                       L->getStartLoc(), L->getHeader())
7895              << VecDiagMsg.second;
7896     });
7897     ORE->emit([&]() {
7898       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7899                                       L->getStartLoc(), L->getHeader())
7900              << IntDiagMsg.second;
7901     });
7902     return false;
7903   } else if (!VectorizeLoop && InterleaveLoop) {
7904     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7905     ORE->emit([&]() {
7906       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7907                                         L->getStartLoc(), L->getHeader())
7908              << VecDiagMsg.second;
7909     });
7910   } else if (VectorizeLoop && !InterleaveLoop) {
7911     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7912                       << ") in " << DebugLocStr << '\n');
7913     ORE->emit([&]() {
7914       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7915                                         L->getStartLoc(), L->getHeader())
7916              << IntDiagMsg.second;
7917     });
7918   } else if (VectorizeLoop && InterleaveLoop) {
7919     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7920                       << ") in " << DebugLocStr << '\n');
7921     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7922   }
7923 
7924   LVP.setBestPlan(VF.Width, IC);
7925 
7926   using namespace ore;
7927   bool DisableRuntimeUnroll = false;
7928   MDNode *OrigLoopID = L->getLoopID();
7929 
7930   if (!VectorizeLoop) {
7931     assert(IC > 1 && "interleave count should not be 1 or 0");
7932     // If we decided that it is not legal to vectorize the loop, then
7933     // interleave it.
7934     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7935                                &CM);
7936     LVP.executePlan(Unroller, DT);
7937 
7938     ORE->emit([&]() {
7939       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7940                                 L->getHeader())
7941              << "interleaved loop (interleaved count: "
7942              << NV("InterleaveCount", IC) << ")";
7943     });
7944   } else {
7945     // If we decided that it is *legal* to vectorize the loop, then do it.
7946     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7947                            &LVL, &CM);
7948     LVP.executePlan(LB, DT);
7949     ++LoopsVectorized;
7950 
7951     // Add metadata to disable runtime unrolling a scalar loop when there are
7952     // no runtime checks about strides and memory. A scalar loop that is
7953     // rarely used is not worth unrolling.
7954     if (!LB.areSafetyChecksAdded())
7955       DisableRuntimeUnroll = true;
7956 
7957     // Report the vectorization decision.
7958     ORE->emit([&]() {
7959       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7960                                 L->getHeader())
7961              << "vectorized loop (vectorization width: "
7962              << NV("VectorizationFactor", VF.Width)
7963              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7964     });
7965   }
7966 
7967   Optional<MDNode *> RemainderLoopID =
7968       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7969                                       LLVMLoopVectorizeFollowupEpilogue});
7970   if (RemainderLoopID.hasValue()) {
7971     L->setLoopID(RemainderLoopID.getValue());
7972   } else {
7973     if (DisableRuntimeUnroll)
7974       AddRuntimeUnrollDisableMetaData(L);
7975 
7976     // Mark the loop as already vectorized to avoid vectorizing again.
7977     Hints.setAlreadyVectorized();
7978   }
7979 
7980   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7981   return true;
7982 }
7983 
7984 LoopVectorizeResult LoopVectorizePass::runImpl(
7985     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7986     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7987     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7988     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7989     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7990   SE = &SE_;
7991   LI = &LI_;
7992   TTI = &TTI_;
7993   DT = &DT_;
7994   BFI = &BFI_;
7995   TLI = TLI_;
7996   AA = &AA_;
7997   AC = &AC_;
7998   GetLAA = &GetLAA_;
7999   DB = &DB_;
8000   ORE = &ORE_;
8001   PSI = PSI_;
8002 
8003   // Don't attempt if
8004   // 1. the target claims to have no vector registers, and
8005   // 2. interleaving won't help ILP.
8006   //
8007   // The second condition is necessary because, even if the target has no
8008   // vector registers, loop vectorization may still enable scalar
8009   // interleaving.
8010   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8011       TTI->getMaxInterleaveFactor(1) < 2)
8012     return LoopVectorizeResult(false, false);
8013 
8014   bool Changed = false, CFGChanged = false;
8015 
8016   // The vectorizer requires loops to be in simplified form.
8017   // Since simplification may add new inner loops, it has to run before the
8018   // legality and profitability checks. This means running the loop vectorizer
8019   // will simplify all loops, regardless of whether anything end up being
8020   // vectorized.
8021   for (auto &L : *LI)
8022     Changed |= CFGChanged |=
8023         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8024 
8025   // Build up a worklist of inner-loops to vectorize. This is necessary as
8026   // the act of vectorizing or partially unrolling a loop creates new loops
8027   // and can invalidate iterators across the loops.
8028   SmallVector<Loop *, 8> Worklist;
8029 
8030   for (Loop *L : *LI)
8031     collectSupportedLoops(*L, LI, ORE, Worklist);
8032 
8033   LoopsAnalyzed += Worklist.size();
8034 
8035   // Now walk the identified inner loops.
8036   while (!Worklist.empty()) {
8037     Loop *L = Worklist.pop_back_val();
8038 
8039     // For the inner loops we actually process, form LCSSA to simplify the
8040     // transform.
8041     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8042 
8043     Changed |= CFGChanged |= processLoop(L);
8044   }
8045 
8046   // Process each loop nest in the function.
8047   return LoopVectorizeResult(Changed, CFGChanged);
8048 }
8049 
8050 PreservedAnalyses LoopVectorizePass::run(Function &F,
8051                                          FunctionAnalysisManager &AM) {
8052     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8053     auto &LI = AM.getResult<LoopAnalysis>(F);
8054     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8055     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8056     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8057     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8058     auto &AA = AM.getResult<AAManager>(F);
8059     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8060     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8061     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8062     MemorySSA *MSSA = EnableMSSALoopDependency
8063                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8064                           : nullptr;
8065 
8066     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8067     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8068         [&](Loop &L) -> const LoopAccessInfo & {
8069       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8070       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8071     };
8072     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8073     ProfileSummaryInfo *PSI =
8074         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8075     LoopVectorizeResult Result =
8076         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8077     if (!Result.MadeAnyChange)
8078       return PreservedAnalyses::all();
8079     PreservedAnalyses PA;
8080 
8081     // We currently do not preserve loopinfo/dominator analyses with outer loop
8082     // vectorization. Until this is addressed, mark these analyses as preserved
8083     // only for non-VPlan-native path.
8084     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8085     if (!EnableVPlanNativePath) {
8086       PA.preserve<LoopAnalysis>();
8087       PA.preserve<DominatorTreeAnalysis>();
8088     }
8089     PA.preserve<BasicAA>();
8090     PA.preserve<GlobalsAA>();
8091     if (!Result.MadeCFGChange)
8092       PA.preserveSet<CFGAnalyses>();
8093     return PA;
8094 }
8095