1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   // Determine if an array of VF elements of type Ty is "bitcast compatible"
371   // with a <VF x Ty> vector.
372   if (VF.isVector()) {
373     auto *VectorTy = VectorType::get(Ty, VF);
374     return TypeSize::get(VF.getKnownMinValue() *
375                              DL.getTypeAllocSize(Ty).getFixedValue(),
376                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377   }
378 
379   // If the vectorization factor is one, we just check if an array of type Ty
380   // requires padding between elements.
381   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383 
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 ///       we always assume predicated blocks have a 50% chance of executing.
390 static unsigned getReciprocalPredBlockProb() { return 2; }
391 
392 /// A helper function that adds a 'fast' flag to floating-point operations.
393 static Value *addFastMathFlag(Value *V) {
394   if (isa<FPMathOperator>(V))
395     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396   return V;
397 }
398 
399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FMF);
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 namespace llvm {
436 
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 ///   counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 ///   instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
453   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454                       LoopInfo *LI, DominatorTree *DT,
455                       const TargetLibraryInfo *TLI,
456                       const TargetTransformInfo *TTI, AssumptionCache *AC,
457                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460                       ProfileSummaryInfo *PSI)
461       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463         Builder(PSE.getSE()->getContext()),
464         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465         BFI(BFI), PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop();
497 
498   // Return true if any runtime check is added.
499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516 
517   /// A helper function to scalarize a single Instruction in the innermost loop.
518   /// Generates a sequence of scalar instances for each lane between \p MinLane
519   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521   /// Instr's operands.
522   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523                             const VPIteration &Instance, bool IfPredicateInstr,
524                             VPTransformState &State);
525 
526   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527   /// is provided, the integer induction variable will first be truncated to
528   /// the corresponding type.
529   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530 
531   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532   /// vector or scalar value on-demand if one is not yet available. When
533   /// vectorizing a loop, we visit the definition of an instruction before its
534   /// uses. When visiting the definition, we either vectorize or scalarize the
535   /// instruction, creating an entry for it in the corresponding map. (In some
536   /// cases, such as induction variables, we will create both vector and scalar
537   /// entries.) Then, as we encounter uses of the definition, we derive values
538   /// for each scalar or vector use unless such a value is already available.
539   /// For example, if we scalarize a definition and one of its uses is vector,
540   /// we build the required vector on-demand with an insertelement sequence
541   /// when visiting the use. Otherwise, if the use is scalar, we can use the
542   /// existing scalar definition.
543   ///
544   /// Return a value in the new loop corresponding to \p V from the original
545   /// loop at unroll index \p Part. If the value has already been vectorized,
546   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548   /// a new vector value on-demand by inserting the scalar values into a vector
549   /// with an insertelement sequence. If the value has been neither vectorized
550   /// nor scalarized, it must be loop invariant, so we simply broadcast the
551   /// value into a vector.
552   Value *getOrCreateVectorValue(Value *V, unsigned Part);
553 
554   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556   }
557 
558   /// Return a value in the new loop corresponding to \p V from the original
559   /// loop at unroll and vector indices \p Instance. If the value has been
560   /// vectorized but not scalarized, the necessary extractelement instruction
561   /// will be generated.
562   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563 
564   /// Construct the vector value of a scalarized value \p V one lane at a time.
565   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566 
567   /// Try to vectorize interleaved access group \p Group with the base address
568   /// given in \p Addr, optionally masking the vector operations if \p
569   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570   /// values in the vectorized loop.
571   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572                                 ArrayRef<VPValue *> VPDefs,
573                                 VPTransformState &State, VPValue *Addr,
574                                 ArrayRef<VPValue *> StoredValues,
575                                 VPValue *BlockInMask = nullptr);
576 
577   /// Vectorize Load and Store instructions with the base address given in \p
578   /// Addr, optionally masking the vector operations if \p BlockInMask is
579   /// non-null. Use \p State to translate given VPValues to IR values in the
580   /// vectorized loop.
581   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
582                                   VPValue *Def, VPValue *Addr,
583                                   VPValue *StoredValue, VPValue *BlockInMask);
584 
585   /// Set the debug location in the builder using the debug location in
586   /// the instruction.
587   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
588 
589   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
590   void fixNonInductionPHIs(void);
591 
592 protected:
593   friend class LoopVectorizationPlanner;
594 
595   /// A small list of PHINodes.
596   using PhiVector = SmallVector<PHINode *, 4>;
597 
598   /// A type for scalarized values in the new loop. Each value from the
599   /// original loop, when scalarized, is represented by UF x VF scalar values
600   /// in the new unrolled loop, where UF is the unroll factor and VF is the
601   /// vectorization factor.
602   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
603 
604   /// Set up the values of the IVs correctly when exiting the vector loop.
605   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
606                     Value *CountRoundDown, Value *EndValue,
607                     BasicBlock *MiddleBlock);
608 
609   /// Create a new induction variable inside L.
610   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
611                                    Value *Step, Instruction *DL);
612 
613   /// Handle all cross-iteration phis in the header.
614   void fixCrossIterationPHIs();
615 
616   /// Fix a first-order recurrence. This is the second phase of vectorizing
617   /// this phi node.
618   void fixFirstOrderRecurrence(PHINode *Phi);
619 
620   /// Fix a reduction cross-iteration phi. This is the second phase of
621   /// vectorizing this phi node.
622   void fixReduction(PHINode *Phi);
623 
624   /// Clear NSW/NUW flags from reduction instructions if necessary.
625   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
626 
627   /// The Loop exit block may have single value PHI nodes with some
628   /// incoming value. While vectorizing we only handled real values
629   /// that were defined inside the loop and we should have one value for
630   /// each predecessor of its parent basic block. See PR14725.
631   void fixLCSSAPHIs();
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
638   /// represented as.
639   void truncateToMinimalBitwidths();
640 
641   /// Create a broadcast instruction. This method generates a broadcast
642   /// instruction (shuffle) for loop invariant values and for the induction
643   /// value. If this is the induction variable then we extend it to N, N+1, ...
644   /// this is needed because each iteration in the loop corresponds to a SIMD
645   /// element.
646   virtual Value *getBroadcastInstrs(Value *V);
647 
648   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
649   /// to each vector element of Val. The sequence starts at StartIndex.
650   /// \p Opcode is relevant for FP induction variable.
651   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
652                                Instruction::BinaryOps Opcode =
653                                Instruction::BinaryOpsEnd);
654 
655   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
656   /// variable on which to base the steps, \p Step is the size of the step, and
657   /// \p EntryVal is the value from the original loop that maps to the steps.
658   /// Note that \p EntryVal doesn't have to be an induction variable - it
659   /// can also be a truncate instruction.
660   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
661                         const InductionDescriptor &ID);
662 
663   /// Create a vector induction phi node based on an existing scalar one. \p
664   /// EntryVal is the value from the original loop that maps to the vector phi
665   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
666   /// truncate instruction, instead of widening the original IV, we widen a
667   /// version of the IV truncated to \p EntryVal's type.
668   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
669                                        Value *Step, Instruction *EntryVal);
670 
671   /// Returns true if an instruction \p I should be scalarized instead of
672   /// vectorized for the chosen vectorization factor.
673   bool shouldScalarizeInstruction(Instruction *I) const;
674 
675   /// Returns true if we should generate a scalar version of \p IV.
676   bool needsScalarInduction(Instruction *IV) const;
677 
678   /// If there is a cast involved in the induction variable \p ID, which should
679   /// be ignored in the vectorized loop body, this function records the
680   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
681   /// cast. We had already proved that the casted Phi is equal to the uncasted
682   /// Phi in the vectorized loop (under a runtime guard), and therefore
683   /// there is no need to vectorize the cast - the same value can be used in the
684   /// vector loop for both the Phi and the cast.
685   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
686   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
687   ///
688   /// \p EntryVal is the value from the original loop that maps to the vector
689   /// phi node and is used to distinguish what is the IV currently being
690   /// processed - original one (if \p EntryVal is a phi corresponding to the
691   /// original IV) or the "newly-created" one based on the proof mentioned above
692   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
693   /// latter case \p EntryVal is a TruncInst and we must not record anything for
694   /// that IV, but it's error-prone to expect callers of this routine to care
695   /// about that, hence this explicit parameter.
696   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
697                                              const Instruction *EntryVal,
698                                              Value *VectorLoopValue,
699                                              unsigned Part,
700                                              unsigned Lane = UINT_MAX);
701 
702   /// Generate a shuffle sequence that will reverse the vector Vec.
703   virtual Value *reverseVector(Value *Vec);
704 
705   /// Returns (and creates if needed) the original loop trip count.
706   Value *getOrCreateTripCount(Loop *NewLoop);
707 
708   /// Returns (and creates if needed) the trip count of the widened loop.
709   Value *getOrCreateVectorTripCount(Loop *NewLoop);
710 
711   /// Returns a bitcasted value to the requested vector type.
712   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
713   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
714                                 const DataLayout &DL);
715 
716   /// Emit a bypass check to see if the vector trip count is zero, including if
717   /// it overflows.
718   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
719 
720   /// Emit a bypass check to see if all of the SCEV assumptions we've
721   /// had to make are correct.
722   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
723 
724   /// Emit bypass checks to check any memory assumptions we may have made.
725   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
726 
727   /// Compute the transformed value of Index at offset StartValue using step
728   /// StepValue.
729   /// For integer induction, returns StartValue + Index * StepValue.
730   /// For pointer induction, returns StartValue[Index * StepValue].
731   /// FIXME: The newly created binary instructions should contain nsw/nuw
732   /// flags, which can be found from the original scalar operations.
733   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
734                               const DataLayout &DL,
735                               const InductionDescriptor &ID) const;
736 
737   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
738   /// vector loop preheader, middle block and scalar preheader. Also
739   /// allocate a loop object for the new vector loop and return it.
740   Loop *createVectorLoopSkeleton(StringRef Prefix);
741 
742   /// Create new phi nodes for the induction variables to resume iteration count
743   /// in the scalar epilogue, from where the vectorized loop left off (given by
744   /// \p VectorTripCount).
745   /// In cases where the loop skeleton is more complicated (eg. epilogue
746   /// vectorization) and the resume values can come from an additional bypass
747   /// block, the \p AdditionalBypass pair provides information about the bypass
748   /// block and the end value on the edge from bypass to this loop.
749   void createInductionResumeValues(
750       Loop *L, Value *VectorTripCount,
751       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
752 
753   /// Complete the loop skeleton by adding debug MDs, creating appropriate
754   /// conditional branches in the middle block, preparing the builder and
755   /// running the verifier. Take in the vector loop \p L as argument, and return
756   /// the preheader of the completed vector loop.
757   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
758 
759   /// Add additional metadata to \p To that was not present on \p Orig.
760   ///
761   /// Currently this is used to add the noalias annotations based on the
762   /// inserted memchecks.  Use this for instructions that are *cloned* into the
763   /// vector loop.
764   void addNewMetadata(Instruction *To, const Instruction *Orig);
765 
766   /// Add metadata from one instruction to another.
767   ///
768   /// This includes both the original MDs from \p From and additional ones (\see
769   /// addNewMetadata).  Use this for *newly created* instructions in the vector
770   /// loop.
771   void addMetadata(Instruction *To, Instruction *From);
772 
773   /// Similar to the previous function but it adds the metadata to a
774   /// vector of instructions.
775   void addMetadata(ArrayRef<Value *> To, Instruction *From);
776 
777   /// Allow subclasses to override and print debug traces before/after vplan
778   /// execution, when trace information is requested.
779   virtual void printDebugTracesAtStart(){};
780   virtual void printDebugTracesAtEnd(){};
781 
782   /// The original loop.
783   Loop *OrigLoop;
784 
785   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
786   /// dynamic knowledge to simplify SCEV expressions and converts them to a
787   /// more usable form.
788   PredicatedScalarEvolution &PSE;
789 
790   /// Loop Info.
791   LoopInfo *LI;
792 
793   /// Dominator Tree.
794   DominatorTree *DT;
795 
796   /// Alias Analysis.
797   AAResults *AA;
798 
799   /// Target Library Info.
800   const TargetLibraryInfo *TLI;
801 
802   /// Target Transform Info.
803   const TargetTransformInfo *TTI;
804 
805   /// Assumption Cache.
806   AssumptionCache *AC;
807 
808   /// Interface to emit optimization remarks.
809   OptimizationRemarkEmitter *ORE;
810 
811   /// LoopVersioning.  It's only set up (non-null) if memchecks were
812   /// used.
813   ///
814   /// This is currently only used to add no-alias metadata based on the
815   /// memchecks.  The actually versioning is performed manually.
816   std::unique_ptr<LoopVersioning> LVer;
817 
818   /// The vectorization SIMD factor to use. Each vector will have this many
819   /// vector elements.
820   ElementCount VF;
821 
822   /// The vectorization unroll factor to use. Each scalar is vectorized to this
823   /// many different vector instructions.
824   unsigned UF;
825 
826   /// The builder that we use
827   IRBuilder<> Builder;
828 
829   // --- Vectorization state ---
830 
831   /// The vector-loop preheader.
832   BasicBlock *LoopVectorPreHeader;
833 
834   /// The scalar-loop preheader.
835   BasicBlock *LoopScalarPreHeader;
836 
837   /// Middle Block between the vector and the scalar.
838   BasicBlock *LoopMiddleBlock;
839 
840   /// The (unique) ExitBlock of the scalar loop.  Note that
841   /// there can be multiple exiting edges reaching this block.
842   BasicBlock *LoopExitBlock;
843 
844   /// The vector loop body.
845   BasicBlock *LoopVectorBody;
846 
847   /// The scalar loop body.
848   BasicBlock *LoopScalarBody;
849 
850   /// A list of all bypass blocks. The first block is the entry of the loop.
851   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
852 
853   /// The new Induction variable which was added to the new block.
854   PHINode *Induction = nullptr;
855 
856   /// The induction variable of the old basic block.
857   PHINode *OldInduction = nullptr;
858 
859   /// Maps values from the original loop to their corresponding values in the
860   /// vectorized loop. A key value can map to either vector values, scalar
861   /// values or both kinds of values, depending on whether the key was
862   /// vectorized and scalarized.
863   VectorizerValueMap VectorLoopValueMap;
864 
865   /// Store instructions that were predicated.
866   SmallVector<Instruction *, 4> PredicatedInstructions;
867 
868   /// Trip count of the original loop.
869   Value *TripCount = nullptr;
870 
871   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
872   Value *VectorTripCount = nullptr;
873 
874   /// The legality analysis.
875   LoopVectorizationLegality *Legal;
876 
877   /// The profitablity analysis.
878   LoopVectorizationCostModel *Cost;
879 
880   // Record whether runtime checks are added.
881   bool AddedSafetyChecks = false;
882 
883   // Holds the end values for each induction variable. We save the end values
884   // so we can later fix-up the external users of the induction variables.
885   DenseMap<PHINode *, Value *> IVEndValues;
886 
887   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
888   // fixed up at the end of vector code generation.
889   SmallVector<PHINode *, 8> OrigPHIsToFix;
890 
891   /// BFI and PSI are used to check for profile guided size optimizations.
892   BlockFrequencyInfo *BFI;
893   ProfileSummaryInfo *PSI;
894 
895   // Whether this loop should be optimized for size based on profile guided size
896   // optimizatios.
897   bool OptForSizeBasedOnProfile;
898 };
899 
900 class InnerLoopUnroller : public InnerLoopVectorizer {
901 public:
902   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
903                     LoopInfo *LI, DominatorTree *DT,
904                     const TargetLibraryInfo *TLI,
905                     const TargetTransformInfo *TTI, AssumptionCache *AC,
906                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
907                     LoopVectorizationLegality *LVL,
908                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
909                     ProfileSummaryInfo *PSI)
910       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
911                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
912                             BFI, PSI) {}
913 
914 private:
915   Value *getBroadcastInstrs(Value *V) override;
916   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
917                        Instruction::BinaryOps Opcode =
918                        Instruction::BinaryOpsEnd) override;
919   Value *reverseVector(Value *Vec) override;
920 };
921 
922 /// Encapsulate information regarding vectorization of a loop and its epilogue.
923 /// This information is meant to be updated and used across two stages of
924 /// epilogue vectorization.
925 struct EpilogueLoopVectorizationInfo {
926   ElementCount MainLoopVF = ElementCount::getFixed(0);
927   unsigned MainLoopUF = 0;
928   ElementCount EpilogueVF = ElementCount::getFixed(0);
929   unsigned EpilogueUF = 0;
930   BasicBlock *MainLoopIterationCountCheck = nullptr;
931   BasicBlock *EpilogueIterationCountCheck = nullptr;
932   BasicBlock *SCEVSafetyCheck = nullptr;
933   BasicBlock *MemSafetyCheck = nullptr;
934   Value *TripCount = nullptr;
935   Value *VectorTripCount = nullptr;
936 
937   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
938                                 unsigned EUF)
939       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
940         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
941     assert(EUF == 1 &&
942            "A high UF for the epilogue loop is likely not beneficial.");
943   }
944 };
945 
946 /// An extension of the inner loop vectorizer that creates a skeleton for a
947 /// vectorized loop that has its epilogue (residual) also vectorized.
948 /// The idea is to run the vplan on a given loop twice, firstly to setup the
949 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
950 /// from the first step and vectorize the epilogue.  This is achieved by
951 /// deriving two concrete strategy classes from this base class and invoking
952 /// them in succession from the loop vectorizer planner.
953 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
954 public:
955   InnerLoopAndEpilogueVectorizer(
956       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
957       DominatorTree *DT, const TargetLibraryInfo *TLI,
958       const TargetTransformInfo *TTI, AssumptionCache *AC,
959       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
960       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
961       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
962       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
963                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
964         EPI(EPI) {}
965 
966   // Override this function to handle the more complex control flow around the
967   // three loops.
968   BasicBlock *createVectorizedLoopSkeleton() final override {
969     return createEpilogueVectorizedLoopSkeleton();
970   }
971 
972   /// The interface for creating a vectorized skeleton using one of two
973   /// different strategies, each corresponding to one execution of the vplan
974   /// as described above.
975   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
976 
977   /// Holds and updates state information required to vectorize the main loop
978   /// and its epilogue in two separate passes. This setup helps us avoid
979   /// regenerating and recomputing runtime safety checks. It also helps us to
980   /// shorten the iteration-count-check path length for the cases where the
981   /// iteration count of the loop is so small that the main vector loop is
982   /// completely skipped.
983   EpilogueLoopVectorizationInfo &EPI;
984 };
985 
986 /// A specialized derived class of inner loop vectorizer that performs
987 /// vectorization of *main* loops in the process of vectorizing loops and their
988 /// epilogues.
989 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
990 public:
991   EpilogueVectorizerMainLoop(
992       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
993       DominatorTree *DT, const TargetLibraryInfo *TLI,
994       const TargetTransformInfo *TTI, AssumptionCache *AC,
995       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
996       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
997       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
998       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
999                                        EPI, LVL, CM, BFI, PSI) {}
1000   /// Implements the interface for creating a vectorized skeleton using the
1001   /// *main loop* strategy (ie the first pass of vplan execution).
1002   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1003 
1004 protected:
1005   /// Emits an iteration count bypass check once for the main loop (when \p
1006   /// ForEpilogue is false) and once for the epilogue loop (when \p
1007   /// ForEpilogue is true).
1008   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1009                                              bool ForEpilogue);
1010   void printDebugTracesAtStart() override;
1011   void printDebugTracesAtEnd() override;
1012 };
1013 
1014 // A specialized derived class of inner loop vectorizer that performs
1015 // vectorization of *epilogue* loops in the process of vectorizing loops and
1016 // their epilogues.
1017 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1018 public:
1019   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1020                     LoopInfo *LI, DominatorTree *DT,
1021                     const TargetLibraryInfo *TLI,
1022                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1023                     OptimizationRemarkEmitter *ORE,
1024                     EpilogueLoopVectorizationInfo &EPI,
1025                     LoopVectorizationLegality *LVL,
1026                     llvm::LoopVectorizationCostModel *CM,
1027                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1028       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1029                                        EPI, LVL, CM, BFI, PSI) {}
1030   /// Implements the interface for creating a vectorized skeleton using the
1031   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1032   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1033 
1034 protected:
1035   /// Emits an iteration count bypass check after the main vector loop has
1036   /// finished to see if there are any iterations left to execute by either
1037   /// the vector epilogue or the scalar epilogue.
1038   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1039                                                       BasicBlock *Bypass,
1040                                                       BasicBlock *Insert);
1041   void printDebugTracesAtStart() override;
1042   void printDebugTracesAtEnd() override;
1043 };
1044 } // end namespace llvm
1045 
1046 /// Look for a meaningful debug location on the instruction or it's
1047 /// operands.
1048 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1049   if (!I)
1050     return I;
1051 
1052   DebugLoc Empty;
1053   if (I->getDebugLoc() != Empty)
1054     return I;
1055 
1056   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1057     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1058       if (OpInst->getDebugLoc() != Empty)
1059         return OpInst;
1060   }
1061 
1062   return I;
1063 }
1064 
1065 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1066   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1067     const DILocation *DIL = Inst->getDebugLoc();
1068     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1069         !isa<DbgInfoIntrinsic>(Inst)) {
1070       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1071       auto NewDIL =
1072           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1073       if (NewDIL)
1074         B.SetCurrentDebugLocation(NewDIL.getValue());
1075       else
1076         LLVM_DEBUG(dbgs()
1077                    << "Failed to create new discriminator: "
1078                    << DIL->getFilename() << " Line: " << DIL->getLine());
1079     }
1080     else
1081       B.SetCurrentDebugLocation(DIL);
1082   } else
1083     B.SetCurrentDebugLocation(DebugLoc());
1084 }
1085 
1086 /// Write a record \p DebugMsg about vectorization failure to the debug
1087 /// output stream. If \p I is passed, it is an instruction that prevents
1088 /// vectorization.
1089 #ifndef NDEBUG
1090 static void debugVectorizationFailure(const StringRef DebugMsg,
1091     Instruction *I) {
1092   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1093   if (I != nullptr)
1094     dbgs() << " " << *I;
1095   else
1096     dbgs() << '.';
1097   dbgs() << '\n';
1098 }
1099 #endif
1100 
1101 /// Create an analysis remark that explains why vectorization failed
1102 ///
1103 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1104 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1105 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1106 /// the location of the remark.  \return the remark object that can be
1107 /// streamed to.
1108 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1109     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1110   Value *CodeRegion = TheLoop->getHeader();
1111   DebugLoc DL = TheLoop->getStartLoc();
1112 
1113   if (I) {
1114     CodeRegion = I->getParent();
1115     // If there is no debug location attached to the instruction, revert back to
1116     // using the loop's.
1117     if (I->getDebugLoc())
1118       DL = I->getDebugLoc();
1119   }
1120 
1121   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1122   R << "loop not vectorized: ";
1123   return R;
1124 }
1125 
1126 /// Return a value for Step multiplied by VF.
1127 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1128   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1129   Constant *StepVal = ConstantInt::get(
1130       Step->getType(),
1131       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1132   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1133 }
1134 
1135 namespace llvm {
1136 
1137 void reportVectorizationFailure(const StringRef DebugMsg,
1138     const StringRef OREMsg, const StringRef ORETag,
1139     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1140   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1141   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1142   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1143                 ORETag, TheLoop, I) << OREMsg);
1144 }
1145 
1146 } // end namespace llvm
1147 
1148 #ifndef NDEBUG
1149 /// \return string containing a file name and a line # for the given loop.
1150 static std::string getDebugLocString(const Loop *L) {
1151   std::string Result;
1152   if (L) {
1153     raw_string_ostream OS(Result);
1154     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1155       LoopDbgLoc.print(OS);
1156     else
1157       // Just print the module name.
1158       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1159     OS.flush();
1160   }
1161   return Result;
1162 }
1163 #endif
1164 
1165 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1166                                          const Instruction *Orig) {
1167   // If the loop was versioned with memchecks, add the corresponding no-alias
1168   // metadata.
1169   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1170     LVer->annotateInstWithNoAlias(To, Orig);
1171 }
1172 
1173 void InnerLoopVectorizer::addMetadata(Instruction *To,
1174                                       Instruction *From) {
1175   propagateMetadata(To, From);
1176   addNewMetadata(To, From);
1177 }
1178 
1179 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1180                                       Instruction *From) {
1181   for (Value *V : To) {
1182     if (Instruction *I = dyn_cast<Instruction>(V))
1183       addMetadata(I, From);
1184   }
1185 }
1186 
1187 namespace llvm {
1188 
1189 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1190 // lowered.
1191 enum ScalarEpilogueLowering {
1192 
1193   // The default: allowing scalar epilogues.
1194   CM_ScalarEpilogueAllowed,
1195 
1196   // Vectorization with OptForSize: don't allow epilogues.
1197   CM_ScalarEpilogueNotAllowedOptSize,
1198 
1199   // A special case of vectorisation with OptForSize: loops with a very small
1200   // trip count are considered for vectorization under OptForSize, thereby
1201   // making sure the cost of their loop body is dominant, free of runtime
1202   // guards and scalar iteration overheads.
1203   CM_ScalarEpilogueNotAllowedLowTripLoop,
1204 
1205   // Loop hint predicate indicating an epilogue is undesired.
1206   CM_ScalarEpilogueNotNeededUsePredicate,
1207 
1208   // Directive indicating we must either tail fold or not vectorize
1209   CM_ScalarEpilogueNotAllowedUsePredicate
1210 };
1211 
1212 /// LoopVectorizationCostModel - estimates the expected speedups due to
1213 /// vectorization.
1214 /// In many cases vectorization is not profitable. This can happen because of
1215 /// a number of reasons. In this class we mainly attempt to predict the
1216 /// expected speedup/slowdowns due to the supported instruction set. We use the
1217 /// TargetTransformInfo to query the different backends for the cost of
1218 /// different operations.
1219 class LoopVectorizationCostModel {
1220 public:
1221   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1222                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1223                              LoopVectorizationLegality *Legal,
1224                              const TargetTransformInfo &TTI,
1225                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1226                              AssumptionCache *AC,
1227                              OptimizationRemarkEmitter *ORE, const Function *F,
1228                              const LoopVectorizeHints *Hints,
1229                              InterleavedAccessInfo &IAI)
1230       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1231         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1232         Hints(Hints), InterleaveInfo(IAI) {}
1233 
1234   /// \return An upper bound for the vectorization factor, or None if
1235   /// vectorization and interleaving should be avoided up front.
1236   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1237 
1238   /// \return True if runtime checks are required for vectorization, and false
1239   /// otherwise.
1240   bool runtimeChecksRequired();
1241 
1242   /// \return The most profitable vectorization factor and the cost of that VF.
1243   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1244   /// then this vectorization factor will be selected if vectorization is
1245   /// possible.
1246   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1247   VectorizationFactor
1248   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1249                                     const LoopVectorizationPlanner &LVP);
1250 
1251   /// Setup cost-based decisions for user vectorization factor.
1252   void selectUserVectorizationFactor(ElementCount UserVF) {
1253     collectUniformsAndScalars(UserVF);
1254     collectInstsToScalarize(UserVF);
1255   }
1256 
1257   /// \return The size (in bits) of the smallest and widest types in the code
1258   /// that needs to be vectorized. We ignore values that remain scalar such as
1259   /// 64 bit loop indices.
1260   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1261 
1262   /// \return The desired interleave count.
1263   /// If interleave count has been specified by metadata it will be returned.
1264   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1265   /// are the selected vectorization factor and the cost of the selected VF.
1266   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1267 
1268   /// Memory access instruction may be vectorized in more than one way.
1269   /// Form of instruction after vectorization depends on cost.
1270   /// This function takes cost-based decisions for Load/Store instructions
1271   /// and collects them in a map. This decisions map is used for building
1272   /// the lists of loop-uniform and loop-scalar instructions.
1273   /// The calculated cost is saved with widening decision in order to
1274   /// avoid redundant calculations.
1275   void setCostBasedWideningDecision(ElementCount VF);
1276 
1277   /// A struct that represents some properties of the register usage
1278   /// of a loop.
1279   struct RegisterUsage {
1280     /// Holds the number of loop invariant values that are used in the loop.
1281     /// The key is ClassID of target-provided register class.
1282     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1283     /// Holds the maximum number of concurrent live intervals in the loop.
1284     /// The key is ClassID of target-provided register class.
1285     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1286   };
1287 
1288   /// \return Returns information about the register usages of the loop for the
1289   /// given vectorization factors.
1290   SmallVector<RegisterUsage, 8>
1291   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1292 
1293   /// Collect values we want to ignore in the cost model.
1294   void collectValuesToIgnore();
1295 
1296   /// Split reductions into those that happen in the loop, and those that happen
1297   /// outside. In loop reductions are collected into InLoopReductionChains.
1298   void collectInLoopReductions();
1299 
1300   /// \returns The smallest bitwidth each instruction can be represented with.
1301   /// The vector equivalents of these instructions should be truncated to this
1302   /// type.
1303   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1304     return MinBWs;
1305   }
1306 
1307   /// \returns True if it is more profitable to scalarize instruction \p I for
1308   /// vectorization factor \p VF.
1309   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1310     assert(VF.isVector() &&
1311            "Profitable to scalarize relevant only for VF > 1.");
1312 
1313     // Cost model is not run in the VPlan-native path - return conservative
1314     // result until this changes.
1315     if (EnableVPlanNativePath)
1316       return false;
1317 
1318     auto Scalars = InstsToScalarize.find(VF);
1319     assert(Scalars != InstsToScalarize.end() &&
1320            "VF not yet analyzed for scalarization profitability");
1321     return Scalars->second.find(I) != Scalars->second.end();
1322   }
1323 
1324   /// Returns true if \p I is known to be uniform after vectorization.
1325   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1326     if (VF.isScalar())
1327       return true;
1328 
1329     // Cost model is not run in the VPlan-native path - return conservative
1330     // result until this changes.
1331     if (EnableVPlanNativePath)
1332       return false;
1333 
1334     auto UniformsPerVF = Uniforms.find(VF);
1335     assert(UniformsPerVF != Uniforms.end() &&
1336            "VF not yet analyzed for uniformity");
1337     return UniformsPerVF->second.count(I);
1338   }
1339 
1340   /// Returns true if \p I is known to be scalar after vectorization.
1341   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1342     if (VF.isScalar())
1343       return true;
1344 
1345     // Cost model is not run in the VPlan-native path - return conservative
1346     // result until this changes.
1347     if (EnableVPlanNativePath)
1348       return false;
1349 
1350     auto ScalarsPerVF = Scalars.find(VF);
1351     assert(ScalarsPerVF != Scalars.end() &&
1352            "Scalar values are not calculated for VF");
1353     return ScalarsPerVF->second.count(I);
1354   }
1355 
1356   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1357   /// for vectorization factor \p VF.
1358   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1359     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1360            !isProfitableToScalarize(I, VF) &&
1361            !isScalarAfterVectorization(I, VF);
1362   }
1363 
1364   /// Decision that was taken during cost calculation for memory instruction.
1365   enum InstWidening {
1366     CM_Unknown,
1367     CM_Widen,         // For consecutive accesses with stride +1.
1368     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1369     CM_Interleave,
1370     CM_GatherScatter,
1371     CM_Scalarize
1372   };
1373 
1374   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1375   /// instruction \p I and vector width \p VF.
1376   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1377                            unsigned Cost) {
1378     assert(VF.isVector() && "Expected VF >=2");
1379     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1380   }
1381 
1382   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1383   /// interleaving group \p Grp and vector width \p VF.
1384   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1385                            ElementCount VF, InstWidening W, unsigned Cost) {
1386     assert(VF.isVector() && "Expected VF >=2");
1387     /// Broadcast this decicion to all instructions inside the group.
1388     /// But the cost will be assigned to one instruction only.
1389     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1390       if (auto *I = Grp->getMember(i)) {
1391         if (Grp->getInsertPos() == I)
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1393         else
1394           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1395       }
1396     }
1397   }
1398 
1399   /// Return the cost model decision for the given instruction \p I and vector
1400   /// width \p VF. Return CM_Unknown if this instruction did not pass
1401   /// through the cost modeling.
1402   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1403     assert(VF.isVector() && "Expected VF to be a vector VF");
1404     // Cost model is not run in the VPlan-native path - return conservative
1405     // result until this changes.
1406     if (EnableVPlanNativePath)
1407       return CM_GatherScatter;
1408 
1409     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1410     auto Itr = WideningDecisions.find(InstOnVF);
1411     if (Itr == WideningDecisions.end())
1412       return CM_Unknown;
1413     return Itr->second.first;
1414   }
1415 
1416   /// Return the vectorization cost for the given instruction \p I and vector
1417   /// width \p VF.
1418   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1419     assert(VF.isVector() && "Expected VF >=2");
1420     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1421     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1422            "The cost is not calculated");
1423     return WideningDecisions[InstOnVF].second;
1424   }
1425 
1426   /// Return True if instruction \p I is an optimizable truncate whose operand
1427   /// is an induction variable. Such a truncate will be removed by adding a new
1428   /// induction variable with the destination type.
1429   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1430     // If the instruction is not a truncate, return false.
1431     auto *Trunc = dyn_cast<TruncInst>(I);
1432     if (!Trunc)
1433       return false;
1434 
1435     // Get the source and destination types of the truncate.
1436     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1437     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1438 
1439     // If the truncate is free for the given types, return false. Replacing a
1440     // free truncate with an induction variable would add an induction variable
1441     // update instruction to each iteration of the loop. We exclude from this
1442     // check the primary induction variable since it will need an update
1443     // instruction regardless.
1444     Value *Op = Trunc->getOperand(0);
1445     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1446       return false;
1447 
1448     // If the truncated value is not an induction variable, return false.
1449     return Legal->isInductionPhi(Op);
1450   }
1451 
1452   /// Collects the instructions to scalarize for each predicated instruction in
1453   /// the loop.
1454   void collectInstsToScalarize(ElementCount VF);
1455 
1456   /// Collect Uniform and Scalar values for the given \p VF.
1457   /// The sets depend on CM decision for Load/Store instructions
1458   /// that may be vectorized as interleave, gather-scatter or scalarized.
1459   void collectUniformsAndScalars(ElementCount VF) {
1460     // Do the analysis once.
1461     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1462       return;
1463     setCostBasedWideningDecision(VF);
1464     collectLoopUniforms(VF);
1465     collectLoopScalars(VF);
1466   }
1467 
1468   /// Returns true if the target machine supports masked store operation
1469   /// for the given \p DataType and kind of access to \p Ptr.
1470   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1471     return Legal->isConsecutivePtr(Ptr) &&
1472            TTI.isLegalMaskedStore(DataType, Alignment);
1473   }
1474 
1475   /// Returns true if the target machine supports masked load operation
1476   /// for the given \p DataType and kind of access to \p Ptr.
1477   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1478     return Legal->isConsecutivePtr(Ptr) &&
1479            TTI.isLegalMaskedLoad(DataType, Alignment);
1480   }
1481 
1482   /// Returns true if the target machine supports masked scatter operation
1483   /// for the given \p DataType.
1484   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1485     return TTI.isLegalMaskedScatter(DataType, Alignment);
1486   }
1487 
1488   /// Returns true if the target machine supports masked gather operation
1489   /// for the given \p DataType.
1490   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1491     return TTI.isLegalMaskedGather(DataType, Alignment);
1492   }
1493 
1494   /// Returns true if the target machine can represent \p V as a masked gather
1495   /// or scatter operation.
1496   bool isLegalGatherOrScatter(Value *V) {
1497     bool LI = isa<LoadInst>(V);
1498     bool SI = isa<StoreInst>(V);
1499     if (!LI && !SI)
1500       return false;
1501     auto *Ty = getMemInstValueType(V);
1502     Align Align = getLoadStoreAlignment(V);
1503     return (LI && isLegalMaskedGather(Ty, Align)) ||
1504            (SI && isLegalMaskedScatter(Ty, Align));
1505   }
1506 
1507   /// Returns true if \p I is an instruction that will be scalarized with
1508   /// predication. Such instructions include conditional stores and
1509   /// instructions that may divide by zero.
1510   /// If a non-zero VF has been calculated, we check if I will be scalarized
1511   /// predication for that VF.
1512   bool isScalarWithPredication(Instruction *I,
1513                                ElementCount VF = ElementCount::getFixed(1));
1514 
1515   // Returns true if \p I is an instruction that will be predicated either
1516   // through scalar predication or masked load/store or masked gather/scatter.
1517   // Superset of instructions that return true for isScalarWithPredication.
1518   bool isPredicatedInst(Instruction *I) {
1519     if (!blockNeedsPredication(I->getParent()))
1520       return false;
1521     // Loads and stores that need some form of masked operation are predicated
1522     // instructions.
1523     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1524       return Legal->isMaskRequired(I);
1525     return isScalarWithPredication(I);
1526   }
1527 
1528   /// Returns true if \p I is a memory instruction with consecutive memory
1529   /// access that can be widened.
1530   bool
1531   memoryInstructionCanBeWidened(Instruction *I,
1532                                 ElementCount VF = ElementCount::getFixed(1));
1533 
1534   /// Returns true if \p I is a memory instruction in an interleaved-group
1535   /// of memory accesses that can be vectorized with wide vector loads/stores
1536   /// and shuffles.
1537   bool
1538   interleavedAccessCanBeWidened(Instruction *I,
1539                                 ElementCount VF = ElementCount::getFixed(1));
1540 
1541   /// Check if \p Instr belongs to any interleaved access group.
1542   bool isAccessInterleaved(Instruction *Instr) {
1543     return InterleaveInfo.isInterleaved(Instr);
1544   }
1545 
1546   /// Get the interleaved access group that \p Instr belongs to.
1547   const InterleaveGroup<Instruction> *
1548   getInterleavedAccessGroup(Instruction *Instr) {
1549     return InterleaveInfo.getInterleaveGroup(Instr);
1550   }
1551 
1552   /// Returns true if we're required to use a scalar epilogue for at least
1553   /// the final iteration of the original loop.
1554   bool requiresScalarEpilogue() const {
1555     if (!isScalarEpilogueAllowed())
1556       return false;
1557     // If we might exit from anywhere but the latch, must run the exiting
1558     // iteration in scalar form.
1559     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1560       return true;
1561     return InterleaveInfo.requiresScalarEpilogue();
1562   }
1563 
1564   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1565   /// loop hint annotation.
1566   bool isScalarEpilogueAllowed() const {
1567     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1568   }
1569 
1570   /// Returns true if all loop blocks should be masked to fold tail loop.
1571   bool foldTailByMasking() const { return FoldTailByMasking; }
1572 
1573   bool blockNeedsPredication(BasicBlock *BB) {
1574     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1575   }
1576 
1577   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1578   /// nodes to the chain of instructions representing the reductions. Uses a
1579   /// MapVector to ensure deterministic iteration order.
1580   using ReductionChainMap =
1581       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1582 
1583   /// Return the chain of instructions representing an inloop reduction.
1584   const ReductionChainMap &getInLoopReductionChains() const {
1585     return InLoopReductionChains;
1586   }
1587 
1588   /// Returns true if the Phi is part of an inloop reduction.
1589   bool isInLoopReduction(PHINode *Phi) const {
1590     return InLoopReductionChains.count(Phi);
1591   }
1592 
1593   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1594   /// with factor VF.  Return the cost of the instruction, including
1595   /// scalarization overhead if it's needed.
1596   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1597 
1598   /// Estimate cost of a call instruction CI if it were vectorized with factor
1599   /// VF. Return the cost of the instruction, including scalarization overhead
1600   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1601   /// scalarized -
1602   /// i.e. either vector version isn't available, or is too expensive.
1603   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1604                              bool &NeedToScalarize);
1605 
1606   /// Invalidates decisions already taken by the cost model.
1607   void invalidateCostModelingDecisions() {
1608     WideningDecisions.clear();
1609     Uniforms.clear();
1610     Scalars.clear();
1611   }
1612 
1613 private:
1614   unsigned NumPredStores = 0;
1615 
1616   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1617   /// than zero. One is returned if vectorization should best be avoided due
1618   /// to cost.
1619   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1620                                     ElementCount UserVF);
1621 
1622   /// The vectorization cost is a combination of the cost itself and a boolean
1623   /// indicating whether any of the contributing operations will actually
1624   /// operate on
1625   /// vector values after type legalization in the backend. If this latter value
1626   /// is
1627   /// false, then all operations will be scalarized (i.e. no vectorization has
1628   /// actually taken place).
1629   using VectorizationCostTy = std::pair<unsigned, bool>;
1630 
1631   /// Returns the expected execution cost. The unit of the cost does
1632   /// not matter because we use the 'cost' units to compare different
1633   /// vector widths. The cost that is returned is *not* normalized by
1634   /// the factor width.
1635   VectorizationCostTy expectedCost(ElementCount VF);
1636 
1637   /// Returns the execution time cost of an instruction for a given vector
1638   /// width. Vector width of one means scalar.
1639   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost-computation logic from getInstructionCost which provides
1642   /// the vector type as an output parameter.
1643   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1644 
1645   /// Calculate vectorization cost of memory instruction \p I.
1646   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1647 
1648   /// The cost computation for scalarized memory instruction.
1649   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost computation for interleaving group of memory instructions.
1652   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1653 
1654   /// The cost computation for Gather/Scatter instruction.
1655   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1656 
1657   /// The cost computation for widening instruction \p I with consecutive
1658   /// memory access.
1659   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1660 
1661   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1662   /// Load: scalar load + broadcast.
1663   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1664   /// element)
1665   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1666 
1667   /// Estimate the overhead of scalarizing an instruction. This is a
1668   /// convenience wrapper for the type-based getScalarizationOverhead API.
1669   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1670 
1671   /// Returns whether the instruction is a load or store and will be a emitted
1672   /// as a vector operation.
1673   bool isConsecutiveLoadOrStore(Instruction *I);
1674 
1675   /// Returns true if an artificially high cost for emulated masked memrefs
1676   /// should be used.
1677   bool useEmulatedMaskMemRefHack(Instruction *I);
1678 
1679   /// Map of scalar integer values to the smallest bitwidth they can be legally
1680   /// represented as. The vector equivalents of these values should be truncated
1681   /// to this type.
1682   MapVector<Instruction *, uint64_t> MinBWs;
1683 
1684   /// A type representing the costs for instructions if they were to be
1685   /// scalarized rather than vectorized. The entries are Instruction-Cost
1686   /// pairs.
1687   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1688 
1689   /// A set containing all BasicBlocks that are known to present after
1690   /// vectorization as a predicated block.
1691   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1692 
1693   /// Records whether it is allowed to have the original scalar loop execute at
1694   /// least once. This may be needed as a fallback loop in case runtime
1695   /// aliasing/dependence checks fail, or to handle the tail/remainder
1696   /// iterations when the trip count is unknown or doesn't divide by the VF,
1697   /// or as a peel-loop to handle gaps in interleave-groups.
1698   /// Under optsize and when the trip count is very small we don't allow any
1699   /// iterations to execute in the scalar loop.
1700   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1701 
1702   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1703   bool FoldTailByMasking = false;
1704 
1705   /// A map holding scalar costs for different vectorization factors. The
1706   /// presence of a cost for an instruction in the mapping indicates that the
1707   /// instruction will be scalarized when vectorizing with the associated
1708   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1709   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1710 
1711   /// Holds the instructions known to be uniform after vectorization.
1712   /// The data is collected per VF.
1713   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1714 
1715   /// Holds the instructions known to be scalar after vectorization.
1716   /// The data is collected per VF.
1717   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1718 
1719   /// Holds the instructions (address computations) that are forced to be
1720   /// scalarized.
1721   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1722 
1723   /// PHINodes of the reductions that should be expanded in-loop along with
1724   /// their associated chains of reduction operations, in program order from top
1725   /// (PHI) to bottom
1726   ReductionChainMap InLoopReductionChains;
1727 
1728   /// Returns the expected difference in cost from scalarizing the expression
1729   /// feeding a predicated instruction \p PredInst. The instructions to
1730   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1731   /// non-negative return value implies the expression will be scalarized.
1732   /// Currently, only single-use chains are considered for scalarization.
1733   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1734                               ElementCount VF);
1735 
1736   /// Collect the instructions that are uniform after vectorization. An
1737   /// instruction is uniform if we represent it with a single scalar value in
1738   /// the vectorized loop corresponding to each vector iteration. Examples of
1739   /// uniform instructions include pointer operands of consecutive or
1740   /// interleaved memory accesses. Note that although uniformity implies an
1741   /// instruction will be scalar, the reverse is not true. In general, a
1742   /// scalarized instruction will be represented by VF scalar values in the
1743   /// vectorized loop, each corresponding to an iteration of the original
1744   /// scalar loop.
1745   void collectLoopUniforms(ElementCount VF);
1746 
1747   /// Collect the instructions that are scalar after vectorization. An
1748   /// instruction is scalar if it is known to be uniform or will be scalarized
1749   /// during vectorization. Non-uniform scalarized instructions will be
1750   /// represented by VF values in the vectorized loop, each corresponding to an
1751   /// iteration of the original scalar loop.
1752   void collectLoopScalars(ElementCount VF);
1753 
1754   /// Keeps cost model vectorization decision and cost for instructions.
1755   /// Right now it is used for memory instructions only.
1756   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1757                                 std::pair<InstWidening, unsigned>>;
1758 
1759   DecisionList WideningDecisions;
1760 
1761   /// Returns true if \p V is expected to be vectorized and it needs to be
1762   /// extracted.
1763   bool needsExtract(Value *V, ElementCount VF) const {
1764     Instruction *I = dyn_cast<Instruction>(V);
1765     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1766         TheLoop->isLoopInvariant(I))
1767       return false;
1768 
1769     // Assume we can vectorize V (and hence we need extraction) if the
1770     // scalars are not computed yet. This can happen, because it is called
1771     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1772     // the scalars are collected. That should be a safe assumption in most
1773     // cases, because we check if the operands have vectorizable types
1774     // beforehand in LoopVectorizationLegality.
1775     return Scalars.find(VF) == Scalars.end() ||
1776            !isScalarAfterVectorization(I, VF);
1777   };
1778 
1779   /// Returns a range containing only operands needing to be extracted.
1780   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1781                                                    ElementCount VF) {
1782     return SmallVector<Value *, 4>(make_filter_range(
1783         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1784   }
1785 
1786   /// Determines if we have the infrastructure to vectorize loop \p L and its
1787   /// epilogue, assuming the main loop is vectorized by \p VF.
1788   bool isCandidateForEpilogueVectorization(const Loop &L,
1789                                            const ElementCount VF) const;
1790 
1791   /// Returns true if epilogue vectorization is considered profitable, and
1792   /// false otherwise.
1793   /// \p VF is the vectorization factor chosen for the original loop.
1794   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1795 
1796 public:
1797   /// The loop that we evaluate.
1798   Loop *TheLoop;
1799 
1800   /// Predicated scalar evolution analysis.
1801   PredicatedScalarEvolution &PSE;
1802 
1803   /// Loop Info analysis.
1804   LoopInfo *LI;
1805 
1806   /// Vectorization legality.
1807   LoopVectorizationLegality *Legal;
1808 
1809   /// Vector target information.
1810   const TargetTransformInfo &TTI;
1811 
1812   /// Target Library Info.
1813   const TargetLibraryInfo *TLI;
1814 
1815   /// Demanded bits analysis.
1816   DemandedBits *DB;
1817 
1818   /// Assumption cache.
1819   AssumptionCache *AC;
1820 
1821   /// Interface to emit optimization remarks.
1822   OptimizationRemarkEmitter *ORE;
1823 
1824   const Function *TheFunction;
1825 
1826   /// Loop Vectorize Hint.
1827   const LoopVectorizeHints *Hints;
1828 
1829   /// The interleave access information contains groups of interleaved accesses
1830   /// with the same stride and close to each other.
1831   InterleavedAccessInfo &InterleaveInfo;
1832 
1833   /// Values to ignore in the cost model.
1834   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1835 
1836   /// Values to ignore in the cost model when VF > 1.
1837   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1838 
1839   /// Profitable vector factors.
1840   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1841 };
1842 
1843 } // end namespace llvm
1844 
1845 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1846 // vectorization. The loop needs to be annotated with #pragma omp simd
1847 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1848 // vector length information is not provided, vectorization is not considered
1849 // explicit. Interleave hints are not allowed either. These limitations will be
1850 // relaxed in the future.
1851 // Please, note that we are currently forced to abuse the pragma 'clang
1852 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1853 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1854 // provides *explicit vectorization hints* (LV can bypass legal checks and
1855 // assume that vectorization is legal). However, both hints are implemented
1856 // using the same metadata (llvm.loop.vectorize, processed by
1857 // LoopVectorizeHints). This will be fixed in the future when the native IR
1858 // representation for pragma 'omp simd' is introduced.
1859 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1860                                    OptimizationRemarkEmitter *ORE) {
1861   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1862   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1863 
1864   // Only outer loops with an explicit vectorization hint are supported.
1865   // Unannotated outer loops are ignored.
1866   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1867     return false;
1868 
1869   Function *Fn = OuterLp->getHeader()->getParent();
1870   if (!Hints.allowVectorization(Fn, OuterLp,
1871                                 true /*VectorizeOnlyWhenForced*/)) {
1872     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1873     return false;
1874   }
1875 
1876   if (Hints.getInterleave() > 1) {
1877     // TODO: Interleave support is future work.
1878     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1879                          "outer loops.\n");
1880     Hints.emitRemarkWithHints();
1881     return false;
1882   }
1883 
1884   return true;
1885 }
1886 
1887 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1888                                   OptimizationRemarkEmitter *ORE,
1889                                   SmallVectorImpl<Loop *> &V) {
1890   // Collect inner loops and outer loops without irreducible control flow. For
1891   // now, only collect outer loops that have explicit vectorization hints. If we
1892   // are stress testing the VPlan H-CFG construction, we collect the outermost
1893   // loop of every loop nest.
1894   if (L.isInnermost() || VPlanBuildStressTest ||
1895       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1896     LoopBlocksRPO RPOT(&L);
1897     RPOT.perform(LI);
1898     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1899       V.push_back(&L);
1900       // TODO: Collect inner loops inside marked outer loops in case
1901       // vectorization fails for the outer loop. Do not invoke
1902       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1903       // already known to be reducible. We can use an inherited attribute for
1904       // that.
1905       return;
1906     }
1907   }
1908   for (Loop *InnerL : L)
1909     collectSupportedLoops(*InnerL, LI, ORE, V);
1910 }
1911 
1912 namespace {
1913 
1914 /// The LoopVectorize Pass.
1915 struct LoopVectorize : public FunctionPass {
1916   /// Pass identification, replacement for typeid
1917   static char ID;
1918 
1919   LoopVectorizePass Impl;
1920 
1921   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1922                          bool VectorizeOnlyWhenForced = false)
1923       : FunctionPass(ID),
1924         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1925     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1926   }
1927 
1928   bool runOnFunction(Function &F) override {
1929     if (skipFunction(F))
1930       return false;
1931 
1932     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1933     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1934     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1935     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1936     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1937     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1938     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1939     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1940     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1941     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1942     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1943     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1944     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1945 
1946     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1947         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1948 
1949     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1950                         GetLAA, *ORE, PSI).MadeAnyChange;
1951   }
1952 
1953   void getAnalysisUsage(AnalysisUsage &AU) const override {
1954     AU.addRequired<AssumptionCacheTracker>();
1955     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1956     AU.addRequired<DominatorTreeWrapperPass>();
1957     AU.addRequired<LoopInfoWrapperPass>();
1958     AU.addRequired<ScalarEvolutionWrapperPass>();
1959     AU.addRequired<TargetTransformInfoWrapperPass>();
1960     AU.addRequired<AAResultsWrapperPass>();
1961     AU.addRequired<LoopAccessLegacyAnalysis>();
1962     AU.addRequired<DemandedBitsWrapperPass>();
1963     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1964     AU.addRequired<InjectTLIMappingsLegacy>();
1965 
1966     // We currently do not preserve loopinfo/dominator analyses with outer loop
1967     // vectorization. Until this is addressed, mark these analyses as preserved
1968     // only for non-VPlan-native path.
1969     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1970     if (!EnableVPlanNativePath) {
1971       AU.addPreserved<LoopInfoWrapperPass>();
1972       AU.addPreserved<DominatorTreeWrapperPass>();
1973     }
1974 
1975     AU.addPreserved<BasicAAWrapperPass>();
1976     AU.addPreserved<GlobalsAAWrapperPass>();
1977     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1978   }
1979 };
1980 
1981 } // end anonymous namespace
1982 
1983 //===----------------------------------------------------------------------===//
1984 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1985 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1986 //===----------------------------------------------------------------------===//
1987 
1988 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1989   // We need to place the broadcast of invariant variables outside the loop,
1990   // but only if it's proven safe to do so. Else, broadcast will be inside
1991   // vector loop body.
1992   Instruction *Instr = dyn_cast<Instruction>(V);
1993   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1994                      (!Instr ||
1995                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1996   // Place the code for broadcasting invariant variables in the new preheader.
1997   IRBuilder<>::InsertPointGuard Guard(Builder);
1998   if (SafeToHoist)
1999     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2000 
2001   // Broadcast the scalar into all locations in the vector.
2002   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2003 
2004   return Shuf;
2005 }
2006 
2007 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2008     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2009   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2010          "Expected either an induction phi-node or a truncate of it!");
2011   Value *Start = II.getStartValue();
2012 
2013   // Construct the initial value of the vector IV in the vector loop preheader
2014   auto CurrIP = Builder.saveIP();
2015   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2016   if (isa<TruncInst>(EntryVal)) {
2017     assert(Start->getType()->isIntegerTy() &&
2018            "Truncation requires an integer type");
2019     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2020     Step = Builder.CreateTrunc(Step, TruncType);
2021     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2022   }
2023   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2024   Value *SteppedStart =
2025       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2026 
2027   // We create vector phi nodes for both integer and floating-point induction
2028   // variables. Here, we determine the kind of arithmetic we will perform.
2029   Instruction::BinaryOps AddOp;
2030   Instruction::BinaryOps MulOp;
2031   if (Step->getType()->isIntegerTy()) {
2032     AddOp = Instruction::Add;
2033     MulOp = Instruction::Mul;
2034   } else {
2035     AddOp = II.getInductionOpcode();
2036     MulOp = Instruction::FMul;
2037   }
2038 
2039   // Multiply the vectorization factor by the step using integer or
2040   // floating-point arithmetic as appropriate.
2041   Value *ConstVF =
2042       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2043   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2044 
2045   // Create a vector splat to use in the induction update.
2046   //
2047   // FIXME: If the step is non-constant, we create the vector splat with
2048   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2049   //        handle a constant vector splat.
2050   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2051   Value *SplatVF = isa<Constant>(Mul)
2052                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2053                        : Builder.CreateVectorSplat(VF, Mul);
2054   Builder.restoreIP(CurrIP);
2055 
2056   // We may need to add the step a number of times, depending on the unroll
2057   // factor. The last of those goes into the PHI.
2058   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2059                                     &*LoopVectorBody->getFirstInsertionPt());
2060   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2061   Instruction *LastInduction = VecInd;
2062   for (unsigned Part = 0; Part < UF; ++Part) {
2063     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2064 
2065     if (isa<TruncInst>(EntryVal))
2066       addMetadata(LastInduction, EntryVal);
2067     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2068 
2069     LastInduction = cast<Instruction>(addFastMathFlag(
2070         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2071     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2072   }
2073 
2074   // Move the last step to the end of the latch block. This ensures consistent
2075   // placement of all induction updates.
2076   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2077   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2078   auto *ICmp = cast<Instruction>(Br->getCondition());
2079   LastInduction->moveBefore(ICmp);
2080   LastInduction->setName("vec.ind.next");
2081 
2082   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2083   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2084 }
2085 
2086 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2087   return Cost->isScalarAfterVectorization(I, VF) ||
2088          Cost->isProfitableToScalarize(I, VF);
2089 }
2090 
2091 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2092   if (shouldScalarizeInstruction(IV))
2093     return true;
2094   auto isScalarInst = [&](User *U) -> bool {
2095     auto *I = cast<Instruction>(U);
2096     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2097   };
2098   return llvm::any_of(IV->users(), isScalarInst);
2099 }
2100 
2101 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2102     const InductionDescriptor &ID, const Instruction *EntryVal,
2103     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2104   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2105          "Expected either an induction phi-node or a truncate of it!");
2106 
2107   // This induction variable is not the phi from the original loop but the
2108   // newly-created IV based on the proof that casted Phi is equal to the
2109   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2110   // re-uses the same InductionDescriptor that original IV uses but we don't
2111   // have to do any recording in this case - that is done when original IV is
2112   // processed.
2113   if (isa<TruncInst>(EntryVal))
2114     return;
2115 
2116   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2117   if (Casts.empty())
2118     return;
2119   // Only the first Cast instruction in the Casts vector is of interest.
2120   // The rest of the Casts (if exist) have no uses outside the
2121   // induction update chain itself.
2122   Instruction *CastInst = *Casts.begin();
2123   if (Lane < UINT_MAX)
2124     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2125   else
2126     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2127 }
2128 
2129 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2130   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2131          "Primary induction variable must have an integer type");
2132 
2133   auto II = Legal->getInductionVars().find(IV);
2134   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2135 
2136   auto ID = II->second;
2137   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2138 
2139   // The value from the original loop to which we are mapping the new induction
2140   // variable.
2141   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2142 
2143   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2144 
2145   // Generate code for the induction step. Note that induction steps are
2146   // required to be loop-invariant
2147   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2148     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2149            "Induction step should be loop invariant");
2150     if (PSE.getSE()->isSCEVable(IV->getType())) {
2151       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2152       return Exp.expandCodeFor(Step, Step->getType(),
2153                                LoopVectorPreHeader->getTerminator());
2154     }
2155     return cast<SCEVUnknown>(Step)->getValue();
2156   };
2157 
2158   // The scalar value to broadcast. This is derived from the canonical
2159   // induction variable. If a truncation type is given, truncate the canonical
2160   // induction variable and step. Otherwise, derive these values from the
2161   // induction descriptor.
2162   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2163     Value *ScalarIV = Induction;
2164     if (IV != OldInduction) {
2165       ScalarIV = IV->getType()->isIntegerTy()
2166                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2167                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2168                                           IV->getType());
2169       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2170       ScalarIV->setName("offset.idx");
2171     }
2172     if (Trunc) {
2173       auto *TruncType = cast<IntegerType>(Trunc->getType());
2174       assert(Step->getType()->isIntegerTy() &&
2175              "Truncation requires an integer step");
2176       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2177       Step = Builder.CreateTrunc(Step, TruncType);
2178     }
2179     return ScalarIV;
2180   };
2181 
2182   // Create the vector values from the scalar IV, in the absence of creating a
2183   // vector IV.
2184   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2185     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2186     for (unsigned Part = 0; Part < UF; ++Part) {
2187       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2188       Value *EntryPart =
2189           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2190                         ID.getInductionOpcode());
2191       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2192       if (Trunc)
2193         addMetadata(EntryPart, Trunc);
2194       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2195     }
2196   };
2197 
2198   // Now do the actual transformations, and start with creating the step value.
2199   Value *Step = CreateStepValue(ID.getStep());
2200   if (VF.isZero() || VF.isScalar()) {
2201     Value *ScalarIV = CreateScalarIV(Step);
2202     CreateSplatIV(ScalarIV, Step);
2203     return;
2204   }
2205 
2206   // Determine if we want a scalar version of the induction variable. This is
2207   // true if the induction variable itself is not widened, or if it has at
2208   // least one user in the loop that is not widened.
2209   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2210   if (!NeedsScalarIV) {
2211     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2212     return;
2213   }
2214 
2215   // Try to create a new independent vector induction variable. If we can't
2216   // create the phi node, we will splat the scalar induction variable in each
2217   // loop iteration.
2218   if (!shouldScalarizeInstruction(EntryVal)) {
2219     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2220     Value *ScalarIV = CreateScalarIV(Step);
2221     // Create scalar steps that can be used by instructions we will later
2222     // scalarize. Note that the addition of the scalar steps will not increase
2223     // the number of instructions in the loop in the common case prior to
2224     // InstCombine. We will be trading one vector extract for each scalar step.
2225     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2226     return;
2227   }
2228 
2229   // All IV users are scalar instructions, so only emit a scalar IV, not a
2230   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2231   // predicate used by the masked loads/stores.
2232   Value *ScalarIV = CreateScalarIV(Step);
2233   if (!Cost->isScalarEpilogueAllowed())
2234     CreateSplatIV(ScalarIV, Step);
2235   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2236 }
2237 
2238 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2239                                           Instruction::BinaryOps BinOp) {
2240   // Create and check the types.
2241   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2242   int VLen = ValVTy->getNumElements();
2243 
2244   Type *STy = Val->getType()->getScalarType();
2245   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2246          "Induction Step must be an integer or FP");
2247   assert(Step->getType() == STy && "Step has wrong type");
2248 
2249   SmallVector<Constant *, 8> Indices;
2250 
2251   if (STy->isIntegerTy()) {
2252     // Create a vector of consecutive numbers from zero to VF.
2253     for (int i = 0; i < VLen; ++i)
2254       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2255 
2256     // Add the consecutive indices to the vector value.
2257     Constant *Cv = ConstantVector::get(Indices);
2258     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2259     Step = Builder.CreateVectorSplat(VLen, Step);
2260     assert(Step->getType() == Val->getType() && "Invalid step vec");
2261     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2262     // which can be found from the original scalar operations.
2263     Step = Builder.CreateMul(Cv, Step);
2264     return Builder.CreateAdd(Val, Step, "induction");
2265   }
2266 
2267   // Floating point induction.
2268   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2269          "Binary Opcode should be specified for FP induction");
2270   // Create a vector of consecutive numbers from zero to VF.
2271   for (int i = 0; i < VLen; ++i)
2272     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2273 
2274   // Add the consecutive indices to the vector value.
2275   Constant *Cv = ConstantVector::get(Indices);
2276 
2277   Step = Builder.CreateVectorSplat(VLen, Step);
2278 
2279   // Floating point operations had to be 'fast' to enable the induction.
2280   FastMathFlags Flags;
2281   Flags.setFast();
2282 
2283   Value *MulOp = Builder.CreateFMul(Cv, Step);
2284   if (isa<Instruction>(MulOp))
2285     // Have to check, MulOp may be a constant
2286     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2287 
2288   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2289   if (isa<Instruction>(BOp))
2290     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2291   return BOp;
2292 }
2293 
2294 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2295                                            Instruction *EntryVal,
2296                                            const InductionDescriptor &ID) {
2297   // We shouldn't have to build scalar steps if we aren't vectorizing.
2298   assert(VF.isVector() && "VF should be greater than one");
2299   // Get the value type and ensure it and the step have the same integer type.
2300   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2301   assert(ScalarIVTy == Step->getType() &&
2302          "Val and Step should have the same type");
2303 
2304   // We build scalar steps for both integer and floating-point induction
2305   // variables. Here, we determine the kind of arithmetic we will perform.
2306   Instruction::BinaryOps AddOp;
2307   Instruction::BinaryOps MulOp;
2308   if (ScalarIVTy->isIntegerTy()) {
2309     AddOp = Instruction::Add;
2310     MulOp = Instruction::Mul;
2311   } else {
2312     AddOp = ID.getInductionOpcode();
2313     MulOp = Instruction::FMul;
2314   }
2315 
2316   // Determine the number of scalars we need to generate for each unroll
2317   // iteration. If EntryVal is uniform, we only need to generate the first
2318   // lane. Otherwise, we generate all VF values.
2319   unsigned Lanes =
2320       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2321           ? 1
2322           : VF.getKnownMinValue();
2323   assert((!VF.isScalable() || Lanes == 1) &&
2324          "Should never scalarize a scalable vector");
2325   // Compute the scalar steps and save the results in VectorLoopValueMap.
2326   for (unsigned Part = 0; Part < UF; ++Part) {
2327     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2328       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2329                                          ScalarIVTy->getScalarSizeInBits());
2330       Value *StartIdx =
2331           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2332       if (ScalarIVTy->isFloatingPointTy())
2333         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2334       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2335           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2336       // The step returned by `createStepForVF` is a runtime-evaluated value
2337       // when VF is scalable. Otherwise, it should be folded into a Constant.
2338       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2339              "Expected StartIdx to be folded to a constant when VF is not "
2340              "scalable");
2341       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2342       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2343       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2344       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2345     }
2346   }
2347 }
2348 
2349 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2350   assert(V != Induction && "The new induction variable should not be used.");
2351   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2352   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2353 
2354   // If we have a stride that is replaced by one, do it here. Defer this for
2355   // the VPlan-native path until we start running Legal checks in that path.
2356   if (!EnableVPlanNativePath && Legal->hasStride(V))
2357     V = ConstantInt::get(V->getType(), 1);
2358 
2359   // If we have a vector mapped to this value, return it.
2360   if (VectorLoopValueMap.hasVectorValue(V, Part))
2361     return VectorLoopValueMap.getVectorValue(V, Part);
2362 
2363   // If the value has not been vectorized, check if it has been scalarized
2364   // instead. If it has been scalarized, and we actually need the value in
2365   // vector form, we will construct the vector values on demand.
2366   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2367     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2368 
2369     // If we've scalarized a value, that value should be an instruction.
2370     auto *I = cast<Instruction>(V);
2371 
2372     // If we aren't vectorizing, we can just copy the scalar map values over to
2373     // the vector map.
2374     if (VF.isScalar()) {
2375       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2376       return ScalarValue;
2377     }
2378 
2379     // Get the last scalar instruction we generated for V and Part. If the value
2380     // is known to be uniform after vectorization, this corresponds to lane zero
2381     // of the Part unroll iteration. Otherwise, the last instruction is the one
2382     // we created for the last vector lane of the Part unroll iteration.
2383     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2384                             ? 0
2385                             : VF.getKnownMinValue() - 1;
2386     assert((!VF.isScalable() || LastLane == 0) &&
2387            "Scalable vectorization can't lead to any scalarized values.");
2388     auto *LastInst = cast<Instruction>(
2389         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2390 
2391     // Set the insert point after the last scalarized instruction. This ensures
2392     // the insertelement sequence will directly follow the scalar definitions.
2393     auto OldIP = Builder.saveIP();
2394     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2395     Builder.SetInsertPoint(&*NewIP);
2396 
2397     // However, if we are vectorizing, we need to construct the vector values.
2398     // If the value is known to be uniform after vectorization, we can just
2399     // broadcast the scalar value corresponding to lane zero for each unroll
2400     // iteration. Otherwise, we construct the vector values using insertelement
2401     // instructions. Since the resulting vectors are stored in
2402     // VectorLoopValueMap, we will only generate the insertelements once.
2403     Value *VectorValue = nullptr;
2404     if (Cost->isUniformAfterVectorization(I, VF)) {
2405       VectorValue = getBroadcastInstrs(ScalarValue);
2406       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2407     } else {
2408       // Initialize packing with insertelements to start from undef.
2409       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2410       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2411       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2412       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2413         packScalarIntoVectorValue(V, {Part, Lane});
2414       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2415     }
2416     Builder.restoreIP(OldIP);
2417     return VectorValue;
2418   }
2419 
2420   // If this scalar is unknown, assume that it is a constant or that it is
2421   // loop invariant. Broadcast V and save the value for future uses.
2422   Value *B = getBroadcastInstrs(V);
2423   VectorLoopValueMap.setVectorValue(V, Part, B);
2424   return B;
2425 }
2426 
2427 Value *
2428 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2429                                             const VPIteration &Instance) {
2430   // If the value is not an instruction contained in the loop, it should
2431   // already be scalar.
2432   if (OrigLoop->isLoopInvariant(V))
2433     return V;
2434 
2435   assert(Instance.Lane > 0
2436              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2437              : true && "Uniform values only have lane zero");
2438 
2439   // If the value from the original loop has not been vectorized, it is
2440   // represented by UF x VF scalar values in the new loop. Return the requested
2441   // scalar value.
2442   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2443     return VectorLoopValueMap.getScalarValue(V, Instance);
2444 
2445   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2446   // for the given unroll part. If this entry is not a vector type (i.e., the
2447   // vectorization factor is one), there is no need to generate an
2448   // extractelement instruction.
2449   auto *U = getOrCreateVectorValue(V, Instance.Part);
2450   if (!U->getType()->isVectorTy()) {
2451     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2452     return U;
2453   }
2454 
2455   // Otherwise, the value from the original loop has been vectorized and is
2456   // represented by UF vector values. Extract and return the requested scalar
2457   // value from the appropriate vector lane.
2458   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2459 }
2460 
2461 void InnerLoopVectorizer::packScalarIntoVectorValue(
2462     Value *V, const VPIteration &Instance) {
2463   assert(V != Induction && "The new induction variable should not be used.");
2464   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2465   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2466 
2467   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2468   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2469   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2470                                             Builder.getInt32(Instance.Lane));
2471   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2472 }
2473 
2474 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2475   assert(Vec->getType()->isVectorTy() && "Invalid type");
2476   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2477   SmallVector<int, 8> ShuffleMask;
2478   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2479     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2480 
2481   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2482 }
2483 
2484 // Return whether we allow using masked interleave-groups (for dealing with
2485 // strided loads/stores that reside in predicated blocks, or for dealing
2486 // with gaps).
2487 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2488   // If an override option has been passed in for interleaved accesses, use it.
2489   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2490     return EnableMaskedInterleavedMemAccesses;
2491 
2492   return TTI.enableMaskedInterleavedAccessVectorization();
2493 }
2494 
2495 // Try to vectorize the interleave group that \p Instr belongs to.
2496 //
2497 // E.g. Translate following interleaved load group (factor = 3):
2498 //   for (i = 0; i < N; i+=3) {
2499 //     R = Pic[i];             // Member of index 0
2500 //     G = Pic[i+1];           // Member of index 1
2501 //     B = Pic[i+2];           // Member of index 2
2502 //     ... // do something to R, G, B
2503 //   }
2504 // To:
2505 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2506 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2507 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2508 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2509 //
2510 // Or translate following interleaved store group (factor = 3):
2511 //   for (i = 0; i < N; i+=3) {
2512 //     ... do something to R, G, B
2513 //     Pic[i]   = R;           // Member of index 0
2514 //     Pic[i+1] = G;           // Member of index 1
2515 //     Pic[i+2] = B;           // Member of index 2
2516 //   }
2517 // To:
2518 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2519 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2520 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2521 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2522 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2523 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2524     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2525     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2526     VPValue *BlockInMask) {
2527   Instruction *Instr = Group->getInsertPos();
2528   const DataLayout &DL = Instr->getModule()->getDataLayout();
2529 
2530   // Prepare for the vector type of the interleaved load/store.
2531   Type *ScalarTy = getMemInstValueType(Instr);
2532   unsigned InterleaveFactor = Group->getFactor();
2533   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2534   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2535 
2536   // Prepare for the new pointers.
2537   SmallVector<Value *, 2> AddrParts;
2538   unsigned Index = Group->getIndex(Instr);
2539 
2540   // TODO: extend the masked interleaved-group support to reversed access.
2541   assert((!BlockInMask || !Group->isReverse()) &&
2542          "Reversed masked interleave-group not supported.");
2543 
2544   // If the group is reverse, adjust the index to refer to the last vector lane
2545   // instead of the first. We adjust the index from the first vector lane,
2546   // rather than directly getting the pointer for lane VF - 1, because the
2547   // pointer operand of the interleaved access is supposed to be uniform. For
2548   // uniform instructions, we're only required to generate a value for the
2549   // first vector lane in each unroll iteration.
2550   assert(!VF.isScalable() &&
2551          "scalable vector reverse operation is not implemented");
2552   if (Group->isReverse())
2553     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2554 
2555   for (unsigned Part = 0; Part < UF; Part++) {
2556     Value *AddrPart = State.get(Addr, {Part, 0});
2557     setDebugLocFromInst(Builder, AddrPart);
2558 
2559     // Notice current instruction could be any index. Need to adjust the address
2560     // to the member of index 0.
2561     //
2562     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2563     //       b = A[i];       // Member of index 0
2564     // Current pointer is pointed to A[i+1], adjust it to A[i].
2565     //
2566     // E.g.  A[i+1] = a;     // Member of index 1
2567     //       A[i]   = b;     // Member of index 0
2568     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2569     // Current pointer is pointed to A[i+2], adjust it to A[i].
2570 
2571     bool InBounds = false;
2572     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2573       InBounds = gep->isInBounds();
2574     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2575     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2576 
2577     // Cast to the vector pointer type.
2578     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2579     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2580     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2581   }
2582 
2583   setDebugLocFromInst(Builder, Instr);
2584   Value *UndefVec = UndefValue::get(VecTy);
2585 
2586   Value *MaskForGaps = nullptr;
2587   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2588     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2589     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2590     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2591   }
2592 
2593   // Vectorize the interleaved load group.
2594   if (isa<LoadInst>(Instr)) {
2595     // For each unroll part, create a wide load for the group.
2596     SmallVector<Value *, 2> NewLoads;
2597     for (unsigned Part = 0; Part < UF; Part++) {
2598       Instruction *NewLoad;
2599       if (BlockInMask || MaskForGaps) {
2600         assert(useMaskedInterleavedAccesses(*TTI) &&
2601                "masked interleaved groups are not allowed.");
2602         Value *GroupMask = MaskForGaps;
2603         if (BlockInMask) {
2604           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2605           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2606           Value *ShuffledMask = Builder.CreateShuffleVector(
2607               BlockInMaskPart,
2608               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2609               "interleaved.mask");
2610           GroupMask = MaskForGaps
2611                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2612                                                 MaskForGaps)
2613                           : ShuffledMask;
2614         }
2615         NewLoad =
2616             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2617                                      GroupMask, UndefVec, "wide.masked.vec");
2618       }
2619       else
2620         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2621                                             Group->getAlign(), "wide.vec");
2622       Group->addMetadata(NewLoad);
2623       NewLoads.push_back(NewLoad);
2624     }
2625 
2626     // For each member in the group, shuffle out the appropriate data from the
2627     // wide loads.
2628     unsigned J = 0;
2629     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2630       Instruction *Member = Group->getMember(I);
2631 
2632       // Skip the gaps in the group.
2633       if (!Member)
2634         continue;
2635 
2636       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2637       auto StrideMask =
2638           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2639       for (unsigned Part = 0; Part < UF; Part++) {
2640         Value *StridedVec = Builder.CreateShuffleVector(
2641             NewLoads[Part], StrideMask, "strided.vec");
2642 
2643         // If this member has different type, cast the result type.
2644         if (Member->getType() != ScalarTy) {
2645           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2646           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2647           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2648         }
2649 
2650         if (Group->isReverse())
2651           StridedVec = reverseVector(StridedVec);
2652 
2653         State.set(VPDefs[J], Member, StridedVec, Part);
2654       }
2655       ++J;
2656     }
2657     return;
2658   }
2659 
2660   // The sub vector type for current instruction.
2661   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2662   auto *SubVT = VectorType::get(ScalarTy, VF);
2663 
2664   // Vectorize the interleaved store group.
2665   for (unsigned Part = 0; Part < UF; Part++) {
2666     // Collect the stored vector from each member.
2667     SmallVector<Value *, 4> StoredVecs;
2668     for (unsigned i = 0; i < InterleaveFactor; i++) {
2669       // Interleaved store group doesn't allow a gap, so each index has a member
2670       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2671 
2672       Value *StoredVec = State.get(StoredValues[i], Part);
2673 
2674       if (Group->isReverse())
2675         StoredVec = reverseVector(StoredVec);
2676 
2677       // If this member has different type, cast it to a unified type.
2678 
2679       if (StoredVec->getType() != SubVT)
2680         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2681 
2682       StoredVecs.push_back(StoredVec);
2683     }
2684 
2685     // Concatenate all vectors into a wide vector.
2686     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2687 
2688     // Interleave the elements in the wide vector.
2689     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2690     Value *IVec = Builder.CreateShuffleVector(
2691         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2692         "interleaved.vec");
2693 
2694     Instruction *NewStoreInstr;
2695     if (BlockInMask) {
2696       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2697       Value *ShuffledMask = Builder.CreateShuffleVector(
2698           BlockInMaskPart,
2699           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2700           "interleaved.mask");
2701       NewStoreInstr = Builder.CreateMaskedStore(
2702           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2703     }
2704     else
2705       NewStoreInstr =
2706           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2707 
2708     Group->addMetadata(NewStoreInstr);
2709   }
2710 }
2711 
2712 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2713     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2714     VPValue *StoredValue, VPValue *BlockInMask) {
2715   // Attempt to issue a wide load.
2716   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2717   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2718 
2719   assert((LI || SI) && "Invalid Load/Store instruction");
2720   assert((!SI || StoredValue) && "No stored value provided for widened store");
2721   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2722 
2723   LoopVectorizationCostModel::InstWidening Decision =
2724       Cost->getWideningDecision(Instr, VF);
2725   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2726           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2727           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2728          "CM decision is not to widen the memory instruction");
2729 
2730   Type *ScalarDataTy = getMemInstValueType(Instr);
2731 
2732   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2733   const Align Alignment = getLoadStoreAlignment(Instr);
2734 
2735   // Determine if the pointer operand of the access is either consecutive or
2736   // reverse consecutive.
2737   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2738   bool ConsecutiveStride =
2739       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2740   bool CreateGatherScatter =
2741       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2742 
2743   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2744   // gather/scatter. Otherwise Decision should have been to Scalarize.
2745   assert((ConsecutiveStride || CreateGatherScatter) &&
2746          "The instruction should be scalarized");
2747   (void)ConsecutiveStride;
2748 
2749   VectorParts BlockInMaskParts(UF);
2750   bool isMaskRequired = BlockInMask;
2751   if (isMaskRequired)
2752     for (unsigned Part = 0; Part < UF; ++Part)
2753       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2754 
2755   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2756     // Calculate the pointer for the specific unroll-part.
2757     GetElementPtrInst *PartPtr = nullptr;
2758 
2759     bool InBounds = false;
2760     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2761       InBounds = gep->isInBounds();
2762 
2763     if (Reverse) {
2764       assert(!VF.isScalable() &&
2765              "Reversing vectors is not yet supported for scalable vectors.");
2766 
2767       // If the address is consecutive but reversed, then the
2768       // wide store needs to start at the last vector element.
2769       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2770           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2771       PartPtr->setIsInBounds(InBounds);
2772       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2773           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2774       PartPtr->setIsInBounds(InBounds);
2775       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2776         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2777     } else {
2778       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2779       PartPtr = cast<GetElementPtrInst>(
2780           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2781       PartPtr->setIsInBounds(InBounds);
2782     }
2783 
2784     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2785     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2786   };
2787 
2788   // Handle Stores:
2789   if (SI) {
2790     setDebugLocFromInst(Builder, SI);
2791 
2792     for (unsigned Part = 0; Part < UF; ++Part) {
2793       Instruction *NewSI = nullptr;
2794       Value *StoredVal = State.get(StoredValue, Part);
2795       if (CreateGatherScatter) {
2796         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2797         Value *VectorGep = State.get(Addr, Part);
2798         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2799                                             MaskPart);
2800       } else {
2801         if (Reverse) {
2802           // If we store to reverse consecutive memory locations, then we need
2803           // to reverse the order of elements in the stored value.
2804           StoredVal = reverseVector(StoredVal);
2805           // We don't want to update the value in the map as it might be used in
2806           // another expression. So don't call resetVectorValue(StoredVal).
2807         }
2808         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2809         if (isMaskRequired)
2810           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2811                                             BlockInMaskParts[Part]);
2812         else
2813           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2814       }
2815       addMetadata(NewSI, SI);
2816     }
2817     return;
2818   }
2819 
2820   // Handle loads.
2821   assert(LI && "Must have a load instruction");
2822   setDebugLocFromInst(Builder, LI);
2823   for (unsigned Part = 0; Part < UF; ++Part) {
2824     Value *NewLI;
2825     if (CreateGatherScatter) {
2826       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2827       Value *VectorGep = State.get(Addr, Part);
2828       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2829                                          nullptr, "wide.masked.gather");
2830       addMetadata(NewLI, LI);
2831     } else {
2832       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2833       if (isMaskRequired)
2834         NewLI = Builder.CreateMaskedLoad(
2835             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2836             "wide.masked.load");
2837       else
2838         NewLI =
2839             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2840 
2841       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2842       addMetadata(NewLI, LI);
2843       if (Reverse)
2844         NewLI = reverseVector(NewLI);
2845     }
2846 
2847     State.set(Def, Instr, NewLI, Part);
2848   }
2849 }
2850 
2851 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2852                                                const VPIteration &Instance,
2853                                                bool IfPredicateInstr,
2854                                                VPTransformState &State) {
2855   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2856 
2857   setDebugLocFromInst(Builder, Instr);
2858 
2859   // Does this instruction return a value ?
2860   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2861 
2862   Instruction *Cloned = Instr->clone();
2863   if (!IsVoidRetTy)
2864     Cloned->setName(Instr->getName() + ".cloned");
2865 
2866   // Replace the operands of the cloned instructions with their scalar
2867   // equivalents in the new loop.
2868   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2869     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2870     auto InputInstance = Instance;
2871     if (!Operand || !OrigLoop->contains(Operand) ||
2872         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2873       InputInstance.Lane = 0;
2874     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2875     Cloned->setOperand(op, NewOp);
2876   }
2877   addNewMetadata(Cloned, Instr);
2878 
2879   // Place the cloned scalar in the new loop.
2880   Builder.Insert(Cloned);
2881 
2882   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2883   // representing scalar values in VPTransformState. Add the cloned scalar to
2884   // the scalar map entry.
2885   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2886 
2887   // If we just cloned a new assumption, add it the assumption cache.
2888   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2889     if (II->getIntrinsicID() == Intrinsic::assume)
2890       AC->registerAssumption(II);
2891 
2892   // End if-block.
2893   if (IfPredicateInstr)
2894     PredicatedInstructions.push_back(Cloned);
2895 }
2896 
2897 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2898                                                       Value *End, Value *Step,
2899                                                       Instruction *DL) {
2900   BasicBlock *Header = L->getHeader();
2901   BasicBlock *Latch = L->getLoopLatch();
2902   // As we're just creating this loop, it's possible no latch exists
2903   // yet. If so, use the header as this will be a single block loop.
2904   if (!Latch)
2905     Latch = Header;
2906 
2907   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2908   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2909   setDebugLocFromInst(Builder, OldInst);
2910   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2911 
2912   Builder.SetInsertPoint(Latch->getTerminator());
2913   setDebugLocFromInst(Builder, OldInst);
2914 
2915   // Create i+1 and fill the PHINode.
2916   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2917   Induction->addIncoming(Start, L->getLoopPreheader());
2918   Induction->addIncoming(Next, Latch);
2919   // Create the compare.
2920   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2921   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
2922 
2923   // Now we have two terminators. Remove the old one from the block.
2924   Latch->getTerminator()->eraseFromParent();
2925 
2926   return Induction;
2927 }
2928 
2929 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2930   if (TripCount)
2931     return TripCount;
2932 
2933   assert(L && "Create Trip Count for null loop.");
2934   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2935   // Find the loop boundaries.
2936   ScalarEvolution *SE = PSE.getSE();
2937   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2938   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2939          "Invalid loop count");
2940 
2941   Type *IdxTy = Legal->getWidestInductionType();
2942   assert(IdxTy && "No type for induction");
2943 
2944   // The exit count might have the type of i64 while the phi is i32. This can
2945   // happen if we have an induction variable that is sign extended before the
2946   // compare. The only way that we get a backedge taken count is that the
2947   // induction variable was signed and as such will not overflow. In such a case
2948   // truncation is legal.
2949   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2950       IdxTy->getPrimitiveSizeInBits())
2951     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2952   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2953 
2954   // Get the total trip count from the count by adding 1.
2955   const SCEV *ExitCount = SE->getAddExpr(
2956       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2957 
2958   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2959 
2960   // Expand the trip count and place the new instructions in the preheader.
2961   // Notice that the pre-header does not change, only the loop body.
2962   SCEVExpander Exp(*SE, DL, "induction");
2963 
2964   // Count holds the overall loop count (N).
2965   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2966                                 L->getLoopPreheader()->getTerminator());
2967 
2968   if (TripCount->getType()->isPointerTy())
2969     TripCount =
2970         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2971                                     L->getLoopPreheader()->getTerminator());
2972 
2973   return TripCount;
2974 }
2975 
2976 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2977   if (VectorTripCount)
2978     return VectorTripCount;
2979 
2980   Value *TC = getOrCreateTripCount(L);
2981   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2982 
2983   Type *Ty = TC->getType();
2984   // This is where we can make the step a runtime constant.
2985   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2986 
2987   // If the tail is to be folded by masking, round the number of iterations N
2988   // up to a multiple of Step instead of rounding down. This is done by first
2989   // adding Step-1 and then rounding down. Note that it's ok if this addition
2990   // overflows: the vector induction variable will eventually wrap to zero given
2991   // that it starts at zero and its Step is a power of two; the loop will then
2992   // exit, with the last early-exit vector comparison also producing all-true.
2993   if (Cost->foldTailByMasking()) {
2994     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2995            "VF*UF must be a power of 2 when folding tail by masking");
2996     assert(!VF.isScalable() &&
2997            "Tail folding not yet supported for scalable vectors");
2998     TC = Builder.CreateAdd(
2999         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3000   }
3001 
3002   // Now we need to generate the expression for the part of the loop that the
3003   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3004   // iterations are not required for correctness, or N - Step, otherwise. Step
3005   // is equal to the vectorization factor (number of SIMD elements) times the
3006   // unroll factor (number of SIMD instructions).
3007   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3008 
3009   // There are two cases where we need to ensure (at least) the last iteration
3010   // runs in the scalar remainder loop. Thus, if the step evenly divides
3011   // the trip count, we set the remainder to be equal to the step. If the step
3012   // does not evenly divide the trip count, no adjustment is necessary since
3013   // there will already be scalar iterations. Note that the minimum iterations
3014   // check ensures that N >= Step. The cases are:
3015   // 1) If there is a non-reversed interleaved group that may speculatively
3016   //    access memory out-of-bounds.
3017   // 2) If any instruction may follow a conditionally taken exit. That is, if
3018   //    the loop contains multiple exiting blocks, or a single exiting block
3019   //    which is not the latch.
3020   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3021     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3022     R = Builder.CreateSelect(IsZero, Step, R);
3023   }
3024 
3025   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3026 
3027   return VectorTripCount;
3028 }
3029 
3030 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3031                                                    const DataLayout &DL) {
3032   // Verify that V is a vector type with same number of elements as DstVTy.
3033   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3034   unsigned VF = DstFVTy->getNumElements();
3035   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3036   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3037   Type *SrcElemTy = SrcVecTy->getElementType();
3038   Type *DstElemTy = DstFVTy->getElementType();
3039   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3040          "Vector elements must have same size");
3041 
3042   // Do a direct cast if element types are castable.
3043   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3044     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3045   }
3046   // V cannot be directly casted to desired vector type.
3047   // May happen when V is a floating point vector but DstVTy is a vector of
3048   // pointers or vice-versa. Handle this using a two-step bitcast using an
3049   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3050   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3051          "Only one type should be a pointer type");
3052   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3053          "Only one type should be a floating point type");
3054   Type *IntTy =
3055       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3056   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3057   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3058   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3059 }
3060 
3061 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3062                                                          BasicBlock *Bypass) {
3063   Value *Count = getOrCreateTripCount(L);
3064   // Reuse existing vector loop preheader for TC checks.
3065   // Note that new preheader block is generated for vector loop.
3066   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3067   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3068 
3069   // Generate code to check if the loop's trip count is less than VF * UF, or
3070   // equal to it in case a scalar epilogue is required; this implies that the
3071   // vector trip count is zero. This check also covers the case where adding one
3072   // to the backedge-taken count overflowed leading to an incorrect trip count
3073   // of zero. In this case we will also jump to the scalar loop.
3074   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3075                                           : ICmpInst::ICMP_ULT;
3076 
3077   // If tail is to be folded, vector loop takes care of all iterations.
3078   Value *CheckMinIters = Builder.getFalse();
3079   if (!Cost->foldTailByMasking()) {
3080     Value *Step =
3081         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3082     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3083   }
3084   // Create new preheader for vector loop.
3085   LoopVectorPreHeader =
3086       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3087                  "vector.ph");
3088 
3089   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3090                                DT->getNode(Bypass)->getIDom()) &&
3091          "TC check is expected to dominate Bypass");
3092 
3093   // Update dominator for Bypass & LoopExit.
3094   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3095   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3096 
3097   ReplaceInstWithInst(
3098       TCCheckBlock->getTerminator(),
3099       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3100   LoopBypassBlocks.push_back(TCCheckBlock);
3101 }
3102 
3103 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3104   // Reuse existing vector loop preheader for SCEV checks.
3105   // Note that new preheader block is generated for vector loop.
3106   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3107 
3108   // Generate the code to check that the SCEV assumptions that we made.
3109   // We want the new basic block to start at the first instruction in a
3110   // sequence of instructions that form a check.
3111   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3112                    "scev.check");
3113   Value *SCEVCheck = Exp.expandCodeForPredicate(
3114       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3115 
3116   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3117     if (C->isZero())
3118       return;
3119 
3120   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3121            (OptForSizeBasedOnProfile &&
3122             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3123          "Cannot SCEV check stride or overflow when optimizing for size");
3124 
3125   SCEVCheckBlock->setName("vector.scevcheck");
3126   // Create new preheader for vector loop.
3127   LoopVectorPreHeader =
3128       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3129                  nullptr, "vector.ph");
3130 
3131   // Update dominator only if this is first RT check.
3132   if (LoopBypassBlocks.empty()) {
3133     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3134     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3135   }
3136 
3137   ReplaceInstWithInst(
3138       SCEVCheckBlock->getTerminator(),
3139       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3140   LoopBypassBlocks.push_back(SCEVCheckBlock);
3141   AddedSafetyChecks = true;
3142 }
3143 
3144 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3145   // VPlan-native path does not do any analysis for runtime checks currently.
3146   if (EnableVPlanNativePath)
3147     return;
3148 
3149   // Reuse existing vector loop preheader for runtime memory checks.
3150   // Note that new preheader block is generated for vector loop.
3151   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3152 
3153   // Generate the code that checks in runtime if arrays overlap. We put the
3154   // checks into a separate block to make the more common case of few elements
3155   // faster.
3156   auto *LAI = Legal->getLAI();
3157   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3158   if (!RtPtrChecking.Need)
3159     return;
3160 
3161   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3162     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3163            "Cannot emit memory checks when optimizing for size, unless forced "
3164            "to vectorize.");
3165     ORE->emit([&]() {
3166       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3167                                         L->getStartLoc(), L->getHeader())
3168              << "Code-size may be reduced by not forcing "
3169                 "vectorization, or by source-code modifications "
3170                 "eliminating the need for runtime checks "
3171                 "(e.g., adding 'restrict').";
3172     });
3173   }
3174 
3175   MemCheckBlock->setName("vector.memcheck");
3176   // Create new preheader for vector loop.
3177   LoopVectorPreHeader =
3178       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3179                  "vector.ph");
3180 
3181   auto *CondBranch = cast<BranchInst>(
3182       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3183   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3184   LoopBypassBlocks.push_back(MemCheckBlock);
3185   AddedSafetyChecks = true;
3186 
3187   // Update dominator only if this is first RT check.
3188   if (LoopBypassBlocks.empty()) {
3189     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3190     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3191   }
3192 
3193   Instruction *FirstCheckInst;
3194   Instruction *MemRuntimeCheck;
3195   std::tie(FirstCheckInst, MemRuntimeCheck) =
3196       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3197                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3198   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3199                             "claimed checks are required");
3200   CondBranch->setCondition(MemRuntimeCheck);
3201 
3202   // We currently don't use LoopVersioning for the actual loop cloning but we
3203   // still use it to add the noalias metadata.
3204   LVer = std::make_unique<LoopVersioning>(
3205       *Legal->getLAI(),
3206       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3207       DT, PSE.getSE());
3208   LVer->prepareNoAliasMetadata();
3209 }
3210 
3211 Value *InnerLoopVectorizer::emitTransformedIndex(
3212     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3213     const InductionDescriptor &ID) const {
3214 
3215   SCEVExpander Exp(*SE, DL, "induction");
3216   auto Step = ID.getStep();
3217   auto StartValue = ID.getStartValue();
3218   assert(Index->getType() == Step->getType() &&
3219          "Index type does not match StepValue type");
3220 
3221   // Note: the IR at this point is broken. We cannot use SE to create any new
3222   // SCEV and then expand it, hoping that SCEV's simplification will give us
3223   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3224   // lead to various SCEV crashes. So all we can do is to use builder and rely
3225   // on InstCombine for future simplifications. Here we handle some trivial
3226   // cases only.
3227   auto CreateAdd = [&B](Value *X, Value *Y) {
3228     assert(X->getType() == Y->getType() && "Types don't match!");
3229     if (auto *CX = dyn_cast<ConstantInt>(X))
3230       if (CX->isZero())
3231         return Y;
3232     if (auto *CY = dyn_cast<ConstantInt>(Y))
3233       if (CY->isZero())
3234         return X;
3235     return B.CreateAdd(X, Y);
3236   };
3237 
3238   auto CreateMul = [&B](Value *X, Value *Y) {
3239     assert(X->getType() == Y->getType() && "Types don't match!");
3240     if (auto *CX = dyn_cast<ConstantInt>(X))
3241       if (CX->isOne())
3242         return Y;
3243     if (auto *CY = dyn_cast<ConstantInt>(Y))
3244       if (CY->isOne())
3245         return X;
3246     return B.CreateMul(X, Y);
3247   };
3248 
3249   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3250   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3251   // the DomTree is not kept up-to-date for additional blocks generated in the
3252   // vector loop. By using the header as insertion point, we guarantee that the
3253   // expanded instructions dominate all their uses.
3254   auto GetInsertPoint = [this, &B]() {
3255     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3256     if (InsertBB != LoopVectorBody &&
3257         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3258       return LoopVectorBody->getTerminator();
3259     return &*B.GetInsertPoint();
3260   };
3261   switch (ID.getKind()) {
3262   case InductionDescriptor::IK_IntInduction: {
3263     assert(Index->getType() == StartValue->getType() &&
3264            "Index type does not match StartValue type");
3265     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3266       return B.CreateSub(StartValue, Index);
3267     auto *Offset = CreateMul(
3268         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3269     return CreateAdd(StartValue, Offset);
3270   }
3271   case InductionDescriptor::IK_PtrInduction: {
3272     assert(isa<SCEVConstant>(Step) &&
3273            "Expected constant step for pointer induction");
3274     return B.CreateGEP(
3275         StartValue->getType()->getPointerElementType(), StartValue,
3276         CreateMul(Index,
3277                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3278   }
3279   case InductionDescriptor::IK_FpInduction: {
3280     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3281     auto InductionBinOp = ID.getInductionBinOp();
3282     assert(InductionBinOp &&
3283            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3284             InductionBinOp->getOpcode() == Instruction::FSub) &&
3285            "Original bin op should be defined for FP induction");
3286 
3287     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3288 
3289     // Floating point operations had to be 'fast' to enable the induction.
3290     FastMathFlags Flags;
3291     Flags.setFast();
3292 
3293     Value *MulExp = B.CreateFMul(StepValue, Index);
3294     if (isa<Instruction>(MulExp))
3295       // We have to check, the MulExp may be a constant.
3296       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3297 
3298     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3299                                "induction");
3300     if (isa<Instruction>(BOp))
3301       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3302 
3303     return BOp;
3304   }
3305   case InductionDescriptor::IK_NoInduction:
3306     return nullptr;
3307   }
3308   llvm_unreachable("invalid enum");
3309 }
3310 
3311 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3312   LoopScalarBody = OrigLoop->getHeader();
3313   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3314   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3315   assert(LoopExitBlock && "Must have an exit block");
3316   assert(LoopVectorPreHeader && "Invalid loop structure");
3317 
3318   LoopMiddleBlock =
3319       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3320                  LI, nullptr, Twine(Prefix) + "middle.block");
3321   LoopScalarPreHeader =
3322       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3323                  nullptr, Twine(Prefix) + "scalar.ph");
3324 
3325   // Set up branch from middle block to the exit and scalar preheader blocks.
3326   // completeLoopSkeleton will update the condition to use an iteration check,
3327   // if required to decide whether to execute the remainder.
3328   BranchInst *BrInst =
3329       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3330   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3331   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3332   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3333 
3334   // We intentionally don't let SplitBlock to update LoopInfo since
3335   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3336   // LoopVectorBody is explicitly added to the correct place few lines later.
3337   LoopVectorBody =
3338       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3339                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3340 
3341   // Update dominator for loop exit.
3342   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3343 
3344   // Create and register the new vector loop.
3345   Loop *Lp = LI->AllocateLoop();
3346   Loop *ParentLoop = OrigLoop->getParentLoop();
3347 
3348   // Insert the new loop into the loop nest and register the new basic blocks
3349   // before calling any utilities such as SCEV that require valid LoopInfo.
3350   if (ParentLoop) {
3351     ParentLoop->addChildLoop(Lp);
3352   } else {
3353     LI->addTopLevelLoop(Lp);
3354   }
3355   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3356   return Lp;
3357 }
3358 
3359 void InnerLoopVectorizer::createInductionResumeValues(
3360     Loop *L, Value *VectorTripCount,
3361     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3362   assert(VectorTripCount && L && "Expected valid arguments");
3363   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3364           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3365          "Inconsistent information about additional bypass.");
3366   // We are going to resume the execution of the scalar loop.
3367   // Go over all of the induction variables that we found and fix the
3368   // PHIs that are left in the scalar version of the loop.
3369   // The starting values of PHI nodes depend on the counter of the last
3370   // iteration in the vectorized loop.
3371   // If we come from a bypass edge then we need to start from the original
3372   // start value.
3373   for (auto &InductionEntry : Legal->getInductionVars()) {
3374     PHINode *OrigPhi = InductionEntry.first;
3375     InductionDescriptor II = InductionEntry.second;
3376 
3377     // Create phi nodes to merge from the  backedge-taken check block.
3378     PHINode *BCResumeVal =
3379         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3380                         LoopScalarPreHeader->getTerminator());
3381     // Copy original phi DL over to the new one.
3382     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3383     Value *&EndValue = IVEndValues[OrigPhi];
3384     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3385     if (OrigPhi == OldInduction) {
3386       // We know what the end value is.
3387       EndValue = VectorTripCount;
3388     } else {
3389       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3390       Type *StepType = II.getStep()->getType();
3391       Instruction::CastOps CastOp =
3392           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3393       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3394       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3395       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3396       EndValue->setName("ind.end");
3397 
3398       // Compute the end value for the additional bypass (if applicable).
3399       if (AdditionalBypass.first) {
3400         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3401         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3402                                          StepType, true);
3403         CRD =
3404             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3405         EndValueFromAdditionalBypass =
3406             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3407         EndValueFromAdditionalBypass->setName("ind.end");
3408       }
3409     }
3410     // The new PHI merges the original incoming value, in case of a bypass,
3411     // or the value at the end of the vectorized loop.
3412     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3413 
3414     // Fix the scalar body counter (PHI node).
3415     // The old induction's phi node in the scalar body needs the truncated
3416     // value.
3417     for (BasicBlock *BB : LoopBypassBlocks)
3418       BCResumeVal->addIncoming(II.getStartValue(), BB);
3419 
3420     if (AdditionalBypass.first)
3421       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3422                                             EndValueFromAdditionalBypass);
3423 
3424     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3425   }
3426 }
3427 
3428 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3429                                                       MDNode *OrigLoopID) {
3430   assert(L && "Expected valid loop.");
3431 
3432   // The trip counts should be cached by now.
3433   Value *Count = getOrCreateTripCount(L);
3434   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3435 
3436   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3437 
3438   // Add a check in the middle block to see if we have completed
3439   // all of the iterations in the first vector loop.
3440   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3441   // If tail is to be folded, we know we don't need to run the remainder.
3442   if (!Cost->foldTailByMasking()) {
3443     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3444                                         Count, VectorTripCount, "cmp.n",
3445                                         LoopMiddleBlock->getTerminator());
3446 
3447     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3448     // of the corresponding compare because they may have ended up with
3449     // different line numbers and we want to avoid awkward line stepping while
3450     // debugging. Eg. if the compare has got a line number inside the loop.
3451     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3452     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3453   }
3454 
3455   // Get ready to start creating new instructions into the vectorized body.
3456   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3457          "Inconsistent vector loop preheader");
3458   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3459 
3460   Optional<MDNode *> VectorizedLoopID =
3461       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3462                                       LLVMLoopVectorizeFollowupVectorized});
3463   if (VectorizedLoopID.hasValue()) {
3464     L->setLoopID(VectorizedLoopID.getValue());
3465 
3466     // Do not setAlreadyVectorized if loop attributes have been defined
3467     // explicitly.
3468     return LoopVectorPreHeader;
3469   }
3470 
3471   // Keep all loop hints from the original loop on the vector loop (we'll
3472   // replace the vectorizer-specific hints below).
3473   if (MDNode *LID = OrigLoop->getLoopID())
3474     L->setLoopID(LID);
3475 
3476   LoopVectorizeHints Hints(L, true, *ORE);
3477   Hints.setAlreadyVectorized();
3478 
3479 #ifdef EXPENSIVE_CHECKS
3480   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3481   LI->verify(*DT);
3482 #endif
3483 
3484   return LoopVectorPreHeader;
3485 }
3486 
3487 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3488   /*
3489    In this function we generate a new loop. The new loop will contain
3490    the vectorized instructions while the old loop will continue to run the
3491    scalar remainder.
3492 
3493        [ ] <-- loop iteration number check.
3494     /   |
3495    /    v
3496   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3497   |  /  |
3498   | /   v
3499   ||   [ ]     <-- vector pre header.
3500   |/    |
3501   |     v
3502   |    [  ] \
3503   |    [  ]_|   <-- vector loop.
3504   |     |
3505   |     v
3506   |   -[ ]   <--- middle-block.
3507   |  /  |
3508   | /   v
3509   -|- >[ ]     <--- new preheader.
3510    |    |
3511    |    v
3512    |   [ ] \
3513    |   [ ]_|   <-- old scalar loop to handle remainder.
3514     \   |
3515      \  v
3516       >[ ]     <-- exit block.
3517    ...
3518    */
3519 
3520   // Get the metadata of the original loop before it gets modified.
3521   MDNode *OrigLoopID = OrigLoop->getLoopID();
3522 
3523   // Create an empty vector loop, and prepare basic blocks for the runtime
3524   // checks.
3525   Loop *Lp = createVectorLoopSkeleton("");
3526 
3527   // Now, compare the new count to zero. If it is zero skip the vector loop and
3528   // jump to the scalar loop. This check also covers the case where the
3529   // backedge-taken count is uint##_max: adding one to it will overflow leading
3530   // to an incorrect trip count of zero. In this (rare) case we will also jump
3531   // to the scalar loop.
3532   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3533 
3534   // Generate the code to check any assumptions that we've made for SCEV
3535   // expressions.
3536   emitSCEVChecks(Lp, LoopScalarPreHeader);
3537 
3538   // Generate the code that checks in runtime if arrays overlap. We put the
3539   // checks into a separate block to make the more common case of few elements
3540   // faster.
3541   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3542 
3543   // Some loops have a single integer induction variable, while other loops
3544   // don't. One example is c++ iterators that often have multiple pointer
3545   // induction variables. In the code below we also support a case where we
3546   // don't have a single induction variable.
3547   //
3548   // We try to obtain an induction variable from the original loop as hard
3549   // as possible. However if we don't find one that:
3550   //   - is an integer
3551   //   - counts from zero, stepping by one
3552   //   - is the size of the widest induction variable type
3553   // then we create a new one.
3554   OldInduction = Legal->getPrimaryInduction();
3555   Type *IdxTy = Legal->getWidestInductionType();
3556   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3557   // The loop step is equal to the vectorization factor (num of SIMD elements)
3558   // times the unroll factor (num of SIMD instructions).
3559   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3560   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3561   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3562   Induction =
3563       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3564                               getDebugLocFromInstOrOperands(OldInduction));
3565 
3566   // Emit phis for the new starting index of the scalar loop.
3567   createInductionResumeValues(Lp, CountRoundDown);
3568 
3569   return completeLoopSkeleton(Lp, OrigLoopID);
3570 }
3571 
3572 // Fix up external users of the induction variable. At this point, we are
3573 // in LCSSA form, with all external PHIs that use the IV having one input value,
3574 // coming from the remainder loop. We need those PHIs to also have a correct
3575 // value for the IV when arriving directly from the middle block.
3576 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3577                                        const InductionDescriptor &II,
3578                                        Value *CountRoundDown, Value *EndValue,
3579                                        BasicBlock *MiddleBlock) {
3580   // There are two kinds of external IV usages - those that use the value
3581   // computed in the last iteration (the PHI) and those that use the penultimate
3582   // value (the value that feeds into the phi from the loop latch).
3583   // We allow both, but they, obviously, have different values.
3584 
3585   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3586 
3587   DenseMap<Value *, Value *> MissingVals;
3588 
3589   // An external user of the last iteration's value should see the value that
3590   // the remainder loop uses to initialize its own IV.
3591   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3592   for (User *U : PostInc->users()) {
3593     Instruction *UI = cast<Instruction>(U);
3594     if (!OrigLoop->contains(UI)) {
3595       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3596       MissingVals[UI] = EndValue;
3597     }
3598   }
3599 
3600   // An external user of the penultimate value need to see EndValue - Step.
3601   // The simplest way to get this is to recompute it from the constituent SCEVs,
3602   // that is Start + (Step * (CRD - 1)).
3603   for (User *U : OrigPhi->users()) {
3604     auto *UI = cast<Instruction>(U);
3605     if (!OrigLoop->contains(UI)) {
3606       const DataLayout &DL =
3607           OrigLoop->getHeader()->getModule()->getDataLayout();
3608       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3609 
3610       IRBuilder<> B(MiddleBlock->getTerminator());
3611       Value *CountMinusOne = B.CreateSub(
3612           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3613       Value *CMO =
3614           !II.getStep()->getType()->isIntegerTy()
3615               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3616                              II.getStep()->getType())
3617               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3618       CMO->setName("cast.cmo");
3619       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3620       Escape->setName("ind.escape");
3621       MissingVals[UI] = Escape;
3622     }
3623   }
3624 
3625   for (auto &I : MissingVals) {
3626     PHINode *PHI = cast<PHINode>(I.first);
3627     // One corner case we have to handle is two IVs "chasing" each-other,
3628     // that is %IV2 = phi [...], [ %IV1, %latch ]
3629     // In this case, if IV1 has an external use, we need to avoid adding both
3630     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3631     // don't already have an incoming value for the middle block.
3632     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3633       PHI->addIncoming(I.second, MiddleBlock);
3634   }
3635 }
3636 
3637 namespace {
3638 
3639 struct CSEDenseMapInfo {
3640   static bool canHandle(const Instruction *I) {
3641     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3642            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3643   }
3644 
3645   static inline Instruction *getEmptyKey() {
3646     return DenseMapInfo<Instruction *>::getEmptyKey();
3647   }
3648 
3649   static inline Instruction *getTombstoneKey() {
3650     return DenseMapInfo<Instruction *>::getTombstoneKey();
3651   }
3652 
3653   static unsigned getHashValue(const Instruction *I) {
3654     assert(canHandle(I) && "Unknown instruction!");
3655     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3656                                                            I->value_op_end()));
3657   }
3658 
3659   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3660     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3661         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3662       return LHS == RHS;
3663     return LHS->isIdenticalTo(RHS);
3664   }
3665 };
3666 
3667 } // end anonymous namespace
3668 
3669 ///Perform cse of induction variable instructions.
3670 static void cse(BasicBlock *BB) {
3671   // Perform simple cse.
3672   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3673   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3674     Instruction *In = &*I++;
3675 
3676     if (!CSEDenseMapInfo::canHandle(In))
3677       continue;
3678 
3679     // Check if we can replace this instruction with any of the
3680     // visited instructions.
3681     if (Instruction *V = CSEMap.lookup(In)) {
3682       In->replaceAllUsesWith(V);
3683       In->eraseFromParent();
3684       continue;
3685     }
3686 
3687     CSEMap[In] = In;
3688   }
3689 }
3690 
3691 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3692                                                        ElementCount VF,
3693                                                        bool &NeedToScalarize) {
3694   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3695   Function *F = CI->getCalledFunction();
3696   Type *ScalarRetTy = CI->getType();
3697   SmallVector<Type *, 4> Tys, ScalarTys;
3698   for (auto &ArgOp : CI->arg_operands())
3699     ScalarTys.push_back(ArgOp->getType());
3700 
3701   // Estimate cost of scalarized vector call. The source operands are assumed
3702   // to be vectors, so we need to extract individual elements from there,
3703   // execute VF scalar calls, and then gather the result into the vector return
3704   // value.
3705   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3706                                                  TTI::TCK_RecipThroughput);
3707   if (VF.isScalar())
3708     return ScalarCallCost;
3709 
3710   // Compute corresponding vector type for return value and arguments.
3711   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3712   for (Type *ScalarTy : ScalarTys)
3713     Tys.push_back(ToVectorTy(ScalarTy, VF));
3714 
3715   // Compute costs of unpacking argument values for the scalar calls and
3716   // packing the return values to a vector.
3717   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3718 
3719   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3720 
3721   // If we can't emit a vector call for this function, then the currently found
3722   // cost is the cost we need to return.
3723   NeedToScalarize = true;
3724   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3725   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3726 
3727   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3728     return Cost;
3729 
3730   // If the corresponding vector cost is cheaper, return its cost.
3731   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3732                                                  TTI::TCK_RecipThroughput);
3733   if (VectorCallCost < Cost) {
3734     NeedToScalarize = false;
3735     return VectorCallCost;
3736   }
3737   return Cost;
3738 }
3739 
3740 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3741                                                             ElementCount VF) {
3742   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3743   assert(ID && "Expected intrinsic call!");
3744 
3745   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3746   return TTI.getIntrinsicInstrCost(CostAttrs,
3747                                    TargetTransformInfo::TCK_RecipThroughput);
3748 }
3749 
3750 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3751   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3752   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3753   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3754 }
3755 
3756 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3757   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3758   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3759   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3760 }
3761 
3762 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3763   // For every instruction `I` in MinBWs, truncate the operands, create a
3764   // truncated version of `I` and reextend its result. InstCombine runs
3765   // later and will remove any ext/trunc pairs.
3766   SmallPtrSet<Value *, 4> Erased;
3767   for (const auto &KV : Cost->getMinimalBitwidths()) {
3768     // If the value wasn't vectorized, we must maintain the original scalar
3769     // type. The absence of the value from VectorLoopValueMap indicates that it
3770     // wasn't vectorized.
3771     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3772       continue;
3773     for (unsigned Part = 0; Part < UF; ++Part) {
3774       Value *I = getOrCreateVectorValue(KV.first, Part);
3775       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3776         continue;
3777       Type *OriginalTy = I->getType();
3778       Type *ScalarTruncatedTy =
3779           IntegerType::get(OriginalTy->getContext(), KV.second);
3780       auto *TruncatedTy = FixedVectorType::get(
3781           ScalarTruncatedTy,
3782           cast<FixedVectorType>(OriginalTy)->getNumElements());
3783       if (TruncatedTy == OriginalTy)
3784         continue;
3785 
3786       IRBuilder<> B(cast<Instruction>(I));
3787       auto ShrinkOperand = [&](Value *V) -> Value * {
3788         if (auto *ZI = dyn_cast<ZExtInst>(V))
3789           if (ZI->getSrcTy() == TruncatedTy)
3790             return ZI->getOperand(0);
3791         return B.CreateZExtOrTrunc(V, TruncatedTy);
3792       };
3793 
3794       // The actual instruction modification depends on the instruction type,
3795       // unfortunately.
3796       Value *NewI = nullptr;
3797       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3798         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3799                              ShrinkOperand(BO->getOperand(1)));
3800 
3801         // Any wrapping introduced by shrinking this operation shouldn't be
3802         // considered undefined behavior. So, we can't unconditionally copy
3803         // arithmetic wrapping flags to NewI.
3804         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3805       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3806         NewI =
3807             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3808                          ShrinkOperand(CI->getOperand(1)));
3809       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3810         NewI = B.CreateSelect(SI->getCondition(),
3811                               ShrinkOperand(SI->getTrueValue()),
3812                               ShrinkOperand(SI->getFalseValue()));
3813       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3814         switch (CI->getOpcode()) {
3815         default:
3816           llvm_unreachable("Unhandled cast!");
3817         case Instruction::Trunc:
3818           NewI = ShrinkOperand(CI->getOperand(0));
3819           break;
3820         case Instruction::SExt:
3821           NewI = B.CreateSExtOrTrunc(
3822               CI->getOperand(0),
3823               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3824           break;
3825         case Instruction::ZExt:
3826           NewI = B.CreateZExtOrTrunc(
3827               CI->getOperand(0),
3828               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3829           break;
3830         }
3831       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3832         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3833                              ->getNumElements();
3834         auto *O0 = B.CreateZExtOrTrunc(
3835             SI->getOperand(0),
3836             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3837         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3838                              ->getNumElements();
3839         auto *O1 = B.CreateZExtOrTrunc(
3840             SI->getOperand(1),
3841             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3842 
3843         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3844       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3845         // Don't do anything with the operands, just extend the result.
3846         continue;
3847       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3848         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3849                             ->getNumElements();
3850         auto *O0 = B.CreateZExtOrTrunc(
3851             IE->getOperand(0),
3852             FixedVectorType::get(ScalarTruncatedTy, Elements));
3853         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3854         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3855       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3856         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3857                             ->getNumElements();
3858         auto *O0 = B.CreateZExtOrTrunc(
3859             EE->getOperand(0),
3860             FixedVectorType::get(ScalarTruncatedTy, Elements));
3861         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3862       } else {
3863         // If we don't know what to do, be conservative and don't do anything.
3864         continue;
3865       }
3866 
3867       // Lastly, extend the result.
3868       NewI->takeName(cast<Instruction>(I));
3869       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3870       I->replaceAllUsesWith(Res);
3871       cast<Instruction>(I)->eraseFromParent();
3872       Erased.insert(I);
3873       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3874     }
3875   }
3876 
3877   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3878   for (const auto &KV : Cost->getMinimalBitwidths()) {
3879     // If the value wasn't vectorized, we must maintain the original scalar
3880     // type. The absence of the value from VectorLoopValueMap indicates that it
3881     // wasn't vectorized.
3882     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3883       continue;
3884     for (unsigned Part = 0; Part < UF; ++Part) {
3885       Value *I = getOrCreateVectorValue(KV.first, Part);
3886       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3887       if (Inst && Inst->use_empty()) {
3888         Value *NewI = Inst->getOperand(0);
3889         Inst->eraseFromParent();
3890         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3891       }
3892     }
3893   }
3894 }
3895 
3896 void InnerLoopVectorizer::fixVectorizedLoop() {
3897   // Insert truncates and extends for any truncated instructions as hints to
3898   // InstCombine.
3899   if (VF.isVector())
3900     truncateToMinimalBitwidths();
3901 
3902   // Fix widened non-induction PHIs by setting up the PHI operands.
3903   if (OrigPHIsToFix.size()) {
3904     assert(EnableVPlanNativePath &&
3905            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3906     fixNonInductionPHIs();
3907   }
3908 
3909   // At this point every instruction in the original loop is widened to a
3910   // vector form. Now we need to fix the recurrences in the loop. These PHI
3911   // nodes are currently empty because we did not want to introduce cycles.
3912   // This is the second stage of vectorizing recurrences.
3913   fixCrossIterationPHIs();
3914 
3915   // Forget the original basic block.
3916   PSE.getSE()->forgetLoop(OrigLoop);
3917 
3918   // Fix-up external users of the induction variables.
3919   for (auto &Entry : Legal->getInductionVars())
3920     fixupIVUsers(Entry.first, Entry.second,
3921                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3922                  IVEndValues[Entry.first], LoopMiddleBlock);
3923 
3924   fixLCSSAPHIs();
3925   for (Instruction *PI : PredicatedInstructions)
3926     sinkScalarOperands(&*PI);
3927 
3928   // Remove redundant induction instructions.
3929   cse(LoopVectorBody);
3930 
3931   // Set/update profile weights for the vector and remainder loops as original
3932   // loop iterations are now distributed among them. Note that original loop
3933   // represented by LoopScalarBody becomes remainder loop after vectorization.
3934   //
3935   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3936   // end up getting slightly roughened result but that should be OK since
3937   // profile is not inherently precise anyway. Note also possible bypass of
3938   // vector code caused by legality checks is ignored, assigning all the weight
3939   // to the vector loop, optimistically.
3940   //
3941   // For scalable vectorization we can't know at compile time how many iterations
3942   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3943   // vscale of '1'.
3944   setProfileInfoAfterUnrolling(
3945       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3946       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3947 }
3948 
3949 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3950   // In order to support recurrences we need to be able to vectorize Phi nodes.
3951   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3952   // stage #2: We now need to fix the recurrences by adding incoming edges to
3953   // the currently empty PHI nodes. At this point every instruction in the
3954   // original loop is widened to a vector form so we can use them to construct
3955   // the incoming edges.
3956   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3957     // Handle first-order recurrences and reductions that need to be fixed.
3958     if (Legal->isFirstOrderRecurrence(&Phi))
3959       fixFirstOrderRecurrence(&Phi);
3960     else if (Legal->isReductionVariable(&Phi))
3961       fixReduction(&Phi);
3962   }
3963 }
3964 
3965 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3966   // This is the second phase of vectorizing first-order recurrences. An
3967   // overview of the transformation is described below. Suppose we have the
3968   // following loop.
3969   //
3970   //   for (int i = 0; i < n; ++i)
3971   //     b[i] = a[i] - a[i - 1];
3972   //
3973   // There is a first-order recurrence on "a". For this loop, the shorthand
3974   // scalar IR looks like:
3975   //
3976   //   scalar.ph:
3977   //     s_init = a[-1]
3978   //     br scalar.body
3979   //
3980   //   scalar.body:
3981   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3982   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3983   //     s2 = a[i]
3984   //     b[i] = s2 - s1
3985   //     br cond, scalar.body, ...
3986   //
3987   // In this example, s1 is a recurrence because it's value depends on the
3988   // previous iteration. In the first phase of vectorization, we created a
3989   // temporary value for s1. We now complete the vectorization and produce the
3990   // shorthand vector IR shown below (for VF = 4, UF = 1).
3991   //
3992   //   vector.ph:
3993   //     v_init = vector(..., ..., ..., a[-1])
3994   //     br vector.body
3995   //
3996   //   vector.body
3997   //     i = phi [0, vector.ph], [i+4, vector.body]
3998   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3999   //     v2 = a[i, i+1, i+2, i+3];
4000   //     v3 = vector(v1(3), v2(0, 1, 2))
4001   //     b[i, i+1, i+2, i+3] = v2 - v3
4002   //     br cond, vector.body, middle.block
4003   //
4004   //   middle.block:
4005   //     x = v2(3)
4006   //     br scalar.ph
4007   //
4008   //   scalar.ph:
4009   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4010   //     br scalar.body
4011   //
4012   // After execution completes the vector loop, we extract the next value of
4013   // the recurrence (x) to use as the initial value in the scalar loop.
4014 
4015   // Get the original loop preheader and single loop latch.
4016   auto *Preheader = OrigLoop->getLoopPreheader();
4017   auto *Latch = OrigLoop->getLoopLatch();
4018 
4019   // Get the initial and previous values of the scalar recurrence.
4020   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4021   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4022 
4023   // Create a vector from the initial value.
4024   auto *VectorInit = ScalarInit;
4025   if (VF.isVector()) {
4026     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4027     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4028     VectorInit = Builder.CreateInsertElement(
4029         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4030         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4031   }
4032 
4033   // We constructed a temporary phi node in the first phase of vectorization.
4034   // This phi node will eventually be deleted.
4035   Builder.SetInsertPoint(
4036       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4037 
4038   // Create a phi node for the new recurrence. The current value will either be
4039   // the initial value inserted into a vector or loop-varying vector value.
4040   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4041   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4042 
4043   // Get the vectorized previous value of the last part UF - 1. It appears last
4044   // among all unrolled iterations, due to the order of their construction.
4045   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4046 
4047   // Find and set the insertion point after the previous value if it is an
4048   // instruction.
4049   BasicBlock::iterator InsertPt;
4050   // Note that the previous value may have been constant-folded so it is not
4051   // guaranteed to be an instruction in the vector loop.
4052   // FIXME: Loop invariant values do not form recurrences. We should deal with
4053   //        them earlier.
4054   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4055     InsertPt = LoopVectorBody->getFirstInsertionPt();
4056   else {
4057     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4058     if (isa<PHINode>(PreviousLastPart))
4059       // If the previous value is a phi node, we should insert after all the phi
4060       // nodes in the block containing the PHI to avoid breaking basic block
4061       // verification. Note that the basic block may be different to
4062       // LoopVectorBody, in case we predicate the loop.
4063       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4064     else
4065       InsertPt = ++PreviousInst->getIterator();
4066   }
4067   Builder.SetInsertPoint(&*InsertPt);
4068 
4069   // We will construct a vector for the recurrence by combining the values for
4070   // the current and previous iterations. This is the required shuffle mask.
4071   assert(!VF.isScalable());
4072   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4073   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4074   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4075     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4076 
4077   // The vector from which to take the initial value for the current iteration
4078   // (actual or unrolled). Initially, this is the vector phi node.
4079   Value *Incoming = VecPhi;
4080 
4081   // Shuffle the current and previous vector and update the vector parts.
4082   for (unsigned Part = 0; Part < UF; ++Part) {
4083     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4084     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4085     auto *Shuffle =
4086         VF.isVector()
4087             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4088             : Incoming;
4089     PhiPart->replaceAllUsesWith(Shuffle);
4090     cast<Instruction>(PhiPart)->eraseFromParent();
4091     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4092     Incoming = PreviousPart;
4093   }
4094 
4095   // Fix the latch value of the new recurrence in the vector loop.
4096   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4097 
4098   // Extract the last vector element in the middle block. This will be the
4099   // initial value for the recurrence when jumping to the scalar loop.
4100   auto *ExtractForScalar = Incoming;
4101   if (VF.isVector()) {
4102     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4103     ExtractForScalar = Builder.CreateExtractElement(
4104         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4105         "vector.recur.extract");
4106   }
4107   // Extract the second last element in the middle block if the
4108   // Phi is used outside the loop. We need to extract the phi itself
4109   // and not the last element (the phi update in the current iteration). This
4110   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4111   // when the scalar loop is not run at all.
4112   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4113   if (VF.isVector())
4114     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4115         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4116         "vector.recur.extract.for.phi");
4117   // When loop is unrolled without vectorizing, initialize
4118   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4119   // `Incoming`. This is analogous to the vectorized case above: extracting the
4120   // second last element when VF > 1.
4121   else if (UF > 1)
4122     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4123 
4124   // Fix the initial value of the original recurrence in the scalar loop.
4125   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4126   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4127   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4128     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4129     Start->addIncoming(Incoming, BB);
4130   }
4131 
4132   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4133   Phi->setName("scalar.recur");
4134 
4135   // Finally, fix users of the recurrence outside the loop. The users will need
4136   // either the last value of the scalar recurrence or the last value of the
4137   // vector recurrence we extracted in the middle block. Since the loop is in
4138   // LCSSA form, we just need to find all the phi nodes for the original scalar
4139   // recurrence in the exit block, and then add an edge for the middle block.
4140   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4141     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4142       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4143     }
4144   }
4145 }
4146 
4147 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4148   Constant *Zero = Builder.getInt32(0);
4149 
4150   // Get it's reduction variable descriptor.
4151   assert(Legal->isReductionVariable(Phi) &&
4152          "Unable to find the reduction variable");
4153   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4154 
4155   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4156   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4157   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4158   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4159     RdxDesc.getMinMaxRecurrenceKind();
4160   setDebugLocFromInst(Builder, ReductionStartValue);
4161   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4162 
4163   // We need to generate a reduction vector from the incoming scalar.
4164   // To do so, we need to generate the 'identity' vector and override
4165   // one of the elements with the incoming scalar reduction. We need
4166   // to do it in the vector-loop preheader.
4167   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4168 
4169   // This is the vector-clone of the value that leaves the loop.
4170   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4171 
4172   // Find the reduction identity variable. Zero for addition, or, xor,
4173   // one for multiplication, -1 for And.
4174   Value *Identity;
4175   Value *VectorStart;
4176   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4177       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4178     // MinMax reduction have the start value as their identify.
4179     if (VF.isScalar() || IsInLoopReductionPhi) {
4180       VectorStart = Identity = ReductionStartValue;
4181     } else {
4182       VectorStart = Identity =
4183         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4184     }
4185   } else {
4186     // Handle other reduction kinds:
4187     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4188         RK, MinMaxKind, VecTy->getScalarType());
4189     if (VF.isScalar() || IsInLoopReductionPhi) {
4190       Identity = Iden;
4191       // This vector is the Identity vector where the first element is the
4192       // incoming scalar reduction.
4193       VectorStart = ReductionStartValue;
4194     } else {
4195       Identity = ConstantVector::getSplat(VF, Iden);
4196 
4197       // This vector is the Identity vector where the first element is the
4198       // incoming scalar reduction.
4199       VectorStart =
4200         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4201     }
4202   }
4203 
4204   // Wrap flags are in general invalid after vectorization, clear them.
4205   clearReductionWrapFlags(RdxDesc);
4206 
4207   // Fix the vector-loop phi.
4208 
4209   // Reductions do not have to start at zero. They can start with
4210   // any loop invariant values.
4211   BasicBlock *Latch = OrigLoop->getLoopLatch();
4212   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4213 
4214   for (unsigned Part = 0; Part < UF; ++Part) {
4215     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4216     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4217     // Make sure to add the reduction start value only to the
4218     // first unroll part.
4219     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4220     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4221     cast<PHINode>(VecRdxPhi)
4222       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4223   }
4224 
4225   // Before each round, move the insertion point right between
4226   // the PHIs and the values we are going to write.
4227   // This allows us to write both PHINodes and the extractelement
4228   // instructions.
4229   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4230 
4231   setDebugLocFromInst(Builder, LoopExitInst);
4232 
4233   // If tail is folded by masking, the vector value to leave the loop should be
4234   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4235   // instead of the former. For an inloop reduction the reduction will already
4236   // be predicated, and does not need to be handled here.
4237   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4238     for (unsigned Part = 0; Part < UF; ++Part) {
4239       Value *VecLoopExitInst =
4240           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4241       Value *Sel = nullptr;
4242       for (User *U : VecLoopExitInst->users()) {
4243         if (isa<SelectInst>(U)) {
4244           assert(!Sel && "Reduction exit feeding two selects");
4245           Sel = U;
4246         } else
4247           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4248       }
4249       assert(Sel && "Reduction exit feeds no select");
4250       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4251 
4252       // If the target can create a predicated operator for the reduction at no
4253       // extra cost in the loop (for example a predicated vadd), it can be
4254       // cheaper for the select to remain in the loop than be sunk out of it,
4255       // and so use the select value for the phi instead of the old
4256       // LoopExitValue.
4257       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4258       if (PreferPredicatedReductionSelect ||
4259           TTI->preferPredicatedReductionSelect(
4260               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4261               TargetTransformInfo::ReductionFlags())) {
4262         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4263         VecRdxPhi->setIncomingValueForBlock(
4264             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4265       }
4266     }
4267   }
4268 
4269   // If the vector reduction can be performed in a smaller type, we truncate
4270   // then extend the loop exit value to enable InstCombine to evaluate the
4271   // entire expression in the smaller type.
4272   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4273     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4274     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4275     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4276     Builder.SetInsertPoint(
4277         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4278     VectorParts RdxParts(UF);
4279     for (unsigned Part = 0; Part < UF; ++Part) {
4280       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4281       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4282       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4283                                         : Builder.CreateZExt(Trunc, VecTy);
4284       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4285            UI != RdxParts[Part]->user_end();)
4286         if (*UI != Trunc) {
4287           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4288           RdxParts[Part] = Extnd;
4289         } else {
4290           ++UI;
4291         }
4292     }
4293     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4294     for (unsigned Part = 0; Part < UF; ++Part) {
4295       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4296       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4297     }
4298   }
4299 
4300   // Reduce all of the unrolled parts into a single vector.
4301   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4302   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4303 
4304   // The middle block terminator has already been assigned a DebugLoc here (the
4305   // OrigLoop's single latch terminator). We want the whole middle block to
4306   // appear to execute on this line because: (a) it is all compiler generated,
4307   // (b) these instructions are always executed after evaluating the latch
4308   // conditional branch, and (c) other passes may add new predecessors which
4309   // terminate on this line. This is the easiest way to ensure we don't
4310   // accidentally cause an extra step back into the loop while debugging.
4311   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4312   for (unsigned Part = 1; Part < UF; ++Part) {
4313     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4314     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4315       // Floating point operations had to be 'fast' to enable the reduction.
4316       ReducedPartRdx = addFastMathFlag(
4317           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4318                               ReducedPartRdx, "bin.rdx"),
4319           RdxDesc.getFastMathFlags());
4320     else
4321       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4322                                       RdxPart);
4323   }
4324 
4325   // Create the reduction after the loop. Note that inloop reductions create the
4326   // target reduction in the loop using a Reduction recipe.
4327   if (VF.isVector() && !IsInLoopReductionPhi) {
4328     bool NoNaN = Legal->hasFunNoNaNAttr();
4329     ReducedPartRdx =
4330         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4331     // If the reduction can be performed in a smaller type, we need to extend
4332     // the reduction to the wider type before we branch to the original loop.
4333     if (Phi->getType() != RdxDesc.getRecurrenceType())
4334       ReducedPartRdx =
4335         RdxDesc.isSigned()
4336         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4337         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4338   }
4339 
4340   // Create a phi node that merges control-flow from the backedge-taken check
4341   // block and the middle block.
4342   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4343                                         LoopScalarPreHeader->getTerminator());
4344   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4345     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4346   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4347 
4348   // Now, we need to fix the users of the reduction variable
4349   // inside and outside of the scalar remainder loop.
4350   // We know that the loop is in LCSSA form. We need to update the
4351   // PHI nodes in the exit blocks.
4352   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4353     // All PHINodes need to have a single entry edge, or two if
4354     // we already fixed them.
4355     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4356 
4357     // We found a reduction value exit-PHI. Update it with the
4358     // incoming bypass edge.
4359     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4360       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4361   } // end of the LCSSA phi scan.
4362 
4363     // Fix the scalar loop reduction variable with the incoming reduction sum
4364     // from the vector body and from the backedge value.
4365   int IncomingEdgeBlockIdx =
4366     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4367   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4368   // Pick the other block.
4369   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4370   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4371   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4372 }
4373 
4374 void InnerLoopVectorizer::clearReductionWrapFlags(
4375     RecurrenceDescriptor &RdxDesc) {
4376   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4377   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4378       RK != RecurrenceDescriptor::RK_IntegerMult)
4379     return;
4380 
4381   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4382   assert(LoopExitInstr && "null loop exit instruction");
4383   SmallVector<Instruction *, 8> Worklist;
4384   SmallPtrSet<Instruction *, 8> Visited;
4385   Worklist.push_back(LoopExitInstr);
4386   Visited.insert(LoopExitInstr);
4387 
4388   while (!Worklist.empty()) {
4389     Instruction *Cur = Worklist.pop_back_val();
4390     if (isa<OverflowingBinaryOperator>(Cur))
4391       for (unsigned Part = 0; Part < UF; ++Part) {
4392         Value *V = getOrCreateVectorValue(Cur, Part);
4393         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4394       }
4395 
4396     for (User *U : Cur->users()) {
4397       Instruction *UI = cast<Instruction>(U);
4398       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4399           Visited.insert(UI).second)
4400         Worklist.push_back(UI);
4401     }
4402   }
4403 }
4404 
4405 void InnerLoopVectorizer::fixLCSSAPHIs() {
4406   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4407     if (LCSSAPhi.getNumIncomingValues() == 1) {
4408       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4409       // Non-instruction incoming values will have only one value.
4410       unsigned LastLane = 0;
4411       if (isa<Instruction>(IncomingValue))
4412         LastLane = Cost->isUniformAfterVectorization(
4413                        cast<Instruction>(IncomingValue), VF)
4414                        ? 0
4415                        : VF.getKnownMinValue() - 1;
4416       assert((!VF.isScalable() || LastLane == 0) &&
4417              "scalable vectors dont support non-uniform scalars yet");
4418       // Can be a loop invariant incoming value or the last scalar value to be
4419       // extracted from the vectorized loop.
4420       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4421       Value *lastIncomingValue =
4422           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4423       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4424     }
4425   }
4426 }
4427 
4428 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4429   // The basic block and loop containing the predicated instruction.
4430   auto *PredBB = PredInst->getParent();
4431   auto *VectorLoop = LI->getLoopFor(PredBB);
4432 
4433   // Initialize a worklist with the operands of the predicated instruction.
4434   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4435 
4436   // Holds instructions that we need to analyze again. An instruction may be
4437   // reanalyzed if we don't yet know if we can sink it or not.
4438   SmallVector<Instruction *, 8> InstsToReanalyze;
4439 
4440   // Returns true if a given use occurs in the predicated block. Phi nodes use
4441   // their operands in their corresponding predecessor blocks.
4442   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4443     auto *I = cast<Instruction>(U.getUser());
4444     BasicBlock *BB = I->getParent();
4445     if (auto *Phi = dyn_cast<PHINode>(I))
4446       BB = Phi->getIncomingBlock(
4447           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4448     return BB == PredBB;
4449   };
4450 
4451   // Iteratively sink the scalarized operands of the predicated instruction
4452   // into the block we created for it. When an instruction is sunk, it's
4453   // operands are then added to the worklist. The algorithm ends after one pass
4454   // through the worklist doesn't sink a single instruction.
4455   bool Changed;
4456   do {
4457     // Add the instructions that need to be reanalyzed to the worklist, and
4458     // reset the changed indicator.
4459     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4460     InstsToReanalyze.clear();
4461     Changed = false;
4462 
4463     while (!Worklist.empty()) {
4464       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4465 
4466       // We can't sink an instruction if it is a phi node, is already in the
4467       // predicated block, is not in the loop, or may have side effects.
4468       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4469           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4470         continue;
4471 
4472       // It's legal to sink the instruction if all its uses occur in the
4473       // predicated block. Otherwise, there's nothing to do yet, and we may
4474       // need to reanalyze the instruction.
4475       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4476         InstsToReanalyze.push_back(I);
4477         continue;
4478       }
4479 
4480       // Move the instruction to the beginning of the predicated block, and add
4481       // it's operands to the worklist.
4482       I->moveBefore(&*PredBB->getFirstInsertionPt());
4483       Worklist.insert(I->op_begin(), I->op_end());
4484 
4485       // The sinking may have enabled other instructions to be sunk, so we will
4486       // need to iterate.
4487       Changed = true;
4488     }
4489   } while (Changed);
4490 }
4491 
4492 void InnerLoopVectorizer::fixNonInductionPHIs() {
4493   for (PHINode *OrigPhi : OrigPHIsToFix) {
4494     PHINode *NewPhi =
4495         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4496     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4497 
4498     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4499         predecessors(OrigPhi->getParent()));
4500     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4501         predecessors(NewPhi->getParent()));
4502     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4503            "Scalar and Vector BB should have the same number of predecessors");
4504 
4505     // The insertion point in Builder may be invalidated by the time we get
4506     // here. Force the Builder insertion point to something valid so that we do
4507     // not run into issues during insertion point restore in
4508     // getOrCreateVectorValue calls below.
4509     Builder.SetInsertPoint(NewPhi);
4510 
4511     // The predecessor order is preserved and we can rely on mapping between
4512     // scalar and vector block predecessors.
4513     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4514       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4515 
4516       // When looking up the new scalar/vector values to fix up, use incoming
4517       // values from original phi.
4518       Value *ScIncV =
4519           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4520 
4521       // Scalar incoming value may need a broadcast
4522       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4523       NewPhi->addIncoming(NewIncV, NewPredBB);
4524     }
4525   }
4526 }
4527 
4528 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4529                                    VPUser &Operands, unsigned UF,
4530                                    ElementCount VF, bool IsPtrLoopInvariant,
4531                                    SmallBitVector &IsIndexLoopInvariant,
4532                                    VPTransformState &State) {
4533   // Construct a vector GEP by widening the operands of the scalar GEP as
4534   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4535   // results in a vector of pointers when at least one operand of the GEP
4536   // is vector-typed. Thus, to keep the representation compact, we only use
4537   // vector-typed operands for loop-varying values.
4538 
4539   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4540     // If we are vectorizing, but the GEP has only loop-invariant operands,
4541     // the GEP we build (by only using vector-typed operands for
4542     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4543     // produce a vector of pointers, we need to either arbitrarily pick an
4544     // operand to broadcast, or broadcast a clone of the original GEP.
4545     // Here, we broadcast a clone of the original.
4546     //
4547     // TODO: If at some point we decide to scalarize instructions having
4548     //       loop-invariant operands, this special case will no longer be
4549     //       required. We would add the scalarization decision to
4550     //       collectLoopScalars() and teach getVectorValue() to broadcast
4551     //       the lane-zero scalar value.
4552     auto *Clone = Builder.Insert(GEP->clone());
4553     for (unsigned Part = 0; Part < UF; ++Part) {
4554       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4555       State.set(VPDef, GEP, EntryPart, Part);
4556       addMetadata(EntryPart, GEP);
4557     }
4558   } else {
4559     // If the GEP has at least one loop-varying operand, we are sure to
4560     // produce a vector of pointers. But if we are only unrolling, we want
4561     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4562     // produce with the code below will be scalar (if VF == 1) or vector
4563     // (otherwise). Note that for the unroll-only case, we still maintain
4564     // values in the vector mapping with initVector, as we do for other
4565     // instructions.
4566     for (unsigned Part = 0; Part < UF; ++Part) {
4567       // The pointer operand of the new GEP. If it's loop-invariant, we
4568       // won't broadcast it.
4569       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4570                                      : State.get(Operands.getOperand(0), Part);
4571 
4572       // Collect all the indices for the new GEP. If any index is
4573       // loop-invariant, we won't broadcast it.
4574       SmallVector<Value *, 4> Indices;
4575       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4576         VPValue *Operand = Operands.getOperand(I);
4577         if (IsIndexLoopInvariant[I - 1])
4578           Indices.push_back(State.get(Operand, {0, 0}));
4579         else
4580           Indices.push_back(State.get(Operand, Part));
4581       }
4582 
4583       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4584       // but it should be a vector, otherwise.
4585       auto *NewGEP =
4586           GEP->isInBounds()
4587               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4588                                           Indices)
4589               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4590       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4591              "NewGEP is not a pointer vector");
4592       State.set(VPDef, GEP, NewGEP, Part);
4593       addMetadata(NewGEP, GEP);
4594     }
4595   }
4596 }
4597 
4598 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4599                                               ElementCount VF) {
4600   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4601   PHINode *P = cast<PHINode>(PN);
4602   if (EnableVPlanNativePath) {
4603     // Currently we enter here in the VPlan-native path for non-induction
4604     // PHIs where all control flow is uniform. We simply widen these PHIs.
4605     // Create a vector phi with no operands - the vector phi operands will be
4606     // set at the end of vector code generation.
4607     Type *VecTy =
4608         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4609     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4610     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4611     OrigPHIsToFix.push_back(P);
4612 
4613     return;
4614   }
4615 
4616   assert(PN->getParent() == OrigLoop->getHeader() &&
4617          "Non-header phis should have been handled elsewhere");
4618 
4619   // In order to support recurrences we need to be able to vectorize Phi nodes.
4620   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4621   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4622   // this value when we vectorize all of the instructions that use the PHI.
4623   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4624     for (unsigned Part = 0; Part < UF; ++Part) {
4625       // This is phase one of vectorizing PHIs.
4626       bool ScalarPHI =
4627           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4628       Type *VecTy =
4629           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4630       Value *EntryPart = PHINode::Create(
4631           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4632       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4633     }
4634     return;
4635   }
4636 
4637   setDebugLocFromInst(Builder, P);
4638 
4639   // This PHINode must be an induction variable.
4640   // Make sure that we know about it.
4641   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4642 
4643   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4644   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4645 
4646   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4647   // which can be found from the original scalar operations.
4648   switch (II.getKind()) {
4649   case InductionDescriptor::IK_NoInduction:
4650     llvm_unreachable("Unknown induction");
4651   case InductionDescriptor::IK_IntInduction:
4652   case InductionDescriptor::IK_FpInduction:
4653     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4654   case InductionDescriptor::IK_PtrInduction: {
4655     // Handle the pointer induction variable case.
4656     assert(P->getType()->isPointerTy() && "Unexpected type.");
4657 
4658     if (Cost->isScalarAfterVectorization(P, VF)) {
4659       // This is the normalized GEP that starts counting at zero.
4660       Value *PtrInd =
4661           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4662       // Determine the number of scalars we need to generate for each unroll
4663       // iteration. If the instruction is uniform, we only need to generate the
4664       // first lane. Otherwise, we generate all VF values.
4665       unsigned Lanes =
4666           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4667       for (unsigned Part = 0; Part < UF; ++Part) {
4668         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4669           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4670                                            Lane + Part * VF.getKnownMinValue());
4671           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4672           Value *SclrGep =
4673               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4674           SclrGep->setName("next.gep");
4675           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4676         }
4677       }
4678       return;
4679     }
4680     assert(isa<SCEVConstant>(II.getStep()) &&
4681            "Induction step not a SCEV constant!");
4682     Type *PhiType = II.getStep()->getType();
4683 
4684     // Build a pointer phi
4685     Value *ScalarStartValue = II.getStartValue();
4686     Type *ScStValueType = ScalarStartValue->getType();
4687     PHINode *NewPointerPhi =
4688         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4689     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4690 
4691     // A pointer induction, performed by using a gep
4692     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4693     Instruction *InductionLoc = LoopLatch->getTerminator();
4694     const SCEV *ScalarStep = II.getStep();
4695     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4696     Value *ScalarStepValue =
4697         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4698     Value *InductionGEP = GetElementPtrInst::Create(
4699         ScStValueType->getPointerElementType(), NewPointerPhi,
4700         Builder.CreateMul(
4701             ScalarStepValue,
4702             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4703         "ptr.ind", InductionLoc);
4704     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4705 
4706     // Create UF many actual address geps that use the pointer
4707     // phi as base and a vectorized version of the step value
4708     // (<step*0, ..., step*N>) as offset.
4709     for (unsigned Part = 0; Part < UF; ++Part) {
4710       SmallVector<Constant *, 8> Indices;
4711       // Create a vector of consecutive numbers from zero to VF.
4712       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4713         Indices.push_back(
4714             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4715       Constant *StartOffset = ConstantVector::get(Indices);
4716 
4717       Value *GEP = Builder.CreateGEP(
4718           ScStValueType->getPointerElementType(), NewPointerPhi,
4719           Builder.CreateMul(
4720               StartOffset,
4721               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4722               "vector.gep"));
4723       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4724     }
4725   }
4726   }
4727 }
4728 
4729 /// A helper function for checking whether an integer division-related
4730 /// instruction may divide by zero (in which case it must be predicated if
4731 /// executed conditionally in the scalar code).
4732 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4733 /// Non-zero divisors that are non compile-time constants will not be
4734 /// converted into multiplication, so we will still end up scalarizing
4735 /// the division, but can do so w/o predication.
4736 static bool mayDivideByZero(Instruction &I) {
4737   assert((I.getOpcode() == Instruction::UDiv ||
4738           I.getOpcode() == Instruction::SDiv ||
4739           I.getOpcode() == Instruction::URem ||
4740           I.getOpcode() == Instruction::SRem) &&
4741          "Unexpected instruction");
4742   Value *Divisor = I.getOperand(1);
4743   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4744   return !CInt || CInt->isZero();
4745 }
4746 
4747 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4748                                            VPUser &User,
4749                                            VPTransformState &State) {
4750   switch (I.getOpcode()) {
4751   case Instruction::Call:
4752   case Instruction::Br:
4753   case Instruction::PHI:
4754   case Instruction::GetElementPtr:
4755   case Instruction::Select:
4756     llvm_unreachable("This instruction is handled by a different recipe.");
4757   case Instruction::UDiv:
4758   case Instruction::SDiv:
4759   case Instruction::SRem:
4760   case Instruction::URem:
4761   case Instruction::Add:
4762   case Instruction::FAdd:
4763   case Instruction::Sub:
4764   case Instruction::FSub:
4765   case Instruction::FNeg:
4766   case Instruction::Mul:
4767   case Instruction::FMul:
4768   case Instruction::FDiv:
4769   case Instruction::FRem:
4770   case Instruction::Shl:
4771   case Instruction::LShr:
4772   case Instruction::AShr:
4773   case Instruction::And:
4774   case Instruction::Or:
4775   case Instruction::Xor: {
4776     // Just widen unops and binops.
4777     setDebugLocFromInst(Builder, &I);
4778 
4779     for (unsigned Part = 0; Part < UF; ++Part) {
4780       SmallVector<Value *, 2> Ops;
4781       for (VPValue *VPOp : User.operands())
4782         Ops.push_back(State.get(VPOp, Part));
4783 
4784       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4785 
4786       if (auto *VecOp = dyn_cast<Instruction>(V))
4787         VecOp->copyIRFlags(&I);
4788 
4789       // Use this vector value for all users of the original instruction.
4790       State.set(Def, &I, V, Part);
4791       addMetadata(V, &I);
4792     }
4793 
4794     break;
4795   }
4796   case Instruction::ICmp:
4797   case Instruction::FCmp: {
4798     // Widen compares. Generate vector compares.
4799     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4800     auto *Cmp = cast<CmpInst>(&I);
4801     setDebugLocFromInst(Builder, Cmp);
4802     for (unsigned Part = 0; Part < UF; ++Part) {
4803       Value *A = State.get(User.getOperand(0), Part);
4804       Value *B = State.get(User.getOperand(1), Part);
4805       Value *C = nullptr;
4806       if (FCmp) {
4807         // Propagate fast math flags.
4808         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4809         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4810         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4811       } else {
4812         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4813       }
4814       State.set(Def, &I, C, Part);
4815       addMetadata(C, &I);
4816     }
4817 
4818     break;
4819   }
4820 
4821   case Instruction::ZExt:
4822   case Instruction::SExt:
4823   case Instruction::FPToUI:
4824   case Instruction::FPToSI:
4825   case Instruction::FPExt:
4826   case Instruction::PtrToInt:
4827   case Instruction::IntToPtr:
4828   case Instruction::SIToFP:
4829   case Instruction::UIToFP:
4830   case Instruction::Trunc:
4831   case Instruction::FPTrunc:
4832   case Instruction::BitCast: {
4833     auto *CI = cast<CastInst>(&I);
4834     setDebugLocFromInst(Builder, CI);
4835 
4836     /// Vectorize casts.
4837     Type *DestTy =
4838         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4839 
4840     for (unsigned Part = 0; Part < UF; ++Part) {
4841       Value *A = State.get(User.getOperand(0), Part);
4842       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4843       State.set(Def, &I, Cast, Part);
4844       addMetadata(Cast, &I);
4845     }
4846     break;
4847   }
4848   default:
4849     // This instruction is not vectorized by simple widening.
4850     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4851     llvm_unreachable("Unhandled instruction!");
4852   } // end of switch.
4853 }
4854 
4855 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4856                                                VPUser &ArgOperands,
4857                                                VPTransformState &State) {
4858   assert(!isa<DbgInfoIntrinsic>(I) &&
4859          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4860   setDebugLocFromInst(Builder, &I);
4861 
4862   Module *M = I.getParent()->getParent()->getParent();
4863   auto *CI = cast<CallInst>(&I);
4864 
4865   SmallVector<Type *, 4> Tys;
4866   for (Value *ArgOperand : CI->arg_operands())
4867     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4868 
4869   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4870 
4871   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4872   // version of the instruction.
4873   // Is it beneficial to perform intrinsic call compared to lib call?
4874   bool NeedToScalarize = false;
4875   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4876   bool UseVectorIntrinsic =
4877       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4878   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4879          "Instruction should be scalarized elsewhere.");
4880 
4881   for (unsigned Part = 0; Part < UF; ++Part) {
4882     SmallVector<Value *, 4> Args;
4883     for (auto &I : enumerate(ArgOperands.operands())) {
4884       // Some intrinsics have a scalar argument - don't replace it with a
4885       // vector.
4886       Value *Arg;
4887       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4888         Arg = State.get(I.value(), Part);
4889       else
4890         Arg = State.get(I.value(), {0, 0});
4891       Args.push_back(Arg);
4892     }
4893 
4894     Function *VectorF;
4895     if (UseVectorIntrinsic) {
4896       // Use vector version of the intrinsic.
4897       Type *TysForDecl[] = {CI->getType()};
4898       if (VF.isVector()) {
4899         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4900         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4901       }
4902       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4903       assert(VectorF && "Can't retrieve vector intrinsic.");
4904     } else {
4905       // Use vector version of the function call.
4906       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4907 #ifndef NDEBUG
4908       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4909              "Can't create vector function.");
4910 #endif
4911         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4912     }
4913       SmallVector<OperandBundleDef, 1> OpBundles;
4914       CI->getOperandBundlesAsDefs(OpBundles);
4915       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4916 
4917       if (isa<FPMathOperator>(V))
4918         V->copyFastMathFlags(CI);
4919 
4920       State.set(Def, &I, V, Part);
4921       addMetadata(V, &I);
4922   }
4923 }
4924 
4925 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4926                                                  VPUser &Operands,
4927                                                  bool InvariantCond,
4928                                                  VPTransformState &State) {
4929   setDebugLocFromInst(Builder, &I);
4930 
4931   // The condition can be loop invariant  but still defined inside the
4932   // loop. This means that we can't just use the original 'cond' value.
4933   // We have to take the 'vectorized' value and pick the first lane.
4934   // Instcombine will make this a no-op.
4935   auto *InvarCond =
4936       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4937 
4938   for (unsigned Part = 0; Part < UF; ++Part) {
4939     Value *Cond =
4940         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4941     Value *Op0 = State.get(Operands.getOperand(1), Part);
4942     Value *Op1 = State.get(Operands.getOperand(2), Part);
4943     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4944     State.set(VPDef, &I, Sel, Part);
4945     addMetadata(Sel, &I);
4946   }
4947 }
4948 
4949 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4950   // We should not collect Scalars more than once per VF. Right now, this
4951   // function is called from collectUniformsAndScalars(), which already does
4952   // this check. Collecting Scalars for VF=1 does not make any sense.
4953   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4954          "This function should not be visited twice for the same VF");
4955 
4956   SmallSetVector<Instruction *, 8> Worklist;
4957 
4958   // These sets are used to seed the analysis with pointers used by memory
4959   // accesses that will remain scalar.
4960   SmallSetVector<Instruction *, 8> ScalarPtrs;
4961   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4962   auto *Latch = TheLoop->getLoopLatch();
4963 
4964   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4965   // The pointer operands of loads and stores will be scalar as long as the
4966   // memory access is not a gather or scatter operation. The value operand of a
4967   // store will remain scalar if the store is scalarized.
4968   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4969     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4970     assert(WideningDecision != CM_Unknown &&
4971            "Widening decision should be ready at this moment");
4972     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4973       if (Ptr == Store->getValueOperand())
4974         return WideningDecision == CM_Scalarize;
4975     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4976            "Ptr is neither a value or pointer operand");
4977     return WideningDecision != CM_GatherScatter;
4978   };
4979 
4980   // A helper that returns true if the given value is a bitcast or
4981   // getelementptr instruction contained in the loop.
4982   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4983     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4984             isa<GetElementPtrInst>(V)) &&
4985            !TheLoop->isLoopInvariant(V);
4986   };
4987 
4988   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4989     if (!isa<PHINode>(Ptr) ||
4990         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4991       return false;
4992     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4993     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4994       return false;
4995     return isScalarUse(MemAccess, Ptr);
4996   };
4997 
4998   // A helper that evaluates a memory access's use of a pointer. If the
4999   // pointer is actually the pointer induction of a loop, it is being
5000   // inserted into Worklist. If the use will be a scalar use, and the
5001   // pointer is only used by memory accesses, we place the pointer in
5002   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5003   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5004     if (isScalarPtrInduction(MemAccess, Ptr)) {
5005       Worklist.insert(cast<Instruction>(Ptr));
5006       Instruction *Update = cast<Instruction>(
5007           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5008       Worklist.insert(Update);
5009       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5010                         << "\n");
5011       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5012                         << "\n");
5013       return;
5014     }
5015     // We only care about bitcast and getelementptr instructions contained in
5016     // the loop.
5017     if (!isLoopVaryingBitCastOrGEP(Ptr))
5018       return;
5019 
5020     // If the pointer has already been identified as scalar (e.g., if it was
5021     // also identified as uniform), there's nothing to do.
5022     auto *I = cast<Instruction>(Ptr);
5023     if (Worklist.count(I))
5024       return;
5025 
5026     // If the use of the pointer will be a scalar use, and all users of the
5027     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5028     // place the pointer in PossibleNonScalarPtrs.
5029     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5030           return isa<LoadInst>(U) || isa<StoreInst>(U);
5031         }))
5032       ScalarPtrs.insert(I);
5033     else
5034       PossibleNonScalarPtrs.insert(I);
5035   };
5036 
5037   // We seed the scalars analysis with three classes of instructions: (1)
5038   // instructions marked uniform-after-vectorization and (2) bitcast,
5039   // getelementptr and (pointer) phi instructions used by memory accesses
5040   // requiring a scalar use.
5041   //
5042   // (1) Add to the worklist all instructions that have been identified as
5043   // uniform-after-vectorization.
5044   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5045 
5046   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5047   // memory accesses requiring a scalar use. The pointer operands of loads and
5048   // stores will be scalar as long as the memory accesses is not a gather or
5049   // scatter operation. The value operand of a store will remain scalar if the
5050   // store is scalarized.
5051   for (auto *BB : TheLoop->blocks())
5052     for (auto &I : *BB) {
5053       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5054         evaluatePtrUse(Load, Load->getPointerOperand());
5055       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5056         evaluatePtrUse(Store, Store->getPointerOperand());
5057         evaluatePtrUse(Store, Store->getValueOperand());
5058       }
5059     }
5060   for (auto *I : ScalarPtrs)
5061     if (!PossibleNonScalarPtrs.count(I)) {
5062       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5063       Worklist.insert(I);
5064     }
5065 
5066   // Insert the forced scalars.
5067   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5068   // induction variable when the PHI user is scalarized.
5069   auto ForcedScalar = ForcedScalars.find(VF);
5070   if (ForcedScalar != ForcedScalars.end())
5071     for (auto *I : ForcedScalar->second)
5072       Worklist.insert(I);
5073 
5074   // Expand the worklist by looking through any bitcasts and getelementptr
5075   // instructions we've already identified as scalar. This is similar to the
5076   // expansion step in collectLoopUniforms(); however, here we're only
5077   // expanding to include additional bitcasts and getelementptr instructions.
5078   unsigned Idx = 0;
5079   while (Idx != Worklist.size()) {
5080     Instruction *Dst = Worklist[Idx++];
5081     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5082       continue;
5083     auto *Src = cast<Instruction>(Dst->getOperand(0));
5084     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5085           auto *J = cast<Instruction>(U);
5086           return !TheLoop->contains(J) || Worklist.count(J) ||
5087                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5088                   isScalarUse(J, Src));
5089         })) {
5090       Worklist.insert(Src);
5091       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5092     }
5093   }
5094 
5095   // An induction variable will remain scalar if all users of the induction
5096   // variable and induction variable update remain scalar.
5097   for (auto &Induction : Legal->getInductionVars()) {
5098     auto *Ind = Induction.first;
5099     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5100 
5101     // If tail-folding is applied, the primary induction variable will be used
5102     // to feed a vector compare.
5103     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5104       continue;
5105 
5106     // Determine if all users of the induction variable are scalar after
5107     // vectorization.
5108     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5109       auto *I = cast<Instruction>(U);
5110       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5111     });
5112     if (!ScalarInd)
5113       continue;
5114 
5115     // Determine if all users of the induction variable update instruction are
5116     // scalar after vectorization.
5117     auto ScalarIndUpdate =
5118         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5119           auto *I = cast<Instruction>(U);
5120           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5121         });
5122     if (!ScalarIndUpdate)
5123       continue;
5124 
5125     // The induction variable and its update instruction will remain scalar.
5126     Worklist.insert(Ind);
5127     Worklist.insert(IndUpdate);
5128     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5129     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5130                       << "\n");
5131   }
5132 
5133   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5134 }
5135 
5136 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5137                                                          ElementCount VF) {
5138   if (!blockNeedsPredication(I->getParent()))
5139     return false;
5140   switch(I->getOpcode()) {
5141   default:
5142     break;
5143   case Instruction::Load:
5144   case Instruction::Store: {
5145     if (!Legal->isMaskRequired(I))
5146       return false;
5147     auto *Ptr = getLoadStorePointerOperand(I);
5148     auto *Ty = getMemInstValueType(I);
5149     // We have already decided how to vectorize this instruction, get that
5150     // result.
5151     if (VF.isVector()) {
5152       InstWidening WideningDecision = getWideningDecision(I, VF);
5153       assert(WideningDecision != CM_Unknown &&
5154              "Widening decision should be ready at this moment");
5155       return WideningDecision == CM_Scalarize;
5156     }
5157     const Align Alignment = getLoadStoreAlignment(I);
5158     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5159                                 isLegalMaskedGather(Ty, Alignment))
5160                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5161                                 isLegalMaskedScatter(Ty, Alignment));
5162   }
5163   case Instruction::UDiv:
5164   case Instruction::SDiv:
5165   case Instruction::SRem:
5166   case Instruction::URem:
5167     return mayDivideByZero(*I);
5168   }
5169   return false;
5170 }
5171 
5172 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5173     Instruction *I, ElementCount VF) {
5174   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5175   assert(getWideningDecision(I, VF) == CM_Unknown &&
5176          "Decision should not be set yet.");
5177   auto *Group = getInterleavedAccessGroup(I);
5178   assert(Group && "Must have a group.");
5179 
5180   // If the instruction's allocated size doesn't equal it's type size, it
5181   // requires padding and will be scalarized.
5182   auto &DL = I->getModule()->getDataLayout();
5183   auto *ScalarTy = getMemInstValueType(I);
5184   if (hasIrregularType(ScalarTy, DL, VF))
5185     return false;
5186 
5187   // Check if masking is required.
5188   // A Group may need masking for one of two reasons: it resides in a block that
5189   // needs predication, or it was decided to use masking to deal with gaps.
5190   bool PredicatedAccessRequiresMasking =
5191       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5192   bool AccessWithGapsRequiresMasking =
5193       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5194   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5195     return true;
5196 
5197   // If masked interleaving is required, we expect that the user/target had
5198   // enabled it, because otherwise it either wouldn't have been created or
5199   // it should have been invalidated by the CostModel.
5200   assert(useMaskedInterleavedAccesses(TTI) &&
5201          "Masked interleave-groups for predicated accesses are not enabled.");
5202 
5203   auto *Ty = getMemInstValueType(I);
5204   const Align Alignment = getLoadStoreAlignment(I);
5205   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5206                           : TTI.isLegalMaskedStore(Ty, Alignment);
5207 }
5208 
5209 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5210     Instruction *I, ElementCount VF) {
5211   // Get and ensure we have a valid memory instruction.
5212   LoadInst *LI = dyn_cast<LoadInst>(I);
5213   StoreInst *SI = dyn_cast<StoreInst>(I);
5214   assert((LI || SI) && "Invalid memory instruction");
5215 
5216   auto *Ptr = getLoadStorePointerOperand(I);
5217 
5218   // In order to be widened, the pointer should be consecutive, first of all.
5219   if (!Legal->isConsecutivePtr(Ptr))
5220     return false;
5221 
5222   // If the instruction is a store located in a predicated block, it will be
5223   // scalarized.
5224   if (isScalarWithPredication(I))
5225     return false;
5226 
5227   // If the instruction's allocated size doesn't equal it's type size, it
5228   // requires padding and will be scalarized.
5229   auto &DL = I->getModule()->getDataLayout();
5230   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5231   if (hasIrregularType(ScalarTy, DL, VF))
5232     return false;
5233 
5234   return true;
5235 }
5236 
5237 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5238   // We should not collect Uniforms more than once per VF. Right now,
5239   // this function is called from collectUniformsAndScalars(), which
5240   // already does this check. Collecting Uniforms for VF=1 does not make any
5241   // sense.
5242 
5243   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5244          "This function should not be visited twice for the same VF");
5245 
5246   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5247   // not analyze again.  Uniforms.count(VF) will return 1.
5248   Uniforms[VF].clear();
5249 
5250   // We now know that the loop is vectorizable!
5251   // Collect instructions inside the loop that will remain uniform after
5252   // vectorization.
5253 
5254   // Global values, params and instructions outside of current loop are out of
5255   // scope.
5256   auto isOutOfScope = [&](Value *V) -> bool {
5257     Instruction *I = dyn_cast<Instruction>(V);
5258     return (!I || !TheLoop->contains(I));
5259   };
5260 
5261   SetVector<Instruction *> Worklist;
5262   BasicBlock *Latch = TheLoop->getLoopLatch();
5263 
5264   // Instructions that are scalar with predication must not be considered
5265   // uniform after vectorization, because that would create an erroneous
5266   // replicating region where only a single instance out of VF should be formed.
5267   // TODO: optimize such seldom cases if found important, see PR40816.
5268   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5269     if (isOutOfScope(I)) {
5270       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5271                         << *I << "\n");
5272       return;
5273     }
5274     if (isScalarWithPredication(I, VF)) {
5275       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5276                         << *I << "\n");
5277       return;
5278     }
5279     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5280     Worklist.insert(I);
5281   };
5282 
5283   // Start with the conditional branch. If the branch condition is an
5284   // instruction contained in the loop that is only used by the branch, it is
5285   // uniform.
5286   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5287   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5288     addToWorklistIfAllowed(Cmp);
5289 
5290   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5291     InstWidening WideningDecision = getWideningDecision(I, VF);
5292     assert(WideningDecision != CM_Unknown &&
5293            "Widening decision should be ready at this moment");
5294 
5295     // A uniform memory op is itself uniform.  We exclude uniform stores
5296     // here as they demand the last lane, not the first one.
5297     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5298       assert(WideningDecision == CM_Scalarize);
5299       return true;
5300     }
5301 
5302     return (WideningDecision == CM_Widen ||
5303             WideningDecision == CM_Widen_Reverse ||
5304             WideningDecision == CM_Interleave);
5305   };
5306 
5307 
5308   // Returns true if Ptr is the pointer operand of a memory access instruction
5309   // I, and I is known to not require scalarization.
5310   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5311     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5312   };
5313 
5314   // Holds a list of values which are known to have at least one uniform use.
5315   // Note that there may be other uses which aren't uniform.  A "uniform use"
5316   // here is something which only demands lane 0 of the unrolled iterations;
5317   // it does not imply that all lanes produce the same value (e.g. this is not
5318   // the usual meaning of uniform)
5319   SmallPtrSet<Value *, 8> HasUniformUse;
5320 
5321   // Scan the loop for instructions which are either a) known to have only
5322   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5323   for (auto *BB : TheLoop->blocks())
5324     for (auto &I : *BB) {
5325       // If there's no pointer operand, there's nothing to do.
5326       auto *Ptr = getLoadStorePointerOperand(&I);
5327       if (!Ptr)
5328         continue;
5329 
5330       // A uniform memory op is itself uniform.  We exclude uniform stores
5331       // here as they demand the last lane, not the first one.
5332       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5333         addToWorklistIfAllowed(&I);
5334 
5335       if (isUniformDecision(&I, VF)) {
5336         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5337         HasUniformUse.insert(Ptr);
5338       }
5339     }
5340 
5341   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5342   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5343   // disallows uses outside the loop as well.
5344   for (auto *V : HasUniformUse) {
5345     if (isOutOfScope(V))
5346       continue;
5347     auto *I = cast<Instruction>(V);
5348     auto UsersAreMemAccesses =
5349       llvm::all_of(I->users(), [&](User *U) -> bool {
5350         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5351       });
5352     if (UsersAreMemAccesses)
5353       addToWorklistIfAllowed(I);
5354   }
5355 
5356   // Expand Worklist in topological order: whenever a new instruction
5357   // is added , its users should be already inside Worklist.  It ensures
5358   // a uniform instruction will only be used by uniform instructions.
5359   unsigned idx = 0;
5360   while (idx != Worklist.size()) {
5361     Instruction *I = Worklist[idx++];
5362 
5363     for (auto OV : I->operand_values()) {
5364       // isOutOfScope operands cannot be uniform instructions.
5365       if (isOutOfScope(OV))
5366         continue;
5367       // First order recurrence Phi's should typically be considered
5368       // non-uniform.
5369       auto *OP = dyn_cast<PHINode>(OV);
5370       if (OP && Legal->isFirstOrderRecurrence(OP))
5371         continue;
5372       // If all the users of the operand are uniform, then add the
5373       // operand into the uniform worklist.
5374       auto *OI = cast<Instruction>(OV);
5375       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5376             auto *J = cast<Instruction>(U);
5377             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5378           }))
5379         addToWorklistIfAllowed(OI);
5380     }
5381   }
5382 
5383   // For an instruction to be added into Worklist above, all its users inside
5384   // the loop should also be in Worklist. However, this condition cannot be
5385   // true for phi nodes that form a cyclic dependence. We must process phi
5386   // nodes separately. An induction variable will remain uniform if all users
5387   // of the induction variable and induction variable update remain uniform.
5388   // The code below handles both pointer and non-pointer induction variables.
5389   for (auto &Induction : Legal->getInductionVars()) {
5390     auto *Ind = Induction.first;
5391     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5392 
5393     // Determine if all users of the induction variable are uniform after
5394     // vectorization.
5395     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5396       auto *I = cast<Instruction>(U);
5397       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5398              isVectorizedMemAccessUse(I, Ind);
5399     });
5400     if (!UniformInd)
5401       continue;
5402 
5403     // Determine if all users of the induction variable update instruction are
5404     // uniform after vectorization.
5405     auto UniformIndUpdate =
5406         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5407           auto *I = cast<Instruction>(U);
5408           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5409                  isVectorizedMemAccessUse(I, IndUpdate);
5410         });
5411     if (!UniformIndUpdate)
5412       continue;
5413 
5414     // The induction variable and its update instruction will remain uniform.
5415     addToWorklistIfAllowed(Ind);
5416     addToWorklistIfAllowed(IndUpdate);
5417   }
5418 
5419   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5420 }
5421 
5422 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5423   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5424 
5425   if (Legal->getRuntimePointerChecking()->Need) {
5426     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5427         "runtime pointer checks needed. Enable vectorization of this "
5428         "loop with '#pragma clang loop vectorize(enable)' when "
5429         "compiling with -Os/-Oz",
5430         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5431     return true;
5432   }
5433 
5434   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5435     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5436         "runtime SCEV checks needed. Enable vectorization of this "
5437         "loop with '#pragma clang loop vectorize(enable)' when "
5438         "compiling with -Os/-Oz",
5439         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5440     return true;
5441   }
5442 
5443   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5444   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5445     reportVectorizationFailure("Runtime stride check for small trip count",
5446         "runtime stride == 1 checks needed. Enable vectorization of "
5447         "this loop without such check by compiling with -Os/-Oz",
5448         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5449     return true;
5450   }
5451 
5452   return false;
5453 }
5454 
5455 Optional<ElementCount>
5456 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5457   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5458     // TODO: It may by useful to do since it's still likely to be dynamically
5459     // uniform if the target can skip.
5460     reportVectorizationFailure(
5461         "Not inserting runtime ptr check for divergent target",
5462         "runtime pointer checks needed. Not enabled for divergent target",
5463         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5464     return None;
5465   }
5466 
5467   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5468   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5469   if (TC == 1) {
5470     reportVectorizationFailure("Single iteration (non) loop",
5471         "loop trip count is one, irrelevant for vectorization",
5472         "SingleIterationLoop", ORE, TheLoop);
5473     return None;
5474   }
5475 
5476   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5477 
5478   switch (ScalarEpilogueStatus) {
5479   case CM_ScalarEpilogueAllowed:
5480     return MaxVF;
5481   case CM_ScalarEpilogueNotAllowedUsePredicate:
5482     LLVM_FALLTHROUGH;
5483   case CM_ScalarEpilogueNotNeededUsePredicate:
5484     LLVM_DEBUG(
5485         dbgs() << "LV: vector predicate hint/switch found.\n"
5486                << "LV: Not allowing scalar epilogue, creating predicated "
5487                << "vector loop.\n");
5488     break;
5489   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5490     // fallthrough as a special case of OptForSize
5491   case CM_ScalarEpilogueNotAllowedOptSize:
5492     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5493       LLVM_DEBUG(
5494           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5495     else
5496       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5497                         << "count.\n");
5498 
5499     // Bail if runtime checks are required, which are not good when optimising
5500     // for size.
5501     if (runtimeChecksRequired())
5502       return None;
5503 
5504     break;
5505   }
5506 
5507   // The only loops we can vectorize without a scalar epilogue, are loops with
5508   // a bottom-test and a single exiting block. We'd have to handle the fact
5509   // that not every instruction executes on the last iteration.  This will
5510   // require a lane mask which varies through the vector loop body.  (TODO)
5511   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5512     // If there was a tail-folding hint/switch, but we can't fold the tail by
5513     // masking, fallback to a vectorization with a scalar epilogue.
5514     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5515       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5516                            "scalar epilogue instead.\n");
5517       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5518       return MaxVF;
5519     }
5520     return None;
5521   }
5522 
5523   // Now try the tail folding
5524 
5525   // Invalidate interleave groups that require an epilogue if we can't mask
5526   // the interleave-group.
5527   if (!useMaskedInterleavedAccesses(TTI)) {
5528     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5529            "No decisions should have been taken at this point");
5530     // Note: There is no need to invalidate any cost modeling decisions here, as
5531     // non where taken so far.
5532     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5533   }
5534 
5535   assert(!MaxVF.isScalable() &&
5536          "Scalable vectors do not yet support tail folding");
5537   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5538          "MaxVF must be a power of 2");
5539   unsigned MaxVFtimesIC =
5540       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5541   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5542   // chose.
5543   ScalarEvolution *SE = PSE.getSE();
5544   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5545   const SCEV *ExitCount = SE->getAddExpr(
5546       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5547   const SCEV *Rem = SE->getURemExpr(
5548       ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5549   if (Rem->isZero()) {
5550     // Accept MaxVF if we do not have a tail.
5551     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5552     return MaxVF;
5553   }
5554 
5555   // If we don't know the precise trip count, or if the trip count that we
5556   // found modulo the vectorization factor is not zero, try to fold the tail
5557   // by masking.
5558   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5559   if (Legal->prepareToFoldTailByMasking()) {
5560     FoldTailByMasking = true;
5561     return MaxVF;
5562   }
5563 
5564   // If there was a tail-folding hint/switch, but we can't fold the tail by
5565   // masking, fallback to a vectorization with a scalar epilogue.
5566   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5567     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5568                          "scalar epilogue instead.\n");
5569     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5570     return MaxVF;
5571   }
5572 
5573   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5574     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5575     return None;
5576   }
5577 
5578   if (TC == 0) {
5579     reportVectorizationFailure(
5580         "Unable to calculate the loop count due to complex control flow",
5581         "unable to calculate the loop count due to complex control flow",
5582         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5583     return None;
5584   }
5585 
5586   reportVectorizationFailure(
5587       "Cannot optimize for size and vectorize at the same time.",
5588       "cannot optimize for size and vectorize at the same time. "
5589       "Enable vectorization of this loop with '#pragma clang loop "
5590       "vectorize(enable)' when compiling with -Os/-Oz",
5591       "NoTailLoopWithOptForSize", ORE, TheLoop);
5592   return None;
5593 }
5594 
5595 ElementCount
5596 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5597                                                  ElementCount UserVF) {
5598   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5599   unsigned SmallestType, WidestType;
5600   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5601   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5602 
5603   // Get the maximum safe dependence distance in bits computed by LAA.
5604   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5605   // the memory accesses that is most restrictive (involved in the smallest
5606   // dependence distance).
5607   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5608 
5609   if (UserVF.isNonZero()) {
5610     // For now, don't verify legality of scalable vectors.
5611     // This will be addressed properly in https://reviews.llvm.org/D91718.
5612     if (UserVF.isScalable())
5613       return UserVF;
5614 
5615     // If legally unsafe, clamp the user vectorization factor to a safe value.
5616     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5617     if (UserVF.getFixedValue() <= MaxSafeVF)
5618       return UserVF;
5619 
5620     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5621                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5622                       << ".\n");
5623     ORE->emit([&]() {
5624       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5625                                         TheLoop->getStartLoc(),
5626                                         TheLoop->getHeader())
5627              << "User-specified vectorization factor "
5628              << ore::NV("UserVectorizationFactor", UserVF)
5629              << " is unsafe, clamping to maximum safe vectorization factor "
5630              << ore::NV("VectorizationFactor", MaxSafeVF);
5631     });
5632     return ElementCount::getFixed(MaxSafeVF);
5633   }
5634 
5635   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5636 
5637   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5638   // Note that both WidestRegister and WidestType may not be a powers of 2.
5639   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5640 
5641   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5642                     << " / " << WidestType << " bits.\n");
5643   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5644                     << WidestRegister << " bits.\n");
5645 
5646   assert(MaxVectorSize <= WidestRegister &&
5647          "Did not expect to pack so many elements"
5648          " into one vector!");
5649   if (MaxVectorSize == 0) {
5650     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5651     MaxVectorSize = 1;
5652     return ElementCount::getFixed(MaxVectorSize);
5653   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5654              isPowerOf2_32(ConstTripCount)) {
5655     // We need to clamp the VF to be the ConstTripCount. There is no point in
5656     // choosing a higher viable VF as done in the loop below.
5657     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5658                       << ConstTripCount << "\n");
5659     MaxVectorSize = ConstTripCount;
5660     return ElementCount::getFixed(MaxVectorSize);
5661   }
5662 
5663   unsigned MaxVF = MaxVectorSize;
5664   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5665       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5666     // Collect all viable vectorization factors larger than the default MaxVF
5667     // (i.e. MaxVectorSize).
5668     SmallVector<ElementCount, 8> VFs;
5669     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5670     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5671       VFs.push_back(ElementCount::getFixed(VS));
5672 
5673     // For each VF calculate its register usage.
5674     auto RUs = calculateRegisterUsage(VFs);
5675 
5676     // Select the largest VF which doesn't require more registers than existing
5677     // ones.
5678     for (int i = RUs.size() - 1; i >= 0; --i) {
5679       bool Selected = true;
5680       for (auto& pair : RUs[i].MaxLocalUsers) {
5681         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5682         if (pair.second > TargetNumRegisters)
5683           Selected = false;
5684       }
5685       if (Selected) {
5686         MaxVF = VFs[i].getKnownMinValue();
5687         break;
5688       }
5689     }
5690     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5691       if (MaxVF < MinVF) {
5692         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5693                           << ") with target's minimum: " << MinVF << '\n');
5694         MaxVF = MinVF;
5695       }
5696     }
5697   }
5698   return ElementCount::getFixed(MaxVF);
5699 }
5700 
5701 VectorizationFactor
5702 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5703   // FIXME: This can be fixed for scalable vectors later, because at this stage
5704   // the LoopVectorizer will only consider vectorizing a loop with scalable
5705   // vectors when the loop has a hint to enable vectorization for a given VF.
5706   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5707 
5708   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5709   const float ScalarCost = Cost;
5710   unsigned Width = 1;
5711   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5712 
5713   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5714   if (ForceVectorization && MaxVF.isVector()) {
5715     // Ignore scalar width, because the user explicitly wants vectorization.
5716     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5717     // evaluation.
5718     Cost = std::numeric_limits<float>::max();
5719   }
5720 
5721   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5722     // Notice that the vector loop needs to be executed less times, so
5723     // we need to divide the cost of the vector loops by the width of
5724     // the vector elements.
5725     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5726     float VectorCost = C.first / (float)i;
5727     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5728                       << " costs: " << (int)VectorCost << ".\n");
5729     if (!C.second && !ForceVectorization) {
5730       LLVM_DEBUG(
5731           dbgs() << "LV: Not considering vector loop of width " << i
5732                  << " because it will not generate any vector instructions.\n");
5733       continue;
5734     }
5735 
5736     // If profitable add it to ProfitableVF list.
5737     if (VectorCost < ScalarCost) {
5738       ProfitableVFs.push_back(VectorizationFactor(
5739           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5740     }
5741 
5742     if (VectorCost < Cost) {
5743       Cost = VectorCost;
5744       Width = i;
5745     }
5746   }
5747 
5748   if (!EnableCondStoresVectorization && NumPredStores) {
5749     reportVectorizationFailure("There are conditional stores.",
5750         "store that is conditionally executed prevents vectorization",
5751         "ConditionalStore", ORE, TheLoop);
5752     Width = 1;
5753     Cost = ScalarCost;
5754   }
5755 
5756   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5757              << "LV: Vectorization seems to be not beneficial, "
5758              << "but was forced by a user.\n");
5759   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5760   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5761                                 (unsigned)(Width * Cost)};
5762   return Factor;
5763 }
5764 
5765 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5766     const Loop &L, ElementCount VF) const {
5767   // Cross iteration phis such as reductions need special handling and are
5768   // currently unsupported.
5769   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5770         return Legal->isFirstOrderRecurrence(&Phi) ||
5771                Legal->isReductionVariable(&Phi);
5772       }))
5773     return false;
5774 
5775   // Phis with uses outside of the loop require special handling and are
5776   // currently unsupported.
5777   for (auto &Entry : Legal->getInductionVars()) {
5778     // Look for uses of the value of the induction at the last iteration.
5779     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5780     for (User *U : PostInc->users())
5781       if (!L.contains(cast<Instruction>(U)))
5782         return false;
5783     // Look for uses of penultimate value of the induction.
5784     for (User *U : Entry.first->users())
5785       if (!L.contains(cast<Instruction>(U)))
5786         return false;
5787   }
5788 
5789   // Induction variables that are widened require special handling that is
5790   // currently not supported.
5791   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5792         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5793                  this->isProfitableToScalarize(Entry.first, VF));
5794       }))
5795     return false;
5796 
5797   return true;
5798 }
5799 
5800 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5801     const ElementCount VF) const {
5802   // FIXME: We need a much better cost-model to take different parameters such
5803   // as register pressure, code size increase and cost of extra branches into
5804   // account. For now we apply a very crude heuristic and only consider loops
5805   // with vectorization factors larger than a certain value.
5806   // We also consider epilogue vectorization unprofitable for targets that don't
5807   // consider interleaving beneficial (eg. MVE).
5808   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5809     return false;
5810   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5811     return true;
5812   return false;
5813 }
5814 
5815 VectorizationFactor
5816 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5817     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5818   VectorizationFactor Result = VectorizationFactor::Disabled();
5819   if (!EnableEpilogueVectorization) {
5820     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5821     return Result;
5822   }
5823 
5824   if (!isScalarEpilogueAllowed()) {
5825     LLVM_DEBUG(
5826         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5827                   "allowed.\n";);
5828     return Result;
5829   }
5830 
5831   // FIXME: This can be fixed for scalable vectors later, because at this stage
5832   // the LoopVectorizer will only consider vectorizing a loop with scalable
5833   // vectors when the loop has a hint to enable vectorization for a given VF.
5834   if (MainLoopVF.isScalable()) {
5835     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5836                          "yet supported.\n");
5837     return Result;
5838   }
5839 
5840   // Not really a cost consideration, but check for unsupported cases here to
5841   // simplify the logic.
5842   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5843     LLVM_DEBUG(
5844         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5845                   "not a supported candidate.\n";);
5846     return Result;
5847   }
5848 
5849   if (EpilogueVectorizationForceVF > 1) {
5850     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5851     if (LVP.hasPlanWithVFs(
5852             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5853       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5854     else {
5855       LLVM_DEBUG(
5856           dbgs()
5857               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5858       return Result;
5859     }
5860   }
5861 
5862   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5863       TheLoop->getHeader()->getParent()->hasMinSize()) {
5864     LLVM_DEBUG(
5865         dbgs()
5866             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5867     return Result;
5868   }
5869 
5870   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5871     return Result;
5872 
5873   for (auto &NextVF : ProfitableVFs)
5874     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5875         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5876         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5877       Result = NextVF;
5878 
5879   if (Result != VectorizationFactor::Disabled())
5880     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5881                       << Result.Width.getFixedValue() << "\n";);
5882   return Result;
5883 }
5884 
5885 std::pair<unsigned, unsigned>
5886 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5887   unsigned MinWidth = -1U;
5888   unsigned MaxWidth = 8;
5889   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5890 
5891   // For each block.
5892   for (BasicBlock *BB : TheLoop->blocks()) {
5893     // For each instruction in the loop.
5894     for (Instruction &I : BB->instructionsWithoutDebug()) {
5895       Type *T = I.getType();
5896 
5897       // Skip ignored values.
5898       if (ValuesToIgnore.count(&I))
5899         continue;
5900 
5901       // Only examine Loads, Stores and PHINodes.
5902       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5903         continue;
5904 
5905       // Examine PHI nodes that are reduction variables. Update the type to
5906       // account for the recurrence type.
5907       if (auto *PN = dyn_cast<PHINode>(&I)) {
5908         if (!Legal->isReductionVariable(PN))
5909           continue;
5910         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5911         T = RdxDesc.getRecurrenceType();
5912       }
5913 
5914       // Examine the stored values.
5915       if (auto *ST = dyn_cast<StoreInst>(&I))
5916         T = ST->getValueOperand()->getType();
5917 
5918       // Ignore loaded pointer types and stored pointer types that are not
5919       // vectorizable.
5920       //
5921       // FIXME: The check here attempts to predict whether a load or store will
5922       //        be vectorized. We only know this for certain after a VF has
5923       //        been selected. Here, we assume that if an access can be
5924       //        vectorized, it will be. We should also look at extending this
5925       //        optimization to non-pointer types.
5926       //
5927       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5928           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5929         continue;
5930 
5931       MinWidth = std::min(MinWidth,
5932                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5933       MaxWidth = std::max(MaxWidth,
5934                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5935     }
5936   }
5937 
5938   return {MinWidth, MaxWidth};
5939 }
5940 
5941 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5942                                                            unsigned LoopCost) {
5943   // -- The interleave heuristics --
5944   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5945   // There are many micro-architectural considerations that we can't predict
5946   // at this level. For example, frontend pressure (on decode or fetch) due to
5947   // code size, or the number and capabilities of the execution ports.
5948   //
5949   // We use the following heuristics to select the interleave count:
5950   // 1. If the code has reductions, then we interleave to break the cross
5951   // iteration dependency.
5952   // 2. If the loop is really small, then we interleave to reduce the loop
5953   // overhead.
5954   // 3. We don't interleave if we think that we will spill registers to memory
5955   // due to the increased register pressure.
5956 
5957   if (!isScalarEpilogueAllowed())
5958     return 1;
5959 
5960   // We used the distance for the interleave count.
5961   if (Legal->getMaxSafeDepDistBytes() != -1U)
5962     return 1;
5963 
5964   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5965   const bool HasReductions = !Legal->getReductionVars().empty();
5966   // Do not interleave loops with a relatively small known or estimated trip
5967   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5968   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5969   // because with the above conditions interleaving can expose ILP and break
5970   // cross iteration dependences for reductions.
5971   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5972       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5973     return 1;
5974 
5975   RegisterUsage R = calculateRegisterUsage({VF})[0];
5976   // We divide by these constants so assume that we have at least one
5977   // instruction that uses at least one register.
5978   for (auto& pair : R.MaxLocalUsers) {
5979     pair.second = std::max(pair.second, 1U);
5980   }
5981 
5982   // We calculate the interleave count using the following formula.
5983   // Subtract the number of loop invariants from the number of available
5984   // registers. These registers are used by all of the interleaved instances.
5985   // Next, divide the remaining registers by the number of registers that is
5986   // required by the loop, in order to estimate how many parallel instances
5987   // fit without causing spills. All of this is rounded down if necessary to be
5988   // a power of two. We want power of two interleave count to simplify any
5989   // addressing operations or alignment considerations.
5990   // We also want power of two interleave counts to ensure that the induction
5991   // variable of the vector loop wraps to zero, when tail is folded by masking;
5992   // this currently happens when OptForSize, in which case IC is set to 1 above.
5993   unsigned IC = UINT_MAX;
5994 
5995   for (auto& pair : R.MaxLocalUsers) {
5996     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5997     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5998                       << " registers of "
5999                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6000     if (VF.isScalar()) {
6001       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6002         TargetNumRegisters = ForceTargetNumScalarRegs;
6003     } else {
6004       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6005         TargetNumRegisters = ForceTargetNumVectorRegs;
6006     }
6007     unsigned MaxLocalUsers = pair.second;
6008     unsigned LoopInvariantRegs = 0;
6009     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6010       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6011 
6012     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6013     // Don't count the induction variable as interleaved.
6014     if (EnableIndVarRegisterHeur) {
6015       TmpIC =
6016           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6017                         std::max(1U, (MaxLocalUsers - 1)));
6018     }
6019 
6020     IC = std::min(IC, TmpIC);
6021   }
6022 
6023   // Clamp the interleave ranges to reasonable counts.
6024   unsigned MaxInterleaveCount =
6025       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6026 
6027   // Check if the user has overridden the max.
6028   if (VF.isScalar()) {
6029     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6030       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6031   } else {
6032     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6033       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6034   }
6035 
6036   // If trip count is known or estimated compile time constant, limit the
6037   // interleave count to be less than the trip count divided by VF, provided it
6038   // is at least 1.
6039   //
6040   // For scalable vectors we can't know if interleaving is beneficial. It may
6041   // not be beneficial for small loops if none of the lanes in the second vector
6042   // iterations is enabled. However, for larger loops, there is likely to be a
6043   // similar benefit as for fixed-width vectors. For now, we choose to leave
6044   // the InterleaveCount as if vscale is '1', although if some information about
6045   // the vector is known (e.g. min vector size), we can make a better decision.
6046   if (BestKnownTC) {
6047     MaxInterleaveCount =
6048         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6049     // Make sure MaxInterleaveCount is greater than 0.
6050     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6051   }
6052 
6053   assert(MaxInterleaveCount > 0 &&
6054          "Maximum interleave count must be greater than 0");
6055 
6056   // Clamp the calculated IC to be between the 1 and the max interleave count
6057   // that the target and trip count allows.
6058   if (IC > MaxInterleaveCount)
6059     IC = MaxInterleaveCount;
6060   else
6061     // Make sure IC is greater than 0.
6062     IC = std::max(1u, IC);
6063 
6064   assert(IC > 0 && "Interleave count must be greater than 0.");
6065 
6066   // If we did not calculate the cost for VF (because the user selected the VF)
6067   // then we calculate the cost of VF here.
6068   if (LoopCost == 0)
6069     LoopCost = expectedCost(VF).first;
6070 
6071   assert(LoopCost && "Non-zero loop cost expected");
6072 
6073   // Interleave if we vectorized this loop and there is a reduction that could
6074   // benefit from interleaving.
6075   if (VF.isVector() && HasReductions) {
6076     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6077     return IC;
6078   }
6079 
6080   // Note that if we've already vectorized the loop we will have done the
6081   // runtime check and so interleaving won't require further checks.
6082   bool InterleavingRequiresRuntimePointerCheck =
6083       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6084 
6085   // We want to interleave small loops in order to reduce the loop overhead and
6086   // potentially expose ILP opportunities.
6087   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6088                     << "LV: IC is " << IC << '\n'
6089                     << "LV: VF is " << VF << '\n');
6090   const bool AggressivelyInterleaveReductions =
6091       TTI.enableAggressiveInterleaving(HasReductions);
6092   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6093     // We assume that the cost overhead is 1 and we use the cost model
6094     // to estimate the cost of the loop and interleave until the cost of the
6095     // loop overhead is about 5% of the cost of the loop.
6096     unsigned SmallIC =
6097         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6098 
6099     // Interleave until store/load ports (estimated by max interleave count) are
6100     // saturated.
6101     unsigned NumStores = Legal->getNumStores();
6102     unsigned NumLoads = Legal->getNumLoads();
6103     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6104     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6105 
6106     // If we have a scalar reduction (vector reductions are already dealt with
6107     // by this point), we can increase the critical path length if the loop
6108     // we're interleaving is inside another loop. Limit, by default to 2, so the
6109     // critical path only gets increased by one reduction operation.
6110     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6111       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6112       SmallIC = std::min(SmallIC, F);
6113       StoresIC = std::min(StoresIC, F);
6114       LoadsIC = std::min(LoadsIC, F);
6115     }
6116 
6117     if (EnableLoadStoreRuntimeInterleave &&
6118         std::max(StoresIC, LoadsIC) > SmallIC) {
6119       LLVM_DEBUG(
6120           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6121       return std::max(StoresIC, LoadsIC);
6122     }
6123 
6124     // If there are scalar reductions and TTI has enabled aggressive
6125     // interleaving for reductions, we will interleave to expose ILP.
6126     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6127         AggressivelyInterleaveReductions) {
6128       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6129       // Interleave no less than SmallIC but not as aggressive as the normal IC
6130       // to satisfy the rare situation when resources are too limited.
6131       return std::max(IC / 2, SmallIC);
6132     } else {
6133       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6134       return SmallIC;
6135     }
6136   }
6137 
6138   // Interleave if this is a large loop (small loops are already dealt with by
6139   // this point) that could benefit from interleaving.
6140   if (AggressivelyInterleaveReductions) {
6141     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6142     return IC;
6143   }
6144 
6145   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6146   return 1;
6147 }
6148 
6149 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6150 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6151   // This function calculates the register usage by measuring the highest number
6152   // of values that are alive at a single location. Obviously, this is a very
6153   // rough estimation. We scan the loop in a topological order in order and
6154   // assign a number to each instruction. We use RPO to ensure that defs are
6155   // met before their users. We assume that each instruction that has in-loop
6156   // users starts an interval. We record every time that an in-loop value is
6157   // used, so we have a list of the first and last occurrences of each
6158   // instruction. Next, we transpose this data structure into a multi map that
6159   // holds the list of intervals that *end* at a specific location. This multi
6160   // map allows us to perform a linear search. We scan the instructions linearly
6161   // and record each time that a new interval starts, by placing it in a set.
6162   // If we find this value in the multi-map then we remove it from the set.
6163   // The max register usage is the maximum size of the set.
6164   // We also search for instructions that are defined outside the loop, but are
6165   // used inside the loop. We need this number separately from the max-interval
6166   // usage number because when we unroll, loop-invariant values do not take
6167   // more register.
6168   LoopBlocksDFS DFS(TheLoop);
6169   DFS.perform(LI);
6170 
6171   RegisterUsage RU;
6172 
6173   // Each 'key' in the map opens a new interval. The values
6174   // of the map are the index of the 'last seen' usage of the
6175   // instruction that is the key.
6176   using IntervalMap = DenseMap<Instruction *, unsigned>;
6177 
6178   // Maps instruction to its index.
6179   SmallVector<Instruction *, 64> IdxToInstr;
6180   // Marks the end of each interval.
6181   IntervalMap EndPoint;
6182   // Saves the list of instruction indices that are used in the loop.
6183   SmallPtrSet<Instruction *, 8> Ends;
6184   // Saves the list of values that are used in the loop but are
6185   // defined outside the loop, such as arguments and constants.
6186   SmallPtrSet<Value *, 8> LoopInvariants;
6187 
6188   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6189     for (Instruction &I : BB->instructionsWithoutDebug()) {
6190       IdxToInstr.push_back(&I);
6191 
6192       // Save the end location of each USE.
6193       for (Value *U : I.operands()) {
6194         auto *Instr = dyn_cast<Instruction>(U);
6195 
6196         // Ignore non-instruction values such as arguments, constants, etc.
6197         if (!Instr)
6198           continue;
6199 
6200         // If this instruction is outside the loop then record it and continue.
6201         if (!TheLoop->contains(Instr)) {
6202           LoopInvariants.insert(Instr);
6203           continue;
6204         }
6205 
6206         // Overwrite previous end points.
6207         EndPoint[Instr] = IdxToInstr.size();
6208         Ends.insert(Instr);
6209       }
6210     }
6211   }
6212 
6213   // Saves the list of intervals that end with the index in 'key'.
6214   using InstrList = SmallVector<Instruction *, 2>;
6215   DenseMap<unsigned, InstrList> TransposeEnds;
6216 
6217   // Transpose the EndPoints to a list of values that end at each index.
6218   for (auto &Interval : EndPoint)
6219     TransposeEnds[Interval.second].push_back(Interval.first);
6220 
6221   SmallPtrSet<Instruction *, 8> OpenIntervals;
6222   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6223   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6224 
6225   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6226 
6227   // A lambda that gets the register usage for the given type and VF.
6228   const auto &TTICapture = TTI;
6229   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6230     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6231       return 0U;
6232     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6233   };
6234 
6235   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6236     Instruction *I = IdxToInstr[i];
6237 
6238     // Remove all of the instructions that end at this location.
6239     InstrList &List = TransposeEnds[i];
6240     for (Instruction *ToRemove : List)
6241       OpenIntervals.erase(ToRemove);
6242 
6243     // Ignore instructions that are never used within the loop.
6244     if (!Ends.count(I))
6245       continue;
6246 
6247     // Skip ignored values.
6248     if (ValuesToIgnore.count(I))
6249       continue;
6250 
6251     // For each VF find the maximum usage of registers.
6252     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6253       // Count the number of live intervals.
6254       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6255 
6256       if (VFs[j].isScalar()) {
6257         for (auto Inst : OpenIntervals) {
6258           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6259           if (RegUsage.find(ClassID) == RegUsage.end())
6260             RegUsage[ClassID] = 1;
6261           else
6262             RegUsage[ClassID] += 1;
6263         }
6264       } else {
6265         collectUniformsAndScalars(VFs[j]);
6266         for (auto Inst : OpenIntervals) {
6267           // Skip ignored values for VF > 1.
6268           if (VecValuesToIgnore.count(Inst))
6269             continue;
6270           if (isScalarAfterVectorization(Inst, VFs[j])) {
6271             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6272             if (RegUsage.find(ClassID) == RegUsage.end())
6273               RegUsage[ClassID] = 1;
6274             else
6275               RegUsage[ClassID] += 1;
6276           } else {
6277             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6278             if (RegUsage.find(ClassID) == RegUsage.end())
6279               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6280             else
6281               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6282           }
6283         }
6284       }
6285 
6286       for (auto& pair : RegUsage) {
6287         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6288           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6289         else
6290           MaxUsages[j][pair.first] = pair.second;
6291       }
6292     }
6293 
6294     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6295                       << OpenIntervals.size() << '\n');
6296 
6297     // Add the current instruction to the list of open intervals.
6298     OpenIntervals.insert(I);
6299   }
6300 
6301   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6302     SmallMapVector<unsigned, unsigned, 4> Invariant;
6303 
6304     for (auto Inst : LoopInvariants) {
6305       unsigned Usage =
6306           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6307       unsigned ClassID =
6308           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6309       if (Invariant.find(ClassID) == Invariant.end())
6310         Invariant[ClassID] = Usage;
6311       else
6312         Invariant[ClassID] += Usage;
6313     }
6314 
6315     LLVM_DEBUG({
6316       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6317       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6318              << " item\n";
6319       for (const auto &pair : MaxUsages[i]) {
6320         dbgs() << "LV(REG): RegisterClass: "
6321                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6322                << " registers\n";
6323       }
6324       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6325              << " item\n";
6326       for (const auto &pair : Invariant) {
6327         dbgs() << "LV(REG): RegisterClass: "
6328                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6329                << " registers\n";
6330       }
6331     });
6332 
6333     RU.LoopInvariantRegs = Invariant;
6334     RU.MaxLocalUsers = MaxUsages[i];
6335     RUs[i] = RU;
6336   }
6337 
6338   return RUs;
6339 }
6340 
6341 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6342   // TODO: Cost model for emulated masked load/store is completely
6343   // broken. This hack guides the cost model to use an artificially
6344   // high enough value to practically disable vectorization with such
6345   // operations, except where previously deployed legality hack allowed
6346   // using very low cost values. This is to avoid regressions coming simply
6347   // from moving "masked load/store" check from legality to cost model.
6348   // Masked Load/Gather emulation was previously never allowed.
6349   // Limited number of Masked Store/Scatter emulation was allowed.
6350   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6351   return isa<LoadInst>(I) ||
6352          (isa<StoreInst>(I) &&
6353           NumPredStores > NumberOfStoresToPredicate);
6354 }
6355 
6356 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6357   // If we aren't vectorizing the loop, or if we've already collected the
6358   // instructions to scalarize, there's nothing to do. Collection may already
6359   // have occurred if we have a user-selected VF and are now computing the
6360   // expected cost for interleaving.
6361   if (VF.isScalar() || VF.isZero() ||
6362       InstsToScalarize.find(VF) != InstsToScalarize.end())
6363     return;
6364 
6365   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6366   // not profitable to scalarize any instructions, the presence of VF in the
6367   // map will indicate that we've analyzed it already.
6368   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6369 
6370   // Find all the instructions that are scalar with predication in the loop and
6371   // determine if it would be better to not if-convert the blocks they are in.
6372   // If so, we also record the instructions to scalarize.
6373   for (BasicBlock *BB : TheLoop->blocks()) {
6374     if (!blockNeedsPredication(BB))
6375       continue;
6376     for (Instruction &I : *BB)
6377       if (isScalarWithPredication(&I)) {
6378         ScalarCostsTy ScalarCosts;
6379         // Do not apply discount logic if hacked cost is needed
6380         // for emulated masked memrefs.
6381         if (!useEmulatedMaskMemRefHack(&I) &&
6382             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6383           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6384         // Remember that BB will remain after vectorization.
6385         PredicatedBBsAfterVectorization.insert(BB);
6386       }
6387   }
6388 }
6389 
6390 int LoopVectorizationCostModel::computePredInstDiscount(
6391     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6392     ElementCount VF) {
6393   assert(!isUniformAfterVectorization(PredInst, VF) &&
6394          "Instruction marked uniform-after-vectorization will be predicated");
6395 
6396   // Initialize the discount to zero, meaning that the scalar version and the
6397   // vector version cost the same.
6398   int Discount = 0;
6399 
6400   // Holds instructions to analyze. The instructions we visit are mapped in
6401   // ScalarCosts. Those instructions are the ones that would be scalarized if
6402   // we find that the scalar version costs less.
6403   SmallVector<Instruction *, 8> Worklist;
6404 
6405   // Returns true if the given instruction can be scalarized.
6406   auto canBeScalarized = [&](Instruction *I) -> bool {
6407     // We only attempt to scalarize instructions forming a single-use chain
6408     // from the original predicated block that would otherwise be vectorized.
6409     // Although not strictly necessary, we give up on instructions we know will
6410     // already be scalar to avoid traversing chains that are unlikely to be
6411     // beneficial.
6412     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6413         isScalarAfterVectorization(I, VF))
6414       return false;
6415 
6416     // If the instruction is scalar with predication, it will be analyzed
6417     // separately. We ignore it within the context of PredInst.
6418     if (isScalarWithPredication(I))
6419       return false;
6420 
6421     // If any of the instruction's operands are uniform after vectorization,
6422     // the instruction cannot be scalarized. This prevents, for example, a
6423     // masked load from being scalarized.
6424     //
6425     // We assume we will only emit a value for lane zero of an instruction
6426     // marked uniform after vectorization, rather than VF identical values.
6427     // Thus, if we scalarize an instruction that uses a uniform, we would
6428     // create uses of values corresponding to the lanes we aren't emitting code
6429     // for. This behavior can be changed by allowing getScalarValue to clone
6430     // the lane zero values for uniforms rather than asserting.
6431     for (Use &U : I->operands())
6432       if (auto *J = dyn_cast<Instruction>(U.get()))
6433         if (isUniformAfterVectorization(J, VF))
6434           return false;
6435 
6436     // Otherwise, we can scalarize the instruction.
6437     return true;
6438   };
6439 
6440   // Compute the expected cost discount from scalarizing the entire expression
6441   // feeding the predicated instruction. We currently only consider expressions
6442   // that are single-use instruction chains.
6443   Worklist.push_back(PredInst);
6444   while (!Worklist.empty()) {
6445     Instruction *I = Worklist.pop_back_val();
6446 
6447     // If we've already analyzed the instruction, there's nothing to do.
6448     if (ScalarCosts.find(I) != ScalarCosts.end())
6449       continue;
6450 
6451     // Compute the cost of the vector instruction. Note that this cost already
6452     // includes the scalarization overhead of the predicated instruction.
6453     unsigned VectorCost = getInstructionCost(I, VF).first;
6454 
6455     // Compute the cost of the scalarized instruction. This cost is the cost of
6456     // the instruction as if it wasn't if-converted and instead remained in the
6457     // predicated block. We will scale this cost by block probability after
6458     // computing the scalarization overhead.
6459     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6460     unsigned ScalarCost =
6461         VF.getKnownMinValue() *
6462         getInstructionCost(I, ElementCount::getFixed(1)).first;
6463 
6464     // Compute the scalarization overhead of needed insertelement instructions
6465     // and phi nodes.
6466     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6467       ScalarCost += TTI.getScalarizationOverhead(
6468           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6469           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6470       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6471       ScalarCost +=
6472           VF.getKnownMinValue() *
6473           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6474     }
6475 
6476     // Compute the scalarization overhead of needed extractelement
6477     // instructions. For each of the instruction's operands, if the operand can
6478     // be scalarized, add it to the worklist; otherwise, account for the
6479     // overhead.
6480     for (Use &U : I->operands())
6481       if (auto *J = dyn_cast<Instruction>(U.get())) {
6482         assert(VectorType::isValidElementType(J->getType()) &&
6483                "Instruction has non-scalar type");
6484         if (canBeScalarized(J))
6485           Worklist.push_back(J);
6486         else if (needsExtract(J, VF)) {
6487           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6488           ScalarCost += TTI.getScalarizationOverhead(
6489               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6490               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6491         }
6492       }
6493 
6494     // Scale the total scalar cost by block probability.
6495     ScalarCost /= getReciprocalPredBlockProb();
6496 
6497     // Compute the discount. A non-negative discount means the vector version
6498     // of the instruction costs more, and scalarizing would be beneficial.
6499     Discount += VectorCost - ScalarCost;
6500     ScalarCosts[I] = ScalarCost;
6501   }
6502 
6503   return Discount;
6504 }
6505 
6506 LoopVectorizationCostModel::VectorizationCostTy
6507 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6508   VectorizationCostTy Cost;
6509 
6510   // For each block.
6511   for (BasicBlock *BB : TheLoop->blocks()) {
6512     VectorizationCostTy BlockCost;
6513 
6514     // For each instruction in the old loop.
6515     for (Instruction &I : BB->instructionsWithoutDebug()) {
6516       // Skip ignored values.
6517       if (ValuesToIgnore.count(&I) ||
6518           (VF.isVector() && VecValuesToIgnore.count(&I)))
6519         continue;
6520 
6521       VectorizationCostTy C = getInstructionCost(&I, VF);
6522 
6523       // Check if we should override the cost.
6524       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6525         C.first = ForceTargetInstructionCost;
6526 
6527       BlockCost.first += C.first;
6528       BlockCost.second |= C.second;
6529       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6530                         << " for VF " << VF << " For instruction: " << I
6531                         << '\n');
6532     }
6533 
6534     // If we are vectorizing a predicated block, it will have been
6535     // if-converted. This means that the block's instructions (aside from
6536     // stores and instructions that may divide by zero) will now be
6537     // unconditionally executed. For the scalar case, we may not always execute
6538     // the predicated block, if it is an if-else block. Thus, scale the block's
6539     // cost by the probability of executing it. blockNeedsPredication from
6540     // Legal is used so as to not include all blocks in tail folded loops.
6541     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6542       BlockCost.first /= getReciprocalPredBlockProb();
6543 
6544     Cost.first += BlockCost.first;
6545     Cost.second |= BlockCost.second;
6546   }
6547 
6548   return Cost;
6549 }
6550 
6551 /// Gets Address Access SCEV after verifying that the access pattern
6552 /// is loop invariant except the induction variable dependence.
6553 ///
6554 /// This SCEV can be sent to the Target in order to estimate the address
6555 /// calculation cost.
6556 static const SCEV *getAddressAccessSCEV(
6557               Value *Ptr,
6558               LoopVectorizationLegality *Legal,
6559               PredicatedScalarEvolution &PSE,
6560               const Loop *TheLoop) {
6561 
6562   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6563   if (!Gep)
6564     return nullptr;
6565 
6566   // We are looking for a gep with all loop invariant indices except for one
6567   // which should be an induction variable.
6568   auto SE = PSE.getSE();
6569   unsigned NumOperands = Gep->getNumOperands();
6570   for (unsigned i = 1; i < NumOperands; ++i) {
6571     Value *Opd = Gep->getOperand(i);
6572     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6573         !Legal->isInductionVariable(Opd))
6574       return nullptr;
6575   }
6576 
6577   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6578   return PSE.getSCEV(Ptr);
6579 }
6580 
6581 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6582   return Legal->hasStride(I->getOperand(0)) ||
6583          Legal->hasStride(I->getOperand(1));
6584 }
6585 
6586 unsigned
6587 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6588                                                         ElementCount VF) {
6589   assert(VF.isVector() &&
6590          "Scalarization cost of instruction implies vectorization.");
6591   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6592   Type *ValTy = getMemInstValueType(I);
6593   auto SE = PSE.getSE();
6594 
6595   unsigned AS = getLoadStoreAddressSpace(I);
6596   Value *Ptr = getLoadStorePointerOperand(I);
6597   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6598 
6599   // Figure out whether the access is strided and get the stride value
6600   // if it's known in compile time
6601   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6602 
6603   // Get the cost of the scalar memory instruction and address computation.
6604   unsigned Cost =
6605       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6606 
6607   // Don't pass *I here, since it is scalar but will actually be part of a
6608   // vectorized loop where the user of it is a vectorized instruction.
6609   const Align Alignment = getLoadStoreAlignment(I);
6610   Cost += VF.getKnownMinValue() *
6611           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6612                               AS, TTI::TCK_RecipThroughput);
6613 
6614   // Get the overhead of the extractelement and insertelement instructions
6615   // we might create due to scalarization.
6616   Cost += getScalarizationOverhead(I, VF);
6617 
6618   // If we have a predicated store, it may not be executed for each vector
6619   // lane. Scale the cost by the probability of executing the predicated
6620   // block.
6621   if (isPredicatedInst(I)) {
6622     Cost /= getReciprocalPredBlockProb();
6623 
6624     if (useEmulatedMaskMemRefHack(I))
6625       // Artificially setting to a high enough value to practically disable
6626       // vectorization with such operations.
6627       Cost = 3000000;
6628   }
6629 
6630   return Cost;
6631 }
6632 
6633 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6634                                                              ElementCount VF) {
6635   Type *ValTy = getMemInstValueType(I);
6636   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6637   Value *Ptr = getLoadStorePointerOperand(I);
6638   unsigned AS = getLoadStoreAddressSpace(I);
6639   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6640   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6641 
6642   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6643          "Stride should be 1 or -1 for consecutive memory access");
6644   const Align Alignment = getLoadStoreAlignment(I);
6645   unsigned Cost = 0;
6646   if (Legal->isMaskRequired(I))
6647     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6648                                       CostKind);
6649   else
6650     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6651                                 CostKind, I);
6652 
6653   bool Reverse = ConsecutiveStride < 0;
6654   if (Reverse)
6655     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6656   return Cost;
6657 }
6658 
6659 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6660                                                          ElementCount VF) {
6661   assert(Legal->isUniformMemOp(*I));
6662 
6663   Type *ValTy = getMemInstValueType(I);
6664   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6665   const Align Alignment = getLoadStoreAlignment(I);
6666   unsigned AS = getLoadStoreAddressSpace(I);
6667   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6668   if (isa<LoadInst>(I)) {
6669     return TTI.getAddressComputationCost(ValTy) +
6670            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6671                                CostKind) +
6672            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6673   }
6674   StoreInst *SI = cast<StoreInst>(I);
6675 
6676   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6677   return TTI.getAddressComputationCost(ValTy) +
6678          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6679                              CostKind) +
6680          (isLoopInvariantStoreValue
6681               ? 0
6682               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6683                                        VF.getKnownMinValue() - 1));
6684 }
6685 
6686 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6687                                                           ElementCount VF) {
6688   Type *ValTy = getMemInstValueType(I);
6689   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6690   const Align Alignment = getLoadStoreAlignment(I);
6691   const Value *Ptr = getLoadStorePointerOperand(I);
6692 
6693   return TTI.getAddressComputationCost(VectorTy) +
6694          TTI.getGatherScatterOpCost(
6695              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6696              TargetTransformInfo::TCK_RecipThroughput, I);
6697 }
6698 
6699 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6700                                                             ElementCount VF) {
6701   Type *ValTy = getMemInstValueType(I);
6702   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6703   unsigned AS = getLoadStoreAddressSpace(I);
6704 
6705   auto Group = getInterleavedAccessGroup(I);
6706   assert(Group && "Fail to get an interleaved access group.");
6707 
6708   unsigned InterleaveFactor = Group->getFactor();
6709   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6710   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6711 
6712   // Holds the indices of existing members in an interleaved load group.
6713   // An interleaved store group doesn't need this as it doesn't allow gaps.
6714   SmallVector<unsigned, 4> Indices;
6715   if (isa<LoadInst>(I)) {
6716     for (unsigned i = 0; i < InterleaveFactor; i++)
6717       if (Group->getMember(i))
6718         Indices.push_back(i);
6719   }
6720 
6721   // Calculate the cost of the whole interleaved group.
6722   bool UseMaskForGaps =
6723       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6724   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6725       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6726       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6727 
6728   if (Group->isReverse()) {
6729     // TODO: Add support for reversed masked interleaved access.
6730     assert(!Legal->isMaskRequired(I) &&
6731            "Reverse masked interleaved access not supported.");
6732     Cost += Group->getNumMembers() *
6733             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6734   }
6735   return Cost;
6736 }
6737 
6738 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6739                                                               ElementCount VF) {
6740   // Calculate scalar cost only. Vectorization cost should be ready at this
6741   // moment.
6742   if (VF.isScalar()) {
6743     Type *ValTy = getMemInstValueType(I);
6744     const Align Alignment = getLoadStoreAlignment(I);
6745     unsigned AS = getLoadStoreAddressSpace(I);
6746 
6747     return TTI.getAddressComputationCost(ValTy) +
6748            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6749                                TTI::TCK_RecipThroughput, I);
6750   }
6751   return getWideningCost(I, VF);
6752 }
6753 
6754 LoopVectorizationCostModel::VectorizationCostTy
6755 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6756                                                ElementCount VF) {
6757   // If we know that this instruction will remain uniform, check the cost of
6758   // the scalar version.
6759   if (isUniformAfterVectorization(I, VF))
6760     VF = ElementCount::getFixed(1);
6761 
6762   if (VF.isVector() && isProfitableToScalarize(I, VF))
6763     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6764 
6765   // Forced scalars do not have any scalarization overhead.
6766   auto ForcedScalar = ForcedScalars.find(VF);
6767   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6768     auto InstSet = ForcedScalar->second;
6769     if (InstSet.count(I))
6770       return VectorizationCostTy(
6771           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6772            VF.getKnownMinValue()),
6773           false);
6774   }
6775 
6776   Type *VectorTy;
6777   unsigned C = getInstructionCost(I, VF, VectorTy);
6778 
6779   bool TypeNotScalarized =
6780       VF.isVector() && VectorTy->isVectorTy() &&
6781       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6782   return VectorizationCostTy(C, TypeNotScalarized);
6783 }
6784 
6785 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6786                                                               ElementCount VF) {
6787 
6788   assert(!VF.isScalable() &&
6789          "cannot compute scalarization overhead for scalable vectorization");
6790   if (VF.isScalar())
6791     return 0;
6792 
6793   unsigned Cost = 0;
6794   Type *RetTy = ToVectorTy(I->getType(), VF);
6795   if (!RetTy->isVoidTy() &&
6796       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6797     Cost += TTI.getScalarizationOverhead(
6798         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6799         true, false);
6800 
6801   // Some targets keep addresses scalar.
6802   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6803     return Cost;
6804 
6805   // Some targets support efficient element stores.
6806   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6807     return Cost;
6808 
6809   // Collect operands to consider.
6810   CallInst *CI = dyn_cast<CallInst>(I);
6811   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6812 
6813   // Skip operands that do not require extraction/scalarization and do not incur
6814   // any overhead.
6815   return Cost + TTI.getOperandsScalarizationOverhead(
6816                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6817 }
6818 
6819 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6820   if (VF.isScalar())
6821     return;
6822   NumPredStores = 0;
6823   for (BasicBlock *BB : TheLoop->blocks()) {
6824     // For each instruction in the old loop.
6825     for (Instruction &I : *BB) {
6826       Value *Ptr =  getLoadStorePointerOperand(&I);
6827       if (!Ptr)
6828         continue;
6829 
6830       // TODO: We should generate better code and update the cost model for
6831       // predicated uniform stores. Today they are treated as any other
6832       // predicated store (see added test cases in
6833       // invariant-store-vectorization.ll).
6834       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6835         NumPredStores++;
6836 
6837       if (Legal->isUniformMemOp(I)) {
6838         // TODO: Avoid replicating loads and stores instead of
6839         // relying on instcombine to remove them.
6840         // Load: Scalar load + broadcast
6841         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6842         unsigned Cost = getUniformMemOpCost(&I, VF);
6843         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6844         continue;
6845       }
6846 
6847       // We assume that widening is the best solution when possible.
6848       if (memoryInstructionCanBeWidened(&I, VF)) {
6849         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6850         int ConsecutiveStride =
6851                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6852         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6853                "Expected consecutive stride.");
6854         InstWidening Decision =
6855             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6856         setWideningDecision(&I, VF, Decision, Cost);
6857         continue;
6858       }
6859 
6860       // Choose between Interleaving, Gather/Scatter or Scalarization.
6861       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6862       unsigned NumAccesses = 1;
6863       if (isAccessInterleaved(&I)) {
6864         auto Group = getInterleavedAccessGroup(&I);
6865         assert(Group && "Fail to get an interleaved access group.");
6866 
6867         // Make one decision for the whole group.
6868         if (getWideningDecision(&I, VF) != CM_Unknown)
6869           continue;
6870 
6871         NumAccesses = Group->getNumMembers();
6872         if (interleavedAccessCanBeWidened(&I, VF))
6873           InterleaveCost = getInterleaveGroupCost(&I, VF);
6874       }
6875 
6876       unsigned GatherScatterCost =
6877           isLegalGatherOrScatter(&I)
6878               ? getGatherScatterCost(&I, VF) * NumAccesses
6879               : std::numeric_limits<unsigned>::max();
6880 
6881       unsigned ScalarizationCost =
6882           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6883 
6884       // Choose better solution for the current VF,
6885       // write down this decision and use it during vectorization.
6886       unsigned Cost;
6887       InstWidening Decision;
6888       if (InterleaveCost <= GatherScatterCost &&
6889           InterleaveCost < ScalarizationCost) {
6890         Decision = CM_Interleave;
6891         Cost = InterleaveCost;
6892       } else if (GatherScatterCost < ScalarizationCost) {
6893         Decision = CM_GatherScatter;
6894         Cost = GatherScatterCost;
6895       } else {
6896         Decision = CM_Scalarize;
6897         Cost = ScalarizationCost;
6898       }
6899       // If the instructions belongs to an interleave group, the whole group
6900       // receives the same decision. The whole group receives the cost, but
6901       // the cost will actually be assigned to one instruction.
6902       if (auto Group = getInterleavedAccessGroup(&I))
6903         setWideningDecision(Group, VF, Decision, Cost);
6904       else
6905         setWideningDecision(&I, VF, Decision, Cost);
6906     }
6907   }
6908 
6909   // Make sure that any load of address and any other address computation
6910   // remains scalar unless there is gather/scatter support. This avoids
6911   // inevitable extracts into address registers, and also has the benefit of
6912   // activating LSR more, since that pass can't optimize vectorized
6913   // addresses.
6914   if (TTI.prefersVectorizedAddressing())
6915     return;
6916 
6917   // Start with all scalar pointer uses.
6918   SmallPtrSet<Instruction *, 8> AddrDefs;
6919   for (BasicBlock *BB : TheLoop->blocks())
6920     for (Instruction &I : *BB) {
6921       Instruction *PtrDef =
6922         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6923       if (PtrDef && TheLoop->contains(PtrDef) &&
6924           getWideningDecision(&I, VF) != CM_GatherScatter)
6925         AddrDefs.insert(PtrDef);
6926     }
6927 
6928   // Add all instructions used to generate the addresses.
6929   SmallVector<Instruction *, 4> Worklist;
6930   for (auto *I : AddrDefs)
6931     Worklist.push_back(I);
6932   while (!Worklist.empty()) {
6933     Instruction *I = Worklist.pop_back_val();
6934     for (auto &Op : I->operands())
6935       if (auto *InstOp = dyn_cast<Instruction>(Op))
6936         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6937             AddrDefs.insert(InstOp).second)
6938           Worklist.push_back(InstOp);
6939   }
6940 
6941   for (auto *I : AddrDefs) {
6942     if (isa<LoadInst>(I)) {
6943       // Setting the desired widening decision should ideally be handled in
6944       // by cost functions, but since this involves the task of finding out
6945       // if the loaded register is involved in an address computation, it is
6946       // instead changed here when we know this is the case.
6947       InstWidening Decision = getWideningDecision(I, VF);
6948       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6949         // Scalarize a widened load of address.
6950         setWideningDecision(
6951             I, VF, CM_Scalarize,
6952             (VF.getKnownMinValue() *
6953              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6954       else if (auto Group = getInterleavedAccessGroup(I)) {
6955         // Scalarize an interleave group of address loads.
6956         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6957           if (Instruction *Member = Group->getMember(I))
6958             setWideningDecision(
6959                 Member, VF, CM_Scalarize,
6960                 (VF.getKnownMinValue() *
6961                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6962         }
6963       }
6964     } else
6965       // Make sure I gets scalarized and a cost estimate without
6966       // scalarization overhead.
6967       ForcedScalars[VF].insert(I);
6968   }
6969 }
6970 
6971 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6972                                                         ElementCount VF,
6973                                                         Type *&VectorTy) {
6974   Type *RetTy = I->getType();
6975   if (canTruncateToMinimalBitwidth(I, VF))
6976     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6977   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6978   auto SE = PSE.getSE();
6979   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6980 
6981   // TODO: We need to estimate the cost of intrinsic calls.
6982   switch (I->getOpcode()) {
6983   case Instruction::GetElementPtr:
6984     // We mark this instruction as zero-cost because the cost of GEPs in
6985     // vectorized code depends on whether the corresponding memory instruction
6986     // is scalarized or not. Therefore, we handle GEPs with the memory
6987     // instruction cost.
6988     return 0;
6989   case Instruction::Br: {
6990     // In cases of scalarized and predicated instructions, there will be VF
6991     // predicated blocks in the vectorized loop. Each branch around these
6992     // blocks requires also an extract of its vector compare i1 element.
6993     bool ScalarPredicatedBB = false;
6994     BranchInst *BI = cast<BranchInst>(I);
6995     if (VF.isVector() && BI->isConditional() &&
6996         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6997          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6998       ScalarPredicatedBB = true;
6999 
7000     if (ScalarPredicatedBB) {
7001       // Return cost for branches around scalarized and predicated blocks.
7002       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7003       auto *Vec_i1Ty =
7004           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7005       return (TTI.getScalarizationOverhead(
7006                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7007                   false, true) +
7008               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7009                VF.getKnownMinValue()));
7010     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7011       // The back-edge branch will remain, as will all scalar branches.
7012       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7013     else
7014       // This branch will be eliminated by if-conversion.
7015       return 0;
7016     // Note: We currently assume zero cost for an unconditional branch inside
7017     // a predicated block since it will become a fall-through, although we
7018     // may decide in the future to call TTI for all branches.
7019   }
7020   case Instruction::PHI: {
7021     auto *Phi = cast<PHINode>(I);
7022 
7023     // First-order recurrences are replaced by vector shuffles inside the loop.
7024     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7025     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7026       return TTI.getShuffleCost(
7027           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7028           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7029 
7030     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7031     // converted into select instructions. We require N - 1 selects per phi
7032     // node, where N is the number of incoming values.
7033     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7034       return (Phi->getNumIncomingValues() - 1) *
7035              TTI.getCmpSelInstrCost(
7036                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7037                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7038                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7039 
7040     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7041   }
7042   case Instruction::UDiv:
7043   case Instruction::SDiv:
7044   case Instruction::URem:
7045   case Instruction::SRem:
7046     // If we have a predicated instruction, it may not be executed for each
7047     // vector lane. Get the scalarization cost and scale this amount by the
7048     // probability of executing the predicated block. If the instruction is not
7049     // predicated, we fall through to the next case.
7050     if (VF.isVector() && isScalarWithPredication(I)) {
7051       unsigned Cost = 0;
7052 
7053       // These instructions have a non-void type, so account for the phi nodes
7054       // that we will create. This cost is likely to be zero. The phi node
7055       // cost, if any, should be scaled by the block probability because it
7056       // models a copy at the end of each predicated block.
7057       Cost += VF.getKnownMinValue() *
7058               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7059 
7060       // The cost of the non-predicated instruction.
7061       Cost += VF.getKnownMinValue() *
7062               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7063 
7064       // The cost of insertelement and extractelement instructions needed for
7065       // scalarization.
7066       Cost += getScalarizationOverhead(I, VF);
7067 
7068       // Scale the cost by the probability of executing the predicated blocks.
7069       // This assumes the predicated block for each vector lane is equally
7070       // likely.
7071       return Cost / getReciprocalPredBlockProb();
7072     }
7073     LLVM_FALLTHROUGH;
7074   case Instruction::Add:
7075   case Instruction::FAdd:
7076   case Instruction::Sub:
7077   case Instruction::FSub:
7078   case Instruction::Mul:
7079   case Instruction::FMul:
7080   case Instruction::FDiv:
7081   case Instruction::FRem:
7082   case Instruction::Shl:
7083   case Instruction::LShr:
7084   case Instruction::AShr:
7085   case Instruction::And:
7086   case Instruction::Or:
7087   case Instruction::Xor: {
7088     // Since we will replace the stride by 1 the multiplication should go away.
7089     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7090       return 0;
7091     // Certain instructions can be cheaper to vectorize if they have a constant
7092     // second vector operand. One example of this are shifts on x86.
7093     Value *Op2 = I->getOperand(1);
7094     TargetTransformInfo::OperandValueProperties Op2VP;
7095     TargetTransformInfo::OperandValueKind Op2VK =
7096         TTI.getOperandInfo(Op2, Op2VP);
7097     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7098       Op2VK = TargetTransformInfo::OK_UniformValue;
7099 
7100     SmallVector<const Value *, 4> Operands(I->operand_values());
7101     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7102     return N * TTI.getArithmeticInstrCost(
7103                    I->getOpcode(), VectorTy, CostKind,
7104                    TargetTransformInfo::OK_AnyValue,
7105                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7106   }
7107   case Instruction::FNeg: {
7108     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7109     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7110     return N * TTI.getArithmeticInstrCost(
7111                    I->getOpcode(), VectorTy, CostKind,
7112                    TargetTransformInfo::OK_AnyValue,
7113                    TargetTransformInfo::OK_AnyValue,
7114                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7115                    I->getOperand(0), I);
7116   }
7117   case Instruction::Select: {
7118     SelectInst *SI = cast<SelectInst>(I);
7119     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7120     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7121     Type *CondTy = SI->getCondition()->getType();
7122     if (!ScalarCond) {
7123       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7124       CondTy = VectorType::get(CondTy, VF);
7125     }
7126     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7127                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7128   }
7129   case Instruction::ICmp:
7130   case Instruction::FCmp: {
7131     Type *ValTy = I->getOperand(0)->getType();
7132     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7133     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7134       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7135     VectorTy = ToVectorTy(ValTy, VF);
7136     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7137                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7138   }
7139   case Instruction::Store:
7140   case Instruction::Load: {
7141     ElementCount Width = VF;
7142     if (Width.isVector()) {
7143       InstWidening Decision = getWideningDecision(I, Width);
7144       assert(Decision != CM_Unknown &&
7145              "CM decision should be taken at this point");
7146       if (Decision == CM_Scalarize)
7147         Width = ElementCount::getFixed(1);
7148     }
7149     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7150     return getMemoryInstructionCost(I, VF);
7151   }
7152   case Instruction::ZExt:
7153   case Instruction::SExt:
7154   case Instruction::FPToUI:
7155   case Instruction::FPToSI:
7156   case Instruction::FPExt:
7157   case Instruction::PtrToInt:
7158   case Instruction::IntToPtr:
7159   case Instruction::SIToFP:
7160   case Instruction::UIToFP:
7161   case Instruction::Trunc:
7162   case Instruction::FPTrunc:
7163   case Instruction::BitCast: {
7164     // Computes the CastContextHint from a Load/Store instruction.
7165     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7166       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7167              "Expected a load or a store!");
7168 
7169       if (VF.isScalar() || !TheLoop->contains(I))
7170         return TTI::CastContextHint::Normal;
7171 
7172       switch (getWideningDecision(I, VF)) {
7173       case LoopVectorizationCostModel::CM_GatherScatter:
7174         return TTI::CastContextHint::GatherScatter;
7175       case LoopVectorizationCostModel::CM_Interleave:
7176         return TTI::CastContextHint::Interleave;
7177       case LoopVectorizationCostModel::CM_Scalarize:
7178       case LoopVectorizationCostModel::CM_Widen:
7179         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7180                                         : TTI::CastContextHint::Normal;
7181       case LoopVectorizationCostModel::CM_Widen_Reverse:
7182         return TTI::CastContextHint::Reversed;
7183       case LoopVectorizationCostModel::CM_Unknown:
7184         llvm_unreachable("Instr did not go through cost modelling?");
7185       }
7186 
7187       llvm_unreachable("Unhandled case!");
7188     };
7189 
7190     unsigned Opcode = I->getOpcode();
7191     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7192     // For Trunc, the context is the only user, which must be a StoreInst.
7193     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7194       if (I->hasOneUse())
7195         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7196           CCH = ComputeCCH(Store);
7197     }
7198     // For Z/Sext, the context is the operand, which must be a LoadInst.
7199     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7200              Opcode == Instruction::FPExt) {
7201       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7202         CCH = ComputeCCH(Load);
7203     }
7204 
7205     // We optimize the truncation of induction variables having constant
7206     // integer steps. The cost of these truncations is the same as the scalar
7207     // operation.
7208     if (isOptimizableIVTruncate(I, VF)) {
7209       auto *Trunc = cast<TruncInst>(I);
7210       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7211                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7212     }
7213 
7214     Type *SrcScalarTy = I->getOperand(0)->getType();
7215     Type *SrcVecTy =
7216         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7217     if (canTruncateToMinimalBitwidth(I, VF)) {
7218       // This cast is going to be shrunk. This may remove the cast or it might
7219       // turn it into slightly different cast. For example, if MinBW == 16,
7220       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7221       //
7222       // Calculate the modified src and dest types.
7223       Type *MinVecTy = VectorTy;
7224       if (Opcode == Instruction::Trunc) {
7225         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7226         VectorTy =
7227             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7228       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7229         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7230         VectorTy =
7231             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7232       }
7233     }
7234 
7235     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7236     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7237     return N *
7238            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7239   }
7240   case Instruction::Call: {
7241     bool NeedToScalarize;
7242     CallInst *CI = cast<CallInst>(I);
7243     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7244     if (getVectorIntrinsicIDForCall(CI, TLI))
7245       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7246     return CallCost;
7247   }
7248   case Instruction::ExtractValue: {
7249     InstructionCost ExtractCost =
7250         TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7251     assert(ExtractCost.isValid() && "Invalid cost for ExtractValue");
7252     return *(ExtractCost.getValue());
7253   }
7254   default:
7255     // The cost of executing VF copies of the scalar instruction. This opcode
7256     // is unknown. Assume that it is the same as 'mul'.
7257     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7258                                        Instruction::Mul, VectorTy, CostKind) +
7259            getScalarizationOverhead(I, VF);
7260   } // end of switch.
7261 }
7262 
7263 char LoopVectorize::ID = 0;
7264 
7265 static const char lv_name[] = "Loop Vectorization";
7266 
7267 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7268 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7269 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7270 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7271 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7272 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7273 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7274 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7275 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7276 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7277 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7278 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7279 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7280 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7281 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7282 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7283 
7284 namespace llvm {
7285 
7286 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7287 
7288 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7289                               bool VectorizeOnlyWhenForced) {
7290   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7291 }
7292 
7293 } // end namespace llvm
7294 
7295 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7296   // Check if the pointer operand of a load or store instruction is
7297   // consecutive.
7298   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7299     return Legal->isConsecutivePtr(Ptr);
7300   return false;
7301 }
7302 
7303 void LoopVectorizationCostModel::collectValuesToIgnore() {
7304   // Ignore ephemeral values.
7305   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7306 
7307   // Ignore type-promoting instructions we identified during reduction
7308   // detection.
7309   for (auto &Reduction : Legal->getReductionVars()) {
7310     RecurrenceDescriptor &RedDes = Reduction.second;
7311     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7312     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7313   }
7314   // Ignore type-casting instructions we identified during induction
7315   // detection.
7316   for (auto &Induction : Legal->getInductionVars()) {
7317     InductionDescriptor &IndDes = Induction.second;
7318     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7319     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7320   }
7321 }
7322 
7323 void LoopVectorizationCostModel::collectInLoopReductions() {
7324   for (auto &Reduction : Legal->getReductionVars()) {
7325     PHINode *Phi = Reduction.first;
7326     RecurrenceDescriptor &RdxDesc = Reduction.second;
7327 
7328     // We don't collect reductions that are type promoted (yet).
7329     if (RdxDesc.getRecurrenceType() != Phi->getType())
7330       continue;
7331 
7332     // If the target would prefer this reduction to happen "in-loop", then we
7333     // want to record it as such.
7334     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7335     if (!PreferInLoopReductions &&
7336         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7337                                    TargetTransformInfo::ReductionFlags()))
7338       continue;
7339 
7340     // Check that we can correctly put the reductions into the loop, by
7341     // finding the chain of operations that leads from the phi to the loop
7342     // exit value.
7343     SmallVector<Instruction *, 4> ReductionOperations =
7344         RdxDesc.getReductionOpChain(Phi, TheLoop);
7345     bool InLoop = !ReductionOperations.empty();
7346     if (InLoop)
7347       InLoopReductionChains[Phi] = ReductionOperations;
7348     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7349                       << " reduction for phi: " << *Phi << "\n");
7350   }
7351 }
7352 
7353 // TODO: we could return a pair of values that specify the max VF and
7354 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7355 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7356 // doesn't have a cost model that can choose which plan to execute if
7357 // more than one is generated.
7358 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7359                                  LoopVectorizationCostModel &CM) {
7360   unsigned WidestType;
7361   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7362   return WidestVectorRegBits / WidestType;
7363 }
7364 
7365 VectorizationFactor
7366 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7367   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7368   ElementCount VF = UserVF;
7369   // Outer loop handling: They may require CFG and instruction level
7370   // transformations before even evaluating whether vectorization is profitable.
7371   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7372   // the vectorization pipeline.
7373   if (!OrigLoop->isInnermost()) {
7374     // If the user doesn't provide a vectorization factor, determine a
7375     // reasonable one.
7376     if (UserVF.isZero()) {
7377       VF = ElementCount::getFixed(
7378           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7379       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7380 
7381       // Make sure we have a VF > 1 for stress testing.
7382       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7383         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7384                           << "overriding computed VF.\n");
7385         VF = ElementCount::getFixed(4);
7386       }
7387     }
7388     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7389     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7390            "VF needs to be a power of two");
7391     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7392                       << "VF " << VF << " to build VPlans.\n");
7393     buildVPlans(VF, VF);
7394 
7395     // For VPlan build stress testing, we bail out after VPlan construction.
7396     if (VPlanBuildStressTest)
7397       return VectorizationFactor::Disabled();
7398 
7399     return {VF, 0 /*Cost*/};
7400   }
7401 
7402   LLVM_DEBUG(
7403       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7404                 "VPlan-native path.\n");
7405   return VectorizationFactor::Disabled();
7406 }
7407 
7408 Optional<VectorizationFactor>
7409 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7410   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7411   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7412   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7413     return None;
7414 
7415   // Invalidate interleave groups if all blocks of loop will be predicated.
7416   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7417       !useMaskedInterleavedAccesses(*TTI)) {
7418     LLVM_DEBUG(
7419         dbgs()
7420         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7421            "which requires masked-interleaved support.\n");
7422     if (CM.InterleaveInfo.invalidateGroups())
7423       // Invalidating interleave groups also requires invalidating all decisions
7424       // based on them, which includes widening decisions and uniform and scalar
7425       // values.
7426       CM.invalidateCostModelingDecisions();
7427   }
7428 
7429   ElementCount MaxVF = MaybeMaxVF.getValue();
7430   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7431 
7432   if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7433     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7434     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7435            "VF needs to be a power of two");
7436     // Collect the instructions (and their associated costs) that will be more
7437     // profitable to scalarize.
7438     CM.selectUserVectorizationFactor(UserVF);
7439     CM.collectInLoopReductions();
7440     buildVPlansWithVPRecipes(UserVF, UserVF);
7441     LLVM_DEBUG(printPlans(dbgs()));
7442     return {{UserVF, 0}};
7443   }
7444 
7445   assert(!MaxVF.isScalable() &&
7446          "Scalable vectors not yet supported beyond this point");
7447 
7448   for (ElementCount VF = ElementCount::getFixed(1);
7449        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7450     // Collect Uniform and Scalar instructions after vectorization with VF.
7451     CM.collectUniformsAndScalars(VF);
7452 
7453     // Collect the instructions (and their associated costs) that will be more
7454     // profitable to scalarize.
7455     if (VF.isVector())
7456       CM.collectInstsToScalarize(VF);
7457   }
7458 
7459   CM.collectInLoopReductions();
7460 
7461   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7462   LLVM_DEBUG(printPlans(dbgs()));
7463   if (MaxVF.isScalar())
7464     return VectorizationFactor::Disabled();
7465 
7466   // Select the optimal vectorization factor.
7467   return CM.selectVectorizationFactor(MaxVF);
7468 }
7469 
7470 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7471   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7472                     << '\n');
7473   BestVF = VF;
7474   BestUF = UF;
7475 
7476   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7477     return !Plan->hasVF(VF);
7478   });
7479   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7480 }
7481 
7482 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7483                                            DominatorTree *DT) {
7484   // Perform the actual loop transformation.
7485 
7486   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7487   VPCallbackILV CallbackILV(ILV);
7488 
7489   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7490 
7491   VPTransformState State{*BestVF, BestUF,      LI,
7492                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7493                          &ILV,    CallbackILV};
7494   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7495   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7496   State.CanonicalIV = ILV.Induction;
7497 
7498   ILV.printDebugTracesAtStart();
7499 
7500   //===------------------------------------------------===//
7501   //
7502   // Notice: any optimization or new instruction that go
7503   // into the code below should also be implemented in
7504   // the cost-model.
7505   //
7506   //===------------------------------------------------===//
7507 
7508   // 2. Copy and widen instructions from the old loop into the new loop.
7509   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7510   VPlans.front()->execute(&State);
7511 
7512   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7513   //    predication, updating analyses.
7514   ILV.fixVectorizedLoop();
7515 
7516   ILV.printDebugTracesAtEnd();
7517 }
7518 
7519 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7520     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7521 
7522   // We create new control-flow for the vectorized loop, so the original exit
7523   // conditions will be dead after vectorization if it's only used by the
7524   // terminator
7525   SmallVector<BasicBlock*> ExitingBlocks;
7526   OrigLoop->getExitingBlocks(ExitingBlocks);
7527   for (auto *BB : ExitingBlocks) {
7528     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7529     if (!Cmp || !Cmp->hasOneUse())
7530       continue;
7531 
7532     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7533     if (!DeadInstructions.insert(Cmp).second)
7534       continue;
7535 
7536     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7537     // TODO: can recurse through operands in general
7538     for (Value *Op : Cmp->operands()) {
7539       if (isa<TruncInst>(Op) && Op->hasOneUse())
7540           DeadInstructions.insert(cast<Instruction>(Op));
7541     }
7542   }
7543 
7544   // We create new "steps" for induction variable updates to which the original
7545   // induction variables map. An original update instruction will be dead if
7546   // all its users except the induction variable are dead.
7547   auto *Latch = OrigLoop->getLoopLatch();
7548   for (auto &Induction : Legal->getInductionVars()) {
7549     PHINode *Ind = Induction.first;
7550     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7551 
7552     // If the tail is to be folded by masking, the primary induction variable,
7553     // if exists, isn't dead: it will be used for masking. Don't kill it.
7554     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7555       continue;
7556 
7557     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7558           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7559         }))
7560       DeadInstructions.insert(IndUpdate);
7561 
7562     // We record as "Dead" also the type-casting instructions we had identified
7563     // during induction analysis. We don't need any handling for them in the
7564     // vectorized loop because we have proven that, under a proper runtime
7565     // test guarding the vectorized loop, the value of the phi, and the casted
7566     // value of the phi, are the same. The last instruction in this casting chain
7567     // will get its scalar/vector/widened def from the scalar/vector/widened def
7568     // of the respective phi node. Any other casts in the induction def-use chain
7569     // have no other uses outside the phi update chain, and will be ignored.
7570     InductionDescriptor &IndDes = Induction.second;
7571     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7572     DeadInstructions.insert(Casts.begin(), Casts.end());
7573   }
7574 }
7575 
7576 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7577 
7578 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7579 
7580 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7581                                         Instruction::BinaryOps BinOp) {
7582   // When unrolling and the VF is 1, we only need to add a simple scalar.
7583   Type *Ty = Val->getType();
7584   assert(!Ty->isVectorTy() && "Val must be a scalar");
7585 
7586   if (Ty->isFloatingPointTy()) {
7587     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7588 
7589     // Floating point operations had to be 'fast' to enable the unrolling.
7590     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7591     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7592   }
7593   Constant *C = ConstantInt::get(Ty, StartIdx);
7594   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7595 }
7596 
7597 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7598   SmallVector<Metadata *, 4> MDs;
7599   // Reserve first location for self reference to the LoopID metadata node.
7600   MDs.push_back(nullptr);
7601   bool IsUnrollMetadata = false;
7602   MDNode *LoopID = L->getLoopID();
7603   if (LoopID) {
7604     // First find existing loop unrolling disable metadata.
7605     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7606       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7607       if (MD) {
7608         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7609         IsUnrollMetadata =
7610             S && S->getString().startswith("llvm.loop.unroll.disable");
7611       }
7612       MDs.push_back(LoopID->getOperand(i));
7613     }
7614   }
7615 
7616   if (!IsUnrollMetadata) {
7617     // Add runtime unroll disable metadata.
7618     LLVMContext &Context = L->getHeader()->getContext();
7619     SmallVector<Metadata *, 1> DisableOperands;
7620     DisableOperands.push_back(
7621         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7622     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7623     MDs.push_back(DisableNode);
7624     MDNode *NewLoopID = MDNode::get(Context, MDs);
7625     // Set operand 0 to refer to the loop id itself.
7626     NewLoopID->replaceOperandWith(0, NewLoopID);
7627     L->setLoopID(NewLoopID);
7628   }
7629 }
7630 
7631 //===--------------------------------------------------------------------===//
7632 // EpilogueVectorizerMainLoop
7633 //===--------------------------------------------------------------------===//
7634 
7635 /// This function is partially responsible for generating the control flow
7636 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7637 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7638   MDNode *OrigLoopID = OrigLoop->getLoopID();
7639   Loop *Lp = createVectorLoopSkeleton("");
7640 
7641   // Generate the code to check the minimum iteration count of the vector
7642   // epilogue (see below).
7643   EPI.EpilogueIterationCountCheck =
7644       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7645   EPI.EpilogueIterationCountCheck->setName("iter.check");
7646 
7647   // Generate the code to check any assumptions that we've made for SCEV
7648   // expressions.
7649   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7650   emitSCEVChecks(Lp, LoopScalarPreHeader);
7651 
7652   // If a safety check was generated save it.
7653   if (SavedPreHeader != LoopVectorPreHeader)
7654     EPI.SCEVSafetyCheck = SavedPreHeader;
7655 
7656   // Generate the code that checks at runtime if arrays overlap. We put the
7657   // checks into a separate block to make the more common case of few elements
7658   // faster.
7659   SavedPreHeader = LoopVectorPreHeader;
7660   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7661 
7662   // If a safety check was generated save/overwite it.
7663   if (SavedPreHeader != LoopVectorPreHeader)
7664     EPI.MemSafetyCheck = SavedPreHeader;
7665 
7666   // Generate the iteration count check for the main loop, *after* the check
7667   // for the epilogue loop, so that the path-length is shorter for the case
7668   // that goes directly through the vector epilogue. The longer-path length for
7669   // the main loop is compensated for, by the gain from vectorizing the larger
7670   // trip count. Note: the branch will get updated later on when we vectorize
7671   // the epilogue.
7672   EPI.MainLoopIterationCountCheck =
7673       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7674 
7675   // Generate the induction variable.
7676   OldInduction = Legal->getPrimaryInduction();
7677   Type *IdxTy = Legal->getWidestInductionType();
7678   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7679   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7680   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7681   EPI.VectorTripCount = CountRoundDown;
7682   Induction =
7683       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7684                               getDebugLocFromInstOrOperands(OldInduction));
7685 
7686   // Skip induction resume value creation here because they will be created in
7687   // the second pass. If we created them here, they wouldn't be used anyway,
7688   // because the vplan in the second pass still contains the inductions from the
7689   // original loop.
7690 
7691   return completeLoopSkeleton(Lp, OrigLoopID);
7692 }
7693 
7694 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7695   LLVM_DEBUG({
7696     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7697            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7698            << ", Main Loop UF:" << EPI.MainLoopUF
7699            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7700            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7701   });
7702 }
7703 
7704 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7705   DEBUG_WITH_TYPE(VerboseDebug, {
7706     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7707   });
7708 }
7709 
7710 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7711     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7712   assert(L && "Expected valid Loop.");
7713   assert(Bypass && "Expected valid bypass basic block.");
7714   unsigned VFactor =
7715       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7716   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7717   Value *Count = getOrCreateTripCount(L);
7718   // Reuse existing vector loop preheader for TC checks.
7719   // Note that new preheader block is generated for vector loop.
7720   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7721   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7722 
7723   // Generate code to check if the loop's trip count is less than VF * UF of the
7724   // main vector loop.
7725   auto P =
7726       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7727 
7728   Value *CheckMinIters = Builder.CreateICmp(
7729       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7730       "min.iters.check");
7731 
7732   if (!ForEpilogue)
7733     TCCheckBlock->setName("vector.main.loop.iter.check");
7734 
7735   // Create new preheader for vector loop.
7736   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7737                                    DT, LI, nullptr, "vector.ph");
7738 
7739   if (ForEpilogue) {
7740     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7741                                  DT->getNode(Bypass)->getIDom()) &&
7742            "TC check is expected to dominate Bypass");
7743 
7744     // Update dominator for Bypass & LoopExit.
7745     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7746     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7747 
7748     LoopBypassBlocks.push_back(TCCheckBlock);
7749 
7750     // Save the trip count so we don't have to regenerate it in the
7751     // vec.epilog.iter.check. This is safe to do because the trip count
7752     // generated here dominates the vector epilog iter check.
7753     EPI.TripCount = Count;
7754   }
7755 
7756   ReplaceInstWithInst(
7757       TCCheckBlock->getTerminator(),
7758       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7759 
7760   return TCCheckBlock;
7761 }
7762 
7763 //===--------------------------------------------------------------------===//
7764 // EpilogueVectorizerEpilogueLoop
7765 //===--------------------------------------------------------------------===//
7766 
7767 /// This function is partially responsible for generating the control flow
7768 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7769 BasicBlock *
7770 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7771   MDNode *OrigLoopID = OrigLoop->getLoopID();
7772   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7773 
7774   // Now, compare the remaining count and if there aren't enough iterations to
7775   // execute the vectorized epilogue skip to the scalar part.
7776   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7777   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7778   LoopVectorPreHeader =
7779       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7780                  LI, nullptr, "vec.epilog.ph");
7781   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7782                                           VecEpilogueIterationCountCheck);
7783 
7784   // Adjust the control flow taking the state info from the main loop
7785   // vectorization into account.
7786   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7787          "expected this to be saved from the previous pass.");
7788   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7789       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7790 
7791   DT->changeImmediateDominator(LoopVectorPreHeader,
7792                                EPI.MainLoopIterationCountCheck);
7793 
7794   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7795       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7796 
7797   if (EPI.SCEVSafetyCheck)
7798     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7799         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7800   if (EPI.MemSafetyCheck)
7801     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7802         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7803 
7804   DT->changeImmediateDominator(
7805       VecEpilogueIterationCountCheck,
7806       VecEpilogueIterationCountCheck->getSinglePredecessor());
7807 
7808   DT->changeImmediateDominator(LoopScalarPreHeader,
7809                                EPI.EpilogueIterationCountCheck);
7810   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7811 
7812   // Keep track of bypass blocks, as they feed start values to the induction
7813   // phis in the scalar loop preheader.
7814   if (EPI.SCEVSafetyCheck)
7815     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7816   if (EPI.MemSafetyCheck)
7817     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7818   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7819 
7820   // Generate a resume induction for the vector epilogue and put it in the
7821   // vector epilogue preheader
7822   Type *IdxTy = Legal->getWidestInductionType();
7823   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7824                                          LoopVectorPreHeader->getFirstNonPHI());
7825   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7826   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7827                            EPI.MainLoopIterationCountCheck);
7828 
7829   // Generate the induction variable.
7830   OldInduction = Legal->getPrimaryInduction();
7831   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7832   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7833   Value *StartIdx = EPResumeVal;
7834   Induction =
7835       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7836                               getDebugLocFromInstOrOperands(OldInduction));
7837 
7838   // Generate induction resume values. These variables save the new starting
7839   // indexes for the scalar loop. They are used to test if there are any tail
7840   // iterations left once the vector loop has completed.
7841   // Note that when the vectorized epilogue is skipped due to iteration count
7842   // check, then the resume value for the induction variable comes from
7843   // the trip count of the main vector loop, hence passing the AdditionalBypass
7844   // argument.
7845   createInductionResumeValues(Lp, CountRoundDown,
7846                               {VecEpilogueIterationCountCheck,
7847                                EPI.VectorTripCount} /* AdditionalBypass */);
7848 
7849   AddRuntimeUnrollDisableMetaData(Lp);
7850   return completeLoopSkeleton(Lp, OrigLoopID);
7851 }
7852 
7853 BasicBlock *
7854 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7855     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7856 
7857   assert(EPI.TripCount &&
7858          "Expected trip count to have been safed in the first pass.");
7859   assert(
7860       (!isa<Instruction>(EPI.TripCount) ||
7861        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7862       "saved trip count does not dominate insertion point.");
7863   Value *TC = EPI.TripCount;
7864   IRBuilder<> Builder(Insert->getTerminator());
7865   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7866 
7867   // Generate code to check if the loop's trip count is less than VF * UF of the
7868   // vector epilogue loop.
7869   auto P =
7870       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7871 
7872   Value *CheckMinIters = Builder.CreateICmp(
7873       P, Count,
7874       ConstantInt::get(Count->getType(),
7875                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7876       "min.epilog.iters.check");
7877 
7878   ReplaceInstWithInst(
7879       Insert->getTerminator(),
7880       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7881 
7882   LoopBypassBlocks.push_back(Insert);
7883   return Insert;
7884 }
7885 
7886 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7887   LLVM_DEBUG({
7888     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7889            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7890            << ", Main Loop UF:" << EPI.MainLoopUF
7891            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7892            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7893   });
7894 }
7895 
7896 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7897   DEBUG_WITH_TYPE(VerboseDebug, {
7898     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7899   });
7900 }
7901 
7902 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7903     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7904   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7905   bool PredicateAtRangeStart = Predicate(Range.Start);
7906 
7907   for (ElementCount TmpVF = Range.Start * 2;
7908        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7909     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7910       Range.End = TmpVF;
7911       break;
7912     }
7913 
7914   return PredicateAtRangeStart;
7915 }
7916 
7917 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7918 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7919 /// of VF's starting at a given VF and extending it as much as possible. Each
7920 /// vectorization decision can potentially shorten this sub-range during
7921 /// buildVPlan().
7922 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7923                                            ElementCount MaxVF) {
7924   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7925   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7926     VFRange SubRange = {VF, MaxVFPlusOne};
7927     VPlans.push_back(buildVPlan(SubRange));
7928     VF = SubRange.End;
7929   }
7930 }
7931 
7932 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7933                                          VPlanPtr &Plan) {
7934   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7935 
7936   // Look for cached value.
7937   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7938   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7939   if (ECEntryIt != EdgeMaskCache.end())
7940     return ECEntryIt->second;
7941 
7942   VPValue *SrcMask = createBlockInMask(Src, Plan);
7943 
7944   // The terminator has to be a branch inst!
7945   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7946   assert(BI && "Unexpected terminator found");
7947 
7948   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7949     return EdgeMaskCache[Edge] = SrcMask;
7950 
7951   // If source is an exiting block, we know the exit edge is dynamically dead
7952   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7953   // adding uses of an otherwise potentially dead instruction.
7954   if (OrigLoop->isLoopExiting(Src))
7955     return EdgeMaskCache[Edge] = SrcMask;
7956 
7957   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7958   assert(EdgeMask && "No Edge Mask found for condition");
7959 
7960   if (BI->getSuccessor(0) != Dst)
7961     EdgeMask = Builder.createNot(EdgeMask);
7962 
7963   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7964     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7965 
7966   return EdgeMaskCache[Edge] = EdgeMask;
7967 }
7968 
7969 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7970   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7971 
7972   // Look for cached value.
7973   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7974   if (BCEntryIt != BlockMaskCache.end())
7975     return BCEntryIt->second;
7976 
7977   // All-one mask is modelled as no-mask following the convention for masked
7978   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7979   VPValue *BlockMask = nullptr;
7980 
7981   if (OrigLoop->getHeader() == BB) {
7982     if (!CM.blockNeedsPredication(BB))
7983       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7984 
7985     // Create the block in mask as the first non-phi instruction in the block.
7986     VPBuilder::InsertPointGuard Guard(Builder);
7987     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7988     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7989 
7990     // Introduce the early-exit compare IV <= BTC to form header block mask.
7991     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7992     // Start by constructing the desired canonical IV.
7993     VPValue *IV = nullptr;
7994     if (Legal->getPrimaryInduction())
7995       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7996     else {
7997       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7998       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7999       IV = IVRecipe->getVPValue();
8000     }
8001     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8002     bool TailFolded = !CM.isScalarEpilogueAllowed();
8003 
8004     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8005       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8006       // as a second argument, we only pass the IV here and extract the
8007       // tripcount from the transform state where codegen of the VP instructions
8008       // happen.
8009       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8010     } else {
8011       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8012     }
8013     return BlockMaskCache[BB] = BlockMask;
8014   }
8015 
8016   // This is the block mask. We OR all incoming edges.
8017   for (auto *Predecessor : predecessors(BB)) {
8018     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8019     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8020       return BlockMaskCache[BB] = EdgeMask;
8021 
8022     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8023       BlockMask = EdgeMask;
8024       continue;
8025     }
8026 
8027     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8028   }
8029 
8030   return BlockMaskCache[BB] = BlockMask;
8031 }
8032 
8033 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8034                                                 VPlanPtr &Plan) {
8035   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8036          "Must be called with either a load or store");
8037 
8038   auto willWiden = [&](ElementCount VF) -> bool {
8039     if (VF.isScalar())
8040       return false;
8041     LoopVectorizationCostModel::InstWidening Decision =
8042         CM.getWideningDecision(I, VF);
8043     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8044            "CM decision should be taken at this point.");
8045     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8046       return true;
8047     if (CM.isScalarAfterVectorization(I, VF) ||
8048         CM.isProfitableToScalarize(I, VF))
8049       return false;
8050     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8051   };
8052 
8053   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8054     return nullptr;
8055 
8056   VPValue *Mask = nullptr;
8057   if (Legal->isMaskRequired(I))
8058     Mask = createBlockInMask(I->getParent(), Plan);
8059 
8060   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8061   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8062     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8063 
8064   StoreInst *Store = cast<StoreInst>(I);
8065   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8066   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8067 }
8068 
8069 VPWidenIntOrFpInductionRecipe *
8070 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8071   // Check if this is an integer or fp induction. If so, build the recipe that
8072   // produces its scalar and vector values.
8073   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8074   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8075       II.getKind() == InductionDescriptor::IK_FpInduction)
8076     return new VPWidenIntOrFpInductionRecipe(Phi);
8077 
8078   return nullptr;
8079 }
8080 
8081 VPWidenIntOrFpInductionRecipe *
8082 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8083                                                 VFRange &Range) const {
8084   // Optimize the special case where the source is a constant integer
8085   // induction variable. Notice that we can only optimize the 'trunc' case
8086   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8087   // (c) other casts depend on pointer size.
8088 
8089   // Determine whether \p K is a truncation based on an induction variable that
8090   // can be optimized.
8091   auto isOptimizableIVTruncate =
8092       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8093     return [=](ElementCount VF) -> bool {
8094       return CM.isOptimizableIVTruncate(K, VF);
8095     };
8096   };
8097 
8098   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8099           isOptimizableIVTruncate(I), Range))
8100     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8101                                              I);
8102   return nullptr;
8103 }
8104 
8105 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8106   // We know that all PHIs in non-header blocks are converted into selects, so
8107   // we don't have to worry about the insertion order and we can just use the
8108   // builder. At this point we generate the predication tree. There may be
8109   // duplications since this is a simple recursive scan, but future
8110   // optimizations will clean it up.
8111 
8112   SmallVector<VPValue *, 2> Operands;
8113   unsigned NumIncoming = Phi->getNumIncomingValues();
8114   for (unsigned In = 0; In < NumIncoming; In++) {
8115     VPValue *EdgeMask =
8116       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8117     assert((EdgeMask || NumIncoming == 1) &&
8118            "Multiple predecessors with one having a full mask");
8119     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8120     if (EdgeMask)
8121       Operands.push_back(EdgeMask);
8122   }
8123   return new VPBlendRecipe(Phi, Operands);
8124 }
8125 
8126 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8127                                                    VPlan &Plan) const {
8128 
8129   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8130       [this, CI](ElementCount VF) {
8131         return CM.isScalarWithPredication(CI, VF);
8132       },
8133       Range);
8134 
8135   if (IsPredicated)
8136     return nullptr;
8137 
8138   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8139   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8140              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8141              ID == Intrinsic::pseudoprobe))
8142     return nullptr;
8143 
8144   auto willWiden = [&](ElementCount VF) -> bool {
8145     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8146     // The following case may be scalarized depending on the VF.
8147     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8148     // version of the instruction.
8149     // Is it beneficial to perform intrinsic call compared to lib call?
8150     bool NeedToScalarize = false;
8151     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8152     bool UseVectorIntrinsic =
8153         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8154     return UseVectorIntrinsic || !NeedToScalarize;
8155   };
8156 
8157   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8158     return nullptr;
8159 
8160   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8161 }
8162 
8163 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8164   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8165          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8166   // Instruction should be widened, unless it is scalar after vectorization,
8167   // scalarization is profitable or it is predicated.
8168   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8169     return CM.isScalarAfterVectorization(I, VF) ||
8170            CM.isProfitableToScalarize(I, VF) ||
8171            CM.isScalarWithPredication(I, VF);
8172   };
8173   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8174                                                              Range);
8175 }
8176 
8177 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8178   auto IsVectorizableOpcode = [](unsigned Opcode) {
8179     switch (Opcode) {
8180     case Instruction::Add:
8181     case Instruction::And:
8182     case Instruction::AShr:
8183     case Instruction::BitCast:
8184     case Instruction::FAdd:
8185     case Instruction::FCmp:
8186     case Instruction::FDiv:
8187     case Instruction::FMul:
8188     case Instruction::FNeg:
8189     case Instruction::FPExt:
8190     case Instruction::FPToSI:
8191     case Instruction::FPToUI:
8192     case Instruction::FPTrunc:
8193     case Instruction::FRem:
8194     case Instruction::FSub:
8195     case Instruction::ICmp:
8196     case Instruction::IntToPtr:
8197     case Instruction::LShr:
8198     case Instruction::Mul:
8199     case Instruction::Or:
8200     case Instruction::PtrToInt:
8201     case Instruction::SDiv:
8202     case Instruction::Select:
8203     case Instruction::SExt:
8204     case Instruction::Shl:
8205     case Instruction::SIToFP:
8206     case Instruction::SRem:
8207     case Instruction::Sub:
8208     case Instruction::Trunc:
8209     case Instruction::UDiv:
8210     case Instruction::UIToFP:
8211     case Instruction::URem:
8212     case Instruction::Xor:
8213     case Instruction::ZExt:
8214       return true;
8215     }
8216     return false;
8217   };
8218 
8219   if (!IsVectorizableOpcode(I->getOpcode()))
8220     return nullptr;
8221 
8222   // Success: widen this instruction.
8223   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8224 }
8225 
8226 VPBasicBlock *VPRecipeBuilder::handleReplication(
8227     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8228     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8229     VPlanPtr &Plan) {
8230   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8231       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8232       Range);
8233 
8234   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8235       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8236       Range);
8237 
8238   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8239                                        IsUniform, IsPredicated);
8240   setRecipe(I, Recipe);
8241   Plan->addVPValue(I, Recipe);
8242 
8243   // Find if I uses a predicated instruction. If so, it will use its scalar
8244   // value. Avoid hoisting the insert-element which packs the scalar value into
8245   // a vector value, as that happens iff all users use the vector value.
8246   for (auto &Op : I->operands())
8247     if (auto *PredInst = dyn_cast<Instruction>(Op))
8248       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8249         PredInst2Recipe[PredInst]->setAlsoPack(false);
8250 
8251   // Finalize the recipe for Instr, first if it is not predicated.
8252   if (!IsPredicated) {
8253     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8254     VPBB->appendRecipe(Recipe);
8255     return VPBB;
8256   }
8257   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8258   assert(VPBB->getSuccessors().empty() &&
8259          "VPBB has successors when handling predicated replication.");
8260   // Record predicated instructions for above packing optimizations.
8261   PredInst2Recipe[I] = Recipe;
8262   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8263   VPBlockUtils::insertBlockAfter(Region, VPBB);
8264   auto *RegSucc = new VPBasicBlock();
8265   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8266   return RegSucc;
8267 }
8268 
8269 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8270                                                       VPRecipeBase *PredRecipe,
8271                                                       VPlanPtr &Plan) {
8272   // Instructions marked for predication are replicated and placed under an
8273   // if-then construct to prevent side-effects.
8274 
8275   // Generate recipes to compute the block mask for this region.
8276   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8277 
8278   // Build the triangular if-then region.
8279   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8280   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8281   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8282   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8283   auto *PHIRecipe = Instr->getType()->isVoidTy()
8284                         ? nullptr
8285                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8286   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8287   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8288   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8289 
8290   // Note: first set Entry as region entry and then connect successors starting
8291   // from it in order, to propagate the "parent" of each VPBasicBlock.
8292   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8293   VPBlockUtils::connectBlocks(Pred, Exit);
8294 
8295   return Region;
8296 }
8297 
8298 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8299                                                       VFRange &Range,
8300                                                       VPlanPtr &Plan) {
8301   // First, check for specific widening recipes that deal with calls, memory
8302   // operations, inductions and Phi nodes.
8303   if (auto *CI = dyn_cast<CallInst>(Instr))
8304     return tryToWidenCall(CI, Range, *Plan);
8305 
8306   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8307     return tryToWidenMemory(Instr, Range, Plan);
8308 
8309   VPRecipeBase *Recipe;
8310   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8311     if (Phi->getParent() != OrigLoop->getHeader())
8312       return tryToBlend(Phi, Plan);
8313     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8314       return Recipe;
8315     return new VPWidenPHIRecipe(Phi);
8316   }
8317 
8318   if (isa<TruncInst>(Instr) &&
8319       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8320     return Recipe;
8321 
8322   if (!shouldWiden(Instr, Range))
8323     return nullptr;
8324 
8325   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8326     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8327                                 OrigLoop);
8328 
8329   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8330     bool InvariantCond =
8331         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8332     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8333                                    InvariantCond);
8334   }
8335 
8336   return tryToWiden(Instr, *Plan);
8337 }
8338 
8339 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8340                                                         ElementCount MaxVF) {
8341   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8342 
8343   // Collect instructions from the original loop that will become trivially dead
8344   // in the vectorized loop. We don't need to vectorize these instructions. For
8345   // example, original induction update instructions can become dead because we
8346   // separately emit induction "steps" when generating code for the new loop.
8347   // Similarly, we create a new latch condition when setting up the structure
8348   // of the new loop, so the old one can become dead.
8349   SmallPtrSet<Instruction *, 4> DeadInstructions;
8350   collectTriviallyDeadInstructions(DeadInstructions);
8351 
8352   // Add assume instructions we need to drop to DeadInstructions, to prevent
8353   // them from being added to the VPlan.
8354   // TODO: We only need to drop assumes in blocks that get flattend. If the
8355   // control flow is preserved, we should keep them.
8356   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8357   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8358 
8359   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8360   // Dead instructions do not need sinking. Remove them from SinkAfter.
8361   for (Instruction *I : DeadInstructions)
8362     SinkAfter.erase(I);
8363 
8364   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8365   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8366     VFRange SubRange = {VF, MaxVFPlusOne};
8367     VPlans.push_back(
8368         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8369     VF = SubRange.End;
8370   }
8371 }
8372 
8373 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8374     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8375     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8376 
8377   // Hold a mapping from predicated instructions to their recipes, in order to
8378   // fix their AlsoPack behavior if a user is determined to replicate and use a
8379   // scalar instead of vector value.
8380   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8381 
8382   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8383 
8384   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8385 
8386   // ---------------------------------------------------------------------------
8387   // Pre-construction: record ingredients whose recipes we'll need to further
8388   // process after constructing the initial VPlan.
8389   // ---------------------------------------------------------------------------
8390 
8391   // Mark instructions we'll need to sink later and their targets as
8392   // ingredients whose recipe we'll need to record.
8393   for (auto &Entry : SinkAfter) {
8394     RecipeBuilder.recordRecipeOf(Entry.first);
8395     RecipeBuilder.recordRecipeOf(Entry.second);
8396   }
8397   for (auto &Reduction : CM.getInLoopReductionChains()) {
8398     PHINode *Phi = Reduction.first;
8399     RecurrenceDescriptor::RecurrenceKind Kind =
8400         Legal->getReductionVars()[Phi].getRecurrenceKind();
8401     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8402 
8403     RecipeBuilder.recordRecipeOf(Phi);
8404     for (auto &R : ReductionOperations) {
8405       RecipeBuilder.recordRecipeOf(R);
8406       // For min/max reducitons, where we have a pair of icmp/select, we also
8407       // need to record the ICmp recipe, so it can be removed later.
8408       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8409           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8410         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8411       }
8412     }
8413   }
8414 
8415   // For each interleave group which is relevant for this (possibly trimmed)
8416   // Range, add it to the set of groups to be later applied to the VPlan and add
8417   // placeholders for its members' Recipes which we'll be replacing with a
8418   // single VPInterleaveRecipe.
8419   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8420     auto applyIG = [IG, this](ElementCount VF) -> bool {
8421       return (VF.isVector() && // Query is illegal for VF == 1
8422               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8423                   LoopVectorizationCostModel::CM_Interleave);
8424     };
8425     if (!getDecisionAndClampRange(applyIG, Range))
8426       continue;
8427     InterleaveGroups.insert(IG);
8428     for (unsigned i = 0; i < IG->getFactor(); i++)
8429       if (Instruction *Member = IG->getMember(i))
8430         RecipeBuilder.recordRecipeOf(Member);
8431   };
8432 
8433   // ---------------------------------------------------------------------------
8434   // Build initial VPlan: Scan the body of the loop in a topological order to
8435   // visit each basic block after having visited its predecessor basic blocks.
8436   // ---------------------------------------------------------------------------
8437 
8438   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8439   auto Plan = std::make_unique<VPlan>();
8440   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8441   Plan->setEntry(VPBB);
8442 
8443   // Scan the body of the loop in a topological order to visit each basic block
8444   // after having visited its predecessor basic blocks.
8445   LoopBlocksDFS DFS(OrigLoop);
8446   DFS.perform(LI);
8447 
8448   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8449     // Relevant instructions from basic block BB will be grouped into VPRecipe
8450     // ingredients and fill a new VPBasicBlock.
8451     unsigned VPBBsForBB = 0;
8452     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8453     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8454     VPBB = FirstVPBBForBB;
8455     Builder.setInsertPoint(VPBB);
8456 
8457     // Introduce each ingredient into VPlan.
8458     // TODO: Model and preserve debug instrinsics in VPlan.
8459     for (Instruction &I : BB->instructionsWithoutDebug()) {
8460       Instruction *Instr = &I;
8461 
8462       // First filter out irrelevant instructions, to ensure no recipes are
8463       // built for them.
8464       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8465         continue;
8466 
8467       if (auto Recipe =
8468               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8469         for (auto *Def : Recipe->definedValues()) {
8470           auto *UV = Def->getUnderlyingValue();
8471           Plan->addVPValue(UV, Def);
8472         }
8473 
8474         RecipeBuilder.setRecipe(Instr, Recipe);
8475         VPBB->appendRecipe(Recipe);
8476         continue;
8477       }
8478 
8479       // Otherwise, if all widening options failed, Instruction is to be
8480       // replicated. This may create a successor for VPBB.
8481       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8482           Instr, Range, VPBB, PredInst2Recipe, Plan);
8483       if (NextVPBB != VPBB) {
8484         VPBB = NextVPBB;
8485         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8486                                     : "");
8487       }
8488     }
8489   }
8490 
8491   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8492   // may also be empty, such as the last one VPBB, reflecting original
8493   // basic-blocks with no recipes.
8494   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8495   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8496   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8497   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8498   delete PreEntry;
8499 
8500   // ---------------------------------------------------------------------------
8501   // Transform initial VPlan: Apply previously taken decisions, in order, to
8502   // bring the VPlan to its final state.
8503   // ---------------------------------------------------------------------------
8504 
8505   // Apply Sink-After legal constraints.
8506   for (auto &Entry : SinkAfter) {
8507     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8508     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8509     Sink->moveAfter(Target);
8510   }
8511 
8512   // Interleave memory: for each Interleave Group we marked earlier as relevant
8513   // for this VPlan, replace the Recipes widening its memory instructions with a
8514   // single VPInterleaveRecipe at its insertion point.
8515   for (auto IG : InterleaveGroups) {
8516     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8517         RecipeBuilder.getRecipe(IG->getInsertPos()));
8518     SmallVector<VPValue *, 4> StoredValues;
8519     for (unsigned i = 0; i < IG->getFactor(); ++i)
8520       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8521         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8522 
8523     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8524                                         Recipe->getMask());
8525     VPIG->insertBefore(Recipe);
8526     unsigned J = 0;
8527     for (unsigned i = 0; i < IG->getFactor(); ++i)
8528       if (Instruction *Member = IG->getMember(i)) {
8529         if (!Member->getType()->isVoidTy()) {
8530           VPValue *OriginalV = Plan->getVPValue(Member);
8531           Plan->removeVPValueFor(Member);
8532           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8533           J++;
8534         }
8535         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8536       }
8537   }
8538 
8539   // Adjust the recipes for any inloop reductions.
8540   if (Range.Start.isVector())
8541     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8542 
8543   // Finally, if tail is folded by masking, introduce selects between the phi
8544   // and the live-out instruction of each reduction, at the end of the latch.
8545   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8546     Builder.setInsertPoint(VPBB);
8547     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8548     for (auto &Reduction : Legal->getReductionVars()) {
8549       if (CM.isInLoopReduction(Reduction.first))
8550         continue;
8551       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8552       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8553       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8554     }
8555   }
8556 
8557   std::string PlanName;
8558   raw_string_ostream RSO(PlanName);
8559   ElementCount VF = Range.Start;
8560   Plan->addVF(VF);
8561   RSO << "Initial VPlan for VF={" << VF;
8562   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8563     Plan->addVF(VF);
8564     RSO << "," << VF;
8565   }
8566   RSO << "},UF>=1";
8567   RSO.flush();
8568   Plan->setName(PlanName);
8569 
8570   return Plan;
8571 }
8572 
8573 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8574   // Outer loop handling: They may require CFG and instruction level
8575   // transformations before even evaluating whether vectorization is profitable.
8576   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8577   // the vectorization pipeline.
8578   assert(!OrigLoop->isInnermost());
8579   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8580 
8581   // Create new empty VPlan
8582   auto Plan = std::make_unique<VPlan>();
8583 
8584   // Build hierarchical CFG
8585   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8586   HCFGBuilder.buildHierarchicalCFG();
8587 
8588   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8589        VF *= 2)
8590     Plan->addVF(VF);
8591 
8592   if (EnableVPlanPredication) {
8593     VPlanPredicator VPP(*Plan);
8594     VPP.predicate();
8595 
8596     // Avoid running transformation to recipes until masked code generation in
8597     // VPlan-native path is in place.
8598     return Plan;
8599   }
8600 
8601   SmallPtrSet<Instruction *, 1> DeadInstructions;
8602   VPlanTransforms::VPInstructionsToVPRecipes(
8603       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8604   return Plan;
8605 }
8606 
8607 // Adjust the recipes for any inloop reductions. The chain of instructions
8608 // leading from the loop exit instr to the phi need to be converted to
8609 // reductions, with one operand being vector and the other being the scalar
8610 // reduction chain.
8611 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8612     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8613   for (auto &Reduction : CM.getInLoopReductionChains()) {
8614     PHINode *Phi = Reduction.first;
8615     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8616     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8617 
8618     // ReductionOperations are orders top-down from the phi's use to the
8619     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8620     // which of the two operands will remain scalar and which will be reduced.
8621     // For minmax the chain will be the select instructions.
8622     Instruction *Chain = Phi;
8623     for (Instruction *R : ReductionOperations) {
8624       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8625       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8626 
8627       VPValue *ChainOp = Plan->getVPValue(Chain);
8628       unsigned FirstOpId;
8629       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8630           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8631         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8632                "Expected to replace a VPWidenSelectSC");
8633         FirstOpId = 1;
8634       } else {
8635         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8636                "Expected to replace a VPWidenSC");
8637         FirstOpId = 0;
8638       }
8639       unsigned VecOpId =
8640           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8641       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8642 
8643       auto *CondOp = CM.foldTailByMasking()
8644                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8645                          : nullptr;
8646       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8647           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8648       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8649       Plan->removeVPValueFor(R);
8650       Plan->addVPValue(R, RedRecipe);
8651       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8652       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8653       WidenRecipe->eraseFromParent();
8654 
8655       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8656           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8657         VPRecipeBase *CompareRecipe =
8658             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8659         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8660                "Expected to replace a VPWidenSC");
8661         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8662                "Expected no remaining users");
8663         CompareRecipe->eraseFromParent();
8664       }
8665       Chain = R;
8666     }
8667   }
8668 }
8669 
8670 Value* LoopVectorizationPlanner::VPCallbackILV::
8671 getOrCreateVectorValues(Value *V, unsigned Part) {
8672       return ILV.getOrCreateVectorValue(V, Part);
8673 }
8674 
8675 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8676     Value *V, const VPIteration &Instance) {
8677   return ILV.getOrCreateScalarValue(V, Instance);
8678 }
8679 
8680 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8681                                VPSlotTracker &SlotTracker) const {
8682   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8683   IG->getInsertPos()->printAsOperand(O, false);
8684   O << ", ";
8685   getAddr()->printAsOperand(O, SlotTracker);
8686   VPValue *Mask = getMask();
8687   if (Mask) {
8688     O << ", ";
8689     Mask->printAsOperand(O, SlotTracker);
8690   }
8691   for (unsigned i = 0; i < IG->getFactor(); ++i)
8692     if (Instruction *I = IG->getMember(i))
8693       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8694 }
8695 
8696 void VPWidenCallRecipe::execute(VPTransformState &State) {
8697   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8698                                   *this, State);
8699 }
8700 
8701 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8702   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8703                                     this, *this, InvariantCond, State);
8704 }
8705 
8706 void VPWidenRecipe::execute(VPTransformState &State) {
8707   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8708 }
8709 
8710 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8711   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8712                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8713                       IsIndexLoopInvariant, State);
8714 }
8715 
8716 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8717   assert(!State.Instance && "Int or FP induction being replicated.");
8718   State.ILV->widenIntOrFpInduction(IV, Trunc);
8719 }
8720 
8721 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8722   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8723 }
8724 
8725 void VPBlendRecipe::execute(VPTransformState &State) {
8726   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8727   // We know that all PHIs in non-header blocks are converted into
8728   // selects, so we don't have to worry about the insertion order and we
8729   // can just use the builder.
8730   // At this point we generate the predication tree. There may be
8731   // duplications since this is a simple recursive scan, but future
8732   // optimizations will clean it up.
8733 
8734   unsigned NumIncoming = getNumIncomingValues();
8735 
8736   // Generate a sequence of selects of the form:
8737   // SELECT(Mask3, In3,
8738   //        SELECT(Mask2, In2,
8739   //               SELECT(Mask1, In1,
8740   //                      In0)))
8741   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8742   // are essentially undef are taken from In0.
8743   InnerLoopVectorizer::VectorParts Entry(State.UF);
8744   for (unsigned In = 0; In < NumIncoming; ++In) {
8745     for (unsigned Part = 0; Part < State.UF; ++Part) {
8746       // We might have single edge PHIs (blocks) - use an identity
8747       // 'select' for the first PHI operand.
8748       Value *In0 = State.get(getIncomingValue(In), Part);
8749       if (In == 0)
8750         Entry[Part] = In0; // Initialize with the first incoming value.
8751       else {
8752         // Select between the current value and the previous incoming edge
8753         // based on the incoming mask.
8754         Value *Cond = State.get(getMask(In), Part);
8755         Entry[Part] =
8756             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8757       }
8758     }
8759   }
8760   for (unsigned Part = 0; Part < State.UF; ++Part)
8761     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8762 }
8763 
8764 void VPInterleaveRecipe::execute(VPTransformState &State) {
8765   assert(!State.Instance && "Interleave group being replicated.");
8766   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
8767                                       getStoredValues(), getMask());
8768 }
8769 
8770 void VPReductionRecipe::execute(VPTransformState &State) {
8771   assert(!State.Instance && "Reduction being replicated.");
8772   for (unsigned Part = 0; Part < State.UF; ++Part) {
8773     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8774     Value *NewVecOp = State.get(getVecOp(), Part);
8775     if (VPValue *Cond = getCondOp()) {
8776       Value *NewCond = State.get(Cond, Part);
8777       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8778       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8779           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8780       Constant *IdenVec =
8781           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8782       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8783       NewVecOp = Select;
8784     }
8785     Value *NewRed =
8786         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8787     Value *PrevInChain = State.get(getChainOp(), Part);
8788     Value *NextInChain;
8789     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8790         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8791       NextInChain =
8792           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8793                          NewRed, PrevInChain);
8794     } else {
8795       NextInChain = State.Builder.CreateBinOp(
8796           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8797           PrevInChain);
8798     }
8799     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8800   }
8801 }
8802 
8803 void VPReplicateRecipe::execute(VPTransformState &State) {
8804   if (State.Instance) { // Generate a single instance.
8805     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8806     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8807                                     *State.Instance, IsPredicated, State);
8808     // Insert scalar instance packing it into a vector.
8809     if (AlsoPack && State.VF.isVector()) {
8810       // If we're constructing lane 0, initialize to start from undef.
8811       if (State.Instance->Lane == 0) {
8812         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8813         Value *Undef = UndefValue::get(
8814             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8815         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8816                                       State.Instance->Part, Undef);
8817       }
8818       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8819                                            *State.Instance);
8820     }
8821     return;
8822   }
8823 
8824   // Generate scalar instances for all VF lanes of all UF parts, unless the
8825   // instruction is uniform inwhich case generate only the first lane for each
8826   // of the UF parts.
8827   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8828   assert((!State.VF.isScalable() || IsUniform) &&
8829          "Can't scalarize a scalable vector");
8830   for (unsigned Part = 0; Part < State.UF; ++Part)
8831     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8832       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8833                                       IsPredicated, State);
8834 }
8835 
8836 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8837   assert(State.Instance && "Branch on Mask works only on single instance.");
8838 
8839   unsigned Part = State.Instance->Part;
8840   unsigned Lane = State.Instance->Lane;
8841 
8842   Value *ConditionBit = nullptr;
8843   VPValue *BlockInMask = getMask();
8844   if (BlockInMask) {
8845     ConditionBit = State.get(BlockInMask, Part);
8846     if (ConditionBit->getType()->isVectorTy())
8847       ConditionBit = State.Builder.CreateExtractElement(
8848           ConditionBit, State.Builder.getInt32(Lane));
8849   } else // Block in mask is all-one.
8850     ConditionBit = State.Builder.getTrue();
8851 
8852   // Replace the temporary unreachable terminator with a new conditional branch,
8853   // whose two destinations will be set later when they are created.
8854   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8855   assert(isa<UnreachableInst>(CurrentTerminator) &&
8856          "Expected to replace unreachable terminator with conditional branch.");
8857   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8858   CondBr->setSuccessor(0, nullptr);
8859   ReplaceInstWithInst(CurrentTerminator, CondBr);
8860 }
8861 
8862 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8863   assert(State.Instance && "Predicated instruction PHI works per instance.");
8864   Instruction *ScalarPredInst =
8865       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8866   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8867   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8868   assert(PredicatingBB && "Predicated block has no single predecessor.");
8869 
8870   // By current pack/unpack logic we need to generate only a single phi node: if
8871   // a vector value for the predicated instruction exists at this point it means
8872   // the instruction has vector users only, and a phi for the vector value is
8873   // needed. In this case the recipe of the predicated instruction is marked to
8874   // also do that packing, thereby "hoisting" the insert-element sequence.
8875   // Otherwise, a phi node for the scalar value is needed.
8876   unsigned Part = State.Instance->Part;
8877   Instruction *PredInst =
8878       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8879   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8880     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8881     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8882     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8883     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8884     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8885     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8886   } else {
8887     Type *PredInstType = PredInst->getType();
8888     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8889     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8890     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8891     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8892   }
8893 }
8894 
8895 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8896   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
8897   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
8898                                         StoredValue ? nullptr : getVPValue(),
8899                                         getAddr(), StoredValue, getMask());
8900 }
8901 
8902 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8903 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8904 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8905 // for predication.
8906 static ScalarEpilogueLowering getScalarEpilogueLowering(
8907     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8908     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8909     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8910     LoopVectorizationLegality &LVL) {
8911   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8912   // don't look at hints or options, and don't request a scalar epilogue.
8913   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8914   // LoopAccessInfo (due to code dependency and not being able to reliably get
8915   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8916   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8917   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8918   // back to the old way and vectorize with versioning when forced. See D81345.)
8919   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8920                                                       PGSOQueryType::IRPass) &&
8921                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8922     return CM_ScalarEpilogueNotAllowedOptSize;
8923 
8924   // 2) If set, obey the directives
8925   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8926     switch (PreferPredicateOverEpilogue) {
8927     case PreferPredicateTy::ScalarEpilogue:
8928       return CM_ScalarEpilogueAllowed;
8929     case PreferPredicateTy::PredicateElseScalarEpilogue:
8930       return CM_ScalarEpilogueNotNeededUsePredicate;
8931     case PreferPredicateTy::PredicateOrDontVectorize:
8932       return CM_ScalarEpilogueNotAllowedUsePredicate;
8933     };
8934   }
8935 
8936   // 3) If set, obey the hints
8937   switch (Hints.getPredicate()) {
8938   case LoopVectorizeHints::FK_Enabled:
8939     return CM_ScalarEpilogueNotNeededUsePredicate;
8940   case LoopVectorizeHints::FK_Disabled:
8941     return CM_ScalarEpilogueAllowed;
8942   };
8943 
8944   // 4) if the TTI hook indicates this is profitable, request predication.
8945   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8946                                        LVL.getLAI()))
8947     return CM_ScalarEpilogueNotNeededUsePredicate;
8948 
8949   return CM_ScalarEpilogueAllowed;
8950 }
8951 
8952 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8953                            unsigned Part) {
8954   set(Def, V, Part);
8955   ILV->setVectorValue(IRDef, Part, V);
8956 }
8957 
8958 // Process the loop in the VPlan-native vectorization path. This path builds
8959 // VPlan upfront in the vectorization pipeline, which allows to apply
8960 // VPlan-to-VPlan transformations from the very beginning without modifying the
8961 // input LLVM IR.
8962 static bool processLoopInVPlanNativePath(
8963     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8964     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8965     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8966     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8967     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8968 
8969   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8970     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8971     return false;
8972   }
8973   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8974   Function *F = L->getHeader()->getParent();
8975   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8976 
8977   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8978       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8979 
8980   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8981                                 &Hints, IAI);
8982   // Use the planner for outer loop vectorization.
8983   // TODO: CM is not used at this point inside the planner. Turn CM into an
8984   // optional argument if we don't need it in the future.
8985   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8986 
8987   // Get user vectorization factor.
8988   ElementCount UserVF = Hints.getWidth();
8989 
8990   // Plan how to best vectorize, return the best VF and its cost.
8991   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8992 
8993   // If we are stress testing VPlan builds, do not attempt to generate vector
8994   // code. Masked vector code generation support will follow soon.
8995   // Also, do not attempt to vectorize if no vector code will be produced.
8996   if (VPlanBuildStressTest || EnableVPlanPredication ||
8997       VectorizationFactor::Disabled() == VF)
8998     return false;
8999 
9000   LVP.setBestPlan(VF.Width, 1);
9001 
9002   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9003                          &CM, BFI, PSI);
9004   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9005                     << L->getHeader()->getParent()->getName() << "\"\n");
9006   LVP.executePlan(LB, DT);
9007 
9008   // Mark the loop as already vectorized to avoid vectorizing again.
9009   Hints.setAlreadyVectorized();
9010 
9011   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9012   return true;
9013 }
9014 
9015 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9016     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9017                                !EnableLoopInterleaving),
9018       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9019                               !EnableLoopVectorization) {}
9020 
9021 bool LoopVectorizePass::processLoop(Loop *L) {
9022   assert((EnableVPlanNativePath || L->isInnermost()) &&
9023          "VPlan-native path is not enabled. Only process inner loops.");
9024 
9025 #ifndef NDEBUG
9026   const std::string DebugLocStr = getDebugLocString(L);
9027 #endif /* NDEBUG */
9028 
9029   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9030                     << L->getHeader()->getParent()->getName() << "\" from "
9031                     << DebugLocStr << "\n");
9032 
9033   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9034 
9035   LLVM_DEBUG(
9036       dbgs() << "LV: Loop hints:"
9037              << " force="
9038              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9039                      ? "disabled"
9040                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9041                             ? "enabled"
9042                             : "?"))
9043              << " width=" << Hints.getWidth()
9044              << " unroll=" << Hints.getInterleave() << "\n");
9045 
9046   // Function containing loop
9047   Function *F = L->getHeader()->getParent();
9048 
9049   // Looking at the diagnostic output is the only way to determine if a loop
9050   // was vectorized (other than looking at the IR or machine code), so it
9051   // is important to generate an optimization remark for each loop. Most of
9052   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9053   // generated as OptimizationRemark and OptimizationRemarkMissed are
9054   // less verbose reporting vectorized loops and unvectorized loops that may
9055   // benefit from vectorization, respectively.
9056 
9057   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9058     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9059     return false;
9060   }
9061 
9062   PredicatedScalarEvolution PSE(*SE, *L);
9063 
9064   // Check if it is legal to vectorize the loop.
9065   LoopVectorizationRequirements Requirements(*ORE);
9066   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9067                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9068   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9069     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9070     Hints.emitRemarkWithHints();
9071     return false;
9072   }
9073 
9074   // Check the function attributes and profiles to find out if this function
9075   // should be optimized for size.
9076   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9077       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9078 
9079   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9080   // here. They may require CFG and instruction level transformations before
9081   // even evaluating whether vectorization is profitable. Since we cannot modify
9082   // the incoming IR, we need to build VPlan upfront in the vectorization
9083   // pipeline.
9084   if (!L->isInnermost())
9085     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9086                                         ORE, BFI, PSI, Hints);
9087 
9088   assert(L->isInnermost() && "Inner loop expected.");
9089 
9090   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9091   // count by optimizing for size, to minimize overheads.
9092   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9093   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9094     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9095                       << "This loop is worth vectorizing only if no scalar "
9096                       << "iteration overheads are incurred.");
9097     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9098       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9099     else {
9100       LLVM_DEBUG(dbgs() << "\n");
9101       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9102     }
9103   }
9104 
9105   // Check the function attributes to see if implicit floats are allowed.
9106   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9107   // an integer loop and the vector instructions selected are purely integer
9108   // vector instructions?
9109   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9110     reportVectorizationFailure(
9111         "Can't vectorize when the NoImplicitFloat attribute is used",
9112         "loop not vectorized due to NoImplicitFloat attribute",
9113         "NoImplicitFloat", ORE, L);
9114     Hints.emitRemarkWithHints();
9115     return false;
9116   }
9117 
9118   // Check if the target supports potentially unsafe FP vectorization.
9119   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9120   // for the target we're vectorizing for, to make sure none of the
9121   // additional fp-math flags can help.
9122   if (Hints.isPotentiallyUnsafe() &&
9123       TTI->isFPVectorizationPotentiallyUnsafe()) {
9124     reportVectorizationFailure(
9125         "Potentially unsafe FP op prevents vectorization",
9126         "loop not vectorized due to unsafe FP support.",
9127         "UnsafeFP", ORE, L);
9128     Hints.emitRemarkWithHints();
9129     return false;
9130   }
9131 
9132   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9133   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9134 
9135   // If an override option has been passed in for interleaved accesses, use it.
9136   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9137     UseInterleaved = EnableInterleavedMemAccesses;
9138 
9139   // Analyze interleaved memory accesses.
9140   if (UseInterleaved) {
9141     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9142   }
9143 
9144   // Use the cost model.
9145   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9146                                 F, &Hints, IAI);
9147   CM.collectValuesToIgnore();
9148 
9149   // Use the planner for vectorization.
9150   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9151 
9152   // Get user vectorization factor and interleave count.
9153   ElementCount UserVF = Hints.getWidth();
9154   unsigned UserIC = Hints.getInterleave();
9155 
9156   // Plan how to best vectorize, return the best VF and its cost.
9157   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9158 
9159   VectorizationFactor VF = VectorizationFactor::Disabled();
9160   unsigned IC = 1;
9161 
9162   if (MaybeVF) {
9163     VF = *MaybeVF;
9164     // Select the interleave count.
9165     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9166   }
9167 
9168   // Identify the diagnostic messages that should be produced.
9169   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9170   bool VectorizeLoop = true, InterleaveLoop = true;
9171   if (Requirements.doesNotMeet(F, L, Hints)) {
9172     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9173                          "requirements.\n");
9174     Hints.emitRemarkWithHints();
9175     return false;
9176   }
9177 
9178   if (VF.Width.isScalar()) {
9179     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9180     VecDiagMsg = std::make_pair(
9181         "VectorizationNotBeneficial",
9182         "the cost-model indicates that vectorization is not beneficial");
9183     VectorizeLoop = false;
9184   }
9185 
9186   if (!MaybeVF && UserIC > 1) {
9187     // Tell the user interleaving was avoided up-front, despite being explicitly
9188     // requested.
9189     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9190                          "interleaving should be avoided up front\n");
9191     IntDiagMsg = std::make_pair(
9192         "InterleavingAvoided",
9193         "Ignoring UserIC, because interleaving was avoided up front");
9194     InterleaveLoop = false;
9195   } else if (IC == 1 && UserIC <= 1) {
9196     // Tell the user interleaving is not beneficial.
9197     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9198     IntDiagMsg = std::make_pair(
9199         "InterleavingNotBeneficial",
9200         "the cost-model indicates that interleaving is not beneficial");
9201     InterleaveLoop = false;
9202     if (UserIC == 1) {
9203       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9204       IntDiagMsg.second +=
9205           " and is explicitly disabled or interleave count is set to 1";
9206     }
9207   } else if (IC > 1 && UserIC == 1) {
9208     // Tell the user interleaving is beneficial, but it explicitly disabled.
9209     LLVM_DEBUG(
9210         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9211     IntDiagMsg = std::make_pair(
9212         "InterleavingBeneficialButDisabled",
9213         "the cost-model indicates that interleaving is beneficial "
9214         "but is explicitly disabled or interleave count is set to 1");
9215     InterleaveLoop = false;
9216   }
9217 
9218   // Override IC if user provided an interleave count.
9219   IC = UserIC > 0 ? UserIC : IC;
9220 
9221   // Emit diagnostic messages, if any.
9222   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9223   if (!VectorizeLoop && !InterleaveLoop) {
9224     // Do not vectorize or interleaving the loop.
9225     ORE->emit([&]() {
9226       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9227                                       L->getStartLoc(), L->getHeader())
9228              << VecDiagMsg.second;
9229     });
9230     ORE->emit([&]() {
9231       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9232                                       L->getStartLoc(), L->getHeader())
9233              << IntDiagMsg.second;
9234     });
9235     return false;
9236   } else if (!VectorizeLoop && InterleaveLoop) {
9237     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9238     ORE->emit([&]() {
9239       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9240                                         L->getStartLoc(), L->getHeader())
9241              << VecDiagMsg.second;
9242     });
9243   } else if (VectorizeLoop && !InterleaveLoop) {
9244     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9245                       << ") in " << DebugLocStr << '\n');
9246     ORE->emit([&]() {
9247       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9248                                         L->getStartLoc(), L->getHeader())
9249              << IntDiagMsg.second;
9250     });
9251   } else if (VectorizeLoop && InterleaveLoop) {
9252     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9253                       << ") in " << DebugLocStr << '\n');
9254     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9255   }
9256 
9257   LVP.setBestPlan(VF.Width, IC);
9258 
9259   using namespace ore;
9260   bool DisableRuntimeUnroll = false;
9261   MDNode *OrigLoopID = L->getLoopID();
9262 
9263   if (!VectorizeLoop) {
9264     assert(IC > 1 && "interleave count should not be 1 or 0");
9265     // If we decided that it is not legal to vectorize the loop, then
9266     // interleave it.
9267     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9268                                BFI, PSI);
9269     LVP.executePlan(Unroller, DT);
9270 
9271     ORE->emit([&]() {
9272       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9273                                 L->getHeader())
9274              << "interleaved loop (interleaved count: "
9275              << NV("InterleaveCount", IC) << ")";
9276     });
9277   } else {
9278     // If we decided that it is *legal* to vectorize the loop, then do it.
9279 
9280     // Consider vectorizing the epilogue too if it's profitable.
9281     VectorizationFactor EpilogueVF =
9282       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9283     if (EpilogueVF.Width.isVector()) {
9284 
9285       // The first pass vectorizes the main loop and creates a scalar epilogue
9286       // to be vectorized by executing the plan (potentially with a different
9287       // factor) again shortly afterwards.
9288       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9289                                         EpilogueVF.Width.getKnownMinValue(), 1);
9290       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9291                                          &LVL, &CM, BFI, PSI);
9292 
9293       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9294       LVP.executePlan(MainILV, DT);
9295       ++LoopsVectorized;
9296 
9297       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9298       formLCSSARecursively(*L, *DT, LI, SE);
9299 
9300       // Second pass vectorizes the epilogue and adjusts the control flow
9301       // edges from the first pass.
9302       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9303       EPI.MainLoopVF = EPI.EpilogueVF;
9304       EPI.MainLoopUF = EPI.EpilogueUF;
9305       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9306                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9307       LVP.executePlan(EpilogILV, DT);
9308       ++LoopsEpilogueVectorized;
9309 
9310       if (!MainILV.areSafetyChecksAdded())
9311         DisableRuntimeUnroll = true;
9312     } else {
9313       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9314                              &LVL, &CM, BFI, PSI);
9315       LVP.executePlan(LB, DT);
9316       ++LoopsVectorized;
9317 
9318       // Add metadata to disable runtime unrolling a scalar loop when there are
9319       // no runtime checks about strides and memory. A scalar loop that is
9320       // rarely used is not worth unrolling.
9321       if (!LB.areSafetyChecksAdded())
9322         DisableRuntimeUnroll = true;
9323     }
9324 
9325     // Report the vectorization decision.
9326     ORE->emit([&]() {
9327       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9328                                 L->getHeader())
9329              << "vectorized loop (vectorization width: "
9330              << NV("VectorizationFactor", VF.Width)
9331              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9332     });
9333   }
9334 
9335   Optional<MDNode *> RemainderLoopID =
9336       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9337                                       LLVMLoopVectorizeFollowupEpilogue});
9338   if (RemainderLoopID.hasValue()) {
9339     L->setLoopID(RemainderLoopID.getValue());
9340   } else {
9341     if (DisableRuntimeUnroll)
9342       AddRuntimeUnrollDisableMetaData(L);
9343 
9344     // Mark the loop as already vectorized to avoid vectorizing again.
9345     Hints.setAlreadyVectorized();
9346   }
9347 
9348   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9349   return true;
9350 }
9351 
9352 LoopVectorizeResult LoopVectorizePass::runImpl(
9353     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9354     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9355     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9356     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9357     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9358   SE = &SE_;
9359   LI = &LI_;
9360   TTI = &TTI_;
9361   DT = &DT_;
9362   BFI = &BFI_;
9363   TLI = TLI_;
9364   AA = &AA_;
9365   AC = &AC_;
9366   GetLAA = &GetLAA_;
9367   DB = &DB_;
9368   ORE = &ORE_;
9369   PSI = PSI_;
9370 
9371   // Don't attempt if
9372   // 1. the target claims to have no vector registers, and
9373   // 2. interleaving won't help ILP.
9374   //
9375   // The second condition is necessary because, even if the target has no
9376   // vector registers, loop vectorization may still enable scalar
9377   // interleaving.
9378   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9379       TTI->getMaxInterleaveFactor(1) < 2)
9380     return LoopVectorizeResult(false, false);
9381 
9382   bool Changed = false, CFGChanged = false;
9383 
9384   // The vectorizer requires loops to be in simplified form.
9385   // Since simplification may add new inner loops, it has to run before the
9386   // legality and profitability checks. This means running the loop vectorizer
9387   // will simplify all loops, regardless of whether anything end up being
9388   // vectorized.
9389   for (auto &L : *LI)
9390     Changed |= CFGChanged |=
9391         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9392 
9393   // Build up a worklist of inner-loops to vectorize. This is necessary as
9394   // the act of vectorizing or partially unrolling a loop creates new loops
9395   // and can invalidate iterators across the loops.
9396   SmallVector<Loop *, 8> Worklist;
9397 
9398   for (Loop *L : *LI)
9399     collectSupportedLoops(*L, LI, ORE, Worklist);
9400 
9401   LoopsAnalyzed += Worklist.size();
9402 
9403   // Now walk the identified inner loops.
9404   while (!Worklist.empty()) {
9405     Loop *L = Worklist.pop_back_val();
9406 
9407     // For the inner loops we actually process, form LCSSA to simplify the
9408     // transform.
9409     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9410 
9411     Changed |= CFGChanged |= processLoop(L);
9412   }
9413 
9414   // Process each loop nest in the function.
9415   return LoopVectorizeResult(Changed, CFGChanged);
9416 }
9417 
9418 PreservedAnalyses LoopVectorizePass::run(Function &F,
9419                                          FunctionAnalysisManager &AM) {
9420     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9421     auto &LI = AM.getResult<LoopAnalysis>(F);
9422     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9423     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9424     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9425     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9426     auto &AA = AM.getResult<AAManager>(F);
9427     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9428     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9429     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9430     MemorySSA *MSSA = EnableMSSALoopDependency
9431                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9432                           : nullptr;
9433 
9434     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9435     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9436         [&](Loop &L) -> const LoopAccessInfo & {
9437       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9438                                         TLI, TTI, nullptr, MSSA};
9439       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9440     };
9441     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9442     ProfileSummaryInfo *PSI =
9443         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9444     LoopVectorizeResult Result =
9445         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9446     if (!Result.MadeAnyChange)
9447       return PreservedAnalyses::all();
9448     PreservedAnalyses PA;
9449 
9450     // We currently do not preserve loopinfo/dominator analyses with outer loop
9451     // vectorization. Until this is addressed, mark these analyses as preserved
9452     // only for non-VPlan-native path.
9453     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9454     if (!EnableVPlanNativePath) {
9455       PA.preserve<LoopAnalysis>();
9456       PA.preserve<DominatorTreeAnalysis>();
9457     }
9458     PA.preserve<BasicAA>();
9459     PA.preserve<GlobalsAA>();
9460     if (!Result.MadeCFGChange)
9461       PA.preserveSet<CFGAnalyses>();
9462     return PA;
9463 }
9464