1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 }
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// The number of stores in a loop that are allowed to need predication.
274 static cl::opt<unsigned> NumberOfStoresToPredicate(
275     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
276     cl::desc("Max number of stores to be predicated behind an if."));
277 
278 static cl::opt<bool> EnableIndVarRegisterHeur(
279     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
280     cl::desc("Count the induction variable only once when interleaving"));
281 
282 static cl::opt<bool> EnableCondStoresVectorization(
283     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
284     cl::desc("Enable if predication of stores during vectorization."));
285 
286 static cl::opt<unsigned> MaxNestedScalarReductionIC(
287     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
288     cl::desc("The maximum interleave count to use when interleaving a scalar "
289              "reduction in a nested loop."));
290 
291 static cl::opt<bool>
292     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
293                            cl::Hidden,
294                            cl::desc("Prefer in-loop vector reductions, "
295                                     "overriding the targets preference."));
296 
297 static cl::opt<bool> PreferPredicatedReductionSelect(
298     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
299     cl::desc(
300         "Prefer predicating a reduction operation over an after loop select."));
301 
302 cl::opt<bool> EnableVPlanNativePath(
303     "enable-vplan-native-path", cl::init(false), cl::Hidden,
304     cl::desc("Enable VPlan-native vectorization path with "
305              "support for outer loop vectorization."));
306 
307 // FIXME: Remove this switch once we have divergence analysis. Currently we
308 // assume divergent non-backedge branches when this switch is true.
309 cl::opt<bool> EnableVPlanPredication(
310     "enable-vplan-predication", cl::init(false), cl::Hidden,
311     cl::desc("Enable VPlan-native vectorization path predicator with "
312              "support for outer loop vectorization."));
313 
314 // This flag enables the stress testing of the VPlan H-CFG construction in the
315 // VPlan-native vectorization path. It must be used in conjuction with
316 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
317 // verification of the H-CFGs built.
318 static cl::opt<bool> VPlanBuildStressTest(
319     "vplan-build-stress-test", cl::init(false), cl::Hidden,
320     cl::desc(
321         "Build VPlan for every supported loop nest in the function and bail "
322         "out right after the build (stress test the VPlan H-CFG construction "
323         "in the VPlan-native vectorization path)."));
324 
325 cl::opt<bool> llvm::EnableLoopInterleaving(
326     "interleave-loops", cl::init(true), cl::Hidden,
327     cl::desc("Enable loop interleaving in Loop vectorization passes"));
328 cl::opt<bool> llvm::EnableLoopVectorization(
329     "vectorize-loops", cl::init(true), cl::Hidden,
330     cl::desc("Run the Loop vectorization passes"));
331 
332 /// A helper function that returns the type of loaded or stored value.
333 static Type *getMemInstValueType(Value *I) {
334   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
335          "Expected Load or Store instruction");
336   if (auto *LI = dyn_cast<LoadInst>(I))
337     return LI->getType();
338   return cast<StoreInst>(I)->getValueOperand()->getType();
339 }
340 
341 /// A helper function that returns true if the given type is irregular. The
342 /// type is irregular if its allocated size doesn't equal the store size of an
343 /// element of the corresponding vector type at the given vectorization factor.
344 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
345   assert(!VF.isScalable() && "scalable vectors not yet supported.");
346   // Determine if an array of VF elements of type Ty is "bitcast compatible"
347   // with a <VF x Ty> vector.
348   if (VF.isVector()) {
349     auto *VectorTy = VectorType::get(Ty, VF);
350     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
351   }
352 
353   // If the vectorization factor is one, we just check if an array of type Ty
354   // requires padding between elements.
355   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
356 }
357 
358 /// A helper function that returns the reciprocal of the block probability of
359 /// predicated blocks. If we return X, we are assuming the predicated block
360 /// will execute once for every X iterations of the loop header.
361 ///
362 /// TODO: We should use actual block probability here, if available. Currently,
363 ///       we always assume predicated blocks have a 50% chance of executing.
364 static unsigned getReciprocalPredBlockProb() { return 2; }
365 
366 /// A helper function that adds a 'fast' flag to floating-point operations.
367 static Value *addFastMathFlag(Value *V) {
368   if (isa<FPMathOperator>(V))
369     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
370   return V;
371 }
372 
373 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
374   if (isa<FPMathOperator>(V))
375     cast<Instruction>(V)->setFastMathFlags(FMF);
376   return V;
377 }
378 
379 /// A helper function that returns an integer or floating-point constant with
380 /// value C.
381 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
382   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
383                            : ConstantFP::get(Ty, C);
384 }
385 
386 /// Returns "best known" trip count for the specified loop \p L as defined by
387 /// the following procedure:
388 ///   1) Returns exact trip count if it is known.
389 ///   2) Returns expected trip count according to profile data if any.
390 ///   3) Returns upper bound estimate if it is known.
391 ///   4) Returns None if all of the above failed.
392 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
393   // Check if exact trip count is known.
394   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
395     return ExpectedTC;
396 
397   // Check if there is an expected trip count available from profile data.
398   if (LoopVectorizeWithBlockFrequency)
399     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
400       return EstimatedTC;
401 
402   // Check if upper bound estimate is known.
403   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
404     return ExpectedTC;
405 
406   return None;
407 }
408 
409 namespace llvm {
410 
411 /// InnerLoopVectorizer vectorizes loops which contain only one basic
412 /// block to a specified vectorization factor (VF).
413 /// This class performs the widening of scalars into vectors, or multiple
414 /// scalars. This class also implements the following features:
415 /// * It inserts an epilogue loop for handling loops that don't have iteration
416 ///   counts that are known to be a multiple of the vectorization factor.
417 /// * It handles the code generation for reduction variables.
418 /// * Scalarization (implementation using scalars) of un-vectorizable
419 ///   instructions.
420 /// InnerLoopVectorizer does not perform any vectorization-legality
421 /// checks, and relies on the caller to check for the different legality
422 /// aspects. The InnerLoopVectorizer relies on the
423 /// LoopVectorizationLegality class to provide information about the induction
424 /// and reduction variables that were found to a given vectorization factor.
425 class InnerLoopVectorizer {
426 public:
427   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
428                       LoopInfo *LI, DominatorTree *DT,
429                       const TargetLibraryInfo *TLI,
430                       const TargetTransformInfo *TTI, AssumptionCache *AC,
431                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
432                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
433                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
434                       ProfileSummaryInfo *PSI)
435       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
436         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
437         Builder(PSE.getSE()->getContext()),
438         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
439         BFI(BFI), PSI(PSI) {
440     // Query this against the original loop and save it here because the profile
441     // of the original loop header may change as the transformation happens.
442     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
443         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
444   }
445 
446   virtual ~InnerLoopVectorizer() = default;
447 
448   /// Create a new empty loop that will contain vectorized instructions later
449   /// on, while the old loop will be used as the scalar remainder. Control flow
450   /// is generated around the vectorized (and scalar epilogue) loops consisting
451   /// of various checks and bypasses. Return the pre-header block of the new
452   /// loop.
453   BasicBlock *createVectorizedLoopSkeleton();
454 
455   /// Widen a single instruction within the innermost loop.
456   void widenInstruction(Instruction &I, VPUser &Operands,
457                         VPTransformState &State);
458 
459   /// Widen a single call instruction within the innermost loop.
460   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
461                             VPTransformState &State);
462 
463   /// Widen a single select instruction within the innermost loop.
464   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
465                               bool InvariantCond, VPTransformState &State);
466 
467   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
468   void fixVectorizedLoop();
469 
470   // Return true if any runtime check is added.
471   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
472 
473   /// A type for vectorized values in the new loop. Each value from the
474   /// original loop, when vectorized, is represented by UF vector values in the
475   /// new unrolled loop, where UF is the unroll factor.
476   using VectorParts = SmallVector<Value *, 2>;
477 
478   /// Vectorize a single GetElementPtrInst based on information gathered and
479   /// decisions taken during planning.
480   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
481                 ElementCount VF, bool IsPtrLoopInvariant,
482                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
483 
484   /// Vectorize a single PHINode in a block. This method handles the induction
485   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
486   /// arbitrary length vectors.
487   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
488 
489   /// A helper function to scalarize a single Instruction in the innermost loop.
490   /// Generates a sequence of scalar instances for each lane between \p MinLane
491   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
492   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
493   /// Instr's operands.
494   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
495                             const VPIteration &Instance, bool IfPredicateInstr,
496                             VPTransformState &State);
497 
498   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
499   /// is provided, the integer induction variable will first be truncated to
500   /// the corresponding type.
501   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
502 
503   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
504   /// vector or scalar value on-demand if one is not yet available. When
505   /// vectorizing a loop, we visit the definition of an instruction before its
506   /// uses. When visiting the definition, we either vectorize or scalarize the
507   /// instruction, creating an entry for it in the corresponding map. (In some
508   /// cases, such as induction variables, we will create both vector and scalar
509   /// entries.) Then, as we encounter uses of the definition, we derive values
510   /// for each scalar or vector use unless such a value is already available.
511   /// For example, if we scalarize a definition and one of its uses is vector,
512   /// we build the required vector on-demand with an insertelement sequence
513   /// when visiting the use. Otherwise, if the use is scalar, we can use the
514   /// existing scalar definition.
515   ///
516   /// Return a value in the new loop corresponding to \p V from the original
517   /// loop at unroll index \p Part. If the value has already been vectorized,
518   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
519   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
520   /// a new vector value on-demand by inserting the scalar values into a vector
521   /// with an insertelement sequence. If the value has been neither vectorized
522   /// nor scalarized, it must be loop invariant, so we simply broadcast the
523   /// value into a vector.
524   Value *getOrCreateVectorValue(Value *V, unsigned Part);
525 
526   /// Return a value in the new loop corresponding to \p V from the original
527   /// loop at unroll and vector indices \p Instance. If the value has been
528   /// vectorized but not scalarized, the necessary extractelement instruction
529   /// will be generated.
530   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
531 
532   /// Construct the vector value of a scalarized value \p V one lane at a time.
533   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
534 
535   /// Try to vectorize interleaved access group \p Group with the base address
536   /// given in \p Addr, optionally masking the vector operations if \p
537   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
538   /// values in the vectorized loop.
539   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
540                                 VPTransformState &State, VPValue *Addr,
541                                 VPValue *BlockInMask = nullptr);
542 
543   /// Vectorize Load and Store instructions with the base address given in \p
544   /// Addr, optionally masking the vector operations if \p BlockInMask is
545   /// non-null. Use \p State to translate given VPValues to IR values in the
546   /// vectorized loop.
547   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
548                                   VPValue *Addr, VPValue *StoredValue,
549                                   VPValue *BlockInMask);
550 
551   /// Set the debug location in the builder using the debug location in
552   /// the instruction.
553   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
554 
555   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
556   void fixNonInductionPHIs(void);
557 
558 protected:
559   friend class LoopVectorizationPlanner;
560 
561   /// A small list of PHINodes.
562   using PhiVector = SmallVector<PHINode *, 4>;
563 
564   /// A type for scalarized values in the new loop. Each value from the
565   /// original loop, when scalarized, is represented by UF x VF scalar values
566   /// in the new unrolled loop, where UF is the unroll factor and VF is the
567   /// vectorization factor.
568   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
569 
570   /// Set up the values of the IVs correctly when exiting the vector loop.
571   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
572                     Value *CountRoundDown, Value *EndValue,
573                     BasicBlock *MiddleBlock);
574 
575   /// Create a new induction variable inside L.
576   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
577                                    Value *Step, Instruction *DL);
578 
579   /// Handle all cross-iteration phis in the header.
580   void fixCrossIterationPHIs();
581 
582   /// Fix a first-order recurrence. This is the second phase of vectorizing
583   /// this phi node.
584   void fixFirstOrderRecurrence(PHINode *Phi);
585 
586   /// Fix a reduction cross-iteration phi. This is the second phase of
587   /// vectorizing this phi node.
588   void fixReduction(PHINode *Phi);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
592 
593   /// The Loop exit block may have single value PHI nodes with some
594   /// incoming value. While vectorizing we only handled real values
595   /// that were defined inside the loop and we should have one value for
596   /// each predecessor of its parent basic block. See PR14725.
597   void fixLCSSAPHIs();
598 
599   /// Iteratively sink the scalarized operands of a predicated instruction into
600   /// the block that was created for it.
601   void sinkScalarOperands(Instruction *PredInst);
602 
603   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
604   /// represented as.
605   void truncateToMinimalBitwidths();
606 
607   /// Create a broadcast instruction. This method generates a broadcast
608   /// instruction (shuffle) for loop invariant values and for the induction
609   /// value. If this is the induction variable then we extend it to N, N+1, ...
610   /// this is needed because each iteration in the loop corresponds to a SIMD
611   /// element.
612   virtual Value *getBroadcastInstrs(Value *V);
613 
614   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
615   /// to each vector element of Val. The sequence starts at StartIndex.
616   /// \p Opcode is relevant for FP induction variable.
617   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
618                                Instruction::BinaryOps Opcode =
619                                Instruction::BinaryOpsEnd);
620 
621   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
622   /// variable on which to base the steps, \p Step is the size of the step, and
623   /// \p EntryVal is the value from the original loop that maps to the steps.
624   /// Note that \p EntryVal doesn't have to be an induction variable - it
625   /// can also be a truncate instruction.
626   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
627                         const InductionDescriptor &ID);
628 
629   /// Create a vector induction phi node based on an existing scalar one. \p
630   /// EntryVal is the value from the original loop that maps to the vector phi
631   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
632   /// truncate instruction, instead of widening the original IV, we widen a
633   /// version of the IV truncated to \p EntryVal's type.
634   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
635                                        Value *Step, Instruction *EntryVal);
636 
637   /// Returns true if an instruction \p I should be scalarized instead of
638   /// vectorized for the chosen vectorization factor.
639   bool shouldScalarizeInstruction(Instruction *I) const;
640 
641   /// Returns true if we should generate a scalar version of \p IV.
642   bool needsScalarInduction(Instruction *IV) const;
643 
644   /// If there is a cast involved in the induction variable \p ID, which should
645   /// be ignored in the vectorized loop body, this function records the
646   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
647   /// cast. We had already proved that the casted Phi is equal to the uncasted
648   /// Phi in the vectorized loop (under a runtime guard), and therefore
649   /// there is no need to vectorize the cast - the same value can be used in the
650   /// vector loop for both the Phi and the cast.
651   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
652   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
653   ///
654   /// \p EntryVal is the value from the original loop that maps to the vector
655   /// phi node and is used to distinguish what is the IV currently being
656   /// processed - original one (if \p EntryVal is a phi corresponding to the
657   /// original IV) or the "newly-created" one based on the proof mentioned above
658   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
659   /// latter case \p EntryVal is a TruncInst and we must not record anything for
660   /// that IV, but it's error-prone to expect callers of this routine to care
661   /// about that, hence this explicit parameter.
662   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
663                                              const Instruction *EntryVal,
664                                              Value *VectorLoopValue,
665                                              unsigned Part,
666                                              unsigned Lane = UINT_MAX);
667 
668   /// Generate a shuffle sequence that will reverse the vector Vec.
669   virtual Value *reverseVector(Value *Vec);
670 
671   /// Returns (and creates if needed) the original loop trip count.
672   Value *getOrCreateTripCount(Loop *NewLoop);
673 
674   /// Returns (and creates if needed) the trip count of the widened loop.
675   Value *getOrCreateVectorTripCount(Loop *NewLoop);
676 
677   /// Returns a bitcasted value to the requested vector type.
678   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
679   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
680                                 const DataLayout &DL);
681 
682   /// Emit a bypass check to see if the vector trip count is zero, including if
683   /// it overflows.
684   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
685 
686   /// Emit a bypass check to see if all of the SCEV assumptions we've
687   /// had to make are correct.
688   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
689 
690   /// Emit bypass checks to check any memory assumptions we may have made.
691   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
692 
693   /// Compute the transformed value of Index at offset StartValue using step
694   /// StepValue.
695   /// For integer induction, returns StartValue + Index * StepValue.
696   /// For pointer induction, returns StartValue[Index * StepValue].
697   /// FIXME: The newly created binary instructions should contain nsw/nuw
698   /// flags, which can be found from the original scalar operations.
699   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
700                               const DataLayout &DL,
701                               const InductionDescriptor &ID) const;
702 
703   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
704   /// vector loop preheader, middle block and scalar preheader. Also
705   /// allocate a loop object for the new vector loop and return it.
706   Loop *createVectorLoopSkeleton(StringRef Prefix);
707 
708   /// Create new phi nodes for the induction variables to resume iteration count
709   /// in the scalar epilogue, from where the vectorized loop left off (given by
710   /// \p VectorTripCount).
711   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
712 
713   /// Complete the loop skeleton by adding debug MDs, creating appropriate
714   /// conditional branches in the middle block, preparing the builder and
715   /// running the verifier. Take in the vector loop \p L as argument, and return
716   /// the preheader of the completed vector loop.
717   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
718 
719   /// Add additional metadata to \p To that was not present on \p Orig.
720   ///
721   /// Currently this is used to add the noalias annotations based on the
722   /// inserted memchecks.  Use this for instructions that are *cloned* into the
723   /// vector loop.
724   void addNewMetadata(Instruction *To, const Instruction *Orig);
725 
726   /// Add metadata from one instruction to another.
727   ///
728   /// This includes both the original MDs from \p From and additional ones (\see
729   /// addNewMetadata).  Use this for *newly created* instructions in the vector
730   /// loop.
731   void addMetadata(Instruction *To, Instruction *From);
732 
733   /// Similar to the previous function but it adds the metadata to a
734   /// vector of instructions.
735   void addMetadata(ArrayRef<Value *> To, Instruction *From);
736 
737   /// The original loop.
738   Loop *OrigLoop;
739 
740   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
741   /// dynamic knowledge to simplify SCEV expressions and converts them to a
742   /// more usable form.
743   PredicatedScalarEvolution &PSE;
744 
745   /// Loop Info.
746   LoopInfo *LI;
747 
748   /// Dominator Tree.
749   DominatorTree *DT;
750 
751   /// Alias Analysis.
752   AAResults *AA;
753 
754   /// Target Library Info.
755   const TargetLibraryInfo *TLI;
756 
757   /// Target Transform Info.
758   const TargetTransformInfo *TTI;
759 
760   /// Assumption Cache.
761   AssumptionCache *AC;
762 
763   /// Interface to emit optimization remarks.
764   OptimizationRemarkEmitter *ORE;
765 
766   /// LoopVersioning.  It's only set up (non-null) if memchecks were
767   /// used.
768   ///
769   /// This is currently only used to add no-alias metadata based on the
770   /// memchecks.  The actually versioning is performed manually.
771   std::unique_ptr<LoopVersioning> LVer;
772 
773   /// The vectorization SIMD factor to use. Each vector will have this many
774   /// vector elements.
775   ElementCount VF;
776 
777   /// The vectorization unroll factor to use. Each scalar is vectorized to this
778   /// many different vector instructions.
779   unsigned UF;
780 
781   /// The builder that we use
782   IRBuilder<> Builder;
783 
784   // --- Vectorization state ---
785 
786   /// The vector-loop preheader.
787   BasicBlock *LoopVectorPreHeader;
788 
789   /// The scalar-loop preheader.
790   BasicBlock *LoopScalarPreHeader;
791 
792   /// Middle Block between the vector and the scalar.
793   BasicBlock *LoopMiddleBlock;
794 
795   /// The ExitBlock of the scalar loop.
796   BasicBlock *LoopExitBlock;
797 
798   /// The vector loop body.
799   BasicBlock *LoopVectorBody;
800 
801   /// The scalar loop body.
802   BasicBlock *LoopScalarBody;
803 
804   /// A list of all bypass blocks. The first block is the entry of the loop.
805   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
806 
807   /// The new Induction variable which was added to the new block.
808   PHINode *Induction = nullptr;
809 
810   /// The induction variable of the old basic block.
811   PHINode *OldInduction = nullptr;
812 
813   /// Maps values from the original loop to their corresponding values in the
814   /// vectorized loop. A key value can map to either vector values, scalar
815   /// values or both kinds of values, depending on whether the key was
816   /// vectorized and scalarized.
817   VectorizerValueMap VectorLoopValueMap;
818 
819   /// Store instructions that were predicated.
820   SmallVector<Instruction *, 4> PredicatedInstructions;
821 
822   /// Trip count of the original loop.
823   Value *TripCount = nullptr;
824 
825   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
826   Value *VectorTripCount = nullptr;
827 
828   /// The legality analysis.
829   LoopVectorizationLegality *Legal;
830 
831   /// The profitablity analysis.
832   LoopVectorizationCostModel *Cost;
833 
834   // Record whether runtime checks are added.
835   bool AddedSafetyChecks = false;
836 
837   // Holds the end values for each induction variable. We save the end values
838   // so we can later fix-up the external users of the induction variables.
839   DenseMap<PHINode *, Value *> IVEndValues;
840 
841   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
842   // fixed up at the end of vector code generation.
843   SmallVector<PHINode *, 8> OrigPHIsToFix;
844 
845   /// BFI and PSI are used to check for profile guided size optimizations.
846   BlockFrequencyInfo *BFI;
847   ProfileSummaryInfo *PSI;
848 
849   // Whether this loop should be optimized for size based on profile guided size
850   // optimizatios.
851   bool OptForSizeBasedOnProfile;
852 };
853 
854 class InnerLoopUnroller : public InnerLoopVectorizer {
855 public:
856   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
857                     LoopInfo *LI, DominatorTree *DT,
858                     const TargetLibraryInfo *TLI,
859                     const TargetTransformInfo *TTI, AssumptionCache *AC,
860                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
861                     LoopVectorizationLegality *LVL,
862                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
863                     ProfileSummaryInfo *PSI)
864       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
865                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
866                             BFI, PSI) {}
867 
868 private:
869   Value *getBroadcastInstrs(Value *V) override;
870   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
871                        Instruction::BinaryOps Opcode =
872                        Instruction::BinaryOpsEnd) override;
873   Value *reverseVector(Value *Vec) override;
874 };
875 
876 } // end namespace llvm
877 
878 /// Look for a meaningful debug location on the instruction or it's
879 /// operands.
880 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
881   if (!I)
882     return I;
883 
884   DebugLoc Empty;
885   if (I->getDebugLoc() != Empty)
886     return I;
887 
888   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
889     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
890       if (OpInst->getDebugLoc() != Empty)
891         return OpInst;
892   }
893 
894   return I;
895 }
896 
897 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
898   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
899     const DILocation *DIL = Inst->getDebugLoc();
900     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
901         !isa<DbgInfoIntrinsic>(Inst)) {
902       assert(!VF.isScalable() && "scalable vectors not yet supported.");
903       auto NewDIL =
904           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
905       if (NewDIL)
906         B.SetCurrentDebugLocation(NewDIL.getValue());
907       else
908         LLVM_DEBUG(dbgs()
909                    << "Failed to create new discriminator: "
910                    << DIL->getFilename() << " Line: " << DIL->getLine());
911     }
912     else
913       B.SetCurrentDebugLocation(DIL);
914   } else
915     B.SetCurrentDebugLocation(DebugLoc());
916 }
917 
918 /// Write a record \p DebugMsg about vectorization failure to the debug
919 /// output stream. If \p I is passed, it is an instruction that prevents
920 /// vectorization.
921 #ifndef NDEBUG
922 static void debugVectorizationFailure(const StringRef DebugMsg,
923     Instruction *I) {
924   dbgs() << "LV: Not vectorizing: " << DebugMsg;
925   if (I != nullptr)
926     dbgs() << " " << *I;
927   else
928     dbgs() << '.';
929   dbgs() << '\n';
930 }
931 #endif
932 
933 /// Create an analysis remark that explains why vectorization failed
934 ///
935 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
936 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
937 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
938 /// the location of the remark.  \return the remark object that can be
939 /// streamed to.
940 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
941     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
942   Value *CodeRegion = TheLoop->getHeader();
943   DebugLoc DL = TheLoop->getStartLoc();
944 
945   if (I) {
946     CodeRegion = I->getParent();
947     // If there is no debug location attached to the instruction, revert back to
948     // using the loop's.
949     if (I->getDebugLoc())
950       DL = I->getDebugLoc();
951   }
952 
953   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
954   R << "loop not vectorized: ";
955   return R;
956 }
957 
958 namespace llvm {
959 
960 void reportVectorizationFailure(const StringRef DebugMsg,
961     const StringRef OREMsg, const StringRef ORETag,
962     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
963   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
964   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
965   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
966                 ORETag, TheLoop, I) << OREMsg);
967 }
968 
969 } // end namespace llvm
970 
971 #ifndef NDEBUG
972 /// \return string containing a file name and a line # for the given loop.
973 static std::string getDebugLocString(const Loop *L) {
974   std::string Result;
975   if (L) {
976     raw_string_ostream OS(Result);
977     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
978       LoopDbgLoc.print(OS);
979     else
980       // Just print the module name.
981       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
982     OS.flush();
983   }
984   return Result;
985 }
986 #endif
987 
988 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
989                                          const Instruction *Orig) {
990   // If the loop was versioned with memchecks, add the corresponding no-alias
991   // metadata.
992   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
993     LVer->annotateInstWithNoAlias(To, Orig);
994 }
995 
996 void InnerLoopVectorizer::addMetadata(Instruction *To,
997                                       Instruction *From) {
998   propagateMetadata(To, From);
999   addNewMetadata(To, From);
1000 }
1001 
1002 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1003                                       Instruction *From) {
1004   for (Value *V : To) {
1005     if (Instruction *I = dyn_cast<Instruction>(V))
1006       addMetadata(I, From);
1007   }
1008 }
1009 
1010 namespace llvm {
1011 
1012 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1013 // lowered.
1014 enum ScalarEpilogueLowering {
1015 
1016   // The default: allowing scalar epilogues.
1017   CM_ScalarEpilogueAllowed,
1018 
1019   // Vectorization with OptForSize: don't allow epilogues.
1020   CM_ScalarEpilogueNotAllowedOptSize,
1021 
1022   // A special case of vectorisation with OptForSize: loops with a very small
1023   // trip count are considered for vectorization under OptForSize, thereby
1024   // making sure the cost of their loop body is dominant, free of runtime
1025   // guards and scalar iteration overheads.
1026   CM_ScalarEpilogueNotAllowedLowTripLoop,
1027 
1028   // Loop hint predicate indicating an epilogue is undesired.
1029   CM_ScalarEpilogueNotNeededUsePredicate
1030 };
1031 
1032 /// LoopVectorizationCostModel - estimates the expected speedups due to
1033 /// vectorization.
1034 /// In many cases vectorization is not profitable. This can happen because of
1035 /// a number of reasons. In this class we mainly attempt to predict the
1036 /// expected speedup/slowdowns due to the supported instruction set. We use the
1037 /// TargetTransformInfo to query the different backends for the cost of
1038 /// different operations.
1039 class LoopVectorizationCostModel {
1040 public:
1041   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1042                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1043                              LoopVectorizationLegality *Legal,
1044                              const TargetTransformInfo &TTI,
1045                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1046                              AssumptionCache *AC,
1047                              OptimizationRemarkEmitter *ORE, const Function *F,
1048                              const LoopVectorizeHints *Hints,
1049                              InterleavedAccessInfo &IAI)
1050       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1051         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1052         Hints(Hints), InterleaveInfo(IAI) {}
1053 
1054   /// \return An upper bound for the vectorization factor, or None if
1055   /// vectorization and interleaving should be avoided up front.
1056   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1057 
1058   /// \return True if runtime checks are required for vectorization, and false
1059   /// otherwise.
1060   bool runtimeChecksRequired();
1061 
1062   /// \return The most profitable vectorization factor and the cost of that VF.
1063   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1064   /// then this vectorization factor will be selected if vectorization is
1065   /// possible.
1066   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1067 
1068   /// Setup cost-based decisions for user vectorization factor.
1069   void selectUserVectorizationFactor(ElementCount UserVF) {
1070     collectUniformsAndScalars(UserVF);
1071     collectInstsToScalarize(UserVF);
1072   }
1073 
1074   /// \return The size (in bits) of the smallest and widest types in the code
1075   /// that needs to be vectorized. We ignore values that remain scalar such as
1076   /// 64 bit loop indices.
1077   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1078 
1079   /// \return The desired interleave count.
1080   /// If interleave count has been specified by metadata it will be returned.
1081   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1082   /// are the selected vectorization factor and the cost of the selected VF.
1083   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1084 
1085   /// Memory access instruction may be vectorized in more than one way.
1086   /// Form of instruction after vectorization depends on cost.
1087   /// This function takes cost-based decisions for Load/Store instructions
1088   /// and collects them in a map. This decisions map is used for building
1089   /// the lists of loop-uniform and loop-scalar instructions.
1090   /// The calculated cost is saved with widening decision in order to
1091   /// avoid redundant calculations.
1092   void setCostBasedWideningDecision(ElementCount VF);
1093 
1094   /// A struct that represents some properties of the register usage
1095   /// of a loop.
1096   struct RegisterUsage {
1097     /// Holds the number of loop invariant values that are used in the loop.
1098     /// The key is ClassID of target-provided register class.
1099     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1100     /// Holds the maximum number of concurrent live intervals in the loop.
1101     /// The key is ClassID of target-provided register class.
1102     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1103   };
1104 
1105   /// \return Returns information about the register usages of the loop for the
1106   /// given vectorization factors.
1107   SmallVector<RegisterUsage, 8>
1108   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1109 
1110   /// Collect values we want to ignore in the cost model.
1111   void collectValuesToIgnore();
1112 
1113   /// Split reductions into those that happen in the loop, and those that happen
1114   /// outside. In loop reductions are collected into InLoopReductionChains.
1115   void collectInLoopReductions();
1116 
1117   /// \returns The smallest bitwidth each instruction can be represented with.
1118   /// The vector equivalents of these instructions should be truncated to this
1119   /// type.
1120   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1121     return MinBWs;
1122   }
1123 
1124   /// \returns True if it is more profitable to scalarize instruction \p I for
1125   /// vectorization factor \p VF.
1126   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1127     assert(VF.isVector() &&
1128            "Profitable to scalarize relevant only for VF > 1.");
1129 
1130     // Cost model is not run in the VPlan-native path - return conservative
1131     // result until this changes.
1132     if (EnableVPlanNativePath)
1133       return false;
1134 
1135     auto Scalars = InstsToScalarize.find(VF);
1136     assert(Scalars != InstsToScalarize.end() &&
1137            "VF not yet analyzed for scalarization profitability");
1138     return Scalars->second.find(I) != Scalars->second.end();
1139   }
1140 
1141   /// Returns true if \p I is known to be uniform after vectorization.
1142   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1143     if (VF.isScalar())
1144       return true;
1145 
1146     // Cost model is not run in the VPlan-native path - return conservative
1147     // result until this changes.
1148     if (EnableVPlanNativePath)
1149       return false;
1150 
1151     auto UniformsPerVF = Uniforms.find(VF);
1152     assert(UniformsPerVF != Uniforms.end() &&
1153            "VF not yet analyzed for uniformity");
1154     return UniformsPerVF->second.count(I);
1155   }
1156 
1157   /// Returns true if \p I is known to be scalar after vectorization.
1158   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1159     if (VF.isScalar())
1160       return true;
1161 
1162     // Cost model is not run in the VPlan-native path - return conservative
1163     // result until this changes.
1164     if (EnableVPlanNativePath)
1165       return false;
1166 
1167     auto ScalarsPerVF = Scalars.find(VF);
1168     assert(ScalarsPerVF != Scalars.end() &&
1169            "Scalar values are not calculated for VF");
1170     return ScalarsPerVF->second.count(I);
1171   }
1172 
1173   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1174   /// for vectorization factor \p VF.
1175   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1176     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1177            !isProfitableToScalarize(I, VF) &&
1178            !isScalarAfterVectorization(I, VF);
1179   }
1180 
1181   /// Decision that was taken during cost calculation for memory instruction.
1182   enum InstWidening {
1183     CM_Unknown,
1184     CM_Widen,         // For consecutive accesses with stride +1.
1185     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1186     CM_Interleave,
1187     CM_GatherScatter,
1188     CM_Scalarize
1189   };
1190 
1191   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1192   /// instruction \p I and vector width \p VF.
1193   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1194                            unsigned Cost) {
1195     assert(VF.isVector() && "Expected VF >=2");
1196     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1197   }
1198 
1199   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1200   /// interleaving group \p Grp and vector width \p VF.
1201   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1202                            ElementCount VF, InstWidening W, unsigned Cost) {
1203     assert(VF.isVector() && "Expected VF >=2");
1204     /// Broadcast this decicion to all instructions inside the group.
1205     /// But the cost will be assigned to one instruction only.
1206     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1207       if (auto *I = Grp->getMember(i)) {
1208         if (Grp->getInsertPos() == I)
1209           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1210         else
1211           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1212       }
1213     }
1214   }
1215 
1216   /// Return the cost model decision for the given instruction \p I and vector
1217   /// width \p VF. Return CM_Unknown if this instruction did not pass
1218   /// through the cost modeling.
1219   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1220     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1221     assert(VF.isVector() && "Expected VF >=2");
1222 
1223     // Cost model is not run in the VPlan-native path - return conservative
1224     // result until this changes.
1225     if (EnableVPlanNativePath)
1226       return CM_GatherScatter;
1227 
1228     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1229     auto Itr = WideningDecisions.find(InstOnVF);
1230     if (Itr == WideningDecisions.end())
1231       return CM_Unknown;
1232     return Itr->second.first;
1233   }
1234 
1235   /// Return the vectorization cost for the given instruction \p I and vector
1236   /// width \p VF.
1237   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1238     assert(VF.isVector() && "Expected VF >=2");
1239     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1240     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1241            "The cost is not calculated");
1242     return WideningDecisions[InstOnVF].second;
1243   }
1244 
1245   /// Return True if instruction \p I is an optimizable truncate whose operand
1246   /// is an induction variable. Such a truncate will be removed by adding a new
1247   /// induction variable with the destination type.
1248   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1249     // If the instruction is not a truncate, return false.
1250     auto *Trunc = dyn_cast<TruncInst>(I);
1251     if (!Trunc)
1252       return false;
1253 
1254     // Get the source and destination types of the truncate.
1255     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1256     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1257 
1258     // If the truncate is free for the given types, return false. Replacing a
1259     // free truncate with an induction variable would add an induction variable
1260     // update instruction to each iteration of the loop. We exclude from this
1261     // check the primary induction variable since it will need an update
1262     // instruction regardless.
1263     Value *Op = Trunc->getOperand(0);
1264     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1265       return false;
1266 
1267     // If the truncated value is not an induction variable, return false.
1268     return Legal->isInductionPhi(Op);
1269   }
1270 
1271   /// Collects the instructions to scalarize for each predicated instruction in
1272   /// the loop.
1273   void collectInstsToScalarize(ElementCount VF);
1274 
1275   /// Collect Uniform and Scalar values for the given \p VF.
1276   /// The sets depend on CM decision for Load/Store instructions
1277   /// that may be vectorized as interleave, gather-scatter or scalarized.
1278   void collectUniformsAndScalars(ElementCount VF) {
1279     // Do the analysis once.
1280     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1281       return;
1282     setCostBasedWideningDecision(VF);
1283     collectLoopUniforms(VF);
1284     collectLoopScalars(VF);
1285   }
1286 
1287   /// Returns true if the target machine supports masked store operation
1288   /// for the given \p DataType and kind of access to \p Ptr.
1289   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1290     return Legal->isConsecutivePtr(Ptr) &&
1291            TTI.isLegalMaskedStore(DataType, Alignment);
1292   }
1293 
1294   /// Returns true if the target machine supports masked load operation
1295   /// for the given \p DataType and kind of access to \p Ptr.
1296   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1297     return Legal->isConsecutivePtr(Ptr) &&
1298            TTI.isLegalMaskedLoad(DataType, Alignment);
1299   }
1300 
1301   /// Returns true if the target machine supports masked scatter operation
1302   /// for the given \p DataType.
1303   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1304     return TTI.isLegalMaskedScatter(DataType, Alignment);
1305   }
1306 
1307   /// Returns true if the target machine supports masked gather operation
1308   /// for the given \p DataType.
1309   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1310     return TTI.isLegalMaskedGather(DataType, Alignment);
1311   }
1312 
1313   /// Returns true if the target machine can represent \p V as a masked gather
1314   /// or scatter operation.
1315   bool isLegalGatherOrScatter(Value *V) {
1316     bool LI = isa<LoadInst>(V);
1317     bool SI = isa<StoreInst>(V);
1318     if (!LI && !SI)
1319       return false;
1320     auto *Ty = getMemInstValueType(V);
1321     Align Align = getLoadStoreAlignment(V);
1322     return (LI && isLegalMaskedGather(Ty, Align)) ||
1323            (SI && isLegalMaskedScatter(Ty, Align));
1324   }
1325 
1326   /// Returns true if \p I is an instruction that will be scalarized with
1327   /// predication. Such instructions include conditional stores and
1328   /// instructions that may divide by zero.
1329   /// If a non-zero VF has been calculated, we check if I will be scalarized
1330   /// predication for that VF.
1331   bool isScalarWithPredication(Instruction *I,
1332                                ElementCount VF = ElementCount::getFixed(1));
1333 
1334   // Returns true if \p I is an instruction that will be predicated either
1335   // through scalar predication or masked load/store or masked gather/scatter.
1336   // Superset of instructions that return true for isScalarWithPredication.
1337   bool isPredicatedInst(Instruction *I) {
1338     if (!blockNeedsPredication(I->getParent()))
1339       return false;
1340     // Loads and stores that need some form of masked operation are predicated
1341     // instructions.
1342     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1343       return Legal->isMaskRequired(I);
1344     return isScalarWithPredication(I);
1345   }
1346 
1347   /// Returns true if \p I is a memory instruction with consecutive memory
1348   /// access that can be widened.
1349   bool
1350   memoryInstructionCanBeWidened(Instruction *I,
1351                                 ElementCount VF = ElementCount::getFixed(1));
1352 
1353   /// Returns true if \p I is a memory instruction in an interleaved-group
1354   /// of memory accesses that can be vectorized with wide vector loads/stores
1355   /// and shuffles.
1356   bool
1357   interleavedAccessCanBeWidened(Instruction *I,
1358                                 ElementCount VF = ElementCount::getFixed(1));
1359 
1360   /// Check if \p Instr belongs to any interleaved access group.
1361   bool isAccessInterleaved(Instruction *Instr) {
1362     return InterleaveInfo.isInterleaved(Instr);
1363   }
1364 
1365   /// Get the interleaved access group that \p Instr belongs to.
1366   const InterleaveGroup<Instruction> *
1367   getInterleavedAccessGroup(Instruction *Instr) {
1368     return InterleaveInfo.getInterleaveGroup(Instr);
1369   }
1370 
1371   /// Returns true if an interleaved group requires a scalar iteration
1372   /// to handle accesses with gaps, and there is nothing preventing us from
1373   /// creating a scalar epilogue.
1374   bool requiresScalarEpilogue() const {
1375     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1376   }
1377 
1378   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1379   /// loop hint annotation.
1380   bool isScalarEpilogueAllowed() const {
1381     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1382   }
1383 
1384   /// Returns true if all loop blocks should be masked to fold tail loop.
1385   bool foldTailByMasking() const { return FoldTailByMasking; }
1386 
1387   bool blockNeedsPredication(BasicBlock *BB) {
1388     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1389   }
1390 
1391   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1392   /// nodes to the chain of instructions representing the reductions. Uses a
1393   /// MapVector to ensure deterministic iteration order.
1394   using ReductionChainMap =
1395       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1396 
1397   /// Return the chain of instructions representing an inloop reduction.
1398   const ReductionChainMap &getInLoopReductionChains() const {
1399     return InLoopReductionChains;
1400   }
1401 
1402   /// Returns true if the Phi is part of an inloop reduction.
1403   bool isInLoopReduction(PHINode *Phi) const {
1404     return InLoopReductionChains.count(Phi);
1405   }
1406 
1407   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1408   /// with factor VF.  Return the cost of the instruction, including
1409   /// scalarization overhead if it's needed.
1410   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1411 
1412   /// Estimate cost of a call instruction CI if it were vectorized with factor
1413   /// VF. Return the cost of the instruction, including scalarization overhead
1414   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1415   /// scalarized -
1416   /// i.e. either vector version isn't available, or is too expensive.
1417   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1418                              bool &NeedToScalarize);
1419 
1420   /// Invalidates decisions already taken by the cost model.
1421   void invalidateCostModelingDecisions() {
1422     WideningDecisions.clear();
1423     Uniforms.clear();
1424     Scalars.clear();
1425   }
1426 
1427 private:
1428   unsigned NumPredStores = 0;
1429 
1430   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1431   /// than zero. One is returned if vectorization should best be avoided due
1432   /// to cost.
1433   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1434 
1435   /// The vectorization cost is a combination of the cost itself and a boolean
1436   /// indicating whether any of the contributing operations will actually
1437   /// operate on
1438   /// vector values after type legalization in the backend. If this latter value
1439   /// is
1440   /// false, then all operations will be scalarized (i.e. no vectorization has
1441   /// actually taken place).
1442   using VectorizationCostTy = std::pair<unsigned, bool>;
1443 
1444   /// Returns the expected execution cost. The unit of the cost does
1445   /// not matter because we use the 'cost' units to compare different
1446   /// vector widths. The cost that is returned is *not* normalized by
1447   /// the factor width.
1448   VectorizationCostTy expectedCost(ElementCount VF);
1449 
1450   /// Returns the execution time cost of an instruction for a given vector
1451   /// width. Vector width of one means scalar.
1452   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1453 
1454   /// The cost-computation logic from getInstructionCost which provides
1455   /// the vector type as an output parameter.
1456   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1457 
1458   /// Calculate vectorization cost of memory instruction \p I.
1459   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1460 
1461   /// The cost computation for scalarized memory instruction.
1462   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1463 
1464   /// The cost computation for interleaving group of memory instructions.
1465   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1466 
1467   /// The cost computation for Gather/Scatter instruction.
1468   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1469 
1470   /// The cost computation for widening instruction \p I with consecutive
1471   /// memory access.
1472   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1473 
1474   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1475   /// Load: scalar load + broadcast.
1476   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1477   /// element)
1478   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1479 
1480   /// Estimate the overhead of scalarizing an instruction. This is a
1481   /// convenience wrapper for the type-based getScalarizationOverhead API.
1482   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1483 
1484   /// Returns whether the instruction is a load or store and will be a emitted
1485   /// as a vector operation.
1486   bool isConsecutiveLoadOrStore(Instruction *I);
1487 
1488   /// Returns true if an artificially high cost for emulated masked memrefs
1489   /// should be used.
1490   bool useEmulatedMaskMemRefHack(Instruction *I);
1491 
1492   /// Map of scalar integer values to the smallest bitwidth they can be legally
1493   /// represented as. The vector equivalents of these values should be truncated
1494   /// to this type.
1495   MapVector<Instruction *, uint64_t> MinBWs;
1496 
1497   /// A type representing the costs for instructions if they were to be
1498   /// scalarized rather than vectorized. The entries are Instruction-Cost
1499   /// pairs.
1500   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1501 
1502   /// A set containing all BasicBlocks that are known to present after
1503   /// vectorization as a predicated block.
1504   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1505 
1506   /// Records whether it is allowed to have the original scalar loop execute at
1507   /// least once. This may be needed as a fallback loop in case runtime
1508   /// aliasing/dependence checks fail, or to handle the tail/remainder
1509   /// iterations when the trip count is unknown or doesn't divide by the VF,
1510   /// or as a peel-loop to handle gaps in interleave-groups.
1511   /// Under optsize and when the trip count is very small we don't allow any
1512   /// iterations to execute in the scalar loop.
1513   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1514 
1515   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1516   bool FoldTailByMasking = false;
1517 
1518   /// A map holding scalar costs for different vectorization factors. The
1519   /// presence of a cost for an instruction in the mapping indicates that the
1520   /// instruction will be scalarized when vectorizing with the associated
1521   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1522   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1523 
1524   /// Holds the instructions known to be uniform after vectorization.
1525   /// The data is collected per VF.
1526   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1527 
1528   /// Holds the instructions known to be scalar after vectorization.
1529   /// The data is collected per VF.
1530   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1531 
1532   /// Holds the instructions (address computations) that are forced to be
1533   /// scalarized.
1534   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1535 
1536   /// PHINodes of the reductions that should be expanded in-loop along with
1537   /// their associated chains of reduction operations, in program order from top
1538   /// (PHI) to bottom
1539   ReductionChainMap InLoopReductionChains;
1540 
1541   /// Returns the expected difference in cost from scalarizing the expression
1542   /// feeding a predicated instruction \p PredInst. The instructions to
1543   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1544   /// non-negative return value implies the expression will be scalarized.
1545   /// Currently, only single-use chains are considered for scalarization.
1546   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1547                               ElementCount VF);
1548 
1549   /// Collect the instructions that are uniform after vectorization. An
1550   /// instruction is uniform if we represent it with a single scalar value in
1551   /// the vectorized loop corresponding to each vector iteration. Examples of
1552   /// uniform instructions include pointer operands of consecutive or
1553   /// interleaved memory accesses. Note that although uniformity implies an
1554   /// instruction will be scalar, the reverse is not true. In general, a
1555   /// scalarized instruction will be represented by VF scalar values in the
1556   /// vectorized loop, each corresponding to an iteration of the original
1557   /// scalar loop.
1558   void collectLoopUniforms(ElementCount VF);
1559 
1560   /// Collect the instructions that are scalar after vectorization. An
1561   /// instruction is scalar if it is known to be uniform or will be scalarized
1562   /// during vectorization. Non-uniform scalarized instructions will be
1563   /// represented by VF values in the vectorized loop, each corresponding to an
1564   /// iteration of the original scalar loop.
1565   void collectLoopScalars(ElementCount VF);
1566 
1567   /// Keeps cost model vectorization decision and cost for instructions.
1568   /// Right now it is used for memory instructions only.
1569   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1570                                 std::pair<InstWidening, unsigned>>;
1571 
1572   DecisionList WideningDecisions;
1573 
1574   /// Returns true if \p V is expected to be vectorized and it needs to be
1575   /// extracted.
1576   bool needsExtract(Value *V, ElementCount VF) const {
1577     Instruction *I = dyn_cast<Instruction>(V);
1578     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1579         TheLoop->isLoopInvariant(I))
1580       return false;
1581 
1582     // Assume we can vectorize V (and hence we need extraction) if the
1583     // scalars are not computed yet. This can happen, because it is called
1584     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1585     // the scalars are collected. That should be a safe assumption in most
1586     // cases, because we check if the operands have vectorizable types
1587     // beforehand in LoopVectorizationLegality.
1588     return Scalars.find(VF) == Scalars.end() ||
1589            !isScalarAfterVectorization(I, VF);
1590   };
1591 
1592   /// Returns a range containing only operands needing to be extracted.
1593   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1594                                                    ElementCount VF) {
1595     return SmallVector<Value *, 4>(make_filter_range(
1596         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1597   }
1598 
1599 public:
1600   /// The loop that we evaluate.
1601   Loop *TheLoop;
1602 
1603   /// Predicated scalar evolution analysis.
1604   PredicatedScalarEvolution &PSE;
1605 
1606   /// Loop Info analysis.
1607   LoopInfo *LI;
1608 
1609   /// Vectorization legality.
1610   LoopVectorizationLegality *Legal;
1611 
1612   /// Vector target information.
1613   const TargetTransformInfo &TTI;
1614 
1615   /// Target Library Info.
1616   const TargetLibraryInfo *TLI;
1617 
1618   /// Demanded bits analysis.
1619   DemandedBits *DB;
1620 
1621   /// Assumption cache.
1622   AssumptionCache *AC;
1623 
1624   /// Interface to emit optimization remarks.
1625   OptimizationRemarkEmitter *ORE;
1626 
1627   const Function *TheFunction;
1628 
1629   /// Loop Vectorize Hint.
1630   const LoopVectorizeHints *Hints;
1631 
1632   /// The interleave access information contains groups of interleaved accesses
1633   /// with the same stride and close to each other.
1634   InterleavedAccessInfo &InterleaveInfo;
1635 
1636   /// Values to ignore in the cost model.
1637   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1638 
1639   /// Values to ignore in the cost model when VF > 1.
1640   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1641 };
1642 
1643 } // end namespace llvm
1644 
1645 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1646 // vectorization. The loop needs to be annotated with #pragma omp simd
1647 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1648 // vector length information is not provided, vectorization is not considered
1649 // explicit. Interleave hints are not allowed either. These limitations will be
1650 // relaxed in the future.
1651 // Please, note that we are currently forced to abuse the pragma 'clang
1652 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1653 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1654 // provides *explicit vectorization hints* (LV can bypass legal checks and
1655 // assume that vectorization is legal). However, both hints are implemented
1656 // using the same metadata (llvm.loop.vectorize, processed by
1657 // LoopVectorizeHints). This will be fixed in the future when the native IR
1658 // representation for pragma 'omp simd' is introduced.
1659 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1660                                    OptimizationRemarkEmitter *ORE) {
1661   assert(!OuterLp->empty() && "This is not an outer loop");
1662   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1663 
1664   // Only outer loops with an explicit vectorization hint are supported.
1665   // Unannotated outer loops are ignored.
1666   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1667     return false;
1668 
1669   Function *Fn = OuterLp->getHeader()->getParent();
1670   if (!Hints.allowVectorization(Fn, OuterLp,
1671                                 true /*VectorizeOnlyWhenForced*/)) {
1672     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1673     return false;
1674   }
1675 
1676   if (Hints.getInterleave() > 1) {
1677     // TODO: Interleave support is future work.
1678     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1679                          "outer loops.\n");
1680     Hints.emitRemarkWithHints();
1681     return false;
1682   }
1683 
1684   return true;
1685 }
1686 
1687 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1688                                   OptimizationRemarkEmitter *ORE,
1689                                   SmallVectorImpl<Loop *> &V) {
1690   // Collect inner loops and outer loops without irreducible control flow. For
1691   // now, only collect outer loops that have explicit vectorization hints. If we
1692   // are stress testing the VPlan H-CFG construction, we collect the outermost
1693   // loop of every loop nest.
1694   if (L.empty() || VPlanBuildStressTest ||
1695       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1696     LoopBlocksRPO RPOT(&L);
1697     RPOT.perform(LI);
1698     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1699       V.push_back(&L);
1700       // TODO: Collect inner loops inside marked outer loops in case
1701       // vectorization fails for the outer loop. Do not invoke
1702       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1703       // already known to be reducible. We can use an inherited attribute for
1704       // that.
1705       return;
1706     }
1707   }
1708   for (Loop *InnerL : L)
1709     collectSupportedLoops(*InnerL, LI, ORE, V);
1710 }
1711 
1712 namespace {
1713 
1714 /// The LoopVectorize Pass.
1715 struct LoopVectorize : public FunctionPass {
1716   /// Pass identification, replacement for typeid
1717   static char ID;
1718 
1719   LoopVectorizePass Impl;
1720 
1721   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1722                          bool VectorizeOnlyWhenForced = false)
1723       : FunctionPass(ID),
1724         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1725     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1726   }
1727 
1728   bool runOnFunction(Function &F) override {
1729     if (skipFunction(F))
1730       return false;
1731 
1732     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1733     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1734     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1735     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1736     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1737     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1738     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1739     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1740     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1741     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1742     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1743     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1744     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1745 
1746     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1747         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1748 
1749     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1750                         GetLAA, *ORE, PSI).MadeAnyChange;
1751   }
1752 
1753   void getAnalysisUsage(AnalysisUsage &AU) const override {
1754     AU.addRequired<AssumptionCacheTracker>();
1755     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1756     AU.addRequired<DominatorTreeWrapperPass>();
1757     AU.addRequired<LoopInfoWrapperPass>();
1758     AU.addRequired<ScalarEvolutionWrapperPass>();
1759     AU.addRequired<TargetTransformInfoWrapperPass>();
1760     AU.addRequired<AAResultsWrapperPass>();
1761     AU.addRequired<LoopAccessLegacyAnalysis>();
1762     AU.addRequired<DemandedBitsWrapperPass>();
1763     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1764     AU.addRequired<InjectTLIMappingsLegacy>();
1765 
1766     // We currently do not preserve loopinfo/dominator analyses with outer loop
1767     // vectorization. Until this is addressed, mark these analyses as preserved
1768     // only for non-VPlan-native path.
1769     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1770     if (!EnableVPlanNativePath) {
1771       AU.addPreserved<LoopInfoWrapperPass>();
1772       AU.addPreserved<DominatorTreeWrapperPass>();
1773     }
1774 
1775     AU.addPreserved<BasicAAWrapperPass>();
1776     AU.addPreserved<GlobalsAAWrapperPass>();
1777     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1778   }
1779 };
1780 
1781 } // end anonymous namespace
1782 
1783 //===----------------------------------------------------------------------===//
1784 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1785 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1786 //===----------------------------------------------------------------------===//
1787 
1788 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1789   // We need to place the broadcast of invariant variables outside the loop,
1790   // but only if it's proven safe to do so. Else, broadcast will be inside
1791   // vector loop body.
1792   Instruction *Instr = dyn_cast<Instruction>(V);
1793   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1794                      (!Instr ||
1795                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1796   // Place the code for broadcasting invariant variables in the new preheader.
1797   IRBuilder<>::InsertPointGuard Guard(Builder);
1798   if (SafeToHoist)
1799     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1800 
1801   // Broadcast the scalar into all locations in the vector.
1802   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1803 
1804   return Shuf;
1805 }
1806 
1807 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1808     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1809   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1810          "Expected either an induction phi-node or a truncate of it!");
1811   Value *Start = II.getStartValue();
1812 
1813   // Construct the initial value of the vector IV in the vector loop preheader
1814   auto CurrIP = Builder.saveIP();
1815   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1816   if (isa<TruncInst>(EntryVal)) {
1817     assert(Start->getType()->isIntegerTy() &&
1818            "Truncation requires an integer type");
1819     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1820     Step = Builder.CreateTrunc(Step, TruncType);
1821     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1822   }
1823   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1824   Value *SteppedStart =
1825       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1826 
1827   // We create vector phi nodes for both integer and floating-point induction
1828   // variables. Here, we determine the kind of arithmetic we will perform.
1829   Instruction::BinaryOps AddOp;
1830   Instruction::BinaryOps MulOp;
1831   if (Step->getType()->isIntegerTy()) {
1832     AddOp = Instruction::Add;
1833     MulOp = Instruction::Mul;
1834   } else {
1835     AddOp = II.getInductionOpcode();
1836     MulOp = Instruction::FMul;
1837   }
1838 
1839   // Multiply the vectorization factor by the step using integer or
1840   // floating-point arithmetic as appropriate.
1841   Value *ConstVF =
1842       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1843   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1844 
1845   // Create a vector splat to use in the induction update.
1846   //
1847   // FIXME: If the step is non-constant, we create the vector splat with
1848   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1849   //        handle a constant vector splat.
1850   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1851   Value *SplatVF = isa<Constant>(Mul)
1852                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1853                        : Builder.CreateVectorSplat(VF, Mul);
1854   Builder.restoreIP(CurrIP);
1855 
1856   // We may need to add the step a number of times, depending on the unroll
1857   // factor. The last of those goes into the PHI.
1858   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1859                                     &*LoopVectorBody->getFirstInsertionPt());
1860   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1861   Instruction *LastInduction = VecInd;
1862   for (unsigned Part = 0; Part < UF; ++Part) {
1863     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1864 
1865     if (isa<TruncInst>(EntryVal))
1866       addMetadata(LastInduction, EntryVal);
1867     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1868 
1869     LastInduction = cast<Instruction>(addFastMathFlag(
1870         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1871     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1872   }
1873 
1874   // Move the last step to the end of the latch block. This ensures consistent
1875   // placement of all induction updates.
1876   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1877   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1878   auto *ICmp = cast<Instruction>(Br->getCondition());
1879   LastInduction->moveBefore(ICmp);
1880   LastInduction->setName("vec.ind.next");
1881 
1882   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1883   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1884 }
1885 
1886 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1887   return Cost->isScalarAfterVectorization(I, VF) ||
1888          Cost->isProfitableToScalarize(I, VF);
1889 }
1890 
1891 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1892   if (shouldScalarizeInstruction(IV))
1893     return true;
1894   auto isScalarInst = [&](User *U) -> bool {
1895     auto *I = cast<Instruction>(U);
1896     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1897   };
1898   return llvm::any_of(IV->users(), isScalarInst);
1899 }
1900 
1901 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1902     const InductionDescriptor &ID, const Instruction *EntryVal,
1903     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1904   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1905          "Expected either an induction phi-node or a truncate of it!");
1906 
1907   // This induction variable is not the phi from the original loop but the
1908   // newly-created IV based on the proof that casted Phi is equal to the
1909   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1910   // re-uses the same InductionDescriptor that original IV uses but we don't
1911   // have to do any recording in this case - that is done when original IV is
1912   // processed.
1913   if (isa<TruncInst>(EntryVal))
1914     return;
1915 
1916   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1917   if (Casts.empty())
1918     return;
1919   // Only the first Cast instruction in the Casts vector is of interest.
1920   // The rest of the Casts (if exist) have no uses outside the
1921   // induction update chain itself.
1922   Instruction *CastInst = *Casts.begin();
1923   if (Lane < UINT_MAX)
1924     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1925   else
1926     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1927 }
1928 
1929 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1930   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1931          "Primary induction variable must have an integer type");
1932 
1933   auto II = Legal->getInductionVars().find(IV);
1934   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1935 
1936   auto ID = II->second;
1937   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1938 
1939   // The value from the original loop to which we are mapping the new induction
1940   // variable.
1941   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1942 
1943   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1944 
1945   // Generate code for the induction step. Note that induction steps are
1946   // required to be loop-invariant
1947   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1948     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1949            "Induction step should be loop invariant");
1950     if (PSE.getSE()->isSCEVable(IV->getType())) {
1951       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1952       return Exp.expandCodeFor(Step, Step->getType(),
1953                                LoopVectorPreHeader->getTerminator());
1954     }
1955     return cast<SCEVUnknown>(Step)->getValue();
1956   };
1957 
1958   // The scalar value to broadcast. This is derived from the canonical
1959   // induction variable. If a truncation type is given, truncate the canonical
1960   // induction variable and step. Otherwise, derive these values from the
1961   // induction descriptor.
1962   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1963     Value *ScalarIV = Induction;
1964     if (IV != OldInduction) {
1965       ScalarIV = IV->getType()->isIntegerTy()
1966                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1967                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1968                                           IV->getType());
1969       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1970       ScalarIV->setName("offset.idx");
1971     }
1972     if (Trunc) {
1973       auto *TruncType = cast<IntegerType>(Trunc->getType());
1974       assert(Step->getType()->isIntegerTy() &&
1975              "Truncation requires an integer step");
1976       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1977       Step = Builder.CreateTrunc(Step, TruncType);
1978     }
1979     return ScalarIV;
1980   };
1981 
1982   // Create the vector values from the scalar IV, in the absence of creating a
1983   // vector IV.
1984   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1985     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1986     for (unsigned Part = 0; Part < UF; ++Part) {
1987       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1988       Value *EntryPart =
1989           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
1990                         ID.getInductionOpcode());
1991       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1992       if (Trunc)
1993         addMetadata(EntryPart, Trunc);
1994       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1995     }
1996   };
1997 
1998   // Now do the actual transformations, and start with creating the step value.
1999   Value *Step = CreateStepValue(ID.getStep());
2000   if (VF.isZero() || VF.isScalar()) {
2001     Value *ScalarIV = CreateScalarIV(Step);
2002     CreateSplatIV(ScalarIV, Step);
2003     return;
2004   }
2005 
2006   // Determine if we want a scalar version of the induction variable. This is
2007   // true if the induction variable itself is not widened, or if it has at
2008   // least one user in the loop that is not widened.
2009   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2010   if (!NeedsScalarIV) {
2011     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2012     return;
2013   }
2014 
2015   // Try to create a new independent vector induction variable. If we can't
2016   // create the phi node, we will splat the scalar induction variable in each
2017   // loop iteration.
2018   if (!shouldScalarizeInstruction(EntryVal)) {
2019     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2020     Value *ScalarIV = CreateScalarIV(Step);
2021     // Create scalar steps that can be used by instructions we will later
2022     // scalarize. Note that the addition of the scalar steps will not increase
2023     // the number of instructions in the loop in the common case prior to
2024     // InstCombine. We will be trading one vector extract for each scalar step.
2025     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2026     return;
2027   }
2028 
2029   // All IV users are scalar instructions, so only emit a scalar IV, not a
2030   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2031   // predicate used by the masked loads/stores.
2032   Value *ScalarIV = CreateScalarIV(Step);
2033   if (!Cost->isScalarEpilogueAllowed())
2034     CreateSplatIV(ScalarIV, Step);
2035   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2036 }
2037 
2038 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2039                                           Instruction::BinaryOps BinOp) {
2040   // Create and check the types.
2041   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2042   int VLen = ValVTy->getNumElements();
2043 
2044   Type *STy = Val->getType()->getScalarType();
2045   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2046          "Induction Step must be an integer or FP");
2047   assert(Step->getType() == STy && "Step has wrong type");
2048 
2049   SmallVector<Constant *, 8> Indices;
2050 
2051   if (STy->isIntegerTy()) {
2052     // Create a vector of consecutive numbers from zero to VF.
2053     for (int i = 0; i < VLen; ++i)
2054       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2055 
2056     // Add the consecutive indices to the vector value.
2057     Constant *Cv = ConstantVector::get(Indices);
2058     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2059     Step = Builder.CreateVectorSplat(VLen, Step);
2060     assert(Step->getType() == Val->getType() && "Invalid step vec");
2061     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2062     // which can be found from the original scalar operations.
2063     Step = Builder.CreateMul(Cv, Step);
2064     return Builder.CreateAdd(Val, Step, "induction");
2065   }
2066 
2067   // Floating point induction.
2068   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2069          "Binary Opcode should be specified for FP induction");
2070   // Create a vector of consecutive numbers from zero to VF.
2071   for (int i = 0; i < VLen; ++i)
2072     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2073 
2074   // Add the consecutive indices to the vector value.
2075   Constant *Cv = ConstantVector::get(Indices);
2076 
2077   Step = Builder.CreateVectorSplat(VLen, Step);
2078 
2079   // Floating point operations had to be 'fast' to enable the induction.
2080   FastMathFlags Flags;
2081   Flags.setFast();
2082 
2083   Value *MulOp = Builder.CreateFMul(Cv, Step);
2084   if (isa<Instruction>(MulOp))
2085     // Have to check, MulOp may be a constant
2086     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2087 
2088   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2089   if (isa<Instruction>(BOp))
2090     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2091   return BOp;
2092 }
2093 
2094 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2095                                            Instruction *EntryVal,
2096                                            const InductionDescriptor &ID) {
2097   // We shouldn't have to build scalar steps if we aren't vectorizing.
2098   assert(VF.isVector() && "VF should be greater than one");
2099   assert(!VF.isScalable() &&
2100          "the code below assumes a fixed number of elements at compile time");
2101   // Get the value type and ensure it and the step have the same integer type.
2102   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2103   assert(ScalarIVTy == Step->getType() &&
2104          "Val and Step should have the same type");
2105 
2106   // We build scalar steps for both integer and floating-point induction
2107   // variables. Here, we determine the kind of arithmetic we will perform.
2108   Instruction::BinaryOps AddOp;
2109   Instruction::BinaryOps MulOp;
2110   if (ScalarIVTy->isIntegerTy()) {
2111     AddOp = Instruction::Add;
2112     MulOp = Instruction::Mul;
2113   } else {
2114     AddOp = ID.getInductionOpcode();
2115     MulOp = Instruction::FMul;
2116   }
2117 
2118   // Determine the number of scalars we need to generate for each unroll
2119   // iteration. If EntryVal is uniform, we only need to generate the first
2120   // lane. Otherwise, we generate all VF values.
2121   unsigned Lanes =
2122       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2123           ? 1
2124           : VF.getKnownMinValue();
2125   // Compute the scalar steps and save the results in VectorLoopValueMap.
2126   for (unsigned Part = 0; Part < UF; ++Part) {
2127     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2128       auto *StartIdx = getSignedIntOrFpConstant(
2129           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2130       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2131       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2132       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2133       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2134     }
2135   }
2136 }
2137 
2138 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2139   assert(V != Induction && "The new induction variable should not be used.");
2140   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2141   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2142 
2143   // If we have a stride that is replaced by one, do it here. Defer this for
2144   // the VPlan-native path until we start running Legal checks in that path.
2145   if (!EnableVPlanNativePath && Legal->hasStride(V))
2146     V = ConstantInt::get(V->getType(), 1);
2147 
2148   // If we have a vector mapped to this value, return it.
2149   if (VectorLoopValueMap.hasVectorValue(V, Part))
2150     return VectorLoopValueMap.getVectorValue(V, Part);
2151 
2152   // If the value has not been vectorized, check if it has been scalarized
2153   // instead. If it has been scalarized, and we actually need the value in
2154   // vector form, we will construct the vector values on demand.
2155   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2156     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2157 
2158     // If we've scalarized a value, that value should be an instruction.
2159     auto *I = cast<Instruction>(V);
2160 
2161     // If we aren't vectorizing, we can just copy the scalar map values over to
2162     // the vector map.
2163     if (VF == 1) {
2164       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2165       return ScalarValue;
2166     }
2167 
2168     // Get the last scalar instruction we generated for V and Part. If the value
2169     // is known to be uniform after vectorization, this corresponds to lane zero
2170     // of the Part unroll iteration. Otherwise, the last instruction is the one
2171     // we created for the last vector lane of the Part unroll iteration.
2172     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2173     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2174                             ? 0
2175                             : VF.getKnownMinValue() - 1;
2176     auto *LastInst = cast<Instruction>(
2177         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2178 
2179     // Set the insert point after the last scalarized instruction. This ensures
2180     // the insertelement sequence will directly follow the scalar definitions.
2181     auto OldIP = Builder.saveIP();
2182     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2183     Builder.SetInsertPoint(&*NewIP);
2184 
2185     // However, if we are vectorizing, we need to construct the vector values.
2186     // If the value is known to be uniform after vectorization, we can just
2187     // broadcast the scalar value corresponding to lane zero for each unroll
2188     // iteration. Otherwise, we construct the vector values using insertelement
2189     // instructions. Since the resulting vectors are stored in
2190     // VectorLoopValueMap, we will only generate the insertelements once.
2191     Value *VectorValue = nullptr;
2192     if (Cost->isUniformAfterVectorization(I, VF)) {
2193       VectorValue = getBroadcastInstrs(ScalarValue);
2194       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2195     } else {
2196       // Initialize packing with insertelements to start from undef.
2197       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2198       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2199       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2200       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2201         packScalarIntoVectorValue(V, {Part, Lane});
2202       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2203     }
2204     Builder.restoreIP(OldIP);
2205     return VectorValue;
2206   }
2207 
2208   // If this scalar is unknown, assume that it is a constant or that it is
2209   // loop invariant. Broadcast V and save the value for future uses.
2210   Value *B = getBroadcastInstrs(V);
2211   VectorLoopValueMap.setVectorValue(V, Part, B);
2212   return B;
2213 }
2214 
2215 Value *
2216 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2217                                             const VPIteration &Instance) {
2218   // If the value is not an instruction contained in the loop, it should
2219   // already be scalar.
2220   if (OrigLoop->isLoopInvariant(V))
2221     return V;
2222 
2223   assert(Instance.Lane > 0
2224              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2225              : true && "Uniform values only have lane zero");
2226 
2227   // If the value from the original loop has not been vectorized, it is
2228   // represented by UF x VF scalar values in the new loop. Return the requested
2229   // scalar value.
2230   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2231     return VectorLoopValueMap.getScalarValue(V, Instance);
2232 
2233   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2234   // for the given unroll part. If this entry is not a vector type (i.e., the
2235   // vectorization factor is one), there is no need to generate an
2236   // extractelement instruction.
2237   auto *U = getOrCreateVectorValue(V, Instance.Part);
2238   if (!U->getType()->isVectorTy()) {
2239     assert(VF == 1 && "Value not scalarized has non-vector type");
2240     return U;
2241   }
2242 
2243   // Otherwise, the value from the original loop has been vectorized and is
2244   // represented by UF vector values. Extract and return the requested scalar
2245   // value from the appropriate vector lane.
2246   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2247 }
2248 
2249 void InnerLoopVectorizer::packScalarIntoVectorValue(
2250     Value *V, const VPIteration &Instance) {
2251   assert(V != Induction && "The new induction variable should not be used.");
2252   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2253   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2254 
2255   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2256   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2257   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2258                                             Builder.getInt32(Instance.Lane));
2259   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2260 }
2261 
2262 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2263   assert(Vec->getType()->isVectorTy() && "Invalid type");
2264   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2265   SmallVector<int, 8> ShuffleMask;
2266   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2267     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2268 
2269   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2270                                      ShuffleMask, "reverse");
2271 }
2272 
2273 // Return whether we allow using masked interleave-groups (for dealing with
2274 // strided loads/stores that reside in predicated blocks, or for dealing
2275 // with gaps).
2276 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2277   // If an override option has been passed in for interleaved accesses, use it.
2278   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2279     return EnableMaskedInterleavedMemAccesses;
2280 
2281   return TTI.enableMaskedInterleavedAccessVectorization();
2282 }
2283 
2284 // Try to vectorize the interleave group that \p Instr belongs to.
2285 //
2286 // E.g. Translate following interleaved load group (factor = 3):
2287 //   for (i = 0; i < N; i+=3) {
2288 //     R = Pic[i];             // Member of index 0
2289 //     G = Pic[i+1];           // Member of index 1
2290 //     B = Pic[i+2];           // Member of index 2
2291 //     ... // do something to R, G, B
2292 //   }
2293 // To:
2294 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2295 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2296 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2297 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2298 //
2299 // Or translate following interleaved store group (factor = 3):
2300 //   for (i = 0; i < N; i+=3) {
2301 //     ... do something to R, G, B
2302 //     Pic[i]   = R;           // Member of index 0
2303 //     Pic[i+1] = G;           // Member of index 1
2304 //     Pic[i+2] = B;           // Member of index 2
2305 //   }
2306 // To:
2307 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2308 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2309 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2310 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2311 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2312 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2313     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2314     VPValue *Addr, VPValue *BlockInMask) {
2315   Instruction *Instr = Group->getInsertPos();
2316   const DataLayout &DL = Instr->getModule()->getDataLayout();
2317 
2318   // Prepare for the vector type of the interleaved load/store.
2319   Type *ScalarTy = getMemInstValueType(Instr);
2320   unsigned InterleaveFactor = Group->getFactor();
2321   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2322   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2323 
2324   // Prepare for the new pointers.
2325   SmallVector<Value *, 2> AddrParts;
2326   unsigned Index = Group->getIndex(Instr);
2327 
2328   // TODO: extend the masked interleaved-group support to reversed access.
2329   assert((!BlockInMask || !Group->isReverse()) &&
2330          "Reversed masked interleave-group not supported.");
2331 
2332   // If the group is reverse, adjust the index to refer to the last vector lane
2333   // instead of the first. We adjust the index from the first vector lane,
2334   // rather than directly getting the pointer for lane VF - 1, because the
2335   // pointer operand of the interleaved access is supposed to be uniform. For
2336   // uniform instructions, we're only required to generate a value for the
2337   // first vector lane in each unroll iteration.
2338   assert(!VF.isScalable() &&
2339          "scalable vector reverse operation is not implemented");
2340   if (Group->isReverse())
2341     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2342 
2343   for (unsigned Part = 0; Part < UF; Part++) {
2344     Value *AddrPart = State.get(Addr, {Part, 0});
2345     setDebugLocFromInst(Builder, AddrPart);
2346 
2347     // Notice current instruction could be any index. Need to adjust the address
2348     // to the member of index 0.
2349     //
2350     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2351     //       b = A[i];       // Member of index 0
2352     // Current pointer is pointed to A[i+1], adjust it to A[i].
2353     //
2354     // E.g.  A[i+1] = a;     // Member of index 1
2355     //       A[i]   = b;     // Member of index 0
2356     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2357     // Current pointer is pointed to A[i+2], adjust it to A[i].
2358 
2359     bool InBounds = false;
2360     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2361       InBounds = gep->isInBounds();
2362     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2363     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2364 
2365     // Cast to the vector pointer type.
2366     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2367     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2368     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2369   }
2370 
2371   setDebugLocFromInst(Builder, Instr);
2372   Value *UndefVec = UndefValue::get(VecTy);
2373 
2374   Value *MaskForGaps = nullptr;
2375   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2376     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2377     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2378     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2379   }
2380 
2381   // Vectorize the interleaved load group.
2382   if (isa<LoadInst>(Instr)) {
2383     // For each unroll part, create a wide load for the group.
2384     SmallVector<Value *, 2> NewLoads;
2385     for (unsigned Part = 0; Part < UF; Part++) {
2386       Instruction *NewLoad;
2387       if (BlockInMask || MaskForGaps) {
2388         assert(useMaskedInterleavedAccesses(*TTI) &&
2389                "masked interleaved groups are not allowed.");
2390         Value *GroupMask = MaskForGaps;
2391         if (BlockInMask) {
2392           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2393           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2394           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2395           Value *ShuffledMask = Builder.CreateShuffleVector(
2396               BlockInMaskPart, Undefs,
2397               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2398               "interleaved.mask");
2399           GroupMask = MaskForGaps
2400                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2401                                                 MaskForGaps)
2402                           : ShuffledMask;
2403         }
2404         NewLoad =
2405             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2406                                      GroupMask, UndefVec, "wide.masked.vec");
2407       }
2408       else
2409         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2410                                             Group->getAlign(), "wide.vec");
2411       Group->addMetadata(NewLoad);
2412       NewLoads.push_back(NewLoad);
2413     }
2414 
2415     // For each member in the group, shuffle out the appropriate data from the
2416     // wide loads.
2417     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2418       Instruction *Member = Group->getMember(I);
2419 
2420       // Skip the gaps in the group.
2421       if (!Member)
2422         continue;
2423 
2424       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2425       auto StrideMask =
2426           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2427       for (unsigned Part = 0; Part < UF; Part++) {
2428         Value *StridedVec = Builder.CreateShuffleVector(
2429             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2430 
2431         // If this member has different type, cast the result type.
2432         if (Member->getType() != ScalarTy) {
2433           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2434           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2435           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2436         }
2437 
2438         if (Group->isReverse())
2439           StridedVec = reverseVector(StridedVec);
2440 
2441         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2442       }
2443     }
2444     return;
2445   }
2446 
2447   // The sub vector type for current instruction.
2448   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2449   auto *SubVT = VectorType::get(ScalarTy, VF);
2450 
2451   // Vectorize the interleaved store group.
2452   for (unsigned Part = 0; Part < UF; Part++) {
2453     // Collect the stored vector from each member.
2454     SmallVector<Value *, 4> StoredVecs;
2455     for (unsigned i = 0; i < InterleaveFactor; i++) {
2456       // Interleaved store group doesn't allow a gap, so each index has a member
2457       Instruction *Member = Group->getMember(i);
2458       assert(Member && "Fail to get a member from an interleaved store group");
2459 
2460       Value *StoredVec = getOrCreateVectorValue(
2461           cast<StoreInst>(Member)->getValueOperand(), Part);
2462       if (Group->isReverse())
2463         StoredVec = reverseVector(StoredVec);
2464 
2465       // If this member has different type, cast it to a unified type.
2466 
2467       if (StoredVec->getType() != SubVT)
2468         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2469 
2470       StoredVecs.push_back(StoredVec);
2471     }
2472 
2473     // Concatenate all vectors into a wide vector.
2474     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2475 
2476     // Interleave the elements in the wide vector.
2477     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2478     Value *IVec = Builder.CreateShuffleVector(
2479         WideVec, UndefVec,
2480         createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2481         "interleaved.vec");
2482 
2483     Instruction *NewStoreInstr;
2484     if (BlockInMask) {
2485       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2486       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2487       Value *ShuffledMask = Builder.CreateShuffleVector(
2488           BlockInMaskPart, Undefs,
2489           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2490           "interleaved.mask");
2491       NewStoreInstr = Builder.CreateMaskedStore(
2492           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2493     }
2494     else
2495       NewStoreInstr =
2496           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2497 
2498     Group->addMetadata(NewStoreInstr);
2499   }
2500 }
2501 
2502 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2503                                                      VPTransformState &State,
2504                                                      VPValue *Addr,
2505                                                      VPValue *StoredValue,
2506                                                      VPValue *BlockInMask) {
2507   // Attempt to issue a wide load.
2508   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2509   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2510 
2511   assert((LI || SI) && "Invalid Load/Store instruction");
2512   assert((!SI || StoredValue) && "No stored value provided for widened store");
2513   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2514 
2515   LoopVectorizationCostModel::InstWidening Decision =
2516       Cost->getWideningDecision(Instr, VF);
2517   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2518           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2519           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2520          "CM decision is not to widen the memory instruction");
2521 
2522   Type *ScalarDataTy = getMemInstValueType(Instr);
2523 
2524   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2525   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2526   const Align Alignment = getLoadStoreAlignment(Instr);
2527 
2528   // Determine if the pointer operand of the access is either consecutive or
2529   // reverse consecutive.
2530   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2531   bool ConsecutiveStride =
2532       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2533   bool CreateGatherScatter =
2534       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2535 
2536   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2537   // gather/scatter. Otherwise Decision should have been to Scalarize.
2538   assert((ConsecutiveStride || CreateGatherScatter) &&
2539          "The instruction should be scalarized");
2540   (void)ConsecutiveStride;
2541 
2542   VectorParts BlockInMaskParts(UF);
2543   bool isMaskRequired = BlockInMask;
2544   if (isMaskRequired)
2545     for (unsigned Part = 0; Part < UF; ++Part)
2546       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2547 
2548   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2549     // Calculate the pointer for the specific unroll-part.
2550     GetElementPtrInst *PartPtr = nullptr;
2551 
2552     bool InBounds = false;
2553     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2554       InBounds = gep->isInBounds();
2555 
2556     if (Reverse) {
2557       // If the address is consecutive but reversed, then the
2558       // wide store needs to start at the last vector element.
2559       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2560           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2561       PartPtr->setIsInBounds(InBounds);
2562       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2563           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2564       PartPtr->setIsInBounds(InBounds);
2565       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2566         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2567     } else {
2568       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2569           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2570       PartPtr->setIsInBounds(InBounds);
2571     }
2572 
2573     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2574     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2575   };
2576 
2577   // Handle Stores:
2578   if (SI) {
2579     setDebugLocFromInst(Builder, SI);
2580 
2581     for (unsigned Part = 0; Part < UF; ++Part) {
2582       Instruction *NewSI = nullptr;
2583       Value *StoredVal = State.get(StoredValue, Part);
2584       if (CreateGatherScatter) {
2585         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2586         Value *VectorGep = State.get(Addr, Part);
2587         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2588                                             MaskPart);
2589       } else {
2590         if (Reverse) {
2591           // If we store to reverse consecutive memory locations, then we need
2592           // to reverse the order of elements in the stored value.
2593           StoredVal = reverseVector(StoredVal);
2594           // We don't want to update the value in the map as it might be used in
2595           // another expression. So don't call resetVectorValue(StoredVal).
2596         }
2597         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2598         if (isMaskRequired)
2599           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2600                                             BlockInMaskParts[Part]);
2601         else
2602           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2603       }
2604       addMetadata(NewSI, SI);
2605     }
2606     return;
2607   }
2608 
2609   // Handle loads.
2610   assert(LI && "Must have a load instruction");
2611   setDebugLocFromInst(Builder, LI);
2612   for (unsigned Part = 0; Part < UF; ++Part) {
2613     Value *NewLI;
2614     if (CreateGatherScatter) {
2615       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2616       Value *VectorGep = State.get(Addr, Part);
2617       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2618                                          nullptr, "wide.masked.gather");
2619       addMetadata(NewLI, LI);
2620     } else {
2621       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2622       if (isMaskRequired)
2623         NewLI = Builder.CreateMaskedLoad(
2624             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2625             "wide.masked.load");
2626       else
2627         NewLI =
2628             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2629 
2630       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2631       addMetadata(NewLI, LI);
2632       if (Reverse)
2633         NewLI = reverseVector(NewLI);
2634     }
2635     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2636   }
2637 }
2638 
2639 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2640                                                const VPIteration &Instance,
2641                                                bool IfPredicateInstr,
2642                                                VPTransformState &State) {
2643   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2644 
2645   setDebugLocFromInst(Builder, Instr);
2646 
2647   // Does this instruction return a value ?
2648   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2649 
2650   Instruction *Cloned = Instr->clone();
2651   if (!IsVoidRetTy)
2652     Cloned->setName(Instr->getName() + ".cloned");
2653 
2654   // Replace the operands of the cloned instructions with their scalar
2655   // equivalents in the new loop.
2656   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2657     auto *NewOp = State.get(User.getOperand(op), Instance);
2658     Cloned->setOperand(op, NewOp);
2659   }
2660   addNewMetadata(Cloned, Instr);
2661 
2662   // Place the cloned scalar in the new loop.
2663   Builder.Insert(Cloned);
2664 
2665   // Add the cloned scalar to the scalar map entry.
2666   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2667 
2668   // If we just cloned a new assumption, add it the assumption cache.
2669   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2670     if (II->getIntrinsicID() == Intrinsic::assume)
2671       AC->registerAssumption(II);
2672 
2673   // End if-block.
2674   if (IfPredicateInstr)
2675     PredicatedInstructions.push_back(Cloned);
2676 }
2677 
2678 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2679                                                       Value *End, Value *Step,
2680                                                       Instruction *DL) {
2681   BasicBlock *Header = L->getHeader();
2682   BasicBlock *Latch = L->getLoopLatch();
2683   // As we're just creating this loop, it's possible no latch exists
2684   // yet. If so, use the header as this will be a single block loop.
2685   if (!Latch)
2686     Latch = Header;
2687 
2688   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2689   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2690   setDebugLocFromInst(Builder, OldInst);
2691   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2692 
2693   Builder.SetInsertPoint(Latch->getTerminator());
2694   setDebugLocFromInst(Builder, OldInst);
2695 
2696   // Create i+1 and fill the PHINode.
2697   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2698   Induction->addIncoming(Start, L->getLoopPreheader());
2699   Induction->addIncoming(Next, Latch);
2700   // Create the compare.
2701   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2702   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2703 
2704   // Now we have two terminators. Remove the old one from the block.
2705   Latch->getTerminator()->eraseFromParent();
2706 
2707   return Induction;
2708 }
2709 
2710 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2711   if (TripCount)
2712     return TripCount;
2713 
2714   assert(L && "Create Trip Count for null loop.");
2715   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2716   // Find the loop boundaries.
2717   ScalarEvolution *SE = PSE.getSE();
2718   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2719   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2720          "Invalid loop count");
2721 
2722   Type *IdxTy = Legal->getWidestInductionType();
2723   assert(IdxTy && "No type for induction");
2724 
2725   // The exit count might have the type of i64 while the phi is i32. This can
2726   // happen if we have an induction variable that is sign extended before the
2727   // compare. The only way that we get a backedge taken count is that the
2728   // induction variable was signed and as such will not overflow. In such a case
2729   // truncation is legal.
2730   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2731       IdxTy->getPrimitiveSizeInBits())
2732     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2733   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2734 
2735   // Get the total trip count from the count by adding 1.
2736   const SCEV *ExitCount = SE->getAddExpr(
2737       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2738 
2739   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2740 
2741   // Expand the trip count and place the new instructions in the preheader.
2742   // Notice that the pre-header does not change, only the loop body.
2743   SCEVExpander Exp(*SE, DL, "induction");
2744 
2745   // Count holds the overall loop count (N).
2746   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2747                                 L->getLoopPreheader()->getTerminator());
2748 
2749   if (TripCount->getType()->isPointerTy())
2750     TripCount =
2751         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2752                                     L->getLoopPreheader()->getTerminator());
2753 
2754   return TripCount;
2755 }
2756 
2757 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2758   if (VectorTripCount)
2759     return VectorTripCount;
2760 
2761   Value *TC = getOrCreateTripCount(L);
2762   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2763 
2764   Type *Ty = TC->getType();
2765   // This is where we can make the step a runtime constant.
2766   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2767   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2768 
2769   // If the tail is to be folded by masking, round the number of iterations N
2770   // up to a multiple of Step instead of rounding down. This is done by first
2771   // adding Step-1 and then rounding down. Note that it's ok if this addition
2772   // overflows: the vector induction variable will eventually wrap to zero given
2773   // that it starts at zero and its Step is a power of two; the loop will then
2774   // exit, with the last early-exit vector comparison also producing all-true.
2775   if (Cost->foldTailByMasking()) {
2776     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2777            "VF*UF must be a power of 2 when folding tail by masking");
2778     TC = Builder.CreateAdd(
2779         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2780   }
2781 
2782   // Now we need to generate the expression for the part of the loop that the
2783   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2784   // iterations are not required for correctness, or N - Step, otherwise. Step
2785   // is equal to the vectorization factor (number of SIMD elements) times the
2786   // unroll factor (number of SIMD instructions).
2787   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2788 
2789   // If there is a non-reversed interleaved group that may speculatively access
2790   // memory out-of-bounds, we need to ensure that there will be at least one
2791   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2792   // the trip count, we set the remainder to be equal to the step. If the step
2793   // does not evenly divide the trip count, no adjustment is necessary since
2794   // there will already be scalar iterations. Note that the minimum iterations
2795   // check ensures that N >= Step.
2796   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2797     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2798     R = Builder.CreateSelect(IsZero, Step, R);
2799   }
2800 
2801   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2802 
2803   return VectorTripCount;
2804 }
2805 
2806 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2807                                                    const DataLayout &DL) {
2808   // Verify that V is a vector type with same number of elements as DstVTy.
2809   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2810   unsigned VF = DstFVTy->getNumElements();
2811   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2812   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2813   Type *SrcElemTy = SrcVecTy->getElementType();
2814   Type *DstElemTy = DstFVTy->getElementType();
2815   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2816          "Vector elements must have same size");
2817 
2818   // Do a direct cast if element types are castable.
2819   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2820     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2821   }
2822   // V cannot be directly casted to desired vector type.
2823   // May happen when V is a floating point vector but DstVTy is a vector of
2824   // pointers or vice-versa. Handle this using a two-step bitcast using an
2825   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2826   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2827          "Only one type should be a pointer type");
2828   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2829          "Only one type should be a floating point type");
2830   Type *IntTy =
2831       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2832   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2833   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2834   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2835 }
2836 
2837 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2838                                                          BasicBlock *Bypass) {
2839   Value *Count = getOrCreateTripCount(L);
2840   // Reuse existing vector loop preheader for TC checks.
2841   // Note that new preheader block is generated for vector loop.
2842   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2843   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2844 
2845   // Generate code to check if the loop's trip count is less than VF * UF, or
2846   // equal to it in case a scalar epilogue is required; this implies that the
2847   // vector trip count is zero. This check also covers the case where adding one
2848   // to the backedge-taken count overflowed leading to an incorrect trip count
2849   // of zero. In this case we will also jump to the scalar loop.
2850   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2851                                           : ICmpInst::ICMP_ULT;
2852 
2853   // If tail is to be folded, vector loop takes care of all iterations.
2854   Value *CheckMinIters = Builder.getFalse();
2855   if (!Cost->foldTailByMasking()) {
2856     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2857     CheckMinIters = Builder.CreateICmp(
2858         P, Count,
2859         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2860         "min.iters.check");
2861   }
2862   // Create new preheader for vector loop.
2863   LoopVectorPreHeader =
2864       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2865                  "vector.ph");
2866 
2867   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2868                                DT->getNode(Bypass)->getIDom()) &&
2869          "TC check is expected to dominate Bypass");
2870 
2871   // Update dominator for Bypass & LoopExit.
2872   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2873   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2874 
2875   ReplaceInstWithInst(
2876       TCCheckBlock->getTerminator(),
2877       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2878   LoopBypassBlocks.push_back(TCCheckBlock);
2879 }
2880 
2881 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2882   // Reuse existing vector loop preheader for SCEV checks.
2883   // Note that new preheader block is generated for vector loop.
2884   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2885 
2886   // Generate the code to check that the SCEV assumptions that we made.
2887   // We want the new basic block to start at the first instruction in a
2888   // sequence of instructions that form a check.
2889   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2890                    "scev.check");
2891   Value *SCEVCheck = Exp.expandCodeForPredicate(
2892       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2893 
2894   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2895     if (C->isZero())
2896       return;
2897 
2898   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2899            (OptForSizeBasedOnProfile &&
2900             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2901          "Cannot SCEV check stride or overflow when optimizing for size");
2902 
2903   SCEVCheckBlock->setName("vector.scevcheck");
2904   // Create new preheader for vector loop.
2905   LoopVectorPreHeader =
2906       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2907                  nullptr, "vector.ph");
2908 
2909   // Update dominator only if this is first RT check.
2910   if (LoopBypassBlocks.empty()) {
2911     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2912     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2913   }
2914 
2915   ReplaceInstWithInst(
2916       SCEVCheckBlock->getTerminator(),
2917       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2918   LoopBypassBlocks.push_back(SCEVCheckBlock);
2919   AddedSafetyChecks = true;
2920 }
2921 
2922 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2923   // VPlan-native path does not do any analysis for runtime checks currently.
2924   if (EnableVPlanNativePath)
2925     return;
2926 
2927   // Reuse existing vector loop preheader for runtime memory checks.
2928   // Note that new preheader block is generated for vector loop.
2929   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2930 
2931   // Generate the code that checks in runtime if arrays overlap. We put the
2932   // checks into a separate block to make the more common case of few elements
2933   // faster.
2934   auto *LAI = Legal->getLAI();
2935   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2936   if (!RtPtrChecking.Need)
2937     return;
2938 
2939   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2940     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2941            "Cannot emit memory checks when optimizing for size, unless forced "
2942            "to vectorize.");
2943     ORE->emit([&]() {
2944       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2945                                         L->getStartLoc(), L->getHeader())
2946              << "Code-size may be reduced by not forcing "
2947                 "vectorization, or by source-code modifications "
2948                 "eliminating the need for runtime checks "
2949                 "(e.g., adding 'restrict').";
2950     });
2951   }
2952 
2953   MemCheckBlock->setName("vector.memcheck");
2954   // Create new preheader for vector loop.
2955   LoopVectorPreHeader =
2956       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2957                  "vector.ph");
2958 
2959   auto *CondBranch = cast<BranchInst>(
2960       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2961   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2962   LoopBypassBlocks.push_back(MemCheckBlock);
2963   AddedSafetyChecks = true;
2964 
2965   // Update dominator only if this is first RT check.
2966   if (LoopBypassBlocks.empty()) {
2967     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2968     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2969   }
2970 
2971   Instruction *FirstCheckInst;
2972   Instruction *MemRuntimeCheck;
2973   std::tie(FirstCheckInst, MemRuntimeCheck) =
2974       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2975                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2976   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2977                             "claimed checks are required");
2978   CondBranch->setCondition(MemRuntimeCheck);
2979 
2980   // We currently don't use LoopVersioning for the actual loop cloning but we
2981   // still use it to add the noalias metadata.
2982   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2983                                           PSE.getSE());
2984   LVer->prepareNoAliasMetadata();
2985 }
2986 
2987 Value *InnerLoopVectorizer::emitTransformedIndex(
2988     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2989     const InductionDescriptor &ID) const {
2990 
2991   SCEVExpander Exp(*SE, DL, "induction");
2992   auto Step = ID.getStep();
2993   auto StartValue = ID.getStartValue();
2994   assert(Index->getType() == Step->getType() &&
2995          "Index type does not match StepValue type");
2996 
2997   // Note: the IR at this point is broken. We cannot use SE to create any new
2998   // SCEV and then expand it, hoping that SCEV's simplification will give us
2999   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3000   // lead to various SCEV crashes. So all we can do is to use builder and rely
3001   // on InstCombine for future simplifications. Here we handle some trivial
3002   // cases only.
3003   auto CreateAdd = [&B](Value *X, Value *Y) {
3004     assert(X->getType() == Y->getType() && "Types don't match!");
3005     if (auto *CX = dyn_cast<ConstantInt>(X))
3006       if (CX->isZero())
3007         return Y;
3008     if (auto *CY = dyn_cast<ConstantInt>(Y))
3009       if (CY->isZero())
3010         return X;
3011     return B.CreateAdd(X, Y);
3012   };
3013 
3014   auto CreateMul = [&B](Value *X, Value *Y) {
3015     assert(X->getType() == Y->getType() && "Types don't match!");
3016     if (auto *CX = dyn_cast<ConstantInt>(X))
3017       if (CX->isOne())
3018         return Y;
3019     if (auto *CY = dyn_cast<ConstantInt>(Y))
3020       if (CY->isOne())
3021         return X;
3022     return B.CreateMul(X, Y);
3023   };
3024 
3025   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3026   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3027   // the DomTree is not kept up-to-date for additional blocks generated in the
3028   // vector loop. By using the header as insertion point, we guarantee that the
3029   // expanded instructions dominate all their uses.
3030   auto GetInsertPoint = [this, &B]() {
3031     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3032     if (InsertBB != LoopVectorBody &&
3033         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3034       return LoopVectorBody->getTerminator();
3035     return &*B.GetInsertPoint();
3036   };
3037   switch (ID.getKind()) {
3038   case InductionDescriptor::IK_IntInduction: {
3039     assert(Index->getType() == StartValue->getType() &&
3040            "Index type does not match StartValue type");
3041     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3042       return B.CreateSub(StartValue, Index);
3043     auto *Offset = CreateMul(
3044         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3045     return CreateAdd(StartValue, Offset);
3046   }
3047   case InductionDescriptor::IK_PtrInduction: {
3048     assert(isa<SCEVConstant>(Step) &&
3049            "Expected constant step for pointer induction");
3050     return B.CreateGEP(
3051         StartValue->getType()->getPointerElementType(), StartValue,
3052         CreateMul(Index,
3053                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3054   }
3055   case InductionDescriptor::IK_FpInduction: {
3056     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3057     auto InductionBinOp = ID.getInductionBinOp();
3058     assert(InductionBinOp &&
3059            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3060             InductionBinOp->getOpcode() == Instruction::FSub) &&
3061            "Original bin op should be defined for FP induction");
3062 
3063     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3064 
3065     // Floating point operations had to be 'fast' to enable the induction.
3066     FastMathFlags Flags;
3067     Flags.setFast();
3068 
3069     Value *MulExp = B.CreateFMul(StepValue, Index);
3070     if (isa<Instruction>(MulExp))
3071       // We have to check, the MulExp may be a constant.
3072       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3073 
3074     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3075                                "induction");
3076     if (isa<Instruction>(BOp))
3077       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3078 
3079     return BOp;
3080   }
3081   case InductionDescriptor::IK_NoInduction:
3082     return nullptr;
3083   }
3084   llvm_unreachable("invalid enum");
3085 }
3086 
3087 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3088   LoopScalarBody = OrigLoop->getHeader();
3089   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3090   LoopExitBlock = OrigLoop->getExitBlock();
3091   assert(LoopExitBlock && "Must have an exit block");
3092   assert(LoopVectorPreHeader && "Invalid loop structure");
3093 
3094   LoopMiddleBlock =
3095       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3096                  LI, nullptr, Twine(Prefix) + "middle.block");
3097   LoopScalarPreHeader =
3098       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3099                  nullptr, Twine(Prefix) + "scalar.ph");
3100   // We intentionally don't let SplitBlock to update LoopInfo since
3101   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3102   // LoopVectorBody is explicitly added to the correct place few lines later.
3103   LoopVectorBody =
3104       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3106 
3107   // Update dominator for loop exit.
3108   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3109 
3110   // Create and register the new vector loop.
3111   Loop *Lp = LI->AllocateLoop();
3112   Loop *ParentLoop = OrigLoop->getParentLoop();
3113 
3114   // Insert the new loop into the loop nest and register the new basic blocks
3115   // before calling any utilities such as SCEV that require valid LoopInfo.
3116   if (ParentLoop) {
3117     ParentLoop->addChildLoop(Lp);
3118   } else {
3119     LI->addTopLevelLoop(Lp);
3120   }
3121   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3122   return Lp;
3123 }
3124 
3125 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3126                                                       Value *VectorTripCount) {
3127   assert(VectorTripCount && L && "Expected valid arguments");
3128   // We are going to resume the execution of the scalar loop.
3129   // Go over all of the induction variables that we found and fix the
3130   // PHIs that are left in the scalar version of the loop.
3131   // The starting values of PHI nodes depend on the counter of the last
3132   // iteration in the vectorized loop.
3133   // If we come from a bypass edge then we need to start from the original
3134   // start value.
3135   for (auto &InductionEntry : Legal->getInductionVars()) {
3136     PHINode *OrigPhi = InductionEntry.first;
3137     InductionDescriptor II = InductionEntry.second;
3138 
3139     // Create phi nodes to merge from the  backedge-taken check block.
3140     PHINode *BCResumeVal =
3141         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3142                         LoopScalarPreHeader->getTerminator());
3143     // Copy original phi DL over to the new one.
3144     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3145     Value *&EndValue = IVEndValues[OrigPhi];
3146     if (OrigPhi == OldInduction) {
3147       // We know what the end value is.
3148       EndValue = VectorTripCount;
3149     } else {
3150       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3151       Type *StepType = II.getStep()->getType();
3152       Instruction::CastOps CastOp =
3153           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3154       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3155       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3156       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3157       EndValue->setName("ind.end");
3158     }
3159 
3160     // The new PHI merges the original incoming value, in case of a bypass,
3161     // or the value at the end of the vectorized loop.
3162     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3163 
3164     // Fix the scalar body counter (PHI node).
3165     // The old induction's phi node in the scalar body needs the truncated
3166     // value.
3167     for (BasicBlock *BB : LoopBypassBlocks)
3168       BCResumeVal->addIncoming(II.getStartValue(), BB);
3169     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3170   }
3171 }
3172 
3173 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3174                                                       MDNode *OrigLoopID) {
3175   assert(L && "Expected valid loop.");
3176 
3177   // The trip counts should be cached by now.
3178   Value *Count = getOrCreateTripCount(L);
3179   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3180 
3181   // We need the OrigLoop (scalar loop part) latch terminator to help
3182   // produce correct debug info for the middle block BB instructions.
3183   // The legality check stage guarantees that the loop will have a single
3184   // latch.
3185   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3186          "Scalar loop latch terminator isn't a branch");
3187   BranchInst *ScalarLatchBr =
3188       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3189 
3190   // Add a check in the middle block to see if we have completed
3191   // all of the iterations in the first vector loop.
3192   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3193   // If tail is to be folded, we know we don't need to run the remainder.
3194   Value *CmpN = Builder.getTrue();
3195   if (!Cost->foldTailByMasking()) {
3196     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3197                            VectorTripCount, "cmp.n",
3198                            LoopMiddleBlock->getTerminator());
3199 
3200     // Here we use the same DebugLoc as the scalar loop latch branch instead
3201     // of the corresponding compare because they may have ended up with
3202     // different line numbers and we want to avoid awkward line stepping while
3203     // debugging. Eg. if the compare has got a line number inside the loop.
3204     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3205   }
3206 
3207   BranchInst *BrInst =
3208       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3209   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3210   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3211 
3212   // Get ready to start creating new instructions into the vectorized body.
3213   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3214          "Inconsistent vector loop preheader");
3215   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3216 
3217   Optional<MDNode *> VectorizedLoopID =
3218       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3219                                       LLVMLoopVectorizeFollowupVectorized});
3220   if (VectorizedLoopID.hasValue()) {
3221     L->setLoopID(VectorizedLoopID.getValue());
3222 
3223     // Do not setAlreadyVectorized if loop attributes have been defined
3224     // explicitly.
3225     return LoopVectorPreHeader;
3226   }
3227 
3228   // Keep all loop hints from the original loop on the vector loop (we'll
3229   // replace the vectorizer-specific hints below).
3230   if (MDNode *LID = OrigLoop->getLoopID())
3231     L->setLoopID(LID);
3232 
3233   LoopVectorizeHints Hints(L, true, *ORE);
3234   Hints.setAlreadyVectorized();
3235 
3236 #ifdef EXPENSIVE_CHECKS
3237   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3238   LI->verify(*DT);
3239 #endif
3240 
3241   return LoopVectorPreHeader;
3242 }
3243 
3244 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3245   /*
3246    In this function we generate a new loop. The new loop will contain
3247    the vectorized instructions while the old loop will continue to run the
3248    scalar remainder.
3249 
3250        [ ] <-- loop iteration number check.
3251     /   |
3252    /    v
3253   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3254   |  /  |
3255   | /   v
3256   ||   [ ]     <-- vector pre header.
3257   |/    |
3258   |     v
3259   |    [  ] \
3260   |    [  ]_|   <-- vector loop.
3261   |     |
3262   |     v
3263   |   -[ ]   <--- middle-block.
3264   |  /  |
3265   | /   v
3266   -|- >[ ]     <--- new preheader.
3267    |    |
3268    |    v
3269    |   [ ] \
3270    |   [ ]_|   <-- old scalar loop to handle remainder.
3271     \   |
3272      \  v
3273       >[ ]     <-- exit block.
3274    ...
3275    */
3276 
3277   // Get the metadata of the original loop before it gets modified.
3278   MDNode *OrigLoopID = OrigLoop->getLoopID();
3279 
3280   // Create an empty vector loop, and prepare basic blocks for the runtime
3281   // checks.
3282   Loop *Lp = createVectorLoopSkeleton("");
3283 
3284   // Now, compare the new count to zero. If it is zero skip the vector loop and
3285   // jump to the scalar loop. This check also covers the case where the
3286   // backedge-taken count is uint##_max: adding one to it will overflow leading
3287   // to an incorrect trip count of zero. In this (rare) case we will also jump
3288   // to the scalar loop.
3289   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3290 
3291   // Generate the code to check any assumptions that we've made for SCEV
3292   // expressions.
3293   emitSCEVChecks(Lp, LoopScalarPreHeader);
3294 
3295   // Generate the code that checks in runtime if arrays overlap. We put the
3296   // checks into a separate block to make the more common case of few elements
3297   // faster.
3298   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3299 
3300   // Some loops have a single integer induction variable, while other loops
3301   // don't. One example is c++ iterators that often have multiple pointer
3302   // induction variables. In the code below we also support a case where we
3303   // don't have a single induction variable.
3304   //
3305   // We try to obtain an induction variable from the original loop as hard
3306   // as possible. However if we don't find one that:
3307   //   - is an integer
3308   //   - counts from zero, stepping by one
3309   //   - is the size of the widest induction variable type
3310   // then we create a new one.
3311   OldInduction = Legal->getPrimaryInduction();
3312   Type *IdxTy = Legal->getWidestInductionType();
3313   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3314   // The loop step is equal to the vectorization factor (num of SIMD elements)
3315   // times the unroll factor (num of SIMD instructions).
3316   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3317   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3318   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3319   Induction =
3320       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3321                               getDebugLocFromInstOrOperands(OldInduction));
3322 
3323   // Emit phis for the new starting index of the scalar loop.
3324   createInductionResumeValues(Lp, CountRoundDown);
3325 
3326   return completeLoopSkeleton(Lp, OrigLoopID);
3327 }
3328 
3329 // Fix up external users of the induction variable. At this point, we are
3330 // in LCSSA form, with all external PHIs that use the IV having one input value,
3331 // coming from the remainder loop. We need those PHIs to also have a correct
3332 // value for the IV when arriving directly from the middle block.
3333 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3334                                        const InductionDescriptor &II,
3335                                        Value *CountRoundDown, Value *EndValue,
3336                                        BasicBlock *MiddleBlock) {
3337   // There are two kinds of external IV usages - those that use the value
3338   // computed in the last iteration (the PHI) and those that use the penultimate
3339   // value (the value that feeds into the phi from the loop latch).
3340   // We allow both, but they, obviously, have different values.
3341 
3342   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3343 
3344   DenseMap<Value *, Value *> MissingVals;
3345 
3346   // An external user of the last iteration's value should see the value that
3347   // the remainder loop uses to initialize its own IV.
3348   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3349   for (User *U : PostInc->users()) {
3350     Instruction *UI = cast<Instruction>(U);
3351     if (!OrigLoop->contains(UI)) {
3352       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3353       MissingVals[UI] = EndValue;
3354     }
3355   }
3356 
3357   // An external user of the penultimate value need to see EndValue - Step.
3358   // The simplest way to get this is to recompute it from the constituent SCEVs,
3359   // that is Start + (Step * (CRD - 1)).
3360   for (User *U : OrigPhi->users()) {
3361     auto *UI = cast<Instruction>(U);
3362     if (!OrigLoop->contains(UI)) {
3363       const DataLayout &DL =
3364           OrigLoop->getHeader()->getModule()->getDataLayout();
3365       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3366 
3367       IRBuilder<> B(MiddleBlock->getTerminator());
3368       Value *CountMinusOne = B.CreateSub(
3369           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3370       Value *CMO =
3371           !II.getStep()->getType()->isIntegerTy()
3372               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3373                              II.getStep()->getType())
3374               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3375       CMO->setName("cast.cmo");
3376       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3377       Escape->setName("ind.escape");
3378       MissingVals[UI] = Escape;
3379     }
3380   }
3381 
3382   for (auto &I : MissingVals) {
3383     PHINode *PHI = cast<PHINode>(I.first);
3384     // One corner case we have to handle is two IVs "chasing" each-other,
3385     // that is %IV2 = phi [...], [ %IV1, %latch ]
3386     // In this case, if IV1 has an external use, we need to avoid adding both
3387     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3388     // don't already have an incoming value for the middle block.
3389     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3390       PHI->addIncoming(I.second, MiddleBlock);
3391   }
3392 }
3393 
3394 namespace {
3395 
3396 struct CSEDenseMapInfo {
3397   static bool canHandle(const Instruction *I) {
3398     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3399            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3400   }
3401 
3402   static inline Instruction *getEmptyKey() {
3403     return DenseMapInfo<Instruction *>::getEmptyKey();
3404   }
3405 
3406   static inline Instruction *getTombstoneKey() {
3407     return DenseMapInfo<Instruction *>::getTombstoneKey();
3408   }
3409 
3410   static unsigned getHashValue(const Instruction *I) {
3411     assert(canHandle(I) && "Unknown instruction!");
3412     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3413                                                            I->value_op_end()));
3414   }
3415 
3416   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3417     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3418         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3419       return LHS == RHS;
3420     return LHS->isIdenticalTo(RHS);
3421   }
3422 };
3423 
3424 } // end anonymous namespace
3425 
3426 ///Perform cse of induction variable instructions.
3427 static void cse(BasicBlock *BB) {
3428   // Perform simple cse.
3429   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3430   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3431     Instruction *In = &*I++;
3432 
3433     if (!CSEDenseMapInfo::canHandle(In))
3434       continue;
3435 
3436     // Check if we can replace this instruction with any of the
3437     // visited instructions.
3438     if (Instruction *V = CSEMap.lookup(In)) {
3439       In->replaceAllUsesWith(V);
3440       In->eraseFromParent();
3441       continue;
3442     }
3443 
3444     CSEMap[In] = In;
3445   }
3446 }
3447 
3448 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3449                                                        ElementCount VF,
3450                                                        bool &NeedToScalarize) {
3451   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3452   Function *F = CI->getCalledFunction();
3453   Type *ScalarRetTy = CI->getType();
3454   SmallVector<Type *, 4> Tys, ScalarTys;
3455   for (auto &ArgOp : CI->arg_operands())
3456     ScalarTys.push_back(ArgOp->getType());
3457 
3458   // Estimate cost of scalarized vector call. The source operands are assumed
3459   // to be vectors, so we need to extract individual elements from there,
3460   // execute VF scalar calls, and then gather the result into the vector return
3461   // value.
3462   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3463                                                  TTI::TCK_RecipThroughput);
3464   if (VF.isScalar())
3465     return ScalarCallCost;
3466 
3467   // Compute corresponding vector type for return value and arguments.
3468   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3469   for (Type *ScalarTy : ScalarTys)
3470     Tys.push_back(ToVectorTy(ScalarTy, VF));
3471 
3472   // Compute costs of unpacking argument values for the scalar calls and
3473   // packing the return values to a vector.
3474   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3475 
3476   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3477 
3478   // If we can't emit a vector call for this function, then the currently found
3479   // cost is the cost we need to return.
3480   NeedToScalarize = true;
3481   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3482   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3483 
3484   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3485     return Cost;
3486 
3487   // If the corresponding vector cost is cheaper, return its cost.
3488   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3489                                                  TTI::TCK_RecipThroughput);
3490   if (VectorCallCost < Cost) {
3491     NeedToScalarize = false;
3492     return VectorCallCost;
3493   }
3494   return Cost;
3495 }
3496 
3497 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3498                                                             ElementCount VF) {
3499   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3500   assert(ID && "Expected intrinsic call!");
3501 
3502   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3503   return TTI.getIntrinsicInstrCost(CostAttrs,
3504                                    TargetTransformInfo::TCK_RecipThroughput);
3505 }
3506 
3507 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3508   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3509   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3510   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3511 }
3512 
3513 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3514   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3517 }
3518 
3519 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3520   // For every instruction `I` in MinBWs, truncate the operands, create a
3521   // truncated version of `I` and reextend its result. InstCombine runs
3522   // later and will remove any ext/trunc pairs.
3523   SmallPtrSet<Value *, 4> Erased;
3524   for (const auto &KV : Cost->getMinimalBitwidths()) {
3525     // If the value wasn't vectorized, we must maintain the original scalar
3526     // type. The absence of the value from VectorLoopValueMap indicates that it
3527     // wasn't vectorized.
3528     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3529       continue;
3530     for (unsigned Part = 0; Part < UF; ++Part) {
3531       Value *I = getOrCreateVectorValue(KV.first, Part);
3532       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3533         continue;
3534       Type *OriginalTy = I->getType();
3535       Type *ScalarTruncatedTy =
3536           IntegerType::get(OriginalTy->getContext(), KV.second);
3537       auto *TruncatedTy = FixedVectorType::get(
3538           ScalarTruncatedTy,
3539           cast<FixedVectorType>(OriginalTy)->getNumElements());
3540       if (TruncatedTy == OriginalTy)
3541         continue;
3542 
3543       IRBuilder<> B(cast<Instruction>(I));
3544       auto ShrinkOperand = [&](Value *V) -> Value * {
3545         if (auto *ZI = dyn_cast<ZExtInst>(V))
3546           if (ZI->getSrcTy() == TruncatedTy)
3547             return ZI->getOperand(0);
3548         return B.CreateZExtOrTrunc(V, TruncatedTy);
3549       };
3550 
3551       // The actual instruction modification depends on the instruction type,
3552       // unfortunately.
3553       Value *NewI = nullptr;
3554       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3555         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3556                              ShrinkOperand(BO->getOperand(1)));
3557 
3558         // Any wrapping introduced by shrinking this operation shouldn't be
3559         // considered undefined behavior. So, we can't unconditionally copy
3560         // arithmetic wrapping flags to NewI.
3561         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3562       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3563         NewI =
3564             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3565                          ShrinkOperand(CI->getOperand(1)));
3566       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3567         NewI = B.CreateSelect(SI->getCondition(),
3568                               ShrinkOperand(SI->getTrueValue()),
3569                               ShrinkOperand(SI->getFalseValue()));
3570       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3571         switch (CI->getOpcode()) {
3572         default:
3573           llvm_unreachable("Unhandled cast!");
3574         case Instruction::Trunc:
3575           NewI = ShrinkOperand(CI->getOperand(0));
3576           break;
3577         case Instruction::SExt:
3578           NewI = B.CreateSExtOrTrunc(
3579               CI->getOperand(0),
3580               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3581           break;
3582         case Instruction::ZExt:
3583           NewI = B.CreateZExtOrTrunc(
3584               CI->getOperand(0),
3585               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3586           break;
3587         }
3588       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3589         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3590                              ->getNumElements();
3591         auto *O0 = B.CreateZExtOrTrunc(
3592             SI->getOperand(0),
3593             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3594         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3595                              ->getNumElements();
3596         auto *O1 = B.CreateZExtOrTrunc(
3597             SI->getOperand(1),
3598             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3599 
3600         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3601       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3602         // Don't do anything with the operands, just extend the result.
3603         continue;
3604       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3605         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3606                             ->getNumElements();
3607         auto *O0 = B.CreateZExtOrTrunc(
3608             IE->getOperand(0),
3609             FixedVectorType::get(ScalarTruncatedTy, Elements));
3610         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3611         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3612       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3613         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3614                             ->getNumElements();
3615         auto *O0 = B.CreateZExtOrTrunc(
3616             EE->getOperand(0),
3617             FixedVectorType::get(ScalarTruncatedTy, Elements));
3618         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3619       } else {
3620         // If we don't know what to do, be conservative and don't do anything.
3621         continue;
3622       }
3623 
3624       // Lastly, extend the result.
3625       NewI->takeName(cast<Instruction>(I));
3626       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3627       I->replaceAllUsesWith(Res);
3628       cast<Instruction>(I)->eraseFromParent();
3629       Erased.insert(I);
3630       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3631     }
3632   }
3633 
3634   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3635   for (const auto &KV : Cost->getMinimalBitwidths()) {
3636     // If the value wasn't vectorized, we must maintain the original scalar
3637     // type. The absence of the value from VectorLoopValueMap indicates that it
3638     // wasn't vectorized.
3639     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3640       continue;
3641     for (unsigned Part = 0; Part < UF; ++Part) {
3642       Value *I = getOrCreateVectorValue(KV.first, Part);
3643       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3644       if (Inst && Inst->use_empty()) {
3645         Value *NewI = Inst->getOperand(0);
3646         Inst->eraseFromParent();
3647         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3648       }
3649     }
3650   }
3651 }
3652 
3653 void InnerLoopVectorizer::fixVectorizedLoop() {
3654   // Insert truncates and extends for any truncated instructions as hints to
3655   // InstCombine.
3656   if (VF.isVector())
3657     truncateToMinimalBitwidths();
3658 
3659   // Fix widened non-induction PHIs by setting up the PHI operands.
3660   if (OrigPHIsToFix.size()) {
3661     assert(EnableVPlanNativePath &&
3662            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3663     fixNonInductionPHIs();
3664   }
3665 
3666   // At this point every instruction in the original loop is widened to a
3667   // vector form. Now we need to fix the recurrences in the loop. These PHI
3668   // nodes are currently empty because we did not want to introduce cycles.
3669   // This is the second stage of vectorizing recurrences.
3670   fixCrossIterationPHIs();
3671 
3672   // Forget the original basic block.
3673   PSE.getSE()->forgetLoop(OrigLoop);
3674 
3675   // Fix-up external users of the induction variables.
3676   for (auto &Entry : Legal->getInductionVars())
3677     fixupIVUsers(Entry.first, Entry.second,
3678                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3679                  IVEndValues[Entry.first], LoopMiddleBlock);
3680 
3681   fixLCSSAPHIs();
3682   for (Instruction *PI : PredicatedInstructions)
3683     sinkScalarOperands(&*PI);
3684 
3685   // Remove redundant induction instructions.
3686   cse(LoopVectorBody);
3687 
3688   // Set/update profile weights for the vector and remainder loops as original
3689   // loop iterations are now distributed among them. Note that original loop
3690   // represented by LoopScalarBody becomes remainder loop after vectorization.
3691   //
3692   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3693   // end up getting slightly roughened result but that should be OK since
3694   // profile is not inherently precise anyway. Note also possible bypass of
3695   // vector code caused by legality checks is ignored, assigning all the weight
3696   // to the vector loop, optimistically.
3697   assert(!VF.isScalable() &&
3698          "cannot use scalable ElementCount to determine unroll factor");
3699   setProfileInfoAfterUnrolling(
3700       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3701       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3702 }
3703 
3704 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3705   // In order to support recurrences we need to be able to vectorize Phi nodes.
3706   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3707   // stage #2: We now need to fix the recurrences by adding incoming edges to
3708   // the currently empty PHI nodes. At this point every instruction in the
3709   // original loop is widened to a vector form so we can use them to construct
3710   // the incoming edges.
3711   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3712     // Handle first-order recurrences and reductions that need to be fixed.
3713     if (Legal->isFirstOrderRecurrence(&Phi))
3714       fixFirstOrderRecurrence(&Phi);
3715     else if (Legal->isReductionVariable(&Phi))
3716       fixReduction(&Phi);
3717   }
3718 }
3719 
3720 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3721   // This is the second phase of vectorizing first-order recurrences. An
3722   // overview of the transformation is described below. Suppose we have the
3723   // following loop.
3724   //
3725   //   for (int i = 0; i < n; ++i)
3726   //     b[i] = a[i] - a[i - 1];
3727   //
3728   // There is a first-order recurrence on "a". For this loop, the shorthand
3729   // scalar IR looks like:
3730   //
3731   //   scalar.ph:
3732   //     s_init = a[-1]
3733   //     br scalar.body
3734   //
3735   //   scalar.body:
3736   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3737   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3738   //     s2 = a[i]
3739   //     b[i] = s2 - s1
3740   //     br cond, scalar.body, ...
3741   //
3742   // In this example, s1 is a recurrence because it's value depends on the
3743   // previous iteration. In the first phase of vectorization, we created a
3744   // temporary value for s1. We now complete the vectorization and produce the
3745   // shorthand vector IR shown below (for VF = 4, UF = 1).
3746   //
3747   //   vector.ph:
3748   //     v_init = vector(..., ..., ..., a[-1])
3749   //     br vector.body
3750   //
3751   //   vector.body
3752   //     i = phi [0, vector.ph], [i+4, vector.body]
3753   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3754   //     v2 = a[i, i+1, i+2, i+3];
3755   //     v3 = vector(v1(3), v2(0, 1, 2))
3756   //     b[i, i+1, i+2, i+3] = v2 - v3
3757   //     br cond, vector.body, middle.block
3758   //
3759   //   middle.block:
3760   //     x = v2(3)
3761   //     br scalar.ph
3762   //
3763   //   scalar.ph:
3764   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3765   //     br scalar.body
3766   //
3767   // After execution completes the vector loop, we extract the next value of
3768   // the recurrence (x) to use as the initial value in the scalar loop.
3769 
3770   // Get the original loop preheader and single loop latch.
3771   auto *Preheader = OrigLoop->getLoopPreheader();
3772   auto *Latch = OrigLoop->getLoopLatch();
3773 
3774   // Get the initial and previous values of the scalar recurrence.
3775   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3776   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3777 
3778   // Create a vector from the initial value.
3779   auto *VectorInit = ScalarInit;
3780   if (VF.isVector()) {
3781     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3782     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3783     VectorInit = Builder.CreateInsertElement(
3784         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3785         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3786   }
3787 
3788   // We constructed a temporary phi node in the first phase of vectorization.
3789   // This phi node will eventually be deleted.
3790   Builder.SetInsertPoint(
3791       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3792 
3793   // Create a phi node for the new recurrence. The current value will either be
3794   // the initial value inserted into a vector or loop-varying vector value.
3795   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3796   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3797 
3798   // Get the vectorized previous value of the last part UF - 1. It appears last
3799   // among all unrolled iterations, due to the order of their construction.
3800   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3801 
3802   // Find and set the insertion point after the previous value if it is an
3803   // instruction.
3804   BasicBlock::iterator InsertPt;
3805   // Note that the previous value may have been constant-folded so it is not
3806   // guaranteed to be an instruction in the vector loop.
3807   // FIXME: Loop invariant values do not form recurrences. We should deal with
3808   //        them earlier.
3809   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3810     InsertPt = LoopVectorBody->getFirstInsertionPt();
3811   else {
3812     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3813     if (isa<PHINode>(PreviousLastPart))
3814       // If the previous value is a phi node, we should insert after all the phi
3815       // nodes in the block containing the PHI to avoid breaking basic block
3816       // verification. Note that the basic block may be different to
3817       // LoopVectorBody, in case we predicate the loop.
3818       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3819     else
3820       InsertPt = ++PreviousInst->getIterator();
3821   }
3822   Builder.SetInsertPoint(&*InsertPt);
3823 
3824   // We will construct a vector for the recurrence by combining the values for
3825   // the current and previous iterations. This is the required shuffle mask.
3826   assert(!VF.isScalable());
3827   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3828   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3829   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3830     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3831 
3832   // The vector from which to take the initial value for the current iteration
3833   // (actual or unrolled). Initially, this is the vector phi node.
3834   Value *Incoming = VecPhi;
3835 
3836   // Shuffle the current and previous vector and update the vector parts.
3837   for (unsigned Part = 0; Part < UF; ++Part) {
3838     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3839     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3840     auto *Shuffle =
3841         VF.isVector()
3842             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3843             : Incoming;
3844     PhiPart->replaceAllUsesWith(Shuffle);
3845     cast<Instruction>(PhiPart)->eraseFromParent();
3846     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3847     Incoming = PreviousPart;
3848   }
3849 
3850   // Fix the latch value of the new recurrence in the vector loop.
3851   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3852 
3853   // Extract the last vector element in the middle block. This will be the
3854   // initial value for the recurrence when jumping to the scalar loop.
3855   auto *ExtractForScalar = Incoming;
3856   if (VF.isVector()) {
3857     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3858     ExtractForScalar = Builder.CreateExtractElement(
3859         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3860         "vector.recur.extract");
3861   }
3862   // Extract the second last element in the middle block if the
3863   // Phi is used outside the loop. We need to extract the phi itself
3864   // and not the last element (the phi update in the current iteration). This
3865   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3866   // when the scalar loop is not run at all.
3867   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3868   if (VF.isVector())
3869     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3870         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3871         "vector.recur.extract.for.phi");
3872   // When loop is unrolled without vectorizing, initialize
3873   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3874   // `Incoming`. This is analogous to the vectorized case above: extracting the
3875   // second last element when VF > 1.
3876   else if (UF > 1)
3877     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3878 
3879   // Fix the initial value of the original recurrence in the scalar loop.
3880   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3881   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3882   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3883     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3884     Start->addIncoming(Incoming, BB);
3885   }
3886 
3887   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3888   Phi->setName("scalar.recur");
3889 
3890   // Finally, fix users of the recurrence outside the loop. The users will need
3891   // either the last value of the scalar recurrence or the last value of the
3892   // vector recurrence we extracted in the middle block. Since the loop is in
3893   // LCSSA form, we just need to find all the phi nodes for the original scalar
3894   // recurrence in the exit block, and then add an edge for the middle block.
3895   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3896     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3897       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3898     }
3899   }
3900 }
3901 
3902 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3903   Constant *Zero = Builder.getInt32(0);
3904 
3905   // Get it's reduction variable descriptor.
3906   assert(Legal->isReductionVariable(Phi) &&
3907          "Unable to find the reduction variable");
3908   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3909 
3910   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3911   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3912   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3913   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3914     RdxDesc.getMinMaxRecurrenceKind();
3915   setDebugLocFromInst(Builder, ReductionStartValue);
3916   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3917 
3918   // We need to generate a reduction vector from the incoming scalar.
3919   // To do so, we need to generate the 'identity' vector and override
3920   // one of the elements with the incoming scalar reduction. We need
3921   // to do it in the vector-loop preheader.
3922   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3923 
3924   // This is the vector-clone of the value that leaves the loop.
3925   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3926 
3927   // Find the reduction identity variable. Zero for addition, or, xor,
3928   // one for multiplication, -1 for And.
3929   Value *Identity;
3930   Value *VectorStart;
3931   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3932       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3933     // MinMax reduction have the start value as their identify.
3934     if (VF == 1 || IsInLoopReductionPhi) {
3935       VectorStart = Identity = ReductionStartValue;
3936     } else {
3937       VectorStart = Identity =
3938         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3939     }
3940   } else {
3941     // Handle other reduction kinds:
3942     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3943         RK, VecTy->getScalarType());
3944     if (VF == 1 || IsInLoopReductionPhi) {
3945       Identity = Iden;
3946       // This vector is the Identity vector where the first element is the
3947       // incoming scalar reduction.
3948       VectorStart = ReductionStartValue;
3949     } else {
3950       Identity = ConstantVector::getSplat(VF, Iden);
3951 
3952       // This vector is the Identity vector where the first element is the
3953       // incoming scalar reduction.
3954       VectorStart =
3955         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3956     }
3957   }
3958 
3959   // Wrap flags are in general invalid after vectorization, clear them.
3960   clearReductionWrapFlags(RdxDesc);
3961 
3962   // Fix the vector-loop phi.
3963 
3964   // Reductions do not have to start at zero. They can start with
3965   // any loop invariant values.
3966   BasicBlock *Latch = OrigLoop->getLoopLatch();
3967   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3968 
3969   for (unsigned Part = 0; Part < UF; ++Part) {
3970     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3971     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3972     // Make sure to add the reduction start value only to the
3973     // first unroll part.
3974     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3975     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3976     cast<PHINode>(VecRdxPhi)
3977       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3978   }
3979 
3980   // Before each round, move the insertion point right between
3981   // the PHIs and the values we are going to write.
3982   // This allows us to write both PHINodes and the extractelement
3983   // instructions.
3984   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3985 
3986   setDebugLocFromInst(Builder, LoopExitInst);
3987 
3988   // If tail is folded by masking, the vector value to leave the loop should be
3989   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3990   // instead of the former.
3991   if (Cost->foldTailByMasking()) {
3992     for (unsigned Part = 0; Part < UF; ++Part) {
3993       Value *VecLoopExitInst =
3994           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3995       Value *Sel = nullptr;
3996       for (User *U : VecLoopExitInst->users()) {
3997         if (isa<SelectInst>(U)) {
3998           assert(!Sel && "Reduction exit feeding two selects");
3999           Sel = U;
4000         } else
4001           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4002       }
4003       assert(Sel && "Reduction exit feeds no select");
4004       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4005 
4006       // If the target can create a predicated operator for the reduction at no
4007       // extra cost in the loop (for example a predicated vadd), it can be
4008       // cheaper for the select to remain in the loop than be sunk out of it,
4009       // and so use the select value for the phi instead of the old
4010       // LoopExitValue.
4011       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4012       if (PreferPredicatedReductionSelect ||
4013           TTI->preferPredicatedReductionSelect(
4014               RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4015               Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4016         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4017         VecRdxPhi->setIncomingValueForBlock(
4018             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4019       }
4020     }
4021   }
4022 
4023   // If the vector reduction can be performed in a smaller type, we truncate
4024   // then extend the loop exit value to enable InstCombine to evaluate the
4025   // entire expression in the smaller type.
4026   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4027     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4028     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4029     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4030     Builder.SetInsertPoint(
4031         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4032     VectorParts RdxParts(UF);
4033     for (unsigned Part = 0; Part < UF; ++Part) {
4034       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4035       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4036       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4037                                         : Builder.CreateZExt(Trunc, VecTy);
4038       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4039            UI != RdxParts[Part]->user_end();)
4040         if (*UI != Trunc) {
4041           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4042           RdxParts[Part] = Extnd;
4043         } else {
4044           ++UI;
4045         }
4046     }
4047     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4048     for (unsigned Part = 0; Part < UF; ++Part) {
4049       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4050       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4051     }
4052   }
4053 
4054   // Reduce all of the unrolled parts into a single vector.
4055   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4056   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4057 
4058   // The middle block terminator has already been assigned a DebugLoc here (the
4059   // OrigLoop's single latch terminator). We want the whole middle block to
4060   // appear to execute on this line because: (a) it is all compiler generated,
4061   // (b) these instructions are always executed after evaluating the latch
4062   // conditional branch, and (c) other passes may add new predecessors which
4063   // terminate on this line. This is the easiest way to ensure we don't
4064   // accidentally cause an extra step back into the loop while debugging.
4065   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4066   for (unsigned Part = 1; Part < UF; ++Part) {
4067     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4068     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4069       // Floating point operations had to be 'fast' to enable the reduction.
4070       ReducedPartRdx = addFastMathFlag(
4071           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4072                               ReducedPartRdx, "bin.rdx"),
4073           RdxDesc.getFastMathFlags());
4074     else
4075       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4076                                       RdxPart);
4077   }
4078 
4079   // Create the reduction after the loop. Note that inloop reductions create the
4080   // target reduction in the loop using a Reduction recipe.
4081   if (VF.isVector() && !IsInLoopReductionPhi) {
4082     bool NoNaN = Legal->hasFunNoNaNAttr();
4083     ReducedPartRdx =
4084         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4085     // If the reduction can be performed in a smaller type, we need to extend
4086     // the reduction to the wider type before we branch to the original loop.
4087     if (Phi->getType() != RdxDesc.getRecurrenceType())
4088       ReducedPartRdx =
4089         RdxDesc.isSigned()
4090         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4091         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4092   }
4093 
4094   // Create a phi node that merges control-flow from the backedge-taken check
4095   // block and the middle block.
4096   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4097                                         LoopScalarPreHeader->getTerminator());
4098   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4099     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4100   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4101 
4102   // Now, we need to fix the users of the reduction variable
4103   // inside and outside of the scalar remainder loop.
4104   // We know that the loop is in LCSSA form. We need to update the
4105   // PHI nodes in the exit blocks.
4106   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4107     // All PHINodes need to have a single entry edge, or two if
4108     // we already fixed them.
4109     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4110 
4111     // We found a reduction value exit-PHI. Update it with the
4112     // incoming bypass edge.
4113     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4114       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4115   } // end of the LCSSA phi scan.
4116 
4117     // Fix the scalar loop reduction variable with the incoming reduction sum
4118     // from the vector body and from the backedge value.
4119   int IncomingEdgeBlockIdx =
4120     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4121   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4122   // Pick the other block.
4123   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4124   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4125   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4126 }
4127 
4128 void InnerLoopVectorizer::clearReductionWrapFlags(
4129     RecurrenceDescriptor &RdxDesc) {
4130   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4131   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4132       RK != RecurrenceDescriptor::RK_IntegerMult)
4133     return;
4134 
4135   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4136   assert(LoopExitInstr && "null loop exit instruction");
4137   SmallVector<Instruction *, 8> Worklist;
4138   SmallPtrSet<Instruction *, 8> Visited;
4139   Worklist.push_back(LoopExitInstr);
4140   Visited.insert(LoopExitInstr);
4141 
4142   while (!Worklist.empty()) {
4143     Instruction *Cur = Worklist.pop_back_val();
4144     if (isa<OverflowingBinaryOperator>(Cur))
4145       for (unsigned Part = 0; Part < UF; ++Part) {
4146         Value *V = getOrCreateVectorValue(Cur, Part);
4147         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4148       }
4149 
4150     for (User *U : Cur->users()) {
4151       Instruction *UI = cast<Instruction>(U);
4152       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4153           Visited.insert(UI).second)
4154         Worklist.push_back(UI);
4155     }
4156   }
4157 }
4158 
4159 void InnerLoopVectorizer::fixLCSSAPHIs() {
4160   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4161   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4162     if (LCSSAPhi.getNumIncomingValues() == 1) {
4163       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4164       // Non-instruction incoming values will have only one value.
4165       unsigned LastLane = 0;
4166       if (isa<Instruction>(IncomingValue))
4167         LastLane = Cost->isUniformAfterVectorization(
4168                        cast<Instruction>(IncomingValue), VF)
4169                        ? 0
4170                        : VF.getKnownMinValue() - 1;
4171       // Can be a loop invariant incoming value or the last scalar value to be
4172       // extracted from the vectorized loop.
4173       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4174       Value *lastIncomingValue =
4175           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4176       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4177     }
4178   }
4179 }
4180 
4181 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4182   // The basic block and loop containing the predicated instruction.
4183   auto *PredBB = PredInst->getParent();
4184   auto *VectorLoop = LI->getLoopFor(PredBB);
4185 
4186   // Initialize a worklist with the operands of the predicated instruction.
4187   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4188 
4189   // Holds instructions that we need to analyze again. An instruction may be
4190   // reanalyzed if we don't yet know if we can sink it or not.
4191   SmallVector<Instruction *, 8> InstsToReanalyze;
4192 
4193   // Returns true if a given use occurs in the predicated block. Phi nodes use
4194   // their operands in their corresponding predecessor blocks.
4195   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4196     auto *I = cast<Instruction>(U.getUser());
4197     BasicBlock *BB = I->getParent();
4198     if (auto *Phi = dyn_cast<PHINode>(I))
4199       BB = Phi->getIncomingBlock(
4200           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4201     return BB == PredBB;
4202   };
4203 
4204   // Iteratively sink the scalarized operands of the predicated instruction
4205   // into the block we created for it. When an instruction is sunk, it's
4206   // operands are then added to the worklist. The algorithm ends after one pass
4207   // through the worklist doesn't sink a single instruction.
4208   bool Changed;
4209   do {
4210     // Add the instructions that need to be reanalyzed to the worklist, and
4211     // reset the changed indicator.
4212     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4213     InstsToReanalyze.clear();
4214     Changed = false;
4215 
4216     while (!Worklist.empty()) {
4217       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4218 
4219       // We can't sink an instruction if it is a phi node, is already in the
4220       // predicated block, is not in the loop, or may have side effects.
4221       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4222           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4223         continue;
4224 
4225       // It's legal to sink the instruction if all its uses occur in the
4226       // predicated block. Otherwise, there's nothing to do yet, and we may
4227       // need to reanalyze the instruction.
4228       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4229         InstsToReanalyze.push_back(I);
4230         continue;
4231       }
4232 
4233       // Move the instruction to the beginning of the predicated block, and add
4234       // it's operands to the worklist.
4235       I->moveBefore(&*PredBB->getFirstInsertionPt());
4236       Worklist.insert(I->op_begin(), I->op_end());
4237 
4238       // The sinking may have enabled other instructions to be sunk, so we will
4239       // need to iterate.
4240       Changed = true;
4241     }
4242   } while (Changed);
4243 }
4244 
4245 void InnerLoopVectorizer::fixNonInductionPHIs() {
4246   for (PHINode *OrigPhi : OrigPHIsToFix) {
4247     PHINode *NewPhi =
4248         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4249     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4250 
4251     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4252         predecessors(OrigPhi->getParent()));
4253     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4254         predecessors(NewPhi->getParent()));
4255     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4256            "Scalar and Vector BB should have the same number of predecessors");
4257 
4258     // The insertion point in Builder may be invalidated by the time we get
4259     // here. Force the Builder insertion point to something valid so that we do
4260     // not run into issues during insertion point restore in
4261     // getOrCreateVectorValue calls below.
4262     Builder.SetInsertPoint(NewPhi);
4263 
4264     // The predecessor order is preserved and we can rely on mapping between
4265     // scalar and vector block predecessors.
4266     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4267       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4268 
4269       // When looking up the new scalar/vector values to fix up, use incoming
4270       // values from original phi.
4271       Value *ScIncV =
4272           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4273 
4274       // Scalar incoming value may need a broadcast
4275       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4276       NewPhi->addIncoming(NewIncV, NewPredBB);
4277     }
4278   }
4279 }
4280 
4281 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4282                                    unsigned UF, ElementCount VF,
4283                                    bool IsPtrLoopInvariant,
4284                                    SmallBitVector &IsIndexLoopInvariant,
4285                                    VPTransformState &State) {
4286   // Construct a vector GEP by widening the operands of the scalar GEP as
4287   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4288   // results in a vector of pointers when at least one operand of the GEP
4289   // is vector-typed. Thus, to keep the representation compact, we only use
4290   // vector-typed operands for loop-varying values.
4291 
4292   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4293     // If we are vectorizing, but the GEP has only loop-invariant operands,
4294     // the GEP we build (by only using vector-typed operands for
4295     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4296     // produce a vector of pointers, we need to either arbitrarily pick an
4297     // operand to broadcast, or broadcast a clone of the original GEP.
4298     // Here, we broadcast a clone of the original.
4299     //
4300     // TODO: If at some point we decide to scalarize instructions having
4301     //       loop-invariant operands, this special case will no longer be
4302     //       required. We would add the scalarization decision to
4303     //       collectLoopScalars() and teach getVectorValue() to broadcast
4304     //       the lane-zero scalar value.
4305     auto *Clone = Builder.Insert(GEP->clone());
4306     for (unsigned Part = 0; Part < UF; ++Part) {
4307       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4308       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4309       addMetadata(EntryPart, GEP);
4310     }
4311   } else {
4312     // If the GEP has at least one loop-varying operand, we are sure to
4313     // produce a vector of pointers. But if we are only unrolling, we want
4314     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4315     // produce with the code below will be scalar (if VF == 1) or vector
4316     // (otherwise). Note that for the unroll-only case, we still maintain
4317     // values in the vector mapping with initVector, as we do for other
4318     // instructions.
4319     for (unsigned Part = 0; Part < UF; ++Part) {
4320       // The pointer operand of the new GEP. If it's loop-invariant, we
4321       // won't broadcast it.
4322       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4323                                      : State.get(Operands.getOperand(0), Part);
4324 
4325       // Collect all the indices for the new GEP. If any index is
4326       // loop-invariant, we won't broadcast it.
4327       SmallVector<Value *, 4> Indices;
4328       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4329         VPValue *Operand = Operands.getOperand(I);
4330         if (IsIndexLoopInvariant[I - 1])
4331           Indices.push_back(State.get(Operand, {0, 0}));
4332         else
4333           Indices.push_back(State.get(Operand, Part));
4334       }
4335 
4336       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4337       // but it should be a vector, otherwise.
4338       auto *NewGEP =
4339           GEP->isInBounds()
4340               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4341                                           Indices)
4342               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4343       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4344              "NewGEP is not a pointer vector");
4345       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4346       addMetadata(NewGEP, GEP);
4347     }
4348   }
4349 }
4350 
4351 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4352                                               ElementCount VF) {
4353   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4354   PHINode *P = cast<PHINode>(PN);
4355   if (EnableVPlanNativePath) {
4356     // Currently we enter here in the VPlan-native path for non-induction
4357     // PHIs where all control flow is uniform. We simply widen these PHIs.
4358     // Create a vector phi with no operands - the vector phi operands will be
4359     // set at the end of vector code generation.
4360     Type *VecTy =
4361         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4362     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4363     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4364     OrigPHIsToFix.push_back(P);
4365 
4366     return;
4367   }
4368 
4369   assert(PN->getParent() == OrigLoop->getHeader() &&
4370          "Non-header phis should have been handled elsewhere");
4371 
4372   // In order to support recurrences we need to be able to vectorize Phi nodes.
4373   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4374   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4375   // this value when we vectorize all of the instructions that use the PHI.
4376   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4377     for (unsigned Part = 0; Part < UF; ++Part) {
4378       // This is phase one of vectorizing PHIs.
4379       bool ScalarPHI =
4380           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4381       Type *VecTy =
4382           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4383       Value *EntryPart = PHINode::Create(
4384           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4385       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4386     }
4387     return;
4388   }
4389 
4390   setDebugLocFromInst(Builder, P);
4391 
4392   // This PHINode must be an induction variable.
4393   // Make sure that we know about it.
4394   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4395 
4396   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4397   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4398 
4399   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4400   // which can be found from the original scalar operations.
4401   switch (II.getKind()) {
4402   case InductionDescriptor::IK_NoInduction:
4403     llvm_unreachable("Unknown induction");
4404   case InductionDescriptor::IK_IntInduction:
4405   case InductionDescriptor::IK_FpInduction:
4406     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4407   case InductionDescriptor::IK_PtrInduction: {
4408     // Handle the pointer induction variable case.
4409     assert(P->getType()->isPointerTy() && "Unexpected type.");
4410 
4411     if (Cost->isScalarAfterVectorization(P, VF)) {
4412       // This is the normalized GEP that starts counting at zero.
4413       Value *PtrInd =
4414           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4415       // Determine the number of scalars we need to generate for each unroll
4416       // iteration. If the instruction is uniform, we only need to generate the
4417       // first lane. Otherwise, we generate all VF values.
4418       unsigned Lanes =
4419           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4420       for (unsigned Part = 0; Part < UF; ++Part) {
4421         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4422           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4423                                            Lane + Part * VF.getKnownMinValue());
4424           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4425           Value *SclrGep =
4426               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4427           SclrGep->setName("next.gep");
4428           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4429         }
4430       }
4431       return;
4432     }
4433     assert(isa<SCEVConstant>(II.getStep()) &&
4434            "Induction step not a SCEV constant!");
4435     Type *PhiType = II.getStep()->getType();
4436 
4437     // Build a pointer phi
4438     Value *ScalarStartValue = II.getStartValue();
4439     Type *ScStValueType = ScalarStartValue->getType();
4440     PHINode *NewPointerPhi =
4441         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4442     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4443 
4444     // A pointer induction, performed by using a gep
4445     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4446     Instruction *InductionLoc = LoopLatch->getTerminator();
4447     const SCEV *ScalarStep = II.getStep();
4448     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4449     Value *ScalarStepValue =
4450         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4451     Value *InductionGEP = GetElementPtrInst::Create(
4452         ScStValueType->getPointerElementType(), NewPointerPhi,
4453         Builder.CreateMul(
4454             ScalarStepValue,
4455             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4456         "ptr.ind", InductionLoc);
4457     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4458 
4459     // Create UF many actual address geps that use the pointer
4460     // phi as base and a vectorized version of the step value
4461     // (<step*0, ..., step*N>) as offset.
4462     for (unsigned Part = 0; Part < UF; ++Part) {
4463       SmallVector<Constant *, 8> Indices;
4464       // Create a vector of consecutive numbers from zero to VF.
4465       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4466         Indices.push_back(
4467             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4468       Constant *StartOffset = ConstantVector::get(Indices);
4469 
4470       Value *GEP = Builder.CreateGEP(
4471           ScStValueType->getPointerElementType(), NewPointerPhi,
4472           Builder.CreateMul(
4473               StartOffset,
4474               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4475               "vector.gep"));
4476       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4477     }
4478   }
4479   }
4480 }
4481 
4482 /// A helper function for checking whether an integer division-related
4483 /// instruction may divide by zero (in which case it must be predicated if
4484 /// executed conditionally in the scalar code).
4485 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4486 /// Non-zero divisors that are non compile-time constants will not be
4487 /// converted into multiplication, so we will still end up scalarizing
4488 /// the division, but can do so w/o predication.
4489 static bool mayDivideByZero(Instruction &I) {
4490   assert((I.getOpcode() == Instruction::UDiv ||
4491           I.getOpcode() == Instruction::SDiv ||
4492           I.getOpcode() == Instruction::URem ||
4493           I.getOpcode() == Instruction::SRem) &&
4494          "Unexpected instruction");
4495   Value *Divisor = I.getOperand(1);
4496   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4497   return !CInt || CInt->isZero();
4498 }
4499 
4500 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4501                                            VPTransformState &State) {
4502   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4503   switch (I.getOpcode()) {
4504   case Instruction::Call:
4505   case Instruction::Br:
4506   case Instruction::PHI:
4507   case Instruction::GetElementPtr:
4508   case Instruction::Select:
4509     llvm_unreachable("This instruction is handled by a different recipe.");
4510   case Instruction::UDiv:
4511   case Instruction::SDiv:
4512   case Instruction::SRem:
4513   case Instruction::URem:
4514   case Instruction::Add:
4515   case Instruction::FAdd:
4516   case Instruction::Sub:
4517   case Instruction::FSub:
4518   case Instruction::FNeg:
4519   case Instruction::Mul:
4520   case Instruction::FMul:
4521   case Instruction::FDiv:
4522   case Instruction::FRem:
4523   case Instruction::Shl:
4524   case Instruction::LShr:
4525   case Instruction::AShr:
4526   case Instruction::And:
4527   case Instruction::Or:
4528   case Instruction::Xor: {
4529     // Just widen unops and binops.
4530     setDebugLocFromInst(Builder, &I);
4531 
4532     for (unsigned Part = 0; Part < UF; ++Part) {
4533       SmallVector<Value *, 2> Ops;
4534       for (VPValue *VPOp : User.operands())
4535         Ops.push_back(State.get(VPOp, Part));
4536 
4537       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4538 
4539       if (auto *VecOp = dyn_cast<Instruction>(V))
4540         VecOp->copyIRFlags(&I);
4541 
4542       // Use this vector value for all users of the original instruction.
4543       VectorLoopValueMap.setVectorValue(&I, Part, V);
4544       addMetadata(V, &I);
4545     }
4546 
4547     break;
4548   }
4549   case Instruction::ICmp:
4550   case Instruction::FCmp: {
4551     // Widen compares. Generate vector compares.
4552     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4553     auto *Cmp = cast<CmpInst>(&I);
4554     setDebugLocFromInst(Builder, Cmp);
4555     for (unsigned Part = 0; Part < UF; ++Part) {
4556       Value *A = State.get(User.getOperand(0), Part);
4557       Value *B = State.get(User.getOperand(1), Part);
4558       Value *C = nullptr;
4559       if (FCmp) {
4560         // Propagate fast math flags.
4561         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4562         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4563         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4564       } else {
4565         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4566       }
4567       VectorLoopValueMap.setVectorValue(&I, Part, C);
4568       addMetadata(C, &I);
4569     }
4570 
4571     break;
4572   }
4573 
4574   case Instruction::ZExt:
4575   case Instruction::SExt:
4576   case Instruction::FPToUI:
4577   case Instruction::FPToSI:
4578   case Instruction::FPExt:
4579   case Instruction::PtrToInt:
4580   case Instruction::IntToPtr:
4581   case Instruction::SIToFP:
4582   case Instruction::UIToFP:
4583   case Instruction::Trunc:
4584   case Instruction::FPTrunc:
4585   case Instruction::BitCast: {
4586     auto *CI = cast<CastInst>(&I);
4587     setDebugLocFromInst(Builder, CI);
4588 
4589     /// Vectorize casts.
4590     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4591     Type *DestTy =
4592         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4593 
4594     for (unsigned Part = 0; Part < UF; ++Part) {
4595       Value *A = State.get(User.getOperand(0), Part);
4596       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4597       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4598       addMetadata(Cast, &I);
4599     }
4600     break;
4601   }
4602   default:
4603     // This instruction is not vectorized by simple widening.
4604     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4605     llvm_unreachable("Unhandled instruction!");
4606   } // end of switch.
4607 }
4608 
4609 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4610                                                VPTransformState &State) {
4611   assert(!isa<DbgInfoIntrinsic>(I) &&
4612          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4613   setDebugLocFromInst(Builder, &I);
4614 
4615   Module *M = I.getParent()->getParent()->getParent();
4616   auto *CI = cast<CallInst>(&I);
4617 
4618   SmallVector<Type *, 4> Tys;
4619   for (Value *ArgOperand : CI->arg_operands())
4620     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4621 
4622   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4623 
4624   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4625   // version of the instruction.
4626   // Is it beneficial to perform intrinsic call compared to lib call?
4627   bool NeedToScalarize = false;
4628   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4629   bool UseVectorIntrinsic =
4630       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4631   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4632          "Instruction should be scalarized elsewhere.");
4633 
4634   for (unsigned Part = 0; Part < UF; ++Part) {
4635     SmallVector<Value *, 4> Args;
4636     for (auto &I : enumerate(ArgOperands.operands())) {
4637       // Some intrinsics have a scalar argument - don't replace it with a
4638       // vector.
4639       Value *Arg;
4640       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4641         Arg = State.get(I.value(), Part);
4642       else
4643         Arg = State.get(I.value(), {0, 0});
4644       Args.push_back(Arg);
4645     }
4646 
4647     Function *VectorF;
4648     if (UseVectorIntrinsic) {
4649       // Use vector version of the intrinsic.
4650       Type *TysForDecl[] = {CI->getType()};
4651       if (VF.isVector()) {
4652         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4653         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4654       }
4655       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4656       assert(VectorF && "Can't retrieve vector intrinsic.");
4657     } else {
4658       // Use vector version of the function call.
4659       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4660 #ifndef NDEBUG
4661       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4662              "Can't create vector function.");
4663 #endif
4664         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4665     }
4666       SmallVector<OperandBundleDef, 1> OpBundles;
4667       CI->getOperandBundlesAsDefs(OpBundles);
4668       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4669 
4670       if (isa<FPMathOperator>(V))
4671         V->copyFastMathFlags(CI);
4672 
4673       VectorLoopValueMap.setVectorValue(&I, Part, V);
4674       addMetadata(V, &I);
4675   }
4676 }
4677 
4678 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4679                                                  VPUser &Operands,
4680                                                  bool InvariantCond,
4681                                                  VPTransformState &State) {
4682   setDebugLocFromInst(Builder, &I);
4683 
4684   // The condition can be loop invariant  but still defined inside the
4685   // loop. This means that we can't just use the original 'cond' value.
4686   // We have to take the 'vectorized' value and pick the first lane.
4687   // Instcombine will make this a no-op.
4688   auto *InvarCond =
4689       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4690 
4691   for (unsigned Part = 0; Part < UF; ++Part) {
4692     Value *Cond =
4693         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4694     Value *Op0 = State.get(Operands.getOperand(1), Part);
4695     Value *Op1 = State.get(Operands.getOperand(2), Part);
4696     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4697     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4698     addMetadata(Sel, &I);
4699   }
4700 }
4701 
4702 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4703   // We should not collect Scalars more than once per VF. Right now, this
4704   // function is called from collectUniformsAndScalars(), which already does
4705   // this check. Collecting Scalars for VF=1 does not make any sense.
4706   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4707          "This function should not be visited twice for the same VF");
4708 
4709   SmallSetVector<Instruction *, 8> Worklist;
4710 
4711   // These sets are used to seed the analysis with pointers used by memory
4712   // accesses that will remain scalar.
4713   SmallSetVector<Instruction *, 8> ScalarPtrs;
4714   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4715   auto *Latch = TheLoop->getLoopLatch();
4716 
4717   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4718   // The pointer operands of loads and stores will be scalar as long as the
4719   // memory access is not a gather or scatter operation. The value operand of a
4720   // store will remain scalar if the store is scalarized.
4721   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4722     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4723     assert(WideningDecision != CM_Unknown &&
4724            "Widening decision should be ready at this moment");
4725     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4726       if (Ptr == Store->getValueOperand())
4727         return WideningDecision == CM_Scalarize;
4728     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4729            "Ptr is neither a value or pointer operand");
4730     return WideningDecision != CM_GatherScatter;
4731   };
4732 
4733   // A helper that returns true if the given value is a bitcast or
4734   // getelementptr instruction contained in the loop.
4735   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4736     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4737             isa<GetElementPtrInst>(V)) &&
4738            !TheLoop->isLoopInvariant(V);
4739   };
4740 
4741   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4742     if (!isa<PHINode>(Ptr) ||
4743         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4744       return false;
4745     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4746     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4747       return false;
4748     return isScalarUse(MemAccess, Ptr);
4749   };
4750 
4751   // A helper that evaluates a memory access's use of a pointer. If the
4752   // pointer is actually the pointer induction of a loop, it is being
4753   // inserted into Worklist. If the use will be a scalar use, and the
4754   // pointer is only used by memory accesses, we place the pointer in
4755   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4756   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4757     if (isScalarPtrInduction(MemAccess, Ptr)) {
4758       Worklist.insert(cast<Instruction>(Ptr));
4759       Instruction *Update = cast<Instruction>(
4760           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4761       Worklist.insert(Update);
4762       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4763                         << "\n");
4764       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4765                         << "\n");
4766       return;
4767     }
4768     // We only care about bitcast and getelementptr instructions contained in
4769     // the loop.
4770     if (!isLoopVaryingBitCastOrGEP(Ptr))
4771       return;
4772 
4773     // If the pointer has already been identified as scalar (e.g., if it was
4774     // also identified as uniform), there's nothing to do.
4775     auto *I = cast<Instruction>(Ptr);
4776     if (Worklist.count(I))
4777       return;
4778 
4779     // If the use of the pointer will be a scalar use, and all users of the
4780     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4781     // place the pointer in PossibleNonScalarPtrs.
4782     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4783           return isa<LoadInst>(U) || isa<StoreInst>(U);
4784         }))
4785       ScalarPtrs.insert(I);
4786     else
4787       PossibleNonScalarPtrs.insert(I);
4788   };
4789 
4790   // We seed the scalars analysis with three classes of instructions: (1)
4791   // instructions marked uniform-after-vectorization and (2) bitcast,
4792   // getelementptr and (pointer) phi instructions used by memory accesses
4793   // requiring a scalar use.
4794   //
4795   // (1) Add to the worklist all instructions that have been identified as
4796   // uniform-after-vectorization.
4797   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4798 
4799   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4800   // memory accesses requiring a scalar use. The pointer operands of loads and
4801   // stores will be scalar as long as the memory accesses is not a gather or
4802   // scatter operation. The value operand of a store will remain scalar if the
4803   // store is scalarized.
4804   for (auto *BB : TheLoop->blocks())
4805     for (auto &I : *BB) {
4806       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4807         evaluatePtrUse(Load, Load->getPointerOperand());
4808       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4809         evaluatePtrUse(Store, Store->getPointerOperand());
4810         evaluatePtrUse(Store, Store->getValueOperand());
4811       }
4812     }
4813   for (auto *I : ScalarPtrs)
4814     if (!PossibleNonScalarPtrs.count(I)) {
4815       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4816       Worklist.insert(I);
4817     }
4818 
4819   // Insert the forced scalars.
4820   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4821   // induction variable when the PHI user is scalarized.
4822   auto ForcedScalar = ForcedScalars.find(VF);
4823   if (ForcedScalar != ForcedScalars.end())
4824     for (auto *I : ForcedScalar->second)
4825       Worklist.insert(I);
4826 
4827   // Expand the worklist by looking through any bitcasts and getelementptr
4828   // instructions we've already identified as scalar. This is similar to the
4829   // expansion step in collectLoopUniforms(); however, here we're only
4830   // expanding to include additional bitcasts and getelementptr instructions.
4831   unsigned Idx = 0;
4832   while (Idx != Worklist.size()) {
4833     Instruction *Dst = Worklist[Idx++];
4834     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4835       continue;
4836     auto *Src = cast<Instruction>(Dst->getOperand(0));
4837     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4838           auto *J = cast<Instruction>(U);
4839           return !TheLoop->contains(J) || Worklist.count(J) ||
4840                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4841                   isScalarUse(J, Src));
4842         })) {
4843       Worklist.insert(Src);
4844       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4845     }
4846   }
4847 
4848   // An induction variable will remain scalar if all users of the induction
4849   // variable and induction variable update remain scalar.
4850   for (auto &Induction : Legal->getInductionVars()) {
4851     auto *Ind = Induction.first;
4852     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4853 
4854     // If tail-folding is applied, the primary induction variable will be used
4855     // to feed a vector compare.
4856     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4857       continue;
4858 
4859     // Determine if all users of the induction variable are scalar after
4860     // vectorization.
4861     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4862       auto *I = cast<Instruction>(U);
4863       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4864     });
4865     if (!ScalarInd)
4866       continue;
4867 
4868     // Determine if all users of the induction variable update instruction are
4869     // scalar after vectorization.
4870     auto ScalarIndUpdate =
4871         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4872           auto *I = cast<Instruction>(U);
4873           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4874         });
4875     if (!ScalarIndUpdate)
4876       continue;
4877 
4878     // The induction variable and its update instruction will remain scalar.
4879     Worklist.insert(Ind);
4880     Worklist.insert(IndUpdate);
4881     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4882     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4883                       << "\n");
4884   }
4885 
4886   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4887 }
4888 
4889 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4890                                                          ElementCount VF) {
4891   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4892   if (!blockNeedsPredication(I->getParent()))
4893     return false;
4894   switch(I->getOpcode()) {
4895   default:
4896     break;
4897   case Instruction::Load:
4898   case Instruction::Store: {
4899     if (!Legal->isMaskRequired(I))
4900       return false;
4901     auto *Ptr = getLoadStorePointerOperand(I);
4902     auto *Ty = getMemInstValueType(I);
4903     // We have already decided how to vectorize this instruction, get that
4904     // result.
4905     if (VF.isVector()) {
4906       InstWidening WideningDecision = getWideningDecision(I, VF);
4907       assert(WideningDecision != CM_Unknown &&
4908              "Widening decision should be ready at this moment");
4909       return WideningDecision == CM_Scalarize;
4910     }
4911     const Align Alignment = getLoadStoreAlignment(I);
4912     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4913                                 isLegalMaskedGather(Ty, Alignment))
4914                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4915                                 isLegalMaskedScatter(Ty, Alignment));
4916   }
4917   case Instruction::UDiv:
4918   case Instruction::SDiv:
4919   case Instruction::SRem:
4920   case Instruction::URem:
4921     return mayDivideByZero(*I);
4922   }
4923   return false;
4924 }
4925 
4926 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4927     Instruction *I, ElementCount VF) {
4928   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4929   assert(getWideningDecision(I, VF) == CM_Unknown &&
4930          "Decision should not be set yet.");
4931   auto *Group = getInterleavedAccessGroup(I);
4932   assert(Group && "Must have a group.");
4933 
4934   // If the instruction's allocated size doesn't equal it's type size, it
4935   // requires padding and will be scalarized.
4936   auto &DL = I->getModule()->getDataLayout();
4937   auto *ScalarTy = getMemInstValueType(I);
4938   if (hasIrregularType(ScalarTy, DL, VF))
4939     return false;
4940 
4941   // Check if masking is required.
4942   // A Group may need masking for one of two reasons: it resides in a block that
4943   // needs predication, or it was decided to use masking to deal with gaps.
4944   bool PredicatedAccessRequiresMasking =
4945       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4946   bool AccessWithGapsRequiresMasking =
4947       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4948   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4949     return true;
4950 
4951   // If masked interleaving is required, we expect that the user/target had
4952   // enabled it, because otherwise it either wouldn't have been created or
4953   // it should have been invalidated by the CostModel.
4954   assert(useMaskedInterleavedAccesses(TTI) &&
4955          "Masked interleave-groups for predicated accesses are not enabled.");
4956 
4957   auto *Ty = getMemInstValueType(I);
4958   const Align Alignment = getLoadStoreAlignment(I);
4959   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4960                           : TTI.isLegalMaskedStore(Ty, Alignment);
4961 }
4962 
4963 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4964     Instruction *I, ElementCount VF) {
4965   // Get and ensure we have a valid memory instruction.
4966   LoadInst *LI = dyn_cast<LoadInst>(I);
4967   StoreInst *SI = dyn_cast<StoreInst>(I);
4968   assert((LI || SI) && "Invalid memory instruction");
4969 
4970   auto *Ptr = getLoadStorePointerOperand(I);
4971 
4972   // In order to be widened, the pointer should be consecutive, first of all.
4973   if (!Legal->isConsecutivePtr(Ptr))
4974     return false;
4975 
4976   // If the instruction is a store located in a predicated block, it will be
4977   // scalarized.
4978   if (isScalarWithPredication(I))
4979     return false;
4980 
4981   // If the instruction's allocated size doesn't equal it's type size, it
4982   // requires padding and will be scalarized.
4983   auto &DL = I->getModule()->getDataLayout();
4984   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4985   if (hasIrregularType(ScalarTy, DL, VF))
4986     return false;
4987 
4988   return true;
4989 }
4990 
4991 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4992   // We should not collect Uniforms more than once per VF. Right now,
4993   // this function is called from collectUniformsAndScalars(), which
4994   // already does this check. Collecting Uniforms for VF=1 does not make any
4995   // sense.
4996 
4997   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4998          "This function should not be visited twice for the same VF");
4999 
5000   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5001   // not analyze again.  Uniforms.count(VF) will return 1.
5002   Uniforms[VF].clear();
5003 
5004   // We now know that the loop is vectorizable!
5005   // Collect instructions inside the loop that will remain uniform after
5006   // vectorization.
5007 
5008   // Global values, params and instructions outside of current loop are out of
5009   // scope.
5010   auto isOutOfScope = [&](Value *V) -> bool {
5011     Instruction *I = dyn_cast<Instruction>(V);
5012     return (!I || !TheLoop->contains(I));
5013   };
5014 
5015   SetVector<Instruction *> Worklist;
5016   BasicBlock *Latch = TheLoop->getLoopLatch();
5017 
5018   // Instructions that are scalar with predication must not be considered
5019   // uniform after vectorization, because that would create an erroneous
5020   // replicating region where only a single instance out of VF should be formed.
5021   // TODO: optimize such seldom cases if found important, see PR40816.
5022   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5023     if (isScalarWithPredication(I, VF)) {
5024       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5025                         << *I << "\n");
5026       return;
5027     }
5028     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5029     Worklist.insert(I);
5030   };
5031 
5032   // Start with the conditional branch. If the branch condition is an
5033   // instruction contained in the loop that is only used by the branch, it is
5034   // uniform.
5035   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5036   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5037     addToWorklistIfAllowed(Cmp);
5038 
5039   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5040   // are pointers that are treated like consecutive pointers during
5041   // vectorization. The pointer operands of interleaved accesses are an
5042   // example.
5043   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5044 
5045   // Holds pointer operands of instructions that are possibly non-uniform.
5046   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5047 
5048   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5049     InstWidening WideningDecision = getWideningDecision(I, VF);
5050     assert(WideningDecision != CM_Unknown &&
5051            "Widening decision should be ready at this moment");
5052 
5053     return (WideningDecision == CM_Widen ||
5054             WideningDecision == CM_Widen_Reverse ||
5055             WideningDecision == CM_Interleave);
5056   };
5057   // Iterate over the instructions in the loop, and collect all
5058   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5059   // that a consecutive-like pointer operand will be scalarized, we collect it
5060   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5061   // getelementptr instruction can be used by both vectorized and scalarized
5062   // memory instructions. For example, if a loop loads and stores from the same
5063   // location, but the store is conditional, the store will be scalarized, and
5064   // the getelementptr won't remain uniform.
5065   for (auto *BB : TheLoop->blocks())
5066     for (auto &I : *BB) {
5067       // If there's no pointer operand, there's nothing to do.
5068       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5069       if (!Ptr)
5070         continue;
5071 
5072       // True if all users of Ptr are memory accesses that have Ptr as their
5073       // pointer operand.
5074       auto UsersAreMemAccesses =
5075           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5076             return getLoadStorePointerOperand(U) == Ptr;
5077           });
5078 
5079       // Ensure the memory instruction will not be scalarized or used by
5080       // gather/scatter, making its pointer operand non-uniform. If the pointer
5081       // operand is used by any instruction other than a memory access, we
5082       // conservatively assume the pointer operand may be non-uniform.
5083       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5084         PossibleNonUniformPtrs.insert(Ptr);
5085 
5086       // If the memory instruction will be vectorized and its pointer operand
5087       // is consecutive-like, or interleaving - the pointer operand should
5088       // remain uniform.
5089       else
5090         ConsecutiveLikePtrs.insert(Ptr);
5091     }
5092 
5093   // Add to the Worklist all consecutive and consecutive-like pointers that
5094   // aren't also identified as possibly non-uniform.
5095   for (auto *V : ConsecutiveLikePtrs)
5096     if (!PossibleNonUniformPtrs.count(V))
5097       addToWorklistIfAllowed(V);
5098 
5099   // Expand Worklist in topological order: whenever a new instruction
5100   // is added , its users should be already inside Worklist.  It ensures
5101   // a uniform instruction will only be used by uniform instructions.
5102   unsigned idx = 0;
5103   while (idx != Worklist.size()) {
5104     Instruction *I = Worklist[idx++];
5105 
5106     for (auto OV : I->operand_values()) {
5107       // isOutOfScope operands cannot be uniform instructions.
5108       if (isOutOfScope(OV))
5109         continue;
5110       // First order recurrence Phi's should typically be considered
5111       // non-uniform.
5112       auto *OP = dyn_cast<PHINode>(OV);
5113       if (OP && Legal->isFirstOrderRecurrence(OP))
5114         continue;
5115       // If all the users of the operand are uniform, then add the
5116       // operand into the uniform worklist.
5117       auto *OI = cast<Instruction>(OV);
5118       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5119             auto *J = cast<Instruction>(U);
5120             return Worklist.count(J) ||
5121                    (OI == getLoadStorePointerOperand(J) &&
5122                     isUniformDecision(J, VF));
5123           }))
5124         addToWorklistIfAllowed(OI);
5125     }
5126   }
5127 
5128   // Returns true if Ptr is the pointer operand of a memory access instruction
5129   // I, and I is known to not require scalarization.
5130   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5131     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5132   };
5133 
5134   // For an instruction to be added into Worklist above, all its users inside
5135   // the loop should also be in Worklist. However, this condition cannot be
5136   // true for phi nodes that form a cyclic dependence. We must process phi
5137   // nodes separately. An induction variable will remain uniform if all users
5138   // of the induction variable and induction variable update remain uniform.
5139   // The code below handles both pointer and non-pointer induction variables.
5140   for (auto &Induction : Legal->getInductionVars()) {
5141     auto *Ind = Induction.first;
5142     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5143 
5144     // Determine if all users of the induction variable are uniform after
5145     // vectorization.
5146     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5147       auto *I = cast<Instruction>(U);
5148       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5149              isVectorizedMemAccessUse(I, Ind);
5150     });
5151     if (!UniformInd)
5152       continue;
5153 
5154     // Determine if all users of the induction variable update instruction are
5155     // uniform after vectorization.
5156     auto UniformIndUpdate =
5157         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5158           auto *I = cast<Instruction>(U);
5159           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5160                  isVectorizedMemAccessUse(I, IndUpdate);
5161         });
5162     if (!UniformIndUpdate)
5163       continue;
5164 
5165     // The induction variable and its update instruction will remain uniform.
5166     addToWorklistIfAllowed(Ind);
5167     addToWorklistIfAllowed(IndUpdate);
5168   }
5169 
5170   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5171 }
5172 
5173 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5174   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5175 
5176   if (Legal->getRuntimePointerChecking()->Need) {
5177     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5178         "runtime pointer checks needed. Enable vectorization of this "
5179         "loop with '#pragma clang loop vectorize(enable)' when "
5180         "compiling with -Os/-Oz",
5181         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5182     return true;
5183   }
5184 
5185   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5186     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5187         "runtime SCEV checks needed. Enable vectorization of this "
5188         "loop with '#pragma clang loop vectorize(enable)' when "
5189         "compiling with -Os/-Oz",
5190         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5191     return true;
5192   }
5193 
5194   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5195   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5196     reportVectorizationFailure("Runtime stride check for small trip count",
5197         "runtime stride == 1 checks needed. Enable vectorization of "
5198         "this loop without such check by compiling with -Os/-Oz",
5199         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5200     return true;
5201   }
5202 
5203   return false;
5204 }
5205 
5206 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5207                                                             unsigned UserIC) {
5208   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5209     // TODO: It may by useful to do since it's still likely to be dynamically
5210     // uniform if the target can skip.
5211     reportVectorizationFailure(
5212         "Not inserting runtime ptr check for divergent target",
5213         "runtime pointer checks needed. Not enabled for divergent target",
5214         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5215     return None;
5216   }
5217 
5218   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5219   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5220   if (TC == 1) {
5221     reportVectorizationFailure("Single iteration (non) loop",
5222         "loop trip count is one, irrelevant for vectorization",
5223         "SingleIterationLoop", ORE, TheLoop);
5224     return None;
5225   }
5226 
5227   switch (ScalarEpilogueStatus) {
5228   case CM_ScalarEpilogueAllowed:
5229     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5230   case CM_ScalarEpilogueNotNeededUsePredicate:
5231     LLVM_DEBUG(
5232         dbgs() << "LV: vector predicate hint/switch found.\n"
5233                << "LV: Not allowing scalar epilogue, creating predicated "
5234                << "vector loop.\n");
5235     break;
5236   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5237     // fallthrough as a special case of OptForSize
5238   case CM_ScalarEpilogueNotAllowedOptSize:
5239     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5240       LLVM_DEBUG(
5241           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5242     else
5243       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5244                         << "count.\n");
5245 
5246     // Bail if runtime checks are required, which are not good when optimising
5247     // for size.
5248     if (runtimeChecksRequired())
5249       return None;
5250     break;
5251   }
5252 
5253   // Now try the tail folding
5254 
5255   // Invalidate interleave groups that require an epilogue if we can't mask
5256   // the interleave-group.
5257   if (!useMaskedInterleavedAccesses(TTI)) {
5258     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5259            "No decisions should have been taken at this point");
5260     // Note: There is no need to invalidate any cost modeling decisions here, as
5261     // non where taken so far.
5262     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5263   }
5264 
5265   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5266   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5267   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5268   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5269     // Accept MaxVF if we do not have a tail.
5270     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5271     return MaxVF;
5272   }
5273 
5274   // If we don't know the precise trip count, or if the trip count that we
5275   // found modulo the vectorization factor is not zero, try to fold the tail
5276   // by masking.
5277   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5278   if (Legal->prepareToFoldTailByMasking()) {
5279     FoldTailByMasking = true;
5280     return MaxVF;
5281   }
5282 
5283   // If there was a tail-folding hint/switch, but we can't fold the tail by
5284   // masking, fallback to a vectorization with a scalar epilogue.
5285   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5286     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5287       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5288       return None;
5289     }
5290     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5291                          "scalar epilogue instead.\n");
5292     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5293     return MaxVF;
5294   }
5295 
5296   if (TC == 0) {
5297     reportVectorizationFailure(
5298         "Unable to calculate the loop count due to complex control flow",
5299         "unable to calculate the loop count due to complex control flow",
5300         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5301     return None;
5302   }
5303 
5304   reportVectorizationFailure(
5305       "Cannot optimize for size and vectorize at the same time.",
5306       "cannot optimize for size and vectorize at the same time. "
5307       "Enable vectorization of this loop with '#pragma clang loop "
5308       "vectorize(enable)' when compiling with -Os/-Oz",
5309       "NoTailLoopWithOptForSize", ORE, TheLoop);
5310   return None;
5311 }
5312 
5313 unsigned
5314 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5315   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5316   unsigned SmallestType, WidestType;
5317   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5318   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5319 
5320   // Get the maximum safe dependence distance in bits computed by LAA.
5321   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5322   // the memory accesses that is most restrictive (involved in the smallest
5323   // dependence distance).
5324   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5325 
5326   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5327 
5328   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5329   // Note that both WidestRegister and WidestType may not be a powers of 2.
5330   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5331 
5332   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5333                     << " / " << WidestType << " bits.\n");
5334   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5335                     << WidestRegister << " bits.\n");
5336 
5337   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5338                                  " into one vector!");
5339   if (MaxVectorSize == 0) {
5340     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5341     MaxVectorSize = 1;
5342     return MaxVectorSize;
5343   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5344              isPowerOf2_32(ConstTripCount)) {
5345     // We need to clamp the VF to be the ConstTripCount. There is no point in
5346     // choosing a higher viable VF as done in the loop below.
5347     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5348                       << ConstTripCount << "\n");
5349     MaxVectorSize = ConstTripCount;
5350     return MaxVectorSize;
5351   }
5352 
5353   unsigned MaxVF = MaxVectorSize;
5354   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5355       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5356     // Collect all viable vectorization factors larger than the default MaxVF
5357     // (i.e. MaxVectorSize).
5358     SmallVector<ElementCount, 8> VFs;
5359     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5360     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5361       VFs.push_back(ElementCount::getFixed(VS));
5362 
5363     // For each VF calculate its register usage.
5364     auto RUs = calculateRegisterUsage(VFs);
5365 
5366     // Select the largest VF which doesn't require more registers than existing
5367     // ones.
5368     for (int i = RUs.size() - 1; i >= 0; --i) {
5369       bool Selected = true;
5370       for (auto& pair : RUs[i].MaxLocalUsers) {
5371         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5372         if (pair.second > TargetNumRegisters)
5373           Selected = false;
5374       }
5375       if (Selected) {
5376         MaxVF = VFs[i].getKnownMinValue();
5377         break;
5378       }
5379     }
5380     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5381       if (MaxVF < MinVF) {
5382         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5383                           << ") with target's minimum: " << MinVF << '\n');
5384         MaxVF = MinVF;
5385       }
5386     }
5387   }
5388   return MaxVF;
5389 }
5390 
5391 VectorizationFactor
5392 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5393   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5394   const float ScalarCost = Cost;
5395   unsigned Width = 1;
5396   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5397 
5398   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5399   if (ForceVectorization && MaxVF > 1) {
5400     // Ignore scalar width, because the user explicitly wants vectorization.
5401     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5402     // evaluation.
5403     Cost = std::numeric_limits<float>::max();
5404   }
5405 
5406   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5407     // Notice that the vector loop needs to be executed less times, so
5408     // we need to divide the cost of the vector loops by the width of
5409     // the vector elements.
5410     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5411     float VectorCost = C.first / (float)i;
5412     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5413                       << " costs: " << (int)VectorCost << ".\n");
5414     if (!C.second && !ForceVectorization) {
5415       LLVM_DEBUG(
5416           dbgs() << "LV: Not considering vector loop of width " << i
5417                  << " because it will not generate any vector instructions.\n");
5418       continue;
5419     }
5420     if (VectorCost < Cost) {
5421       Cost = VectorCost;
5422       Width = i;
5423     }
5424   }
5425 
5426   if (!EnableCondStoresVectorization && NumPredStores) {
5427     reportVectorizationFailure("There are conditional stores.",
5428         "store that is conditionally executed prevents vectorization",
5429         "ConditionalStore", ORE, TheLoop);
5430     Width = 1;
5431     Cost = ScalarCost;
5432   }
5433 
5434   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5435              << "LV: Vectorization seems to be not beneficial, "
5436              << "but was forced by a user.\n");
5437   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5438   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5439                                 (unsigned)(Width * Cost)};
5440   return Factor;
5441 }
5442 
5443 std::pair<unsigned, unsigned>
5444 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5445   unsigned MinWidth = -1U;
5446   unsigned MaxWidth = 8;
5447   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5448 
5449   // For each block.
5450   for (BasicBlock *BB : TheLoop->blocks()) {
5451     // For each instruction in the loop.
5452     for (Instruction &I : BB->instructionsWithoutDebug()) {
5453       Type *T = I.getType();
5454 
5455       // Skip ignored values.
5456       if (ValuesToIgnore.count(&I))
5457         continue;
5458 
5459       // Only examine Loads, Stores and PHINodes.
5460       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5461         continue;
5462 
5463       // Examine PHI nodes that are reduction variables. Update the type to
5464       // account for the recurrence type.
5465       if (auto *PN = dyn_cast<PHINode>(&I)) {
5466         if (!Legal->isReductionVariable(PN))
5467           continue;
5468         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5469         T = RdxDesc.getRecurrenceType();
5470       }
5471 
5472       // Examine the stored values.
5473       if (auto *ST = dyn_cast<StoreInst>(&I))
5474         T = ST->getValueOperand()->getType();
5475 
5476       // Ignore loaded pointer types and stored pointer types that are not
5477       // vectorizable.
5478       //
5479       // FIXME: The check here attempts to predict whether a load or store will
5480       //        be vectorized. We only know this for certain after a VF has
5481       //        been selected. Here, we assume that if an access can be
5482       //        vectorized, it will be. We should also look at extending this
5483       //        optimization to non-pointer types.
5484       //
5485       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5486           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5487         continue;
5488 
5489       MinWidth = std::min(MinWidth,
5490                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5491       MaxWidth = std::max(MaxWidth,
5492                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5493     }
5494   }
5495 
5496   return {MinWidth, MaxWidth};
5497 }
5498 
5499 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5500                                                            unsigned LoopCost) {
5501   // -- The interleave heuristics --
5502   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5503   // There are many micro-architectural considerations that we can't predict
5504   // at this level. For example, frontend pressure (on decode or fetch) due to
5505   // code size, or the number and capabilities of the execution ports.
5506   //
5507   // We use the following heuristics to select the interleave count:
5508   // 1. If the code has reductions, then we interleave to break the cross
5509   // iteration dependency.
5510   // 2. If the loop is really small, then we interleave to reduce the loop
5511   // overhead.
5512   // 3. We don't interleave if we think that we will spill registers to memory
5513   // due to the increased register pressure.
5514 
5515   if (!isScalarEpilogueAllowed())
5516     return 1;
5517 
5518   // We used the distance for the interleave count.
5519   if (Legal->getMaxSafeDepDistBytes() != -1U)
5520     return 1;
5521 
5522   // Do not interleave loops with a relatively small known or estimated trip
5523   // count.
5524   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5525   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5526     return 1;
5527 
5528   RegisterUsage R = calculateRegisterUsage({VF})[0];
5529   // We divide by these constants so assume that we have at least one
5530   // instruction that uses at least one register.
5531   for (auto& pair : R.MaxLocalUsers) {
5532     pair.second = std::max(pair.second, 1U);
5533   }
5534 
5535   // We calculate the interleave count using the following formula.
5536   // Subtract the number of loop invariants from the number of available
5537   // registers. These registers are used by all of the interleaved instances.
5538   // Next, divide the remaining registers by the number of registers that is
5539   // required by the loop, in order to estimate how many parallel instances
5540   // fit without causing spills. All of this is rounded down if necessary to be
5541   // a power of two. We want power of two interleave count to simplify any
5542   // addressing operations or alignment considerations.
5543   // We also want power of two interleave counts to ensure that the induction
5544   // variable of the vector loop wraps to zero, when tail is folded by masking;
5545   // this currently happens when OptForSize, in which case IC is set to 1 above.
5546   unsigned IC = UINT_MAX;
5547 
5548   for (auto& pair : R.MaxLocalUsers) {
5549     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5550     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5551                       << " registers of "
5552                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5553     if (VF == 1) {
5554       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5555         TargetNumRegisters = ForceTargetNumScalarRegs;
5556     } else {
5557       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5558         TargetNumRegisters = ForceTargetNumVectorRegs;
5559     }
5560     unsigned MaxLocalUsers = pair.second;
5561     unsigned LoopInvariantRegs = 0;
5562     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5563       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5564 
5565     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5566     // Don't count the induction variable as interleaved.
5567     if (EnableIndVarRegisterHeur) {
5568       TmpIC =
5569           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5570                         std::max(1U, (MaxLocalUsers - 1)));
5571     }
5572 
5573     IC = std::min(IC, TmpIC);
5574   }
5575 
5576   // Clamp the interleave ranges to reasonable counts.
5577   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5578   unsigned MaxInterleaveCount =
5579       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5580 
5581   // Check if the user has overridden the max.
5582   if (VF == 1) {
5583     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5584       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5585   } else {
5586     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5587       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5588   }
5589 
5590   // If trip count is known or estimated compile time constant, limit the
5591   // interleave count to be less than the trip count divided by VF.
5592   if (BestKnownTC) {
5593     MaxInterleaveCount =
5594         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5595   }
5596 
5597   // If we did not calculate the cost for VF (because the user selected the VF)
5598   // then we calculate the cost of VF here.
5599   if (LoopCost == 0)
5600     LoopCost = expectedCost(VF).first;
5601 
5602   assert(LoopCost && "Non-zero loop cost expected");
5603 
5604   // Clamp the calculated IC to be between the 1 and the max interleave count
5605   // that the target and trip count allows.
5606   if (IC > MaxInterleaveCount)
5607     IC = MaxInterleaveCount;
5608   else if (IC < 1)
5609     IC = 1;
5610 
5611   // Interleave if we vectorized this loop and there is a reduction that could
5612   // benefit from interleaving.
5613   if (VF.isVector() && !Legal->getReductionVars().empty()) {
5614     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5615     return IC;
5616   }
5617 
5618   // Note that if we've already vectorized the loop we will have done the
5619   // runtime check and so interleaving won't require further checks.
5620   bool InterleavingRequiresRuntimePointerCheck =
5621       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5622 
5623   // We want to interleave small loops in order to reduce the loop overhead and
5624   // potentially expose ILP opportunities.
5625   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5626   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5627     // We assume that the cost overhead is 1 and we use the cost model
5628     // to estimate the cost of the loop and interleave until the cost of the
5629     // loop overhead is about 5% of the cost of the loop.
5630     unsigned SmallIC =
5631         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5632 
5633     // Interleave until store/load ports (estimated by max interleave count) are
5634     // saturated.
5635     unsigned NumStores = Legal->getNumStores();
5636     unsigned NumLoads = Legal->getNumLoads();
5637     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5638     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5639 
5640     // If we have a scalar reduction (vector reductions are already dealt with
5641     // by this point), we can increase the critical path length if the loop
5642     // we're interleaving is inside another loop. Limit, by default to 2, so the
5643     // critical path only gets increased by one reduction operation.
5644     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5645       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5646       SmallIC = std::min(SmallIC, F);
5647       StoresIC = std::min(StoresIC, F);
5648       LoadsIC = std::min(LoadsIC, F);
5649     }
5650 
5651     if (EnableLoadStoreRuntimeInterleave &&
5652         std::max(StoresIC, LoadsIC) > SmallIC) {
5653       LLVM_DEBUG(
5654           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5655       return std::max(StoresIC, LoadsIC);
5656     }
5657 
5658     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5659     return SmallIC;
5660   }
5661 
5662   // Interleave if this is a large loop (small loops are already dealt with by
5663   // this point) that could benefit from interleaving.
5664   bool HasReductions = !Legal->getReductionVars().empty();
5665   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5666     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5667     return IC;
5668   }
5669 
5670   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5671   return 1;
5672 }
5673 
5674 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5675 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5676   // This function calculates the register usage by measuring the highest number
5677   // of values that are alive at a single location. Obviously, this is a very
5678   // rough estimation. We scan the loop in a topological order in order and
5679   // assign a number to each instruction. We use RPO to ensure that defs are
5680   // met before their users. We assume that each instruction that has in-loop
5681   // users starts an interval. We record every time that an in-loop value is
5682   // used, so we have a list of the first and last occurrences of each
5683   // instruction. Next, we transpose this data structure into a multi map that
5684   // holds the list of intervals that *end* at a specific location. This multi
5685   // map allows us to perform a linear search. We scan the instructions linearly
5686   // and record each time that a new interval starts, by placing it in a set.
5687   // If we find this value in the multi-map then we remove it from the set.
5688   // The max register usage is the maximum size of the set.
5689   // We also search for instructions that are defined outside the loop, but are
5690   // used inside the loop. We need this number separately from the max-interval
5691   // usage number because when we unroll, loop-invariant values do not take
5692   // more register.
5693   LoopBlocksDFS DFS(TheLoop);
5694   DFS.perform(LI);
5695 
5696   RegisterUsage RU;
5697 
5698   // Each 'key' in the map opens a new interval. The values
5699   // of the map are the index of the 'last seen' usage of the
5700   // instruction that is the key.
5701   using IntervalMap = DenseMap<Instruction *, unsigned>;
5702 
5703   // Maps instruction to its index.
5704   SmallVector<Instruction *, 64> IdxToInstr;
5705   // Marks the end of each interval.
5706   IntervalMap EndPoint;
5707   // Saves the list of instruction indices that are used in the loop.
5708   SmallPtrSet<Instruction *, 8> Ends;
5709   // Saves the list of values that are used in the loop but are
5710   // defined outside the loop, such as arguments and constants.
5711   SmallPtrSet<Value *, 8> LoopInvariants;
5712 
5713   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5714     for (Instruction &I : BB->instructionsWithoutDebug()) {
5715       IdxToInstr.push_back(&I);
5716 
5717       // Save the end location of each USE.
5718       for (Value *U : I.operands()) {
5719         auto *Instr = dyn_cast<Instruction>(U);
5720 
5721         // Ignore non-instruction values such as arguments, constants, etc.
5722         if (!Instr)
5723           continue;
5724 
5725         // If this instruction is outside the loop then record it and continue.
5726         if (!TheLoop->contains(Instr)) {
5727           LoopInvariants.insert(Instr);
5728           continue;
5729         }
5730 
5731         // Overwrite previous end points.
5732         EndPoint[Instr] = IdxToInstr.size();
5733         Ends.insert(Instr);
5734       }
5735     }
5736   }
5737 
5738   // Saves the list of intervals that end with the index in 'key'.
5739   using InstrList = SmallVector<Instruction *, 2>;
5740   DenseMap<unsigned, InstrList> TransposeEnds;
5741 
5742   // Transpose the EndPoints to a list of values that end at each index.
5743   for (auto &Interval : EndPoint)
5744     TransposeEnds[Interval.second].push_back(Interval.first);
5745 
5746   SmallPtrSet<Instruction *, 8> OpenIntervals;
5747 
5748   // Get the size of the widest register.
5749   unsigned MaxSafeDepDist = -1U;
5750   if (Legal->getMaxSafeDepDistBytes() != -1U)
5751     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5752   unsigned WidestRegister =
5753       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5754   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5755 
5756   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5757   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5758 
5759   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5760 
5761   // A lambda that gets the register usage for the given type and VF.
5762   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5763     if (Ty->isTokenTy())
5764       return 0U;
5765     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5766     assert(!VF.isScalable() && "scalable vectors not yet supported.");
5767     return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
5768                                      WidestRegister);
5769   };
5770 
5771   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5772     Instruction *I = IdxToInstr[i];
5773 
5774     // Remove all of the instructions that end at this location.
5775     InstrList &List = TransposeEnds[i];
5776     for (Instruction *ToRemove : List)
5777       OpenIntervals.erase(ToRemove);
5778 
5779     // Ignore instructions that are never used within the loop.
5780     if (!Ends.count(I))
5781       continue;
5782 
5783     // Skip ignored values.
5784     if (ValuesToIgnore.count(I))
5785       continue;
5786 
5787     // For each VF find the maximum usage of registers.
5788     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5789       // Count the number of live intervals.
5790       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5791 
5792       if (VFs[j].isScalar()) {
5793         for (auto Inst : OpenIntervals) {
5794           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5795           if (RegUsage.find(ClassID) == RegUsage.end())
5796             RegUsage[ClassID] = 1;
5797           else
5798             RegUsage[ClassID] += 1;
5799         }
5800       } else {
5801         collectUniformsAndScalars(VFs[j]);
5802         for (auto Inst : OpenIntervals) {
5803           // Skip ignored values for VF > 1.
5804           if (VecValuesToIgnore.count(Inst))
5805             continue;
5806           if (isScalarAfterVectorization(Inst, VFs[j])) {
5807             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5808             if (RegUsage.find(ClassID) == RegUsage.end())
5809               RegUsage[ClassID] = 1;
5810             else
5811               RegUsage[ClassID] += 1;
5812           } else {
5813             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5814             if (RegUsage.find(ClassID) == RegUsage.end())
5815               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5816             else
5817               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5818           }
5819         }
5820       }
5821 
5822       for (auto& pair : RegUsage) {
5823         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5824           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5825         else
5826           MaxUsages[j][pair.first] = pair.second;
5827       }
5828     }
5829 
5830     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5831                       << OpenIntervals.size() << '\n');
5832 
5833     // Add the current instruction to the list of open intervals.
5834     OpenIntervals.insert(I);
5835   }
5836 
5837   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5838     SmallMapVector<unsigned, unsigned, 4> Invariant;
5839 
5840     for (auto Inst : LoopInvariants) {
5841       unsigned Usage =
5842           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5843       unsigned ClassID =
5844           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5845       if (Invariant.find(ClassID) == Invariant.end())
5846         Invariant[ClassID] = Usage;
5847       else
5848         Invariant[ClassID] += Usage;
5849     }
5850 
5851     LLVM_DEBUG({
5852       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5853       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5854              << " item\n";
5855       for (const auto &pair : MaxUsages[i]) {
5856         dbgs() << "LV(REG): RegisterClass: "
5857                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5858                << " registers\n";
5859       }
5860       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5861              << " item\n";
5862       for (const auto &pair : Invariant) {
5863         dbgs() << "LV(REG): RegisterClass: "
5864                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5865                << " registers\n";
5866       }
5867     });
5868 
5869     RU.LoopInvariantRegs = Invariant;
5870     RU.MaxLocalUsers = MaxUsages[i];
5871     RUs[i] = RU;
5872   }
5873 
5874   return RUs;
5875 }
5876 
5877 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5878   // TODO: Cost model for emulated masked load/store is completely
5879   // broken. This hack guides the cost model to use an artificially
5880   // high enough value to practically disable vectorization with such
5881   // operations, except where previously deployed legality hack allowed
5882   // using very low cost values. This is to avoid regressions coming simply
5883   // from moving "masked load/store" check from legality to cost model.
5884   // Masked Load/Gather emulation was previously never allowed.
5885   // Limited number of Masked Store/Scatter emulation was allowed.
5886   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5887   return isa<LoadInst>(I) ||
5888          (isa<StoreInst>(I) &&
5889           NumPredStores > NumberOfStoresToPredicate);
5890 }
5891 
5892 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5893   // If we aren't vectorizing the loop, or if we've already collected the
5894   // instructions to scalarize, there's nothing to do. Collection may already
5895   // have occurred if we have a user-selected VF and are now computing the
5896   // expected cost for interleaving.
5897   if (VF.isScalar() || VF.isZero() ||
5898       InstsToScalarize.find(VF) != InstsToScalarize.end())
5899     return;
5900 
5901   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5902   // not profitable to scalarize any instructions, the presence of VF in the
5903   // map will indicate that we've analyzed it already.
5904   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5905 
5906   // Find all the instructions that are scalar with predication in the loop and
5907   // determine if it would be better to not if-convert the blocks they are in.
5908   // If so, we also record the instructions to scalarize.
5909   for (BasicBlock *BB : TheLoop->blocks()) {
5910     if (!blockNeedsPredication(BB))
5911       continue;
5912     for (Instruction &I : *BB)
5913       if (isScalarWithPredication(&I)) {
5914         ScalarCostsTy ScalarCosts;
5915         // Do not apply discount logic if hacked cost is needed
5916         // for emulated masked memrefs.
5917         if (!useEmulatedMaskMemRefHack(&I) &&
5918             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5919           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5920         // Remember that BB will remain after vectorization.
5921         PredicatedBBsAfterVectorization.insert(BB);
5922       }
5923   }
5924 }
5925 
5926 int LoopVectorizationCostModel::computePredInstDiscount(
5927     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5928     ElementCount VF) {
5929   assert(!isUniformAfterVectorization(PredInst, VF) &&
5930          "Instruction marked uniform-after-vectorization will be predicated");
5931 
5932   // Initialize the discount to zero, meaning that the scalar version and the
5933   // vector version cost the same.
5934   int Discount = 0;
5935 
5936   // Holds instructions to analyze. The instructions we visit are mapped in
5937   // ScalarCosts. Those instructions are the ones that would be scalarized if
5938   // we find that the scalar version costs less.
5939   SmallVector<Instruction *, 8> Worklist;
5940 
5941   // Returns true if the given instruction can be scalarized.
5942   auto canBeScalarized = [&](Instruction *I) -> bool {
5943     // We only attempt to scalarize instructions forming a single-use chain
5944     // from the original predicated block that would otherwise be vectorized.
5945     // Although not strictly necessary, we give up on instructions we know will
5946     // already be scalar to avoid traversing chains that are unlikely to be
5947     // beneficial.
5948     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5949         isScalarAfterVectorization(I, VF))
5950       return false;
5951 
5952     // If the instruction is scalar with predication, it will be analyzed
5953     // separately. We ignore it within the context of PredInst.
5954     if (isScalarWithPredication(I))
5955       return false;
5956 
5957     // If any of the instruction's operands are uniform after vectorization,
5958     // the instruction cannot be scalarized. This prevents, for example, a
5959     // masked load from being scalarized.
5960     //
5961     // We assume we will only emit a value for lane zero of an instruction
5962     // marked uniform after vectorization, rather than VF identical values.
5963     // Thus, if we scalarize an instruction that uses a uniform, we would
5964     // create uses of values corresponding to the lanes we aren't emitting code
5965     // for. This behavior can be changed by allowing getScalarValue to clone
5966     // the lane zero values for uniforms rather than asserting.
5967     for (Use &U : I->operands())
5968       if (auto *J = dyn_cast<Instruction>(U.get()))
5969         if (isUniformAfterVectorization(J, VF))
5970           return false;
5971 
5972     // Otherwise, we can scalarize the instruction.
5973     return true;
5974   };
5975 
5976   // Compute the expected cost discount from scalarizing the entire expression
5977   // feeding the predicated instruction. We currently only consider expressions
5978   // that are single-use instruction chains.
5979   Worklist.push_back(PredInst);
5980   while (!Worklist.empty()) {
5981     Instruction *I = Worklist.pop_back_val();
5982 
5983     // If we've already analyzed the instruction, there's nothing to do.
5984     if (ScalarCosts.find(I) != ScalarCosts.end())
5985       continue;
5986 
5987     // Compute the cost of the vector instruction. Note that this cost already
5988     // includes the scalarization overhead of the predicated instruction.
5989     unsigned VectorCost = getInstructionCost(I, VF).first;
5990 
5991     // Compute the cost of the scalarized instruction. This cost is the cost of
5992     // the instruction as if it wasn't if-converted and instead remained in the
5993     // predicated block. We will scale this cost by block probability after
5994     // computing the scalarization overhead.
5995     assert(!VF.isScalable() && "scalable vectors not yet supported.");
5996     unsigned ScalarCost =
5997         VF.getKnownMinValue() *
5998         getInstructionCost(I, ElementCount::getFixed(1)).first;
5999 
6000     // Compute the scalarization overhead of needed insertelement instructions
6001     // and phi nodes.
6002     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6003       ScalarCost += TTI.getScalarizationOverhead(
6004           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6005           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6006       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6007       ScalarCost +=
6008           VF.getKnownMinValue() *
6009           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6010     }
6011 
6012     // Compute the scalarization overhead of needed extractelement
6013     // instructions. For each of the instruction's operands, if the operand can
6014     // be scalarized, add it to the worklist; otherwise, account for the
6015     // overhead.
6016     for (Use &U : I->operands())
6017       if (auto *J = dyn_cast<Instruction>(U.get())) {
6018         assert(VectorType::isValidElementType(J->getType()) &&
6019                "Instruction has non-scalar type");
6020         if (canBeScalarized(J))
6021           Worklist.push_back(J);
6022         else if (needsExtract(J, VF)) {
6023           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6024           ScalarCost += TTI.getScalarizationOverhead(
6025               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6026               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6027         }
6028       }
6029 
6030     // Scale the total scalar cost by block probability.
6031     ScalarCost /= getReciprocalPredBlockProb();
6032 
6033     // Compute the discount. A non-negative discount means the vector version
6034     // of the instruction costs more, and scalarizing would be beneficial.
6035     Discount += VectorCost - ScalarCost;
6036     ScalarCosts[I] = ScalarCost;
6037   }
6038 
6039   return Discount;
6040 }
6041 
6042 LoopVectorizationCostModel::VectorizationCostTy
6043 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6044   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6045   VectorizationCostTy Cost;
6046 
6047   // For each block.
6048   for (BasicBlock *BB : TheLoop->blocks()) {
6049     VectorizationCostTy BlockCost;
6050 
6051     // For each instruction in the old loop.
6052     for (Instruction &I : BB->instructionsWithoutDebug()) {
6053       // Skip ignored values.
6054       if (ValuesToIgnore.count(&I) ||
6055           (VF.isVector() && VecValuesToIgnore.count(&I)))
6056         continue;
6057 
6058       VectorizationCostTy C = getInstructionCost(&I, VF);
6059 
6060       // Check if we should override the cost.
6061       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6062         C.first = ForceTargetInstructionCost;
6063 
6064       BlockCost.first += C.first;
6065       BlockCost.second |= C.second;
6066       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6067                         << " for VF " << VF << " For instruction: " << I
6068                         << '\n');
6069     }
6070 
6071     // If we are vectorizing a predicated block, it will have been
6072     // if-converted. This means that the block's instructions (aside from
6073     // stores and instructions that may divide by zero) will now be
6074     // unconditionally executed. For the scalar case, we may not always execute
6075     // the predicated block. Thus, scale the block's cost by the probability of
6076     // executing it.
6077     if (VF.isScalar() && blockNeedsPredication(BB))
6078       BlockCost.first /= getReciprocalPredBlockProb();
6079 
6080     Cost.first += BlockCost.first;
6081     Cost.second |= BlockCost.second;
6082   }
6083 
6084   return Cost;
6085 }
6086 
6087 /// Gets Address Access SCEV after verifying that the access pattern
6088 /// is loop invariant except the induction variable dependence.
6089 ///
6090 /// This SCEV can be sent to the Target in order to estimate the address
6091 /// calculation cost.
6092 static const SCEV *getAddressAccessSCEV(
6093               Value *Ptr,
6094               LoopVectorizationLegality *Legal,
6095               PredicatedScalarEvolution &PSE,
6096               const Loop *TheLoop) {
6097 
6098   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6099   if (!Gep)
6100     return nullptr;
6101 
6102   // We are looking for a gep with all loop invariant indices except for one
6103   // which should be an induction variable.
6104   auto SE = PSE.getSE();
6105   unsigned NumOperands = Gep->getNumOperands();
6106   for (unsigned i = 1; i < NumOperands; ++i) {
6107     Value *Opd = Gep->getOperand(i);
6108     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6109         !Legal->isInductionVariable(Opd))
6110       return nullptr;
6111   }
6112 
6113   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6114   return PSE.getSCEV(Ptr);
6115 }
6116 
6117 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6118   return Legal->hasStride(I->getOperand(0)) ||
6119          Legal->hasStride(I->getOperand(1));
6120 }
6121 
6122 unsigned
6123 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6124                                                         ElementCount VF) {
6125   assert(VF.isVector() &&
6126          "Scalarization cost of instruction implies vectorization.");
6127   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6128   Type *ValTy = getMemInstValueType(I);
6129   auto SE = PSE.getSE();
6130 
6131   unsigned AS = getLoadStoreAddressSpace(I);
6132   Value *Ptr = getLoadStorePointerOperand(I);
6133   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6134 
6135   // Figure out whether the access is strided and get the stride value
6136   // if it's known in compile time
6137   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6138 
6139   // Get the cost of the scalar memory instruction and address computation.
6140   unsigned Cost =
6141       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6142 
6143   // Don't pass *I here, since it is scalar but will actually be part of a
6144   // vectorized loop where the user of it is a vectorized instruction.
6145   const Align Alignment = getLoadStoreAlignment(I);
6146   Cost += VF.getKnownMinValue() *
6147           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6148                               AS, TTI::TCK_RecipThroughput);
6149 
6150   // Get the overhead of the extractelement and insertelement instructions
6151   // we might create due to scalarization.
6152   Cost += getScalarizationOverhead(I, VF);
6153 
6154   // If we have a predicated store, it may not be executed for each vector
6155   // lane. Scale the cost by the probability of executing the predicated
6156   // block.
6157   if (isPredicatedInst(I)) {
6158     Cost /= getReciprocalPredBlockProb();
6159 
6160     if (useEmulatedMaskMemRefHack(I))
6161       // Artificially setting to a high enough value to practically disable
6162       // vectorization with such operations.
6163       Cost = 3000000;
6164   }
6165 
6166   return Cost;
6167 }
6168 
6169 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6170                                                              ElementCount VF) {
6171   Type *ValTy = getMemInstValueType(I);
6172   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6173   Value *Ptr = getLoadStorePointerOperand(I);
6174   unsigned AS = getLoadStoreAddressSpace(I);
6175   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6176   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6177 
6178   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6179          "Stride should be 1 or -1 for consecutive memory access");
6180   const Align Alignment = getLoadStoreAlignment(I);
6181   unsigned Cost = 0;
6182   if (Legal->isMaskRequired(I))
6183     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6184                                       CostKind);
6185   else
6186     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6187                                 CostKind, I);
6188 
6189   bool Reverse = ConsecutiveStride < 0;
6190   if (Reverse)
6191     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6192   return Cost;
6193 }
6194 
6195 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6196                                                          ElementCount VF) {
6197   Type *ValTy = getMemInstValueType(I);
6198   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6199   const Align Alignment = getLoadStoreAlignment(I);
6200   unsigned AS = getLoadStoreAddressSpace(I);
6201   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6202   if (isa<LoadInst>(I)) {
6203     return TTI.getAddressComputationCost(ValTy) +
6204            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6205                                CostKind) +
6206            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6207   }
6208   StoreInst *SI = cast<StoreInst>(I);
6209 
6210   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6211   return TTI.getAddressComputationCost(ValTy) +
6212          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6213                              CostKind) +
6214          (isLoopInvariantStoreValue
6215               ? 0
6216               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6217                                        VF.getKnownMinValue() - 1));
6218 }
6219 
6220 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6221                                                           ElementCount VF) {
6222   Type *ValTy = getMemInstValueType(I);
6223   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6224   const Align Alignment = getLoadStoreAlignment(I);
6225   const Value *Ptr = getLoadStorePointerOperand(I);
6226 
6227   return TTI.getAddressComputationCost(VectorTy) +
6228          TTI.getGatherScatterOpCost(
6229              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6230              TargetTransformInfo::TCK_RecipThroughput, I);
6231 }
6232 
6233 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6234                                                             ElementCount VF) {
6235   Type *ValTy = getMemInstValueType(I);
6236   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6237   unsigned AS = getLoadStoreAddressSpace(I);
6238 
6239   auto Group = getInterleavedAccessGroup(I);
6240   assert(Group && "Fail to get an interleaved access group.");
6241 
6242   unsigned InterleaveFactor = Group->getFactor();
6243   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6244   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6245 
6246   // Holds the indices of existing members in an interleaved load group.
6247   // An interleaved store group doesn't need this as it doesn't allow gaps.
6248   SmallVector<unsigned, 4> Indices;
6249   if (isa<LoadInst>(I)) {
6250     for (unsigned i = 0; i < InterleaveFactor; i++)
6251       if (Group->getMember(i))
6252         Indices.push_back(i);
6253   }
6254 
6255   // Calculate the cost of the whole interleaved group.
6256   bool UseMaskForGaps =
6257       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6258   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6259       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6260       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6261 
6262   if (Group->isReverse()) {
6263     // TODO: Add support for reversed masked interleaved access.
6264     assert(!Legal->isMaskRequired(I) &&
6265            "Reverse masked interleaved access not supported.");
6266     Cost += Group->getNumMembers() *
6267             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6268   }
6269   return Cost;
6270 }
6271 
6272 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6273                                                               ElementCount VF) {
6274   // Calculate scalar cost only. Vectorization cost should be ready at this
6275   // moment.
6276   if (VF.isScalar()) {
6277     Type *ValTy = getMemInstValueType(I);
6278     const Align Alignment = getLoadStoreAlignment(I);
6279     unsigned AS = getLoadStoreAddressSpace(I);
6280 
6281     return TTI.getAddressComputationCost(ValTy) +
6282            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6283                                TTI::TCK_RecipThroughput, I);
6284   }
6285   return getWideningCost(I, VF);
6286 }
6287 
6288 LoopVectorizationCostModel::VectorizationCostTy
6289 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6290                                                ElementCount VF) {
6291   assert(!VF.isScalable() &&
6292          "the cost model is not yet implemented for scalable vectorization");
6293   // If we know that this instruction will remain uniform, check the cost of
6294   // the scalar version.
6295   if (isUniformAfterVectorization(I, VF))
6296     VF = ElementCount::getFixed(1);
6297 
6298   if (VF.isVector() && isProfitableToScalarize(I, VF))
6299     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6300 
6301   // Forced scalars do not have any scalarization overhead.
6302   auto ForcedScalar = ForcedScalars.find(VF);
6303   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6304     auto InstSet = ForcedScalar->second;
6305     if (InstSet.count(I))
6306       return VectorizationCostTy(
6307           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6308            VF.getKnownMinValue()),
6309           false);
6310   }
6311 
6312   Type *VectorTy;
6313   unsigned C = getInstructionCost(I, VF, VectorTy);
6314 
6315   bool TypeNotScalarized =
6316       VF.isVector() && VectorTy->isVectorTy() &&
6317       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6318   return VectorizationCostTy(C, TypeNotScalarized);
6319 }
6320 
6321 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6322                                                               ElementCount VF) {
6323 
6324   assert(!VF.isScalable() &&
6325          "cannot compute scalarization overhead for scalable vectorization");
6326   if (VF.isScalar())
6327     return 0;
6328 
6329   unsigned Cost = 0;
6330   Type *RetTy = ToVectorTy(I->getType(), VF);
6331   if (!RetTy->isVoidTy() &&
6332       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6333     Cost += TTI.getScalarizationOverhead(
6334         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6335         true, false);
6336 
6337   // Some targets keep addresses scalar.
6338   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6339     return Cost;
6340 
6341   // Some targets support efficient element stores.
6342   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6343     return Cost;
6344 
6345   // Collect operands to consider.
6346   CallInst *CI = dyn_cast<CallInst>(I);
6347   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6348 
6349   // Skip operands that do not require extraction/scalarization and do not incur
6350   // any overhead.
6351   return Cost + TTI.getOperandsScalarizationOverhead(
6352                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6353 }
6354 
6355 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6356   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6357   if (VF.isScalar())
6358     return;
6359   NumPredStores = 0;
6360   for (BasicBlock *BB : TheLoop->blocks()) {
6361     // For each instruction in the old loop.
6362     for (Instruction &I : *BB) {
6363       Value *Ptr =  getLoadStorePointerOperand(&I);
6364       if (!Ptr)
6365         continue;
6366 
6367       // TODO: We should generate better code and update the cost model for
6368       // predicated uniform stores. Today they are treated as any other
6369       // predicated store (see added test cases in
6370       // invariant-store-vectorization.ll).
6371       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6372         NumPredStores++;
6373 
6374       if (Legal->isUniform(Ptr) &&
6375           // Conditional loads and stores should be scalarized and predicated.
6376           // isScalarWithPredication cannot be used here since masked
6377           // gather/scatters are not considered scalar with predication.
6378           !Legal->blockNeedsPredication(I.getParent())) {
6379         // TODO: Avoid replicating loads and stores instead of
6380         // relying on instcombine to remove them.
6381         // Load: Scalar load + broadcast
6382         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6383         unsigned Cost = getUniformMemOpCost(&I, VF);
6384         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6385         continue;
6386       }
6387 
6388       // We assume that widening is the best solution when possible.
6389       if (memoryInstructionCanBeWidened(&I, VF)) {
6390         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6391         int ConsecutiveStride =
6392                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6393         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6394                "Expected consecutive stride.");
6395         InstWidening Decision =
6396             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6397         setWideningDecision(&I, VF, Decision, Cost);
6398         continue;
6399       }
6400 
6401       // Choose between Interleaving, Gather/Scatter or Scalarization.
6402       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6403       unsigned NumAccesses = 1;
6404       if (isAccessInterleaved(&I)) {
6405         auto Group = getInterleavedAccessGroup(&I);
6406         assert(Group && "Fail to get an interleaved access group.");
6407 
6408         // Make one decision for the whole group.
6409         if (getWideningDecision(&I, VF) != CM_Unknown)
6410           continue;
6411 
6412         NumAccesses = Group->getNumMembers();
6413         if (interleavedAccessCanBeWidened(&I, VF))
6414           InterleaveCost = getInterleaveGroupCost(&I, VF);
6415       }
6416 
6417       unsigned GatherScatterCost =
6418           isLegalGatherOrScatter(&I)
6419               ? getGatherScatterCost(&I, VF) * NumAccesses
6420               : std::numeric_limits<unsigned>::max();
6421 
6422       unsigned ScalarizationCost =
6423           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6424 
6425       // Choose better solution for the current VF,
6426       // write down this decision and use it during vectorization.
6427       unsigned Cost;
6428       InstWidening Decision;
6429       if (InterleaveCost <= GatherScatterCost &&
6430           InterleaveCost < ScalarizationCost) {
6431         Decision = CM_Interleave;
6432         Cost = InterleaveCost;
6433       } else if (GatherScatterCost < ScalarizationCost) {
6434         Decision = CM_GatherScatter;
6435         Cost = GatherScatterCost;
6436       } else {
6437         Decision = CM_Scalarize;
6438         Cost = ScalarizationCost;
6439       }
6440       // If the instructions belongs to an interleave group, the whole group
6441       // receives the same decision. The whole group receives the cost, but
6442       // the cost will actually be assigned to one instruction.
6443       if (auto Group = getInterleavedAccessGroup(&I))
6444         setWideningDecision(Group, VF, Decision, Cost);
6445       else
6446         setWideningDecision(&I, VF, Decision, Cost);
6447     }
6448   }
6449 
6450   // Make sure that any load of address and any other address computation
6451   // remains scalar unless there is gather/scatter support. This avoids
6452   // inevitable extracts into address registers, and also has the benefit of
6453   // activating LSR more, since that pass can't optimize vectorized
6454   // addresses.
6455   if (TTI.prefersVectorizedAddressing())
6456     return;
6457 
6458   // Start with all scalar pointer uses.
6459   SmallPtrSet<Instruction *, 8> AddrDefs;
6460   for (BasicBlock *BB : TheLoop->blocks())
6461     for (Instruction &I : *BB) {
6462       Instruction *PtrDef =
6463         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6464       if (PtrDef && TheLoop->contains(PtrDef) &&
6465           getWideningDecision(&I, VF) != CM_GatherScatter)
6466         AddrDefs.insert(PtrDef);
6467     }
6468 
6469   // Add all instructions used to generate the addresses.
6470   SmallVector<Instruction *, 4> Worklist;
6471   for (auto *I : AddrDefs)
6472     Worklist.push_back(I);
6473   while (!Worklist.empty()) {
6474     Instruction *I = Worklist.pop_back_val();
6475     for (auto &Op : I->operands())
6476       if (auto *InstOp = dyn_cast<Instruction>(Op))
6477         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6478             AddrDefs.insert(InstOp).second)
6479           Worklist.push_back(InstOp);
6480   }
6481 
6482   for (auto *I : AddrDefs) {
6483     if (isa<LoadInst>(I)) {
6484       // Setting the desired widening decision should ideally be handled in
6485       // by cost functions, but since this involves the task of finding out
6486       // if the loaded register is involved in an address computation, it is
6487       // instead changed here when we know this is the case.
6488       InstWidening Decision = getWideningDecision(I, VF);
6489       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6490         // Scalarize a widened load of address.
6491         setWideningDecision(
6492             I, VF, CM_Scalarize,
6493             (VF.getKnownMinValue() *
6494              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6495       else if (auto Group = getInterleavedAccessGroup(I)) {
6496         // Scalarize an interleave group of address loads.
6497         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6498           if (Instruction *Member = Group->getMember(I))
6499             setWideningDecision(
6500                 Member, VF, CM_Scalarize,
6501                 (VF.getKnownMinValue() *
6502                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6503         }
6504       }
6505     } else
6506       // Make sure I gets scalarized and a cost estimate without
6507       // scalarization overhead.
6508       ForcedScalars[VF].insert(I);
6509   }
6510 }
6511 
6512 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6513                                                         ElementCount VF,
6514                                                         Type *&VectorTy) {
6515   Type *RetTy = I->getType();
6516   if (canTruncateToMinimalBitwidth(I, VF))
6517     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6518   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6519   auto SE = PSE.getSE();
6520   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6521 
6522   // TODO: We need to estimate the cost of intrinsic calls.
6523   switch (I->getOpcode()) {
6524   case Instruction::GetElementPtr:
6525     // We mark this instruction as zero-cost because the cost of GEPs in
6526     // vectorized code depends on whether the corresponding memory instruction
6527     // is scalarized or not. Therefore, we handle GEPs with the memory
6528     // instruction cost.
6529     return 0;
6530   case Instruction::Br: {
6531     // In cases of scalarized and predicated instructions, there will be VF
6532     // predicated blocks in the vectorized loop. Each branch around these
6533     // blocks requires also an extract of its vector compare i1 element.
6534     bool ScalarPredicatedBB = false;
6535     BranchInst *BI = cast<BranchInst>(I);
6536     if (VF.isVector() && BI->isConditional() &&
6537         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6538          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6539       ScalarPredicatedBB = true;
6540 
6541     if (ScalarPredicatedBB) {
6542       // Return cost for branches around scalarized and predicated blocks.
6543       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6544       auto *Vec_i1Ty =
6545           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6546       return (TTI.getScalarizationOverhead(
6547                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6548                   false, true) +
6549               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6550                VF.getKnownMinValue()));
6551     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6552       // The back-edge branch will remain, as will all scalar branches.
6553       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6554     else
6555       // This branch will be eliminated by if-conversion.
6556       return 0;
6557     // Note: We currently assume zero cost for an unconditional branch inside
6558     // a predicated block since it will become a fall-through, although we
6559     // may decide in the future to call TTI for all branches.
6560   }
6561   case Instruction::PHI: {
6562     auto *Phi = cast<PHINode>(I);
6563 
6564     // First-order recurrences are replaced by vector shuffles inside the loop.
6565     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6566     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6567       return TTI.getShuffleCost(
6568           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6569           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6570 
6571     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6572     // converted into select instructions. We require N - 1 selects per phi
6573     // node, where N is the number of incoming values.
6574     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6575       return (Phi->getNumIncomingValues() - 1) *
6576              TTI.getCmpSelInstrCost(
6577                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6578                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6579                  CostKind);
6580 
6581     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6582   }
6583   case Instruction::UDiv:
6584   case Instruction::SDiv:
6585   case Instruction::URem:
6586   case Instruction::SRem:
6587     // If we have a predicated instruction, it may not be executed for each
6588     // vector lane. Get the scalarization cost and scale this amount by the
6589     // probability of executing the predicated block. If the instruction is not
6590     // predicated, we fall through to the next case.
6591     if (VF.isVector() && isScalarWithPredication(I)) {
6592       unsigned Cost = 0;
6593 
6594       // These instructions have a non-void type, so account for the phi nodes
6595       // that we will create. This cost is likely to be zero. The phi node
6596       // cost, if any, should be scaled by the block probability because it
6597       // models a copy at the end of each predicated block.
6598       Cost += VF.getKnownMinValue() *
6599               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6600 
6601       // The cost of the non-predicated instruction.
6602       Cost += VF.getKnownMinValue() *
6603               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6604 
6605       // The cost of insertelement and extractelement instructions needed for
6606       // scalarization.
6607       Cost += getScalarizationOverhead(I, VF);
6608 
6609       // Scale the cost by the probability of executing the predicated blocks.
6610       // This assumes the predicated block for each vector lane is equally
6611       // likely.
6612       return Cost / getReciprocalPredBlockProb();
6613     }
6614     LLVM_FALLTHROUGH;
6615   case Instruction::Add:
6616   case Instruction::FAdd:
6617   case Instruction::Sub:
6618   case Instruction::FSub:
6619   case Instruction::Mul:
6620   case Instruction::FMul:
6621   case Instruction::FDiv:
6622   case Instruction::FRem:
6623   case Instruction::Shl:
6624   case Instruction::LShr:
6625   case Instruction::AShr:
6626   case Instruction::And:
6627   case Instruction::Or:
6628   case Instruction::Xor: {
6629     // Since we will replace the stride by 1 the multiplication should go away.
6630     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6631       return 0;
6632     // Certain instructions can be cheaper to vectorize if they have a constant
6633     // second vector operand. One example of this are shifts on x86.
6634     Value *Op2 = I->getOperand(1);
6635     TargetTransformInfo::OperandValueProperties Op2VP;
6636     TargetTransformInfo::OperandValueKind Op2VK =
6637         TTI.getOperandInfo(Op2, Op2VP);
6638     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6639       Op2VK = TargetTransformInfo::OK_UniformValue;
6640 
6641     SmallVector<const Value *, 4> Operands(I->operand_values());
6642     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6643     return N * TTI.getArithmeticInstrCost(
6644                    I->getOpcode(), VectorTy, CostKind,
6645                    TargetTransformInfo::OK_AnyValue,
6646                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6647   }
6648   case Instruction::FNeg: {
6649     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6650     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6651     return N * TTI.getArithmeticInstrCost(
6652                    I->getOpcode(), VectorTy, CostKind,
6653                    TargetTransformInfo::OK_AnyValue,
6654                    TargetTransformInfo::OK_AnyValue,
6655                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6656                    I->getOperand(0), I);
6657   }
6658   case Instruction::Select: {
6659     SelectInst *SI = cast<SelectInst>(I);
6660     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6661     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6662     Type *CondTy = SI->getCondition()->getType();
6663     if (!ScalarCond) {
6664       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6665       CondTy = VectorType::get(CondTy, VF);
6666     }
6667     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6668                                   CostKind, I);
6669   }
6670   case Instruction::ICmp:
6671   case Instruction::FCmp: {
6672     Type *ValTy = I->getOperand(0)->getType();
6673     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6674     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6675       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6676     VectorTy = ToVectorTy(ValTy, VF);
6677     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6678                                   I);
6679   }
6680   case Instruction::Store:
6681   case Instruction::Load: {
6682     ElementCount Width = VF;
6683     if (Width.isVector()) {
6684       InstWidening Decision = getWideningDecision(I, Width);
6685       assert(Decision != CM_Unknown &&
6686              "CM decision should be taken at this point");
6687       if (Decision == CM_Scalarize)
6688         Width = ElementCount::getFixed(1);
6689     }
6690     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6691     return getMemoryInstructionCost(I, VF);
6692   }
6693   case Instruction::ZExt:
6694   case Instruction::SExt:
6695   case Instruction::FPToUI:
6696   case Instruction::FPToSI:
6697   case Instruction::FPExt:
6698   case Instruction::PtrToInt:
6699   case Instruction::IntToPtr:
6700   case Instruction::SIToFP:
6701   case Instruction::UIToFP:
6702   case Instruction::Trunc:
6703   case Instruction::FPTrunc:
6704   case Instruction::BitCast: {
6705     // Computes the CastContextHint from a Load/Store instruction.
6706     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6707       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6708              "Expected a load or a store!");
6709 
6710       if (VF.isScalar() || !TheLoop->contains(I))
6711         return TTI::CastContextHint::Normal;
6712 
6713       switch (getWideningDecision(I, VF)) {
6714       case LoopVectorizationCostModel::CM_GatherScatter:
6715         return TTI::CastContextHint::GatherScatter;
6716       case LoopVectorizationCostModel::CM_Interleave:
6717         return TTI::CastContextHint::Interleave;
6718       case LoopVectorizationCostModel::CM_Scalarize:
6719       case LoopVectorizationCostModel::CM_Widen:
6720         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6721                                         : TTI::CastContextHint::Normal;
6722       case LoopVectorizationCostModel::CM_Widen_Reverse:
6723         return TTI::CastContextHint::Reversed;
6724       case LoopVectorizationCostModel::CM_Unknown:
6725         llvm_unreachable("Instr did not go through cost modelling?");
6726       }
6727 
6728       llvm_unreachable("Unhandled case!");
6729     };
6730 
6731     unsigned Opcode = I->getOpcode();
6732     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6733     // For Trunc, the context is the only user, which must be a StoreInst.
6734     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6735       if (I->hasOneUse())
6736         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6737           CCH = ComputeCCH(Store);
6738     }
6739     // For Z/Sext, the context is the operand, which must be a LoadInst.
6740     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6741              Opcode == Instruction::FPExt) {
6742       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6743         CCH = ComputeCCH(Load);
6744     }
6745 
6746     // We optimize the truncation of induction variables having constant
6747     // integer steps. The cost of these truncations is the same as the scalar
6748     // operation.
6749     if (isOptimizableIVTruncate(I, VF)) {
6750       auto *Trunc = cast<TruncInst>(I);
6751       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6752                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6753     }
6754 
6755     Type *SrcScalarTy = I->getOperand(0)->getType();
6756     Type *SrcVecTy =
6757         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6758     if (canTruncateToMinimalBitwidth(I, VF)) {
6759       // This cast is going to be shrunk. This may remove the cast or it might
6760       // turn it into slightly different cast. For example, if MinBW == 16,
6761       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6762       //
6763       // Calculate the modified src and dest types.
6764       Type *MinVecTy = VectorTy;
6765       if (Opcode == Instruction::Trunc) {
6766         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6767         VectorTy =
6768             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6769       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6770         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6771         VectorTy =
6772             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6773       }
6774     }
6775 
6776     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6777     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6778     return N *
6779            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6780   }
6781   case Instruction::Call: {
6782     bool NeedToScalarize;
6783     CallInst *CI = cast<CallInst>(I);
6784     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6785     if (getVectorIntrinsicIDForCall(CI, TLI))
6786       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6787     return CallCost;
6788   }
6789   default:
6790     // The cost of executing VF copies of the scalar instruction. This opcode
6791     // is unknown. Assume that it is the same as 'mul'.
6792     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6793                                        Instruction::Mul, VectorTy, CostKind) +
6794            getScalarizationOverhead(I, VF);
6795   } // end of switch.
6796 }
6797 
6798 char LoopVectorize::ID = 0;
6799 
6800 static const char lv_name[] = "Loop Vectorization";
6801 
6802 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6803 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6804 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6805 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6806 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6807 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6808 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6809 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6810 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6811 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6812 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6813 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6814 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6815 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6816 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6817 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6818 
6819 namespace llvm {
6820 
6821 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6822 
6823 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6824                               bool VectorizeOnlyWhenForced) {
6825   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6826 }
6827 
6828 } // end namespace llvm
6829 
6830 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6831   // Check if the pointer operand of a load or store instruction is
6832   // consecutive.
6833   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6834     return Legal->isConsecutivePtr(Ptr);
6835   return false;
6836 }
6837 
6838 void LoopVectorizationCostModel::collectValuesToIgnore() {
6839   // Ignore ephemeral values.
6840   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6841 
6842   // Ignore type-promoting instructions we identified during reduction
6843   // detection.
6844   for (auto &Reduction : Legal->getReductionVars()) {
6845     RecurrenceDescriptor &RedDes = Reduction.second;
6846     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6847     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6848   }
6849   // Ignore type-casting instructions we identified during induction
6850   // detection.
6851   for (auto &Induction : Legal->getInductionVars()) {
6852     InductionDescriptor &IndDes = Induction.second;
6853     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6854     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6855   }
6856 }
6857 
6858 void LoopVectorizationCostModel::collectInLoopReductions() {
6859   // For the moment, without predicated reduction instructions, we do not
6860   // support inloop reductions whilst folding the tail, and hence in those cases
6861   // all reductions are currently out of the loop.
6862   if (!PreferInLoopReductions || foldTailByMasking())
6863     return;
6864 
6865   for (auto &Reduction : Legal->getReductionVars()) {
6866     PHINode *Phi = Reduction.first;
6867     RecurrenceDescriptor &RdxDesc = Reduction.second;
6868 
6869     // We don't collect reductions that are type promoted (yet).
6870     if (RdxDesc.getRecurrenceType() != Phi->getType())
6871       continue;
6872 
6873     // Check that we can correctly put the reductions into the loop, by
6874     // finding the chain of operations that leads from the phi to the loop
6875     // exit value.
6876     SmallVector<Instruction *, 4> ReductionOperations =
6877         RdxDesc.getReductionOpChain(Phi, TheLoop);
6878     bool InLoop = !ReductionOperations.empty();
6879     if (InLoop)
6880       InLoopReductionChains[Phi] = ReductionOperations;
6881     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6882                       << " reduction for phi: " << *Phi << "\n");
6883   }
6884 }
6885 
6886 // TODO: we could return a pair of values that specify the max VF and
6887 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6888 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6889 // doesn't have a cost model that can choose which plan to execute if
6890 // more than one is generated.
6891 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6892                                  LoopVectorizationCostModel &CM) {
6893   unsigned WidestType;
6894   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6895   return WidestVectorRegBits / WidestType;
6896 }
6897 
6898 VectorizationFactor
6899 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6900   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6901   ElementCount VF = UserVF;
6902   // Outer loop handling: They may require CFG and instruction level
6903   // transformations before even evaluating whether vectorization is profitable.
6904   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6905   // the vectorization pipeline.
6906   if (!OrigLoop->empty()) {
6907     // If the user doesn't provide a vectorization factor, determine a
6908     // reasonable one.
6909     if (UserVF.isZero()) {
6910       VF = ElementCount::getFixed(
6911           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6912       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6913 
6914       // Make sure we have a VF > 1 for stress testing.
6915       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6916         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6917                           << "overriding computed VF.\n");
6918         VF = ElementCount::getFixed(4);
6919       }
6920     }
6921     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6922     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6923            "VF needs to be a power of two");
6924     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6925                       << "VF " << VF << " to build VPlans.\n");
6926     buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue());
6927 
6928     // For VPlan build stress testing, we bail out after VPlan construction.
6929     if (VPlanBuildStressTest)
6930       return VectorizationFactor::Disabled();
6931 
6932     return {VF, 0 /*Cost*/};
6933   }
6934 
6935   LLVM_DEBUG(
6936       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6937                 "VPlan-native path.\n");
6938   return VectorizationFactor::Disabled();
6939 }
6940 
6941 Optional<VectorizationFactor>
6942 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6943   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
6944   assert(OrigLoop->empty() && "Inner loop expected.");
6945   Optional<unsigned> MaybeMaxVF =
6946       CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC);
6947   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6948     return None;
6949 
6950   // Invalidate interleave groups if all blocks of loop will be predicated.
6951   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6952       !useMaskedInterleavedAccesses(*TTI)) {
6953     LLVM_DEBUG(
6954         dbgs()
6955         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6956            "which requires masked-interleaved support.\n");
6957     if (CM.InterleaveInfo.invalidateGroups())
6958       // Invalidating interleave groups also requires invalidating all decisions
6959       // based on them, which includes widening decisions and uniform and scalar
6960       // values.
6961       CM.invalidateCostModelingDecisions();
6962   }
6963 
6964   if (!UserVF.isZero()) {
6965     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6966     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6967            "VF needs to be a power of two");
6968     // Collect the instructions (and their associated costs) that will be more
6969     // profitable to scalarize.
6970     CM.selectUserVectorizationFactor(UserVF);
6971     CM.collectInLoopReductions();
6972     buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
6973                              UserVF.getKnownMinValue());
6974     LLVM_DEBUG(printPlans(dbgs()));
6975     return {{UserVF, 0}};
6976   }
6977 
6978   unsigned MaxVF = MaybeMaxVF.getValue();
6979   assert(MaxVF != 0 && "MaxVF is zero.");
6980 
6981   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6982     // Collect Uniform and Scalar instructions after vectorization with VF.
6983     CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
6984 
6985     // Collect the instructions (and their associated costs) that will be more
6986     // profitable to scalarize.
6987     if (VF > 1)
6988       CM.collectInstsToScalarize(ElementCount::getFixed(VF));
6989   }
6990 
6991   CM.collectInLoopReductions();
6992 
6993   buildVPlansWithVPRecipes(1, MaxVF);
6994   LLVM_DEBUG(printPlans(dbgs()));
6995   if (MaxVF == 1)
6996     return VectorizationFactor::Disabled();
6997 
6998   // Select the optimal vectorization factor.
6999   return CM.selectVectorizationFactor(MaxVF);
7000 }
7001 
7002 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7003   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7004                     << '\n');
7005   BestVF = VF;
7006   BestUF = UF;
7007 
7008   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7009     return !Plan->hasVF(VF);
7010   });
7011   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7012 }
7013 
7014 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7015                                            DominatorTree *DT) {
7016   // Perform the actual loop transformation.
7017 
7018   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7019   VPCallbackILV CallbackILV(ILV);
7020 
7021   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7022 
7023   VPTransformState State{*BestVF, BestUF,      LI,
7024                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7025                          &ILV,    CallbackILV};
7026   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7027   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7028   State.CanonicalIV = ILV.Induction;
7029 
7030   //===------------------------------------------------===//
7031   //
7032   // Notice: any optimization or new instruction that go
7033   // into the code below should also be implemented in
7034   // the cost-model.
7035   //
7036   //===------------------------------------------------===//
7037 
7038   // 2. Copy and widen instructions from the old loop into the new loop.
7039   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7040   VPlans.front()->execute(&State);
7041 
7042   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7043   //    predication, updating analyses.
7044   ILV.fixVectorizedLoop();
7045 }
7046 
7047 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7048     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7049   BasicBlock *Latch = OrigLoop->getLoopLatch();
7050 
7051   // We create new control-flow for the vectorized loop, so the original
7052   // condition will be dead after vectorization if it's only used by the
7053   // branch.
7054   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7055   if (Cmp && Cmp->hasOneUse())
7056     DeadInstructions.insert(Cmp);
7057 
7058   // We create new "steps" for induction variable updates to which the original
7059   // induction variables map. An original update instruction will be dead if
7060   // all its users except the induction variable are dead.
7061   for (auto &Induction : Legal->getInductionVars()) {
7062     PHINode *Ind = Induction.first;
7063     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7064     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7065           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7066         }))
7067       DeadInstructions.insert(IndUpdate);
7068 
7069     // We record as "Dead" also the type-casting instructions we had identified
7070     // during induction analysis. We don't need any handling for them in the
7071     // vectorized loop because we have proven that, under a proper runtime
7072     // test guarding the vectorized loop, the value of the phi, and the casted
7073     // value of the phi, are the same. The last instruction in this casting chain
7074     // will get its scalar/vector/widened def from the scalar/vector/widened def
7075     // of the respective phi node. Any other casts in the induction def-use chain
7076     // have no other uses outside the phi update chain, and will be ignored.
7077     InductionDescriptor &IndDes = Induction.second;
7078     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7079     DeadInstructions.insert(Casts.begin(), Casts.end());
7080   }
7081 }
7082 
7083 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7084 
7085 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7086 
7087 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7088                                         Instruction::BinaryOps BinOp) {
7089   // When unrolling and the VF is 1, we only need to add a simple scalar.
7090   Type *Ty = Val->getType();
7091   assert(!Ty->isVectorTy() && "Val must be a scalar");
7092 
7093   if (Ty->isFloatingPointTy()) {
7094     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7095 
7096     // Floating point operations had to be 'fast' to enable the unrolling.
7097     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7098     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7099   }
7100   Constant *C = ConstantInt::get(Ty, StartIdx);
7101   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7102 }
7103 
7104 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7105   SmallVector<Metadata *, 4> MDs;
7106   // Reserve first location for self reference to the LoopID metadata node.
7107   MDs.push_back(nullptr);
7108   bool IsUnrollMetadata = false;
7109   MDNode *LoopID = L->getLoopID();
7110   if (LoopID) {
7111     // First find existing loop unrolling disable metadata.
7112     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7113       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7114       if (MD) {
7115         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7116         IsUnrollMetadata =
7117             S && S->getString().startswith("llvm.loop.unroll.disable");
7118       }
7119       MDs.push_back(LoopID->getOperand(i));
7120     }
7121   }
7122 
7123   if (!IsUnrollMetadata) {
7124     // Add runtime unroll disable metadata.
7125     LLVMContext &Context = L->getHeader()->getContext();
7126     SmallVector<Metadata *, 1> DisableOperands;
7127     DisableOperands.push_back(
7128         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7129     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7130     MDs.push_back(DisableNode);
7131     MDNode *NewLoopID = MDNode::get(Context, MDs);
7132     // Set operand 0 to refer to the loop id itself.
7133     NewLoopID->replaceOperandWith(0, NewLoopID);
7134     L->setLoopID(NewLoopID);
7135   }
7136 }
7137 
7138 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7139     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7140   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7141   bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7142 
7143   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7144     if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7145       Range.End = TmpVF;
7146       break;
7147     }
7148 
7149   return PredicateAtRangeStart;
7150 }
7151 
7152 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7153 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7154 /// of VF's starting at a given VF and extending it as much as possible. Each
7155 /// vectorization decision can potentially shorten this sub-range during
7156 /// buildVPlan().
7157 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7158   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7159     VFRange SubRange = {VF, MaxVF + 1};
7160     VPlans.push_back(buildVPlan(SubRange));
7161     VF = SubRange.End;
7162   }
7163 }
7164 
7165 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7166                                          VPlanPtr &Plan) {
7167   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7168 
7169   // Look for cached value.
7170   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7171   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7172   if (ECEntryIt != EdgeMaskCache.end())
7173     return ECEntryIt->second;
7174 
7175   VPValue *SrcMask = createBlockInMask(Src, Plan);
7176 
7177   // The terminator has to be a branch inst!
7178   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7179   assert(BI && "Unexpected terminator found");
7180 
7181   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7182     return EdgeMaskCache[Edge] = SrcMask;
7183 
7184   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7185   assert(EdgeMask && "No Edge Mask found for condition");
7186 
7187   if (BI->getSuccessor(0) != Dst)
7188     EdgeMask = Builder.createNot(EdgeMask);
7189 
7190   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7191     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7192 
7193   return EdgeMaskCache[Edge] = EdgeMask;
7194 }
7195 
7196 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7197   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7198 
7199   // Look for cached value.
7200   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7201   if (BCEntryIt != BlockMaskCache.end())
7202     return BCEntryIt->second;
7203 
7204   // All-one mask is modelled as no-mask following the convention for masked
7205   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7206   VPValue *BlockMask = nullptr;
7207 
7208   if (OrigLoop->getHeader() == BB) {
7209     if (!CM.blockNeedsPredication(BB))
7210       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7211 
7212     // Introduce the early-exit compare IV <= BTC to form header block mask.
7213     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7214     // Start by constructing the desired canonical IV.
7215     VPValue *IV = nullptr;
7216     if (Legal->getPrimaryInduction())
7217       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7218     else {
7219       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7220       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7221       IV = IVRecipe->getVPValue();
7222     }
7223     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7224     bool TailFolded = !CM.isScalarEpilogueAllowed();
7225 
7226     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7227       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7228       // as a second argument, we only pass the IV here and extract the
7229       // tripcount from the transform state where codegen of the VP instructions
7230       // happen.
7231       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7232     } else {
7233       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7234     }
7235     return BlockMaskCache[BB] = BlockMask;
7236   }
7237 
7238   // This is the block mask. We OR all incoming edges.
7239   for (auto *Predecessor : predecessors(BB)) {
7240     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7241     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7242       return BlockMaskCache[BB] = EdgeMask;
7243 
7244     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7245       BlockMask = EdgeMask;
7246       continue;
7247     }
7248 
7249     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7250   }
7251 
7252   return BlockMaskCache[BB] = BlockMask;
7253 }
7254 
7255 VPWidenMemoryInstructionRecipe *
7256 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7257                                   VPlanPtr &Plan) {
7258   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7259          "Must be called with either a load or store");
7260 
7261   auto willWiden = [&](ElementCount VF) -> bool {
7262     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7263     if (VF.isScalar())
7264       return false;
7265     LoopVectorizationCostModel::InstWidening Decision =
7266         CM.getWideningDecision(I, VF);
7267     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7268            "CM decision should be taken at this point.");
7269     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7270       return true;
7271     if (CM.isScalarAfterVectorization(I, VF) ||
7272         CM.isProfitableToScalarize(I, VF))
7273       return false;
7274     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7275   };
7276 
7277   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7278     return nullptr;
7279 
7280   VPValue *Mask = nullptr;
7281   if (Legal->isMaskRequired(I))
7282     Mask = createBlockInMask(I->getParent(), Plan);
7283 
7284   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7285   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7286     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7287 
7288   StoreInst *Store = cast<StoreInst>(I);
7289   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7290   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7291 }
7292 
7293 VPWidenIntOrFpInductionRecipe *
7294 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7295   // Check if this is an integer or fp induction. If so, build the recipe that
7296   // produces its scalar and vector values.
7297   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7298   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7299       II.getKind() == InductionDescriptor::IK_FpInduction)
7300     return new VPWidenIntOrFpInductionRecipe(Phi);
7301 
7302   return nullptr;
7303 }
7304 
7305 VPWidenIntOrFpInductionRecipe *
7306 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7307                                                 VFRange &Range) const {
7308   // Optimize the special case where the source is a constant integer
7309   // induction variable. Notice that we can only optimize the 'trunc' case
7310   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7311   // (c) other casts depend on pointer size.
7312 
7313   // Determine whether \p K is a truncation based on an induction variable that
7314   // can be optimized.
7315   auto isOptimizableIVTruncate =
7316       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7317     return [=](ElementCount VF) -> bool {
7318       return CM.isOptimizableIVTruncate(K, VF);
7319     };
7320   };
7321 
7322   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7323           isOptimizableIVTruncate(I), Range))
7324     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7325                                              I);
7326   return nullptr;
7327 }
7328 
7329 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7330   // We know that all PHIs in non-header blocks are converted into selects, so
7331   // we don't have to worry about the insertion order and we can just use the
7332   // builder. At this point we generate the predication tree. There may be
7333   // duplications since this is a simple recursive scan, but future
7334   // optimizations will clean it up.
7335 
7336   SmallVector<VPValue *, 2> Operands;
7337   unsigned NumIncoming = Phi->getNumIncomingValues();
7338   for (unsigned In = 0; In < NumIncoming; In++) {
7339     VPValue *EdgeMask =
7340       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7341     assert((EdgeMask || NumIncoming == 1) &&
7342            "Multiple predecessors with one having a full mask");
7343     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7344     if (EdgeMask)
7345       Operands.push_back(EdgeMask);
7346   }
7347   return new VPBlendRecipe(Phi, Operands);
7348 }
7349 
7350 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7351                                                    VPlan &Plan) const {
7352 
7353   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7354       [this, CI](ElementCount VF) {
7355         return CM.isScalarWithPredication(CI, VF);
7356       },
7357       Range);
7358 
7359   if (IsPredicated)
7360     return nullptr;
7361 
7362   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7363   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7364              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7365     return nullptr;
7366 
7367   auto willWiden = [&](ElementCount VF) -> bool {
7368     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7369     // The following case may be scalarized depending on the VF.
7370     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7371     // version of the instruction.
7372     // Is it beneficial to perform intrinsic call compared to lib call?
7373     bool NeedToScalarize = false;
7374     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7375     bool UseVectorIntrinsic =
7376         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7377     return UseVectorIntrinsic || !NeedToScalarize;
7378   };
7379 
7380   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7381     return nullptr;
7382 
7383   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7384 }
7385 
7386 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7387   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7388          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7389   // Instruction should be widened, unless it is scalar after vectorization,
7390   // scalarization is profitable or it is predicated.
7391   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7392     return CM.isScalarAfterVectorization(I, VF) ||
7393            CM.isProfitableToScalarize(I, VF) ||
7394            CM.isScalarWithPredication(I, VF);
7395   };
7396   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7397                                                              Range);
7398 }
7399 
7400 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7401   auto IsVectorizableOpcode = [](unsigned Opcode) {
7402     switch (Opcode) {
7403     case Instruction::Add:
7404     case Instruction::And:
7405     case Instruction::AShr:
7406     case Instruction::BitCast:
7407     case Instruction::FAdd:
7408     case Instruction::FCmp:
7409     case Instruction::FDiv:
7410     case Instruction::FMul:
7411     case Instruction::FNeg:
7412     case Instruction::FPExt:
7413     case Instruction::FPToSI:
7414     case Instruction::FPToUI:
7415     case Instruction::FPTrunc:
7416     case Instruction::FRem:
7417     case Instruction::FSub:
7418     case Instruction::ICmp:
7419     case Instruction::IntToPtr:
7420     case Instruction::LShr:
7421     case Instruction::Mul:
7422     case Instruction::Or:
7423     case Instruction::PtrToInt:
7424     case Instruction::SDiv:
7425     case Instruction::Select:
7426     case Instruction::SExt:
7427     case Instruction::Shl:
7428     case Instruction::SIToFP:
7429     case Instruction::SRem:
7430     case Instruction::Sub:
7431     case Instruction::Trunc:
7432     case Instruction::UDiv:
7433     case Instruction::UIToFP:
7434     case Instruction::URem:
7435     case Instruction::Xor:
7436     case Instruction::ZExt:
7437       return true;
7438     }
7439     return false;
7440   };
7441 
7442   if (!IsVectorizableOpcode(I->getOpcode()))
7443     return nullptr;
7444 
7445   // Success: widen this instruction.
7446   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7447 }
7448 
7449 VPBasicBlock *VPRecipeBuilder::handleReplication(
7450     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7451     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7452     VPlanPtr &Plan) {
7453   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7454       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7455       Range);
7456 
7457   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7458       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7459       Range);
7460 
7461   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7462                                        IsUniform, IsPredicated);
7463   setRecipe(I, Recipe);
7464 
7465   // Find if I uses a predicated instruction. If so, it will use its scalar
7466   // value. Avoid hoisting the insert-element which packs the scalar value into
7467   // a vector value, as that happens iff all users use the vector value.
7468   for (auto &Op : I->operands())
7469     if (auto *PredInst = dyn_cast<Instruction>(Op))
7470       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7471         PredInst2Recipe[PredInst]->setAlsoPack(false);
7472 
7473   // Finalize the recipe for Instr, first if it is not predicated.
7474   if (!IsPredicated) {
7475     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7476     VPBB->appendRecipe(Recipe);
7477     return VPBB;
7478   }
7479   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7480   assert(VPBB->getSuccessors().empty() &&
7481          "VPBB has successors when handling predicated replication.");
7482   // Record predicated instructions for above packing optimizations.
7483   PredInst2Recipe[I] = Recipe;
7484   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7485   VPBlockUtils::insertBlockAfter(Region, VPBB);
7486   auto *RegSucc = new VPBasicBlock();
7487   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7488   return RegSucc;
7489 }
7490 
7491 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7492                                                       VPRecipeBase *PredRecipe,
7493                                                       VPlanPtr &Plan) {
7494   // Instructions marked for predication are replicated and placed under an
7495   // if-then construct to prevent side-effects.
7496 
7497   // Generate recipes to compute the block mask for this region.
7498   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7499 
7500   // Build the triangular if-then region.
7501   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7502   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7503   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7504   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7505   auto *PHIRecipe =
7506       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7507   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7508   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7509   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7510 
7511   // Note: first set Entry as region entry and then connect successors starting
7512   // from it in order, to propagate the "parent" of each VPBasicBlock.
7513   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7514   VPBlockUtils::connectBlocks(Pred, Exit);
7515 
7516   return Region;
7517 }
7518 
7519 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7520                                                       VFRange &Range,
7521                                                       VPlanPtr &Plan) {
7522   // First, check for specific widening recipes that deal with calls, memory
7523   // operations, inductions and Phi nodes.
7524   if (auto *CI = dyn_cast<CallInst>(Instr))
7525     return tryToWidenCall(CI, Range, *Plan);
7526 
7527   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7528     return tryToWidenMemory(Instr, Range, Plan);
7529 
7530   VPRecipeBase *Recipe;
7531   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7532     if (Phi->getParent() != OrigLoop->getHeader())
7533       return tryToBlend(Phi, Plan);
7534     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7535       return Recipe;
7536     return new VPWidenPHIRecipe(Phi);
7537   }
7538 
7539   if (isa<TruncInst>(Instr) &&
7540       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7541     return Recipe;
7542 
7543   if (!shouldWiden(Instr, Range))
7544     return nullptr;
7545 
7546   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7547     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7548                                 OrigLoop);
7549 
7550   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7551     bool InvariantCond =
7552         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7553     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7554                                    InvariantCond);
7555   }
7556 
7557   return tryToWiden(Instr, *Plan);
7558 }
7559 
7560 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7561                                                         unsigned MaxVF) {
7562   assert(OrigLoop->empty() && "Inner loop expected.");
7563 
7564   // Collect conditions feeding internal conditional branches; they need to be
7565   // represented in VPlan for it to model masking.
7566   SmallPtrSet<Value *, 1> NeedDef;
7567 
7568   auto *Latch = OrigLoop->getLoopLatch();
7569   for (BasicBlock *BB : OrigLoop->blocks()) {
7570     if (BB == Latch)
7571       continue;
7572     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7573     if (Branch && Branch->isConditional())
7574       NeedDef.insert(Branch->getCondition());
7575   }
7576 
7577   // If the tail is to be folded by masking, the primary induction variable, if
7578   // exists needs to be represented in VPlan for it to model early-exit masking.
7579   // Also, both the Phi and the live-out instruction of each reduction are
7580   // required in order to introduce a select between them in VPlan.
7581   if (CM.foldTailByMasking()) {
7582     if (Legal->getPrimaryInduction())
7583       NeedDef.insert(Legal->getPrimaryInduction());
7584     for (auto &Reduction : Legal->getReductionVars()) {
7585       NeedDef.insert(Reduction.first);
7586       NeedDef.insert(Reduction.second.getLoopExitInstr());
7587     }
7588   }
7589 
7590   // Collect instructions from the original loop that will become trivially dead
7591   // in the vectorized loop. We don't need to vectorize these instructions. For
7592   // example, original induction update instructions can become dead because we
7593   // separately emit induction "steps" when generating code for the new loop.
7594   // Similarly, we create a new latch condition when setting up the structure
7595   // of the new loop, so the old one can become dead.
7596   SmallPtrSet<Instruction *, 4> DeadInstructions;
7597   collectTriviallyDeadInstructions(DeadInstructions);
7598 
7599   // Add assume instructions we need to drop to DeadInstructions, to prevent
7600   // them from being added to the VPlan.
7601   // TODO: We only need to drop assumes in blocks that get flattend. If the
7602   // control flow is preserved, we should keep them.
7603   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7604   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7605 
7606   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7607   // Dead instructions do not need sinking. Remove them from SinkAfter.
7608   for (Instruction *I : DeadInstructions)
7609     SinkAfter.erase(I);
7610 
7611   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7612     VFRange SubRange = {VF, MaxVF + 1};
7613     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7614                                              DeadInstructions, SinkAfter));
7615     VF = SubRange.End;
7616   }
7617 }
7618 
7619 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7620     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7621     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7622     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7623 
7624   // Hold a mapping from predicated instructions to their recipes, in order to
7625   // fix their AlsoPack behavior if a user is determined to replicate and use a
7626   // scalar instead of vector value.
7627   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7628 
7629   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7630 
7631   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7632 
7633   // ---------------------------------------------------------------------------
7634   // Pre-construction: record ingredients whose recipes we'll need to further
7635   // process after constructing the initial VPlan.
7636   // ---------------------------------------------------------------------------
7637 
7638   // Mark instructions we'll need to sink later and their targets as
7639   // ingredients whose recipe we'll need to record.
7640   for (auto &Entry : SinkAfter) {
7641     RecipeBuilder.recordRecipeOf(Entry.first);
7642     RecipeBuilder.recordRecipeOf(Entry.second);
7643   }
7644   for (auto &Reduction : CM.getInLoopReductionChains()) {
7645     PHINode *Phi = Reduction.first;
7646     RecurrenceDescriptor::RecurrenceKind Kind =
7647         Legal->getReductionVars()[Phi].getRecurrenceKind();
7648     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7649 
7650     RecipeBuilder.recordRecipeOf(Phi);
7651     for (auto &R : ReductionOperations) {
7652       RecipeBuilder.recordRecipeOf(R);
7653       // For min/max reducitons, where we have a pair of icmp/select, we also
7654       // need to record the ICmp recipe, so it can be removed later.
7655       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7656           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7657         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7658       }
7659     }
7660   }
7661 
7662   // For each interleave group which is relevant for this (possibly trimmed)
7663   // Range, add it to the set of groups to be later applied to the VPlan and add
7664   // placeholders for its members' Recipes which we'll be replacing with a
7665   // single VPInterleaveRecipe.
7666   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7667     auto applyIG = [IG, this](ElementCount VF) -> bool {
7668       return (VF.isVector() && // Query is illegal for VF == 1
7669               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7670                   LoopVectorizationCostModel::CM_Interleave);
7671     };
7672     if (!getDecisionAndClampRange(applyIG, Range))
7673       continue;
7674     InterleaveGroups.insert(IG);
7675     for (unsigned i = 0; i < IG->getFactor(); i++)
7676       if (Instruction *Member = IG->getMember(i))
7677         RecipeBuilder.recordRecipeOf(Member);
7678   };
7679 
7680   // ---------------------------------------------------------------------------
7681   // Build initial VPlan: Scan the body of the loop in a topological order to
7682   // visit each basic block after having visited its predecessor basic blocks.
7683   // ---------------------------------------------------------------------------
7684 
7685   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7686   auto Plan = std::make_unique<VPlan>();
7687   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7688   Plan->setEntry(VPBB);
7689 
7690   // Represent values that will have defs inside VPlan.
7691   for (Value *V : NeedDef)
7692     Plan->addVPValue(V);
7693 
7694   // Scan the body of the loop in a topological order to visit each basic block
7695   // after having visited its predecessor basic blocks.
7696   LoopBlocksDFS DFS(OrigLoop);
7697   DFS.perform(LI);
7698 
7699   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7700     // Relevant instructions from basic block BB will be grouped into VPRecipe
7701     // ingredients and fill a new VPBasicBlock.
7702     unsigned VPBBsForBB = 0;
7703     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7704     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7705     VPBB = FirstVPBBForBB;
7706     Builder.setInsertPoint(VPBB);
7707 
7708     // Introduce each ingredient into VPlan.
7709     // TODO: Model and preserve debug instrinsics in VPlan.
7710     for (Instruction &I : BB->instructionsWithoutDebug()) {
7711       Instruction *Instr = &I;
7712 
7713       // First filter out irrelevant instructions, to ensure no recipes are
7714       // built for them.
7715       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7716         continue;
7717 
7718       if (auto Recipe =
7719               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7720         RecipeBuilder.setRecipe(Instr, Recipe);
7721         VPBB->appendRecipe(Recipe);
7722         continue;
7723       }
7724 
7725       // Otherwise, if all widening options failed, Instruction is to be
7726       // replicated. This may create a successor for VPBB.
7727       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7728           Instr, Range, VPBB, PredInst2Recipe, Plan);
7729       if (NextVPBB != VPBB) {
7730         VPBB = NextVPBB;
7731         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7732                                     : "");
7733       }
7734     }
7735   }
7736 
7737   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7738   // may also be empty, such as the last one VPBB, reflecting original
7739   // basic-blocks with no recipes.
7740   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7741   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7742   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7743   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7744   delete PreEntry;
7745 
7746   // ---------------------------------------------------------------------------
7747   // Transform initial VPlan: Apply previously taken decisions, in order, to
7748   // bring the VPlan to its final state.
7749   // ---------------------------------------------------------------------------
7750 
7751   // Apply Sink-After legal constraints.
7752   for (auto &Entry : SinkAfter) {
7753     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7754     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7755     Sink->moveAfter(Target);
7756   }
7757 
7758   // Interleave memory: for each Interleave Group we marked earlier as relevant
7759   // for this VPlan, replace the Recipes widening its memory instructions with a
7760   // single VPInterleaveRecipe at its insertion point.
7761   for (auto IG : InterleaveGroups) {
7762     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7763         RecipeBuilder.getRecipe(IG->getInsertPos()));
7764     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7765         ->insertBefore(Recipe);
7766 
7767     for (unsigned i = 0; i < IG->getFactor(); ++i)
7768       if (Instruction *Member = IG->getMember(i)) {
7769         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7770       }
7771   }
7772 
7773   // Adjust the recipes for any inloop reductions.
7774   if (Range.Start > 1)
7775     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7776 
7777   // Finally, if tail is folded by masking, introduce selects between the phi
7778   // and the live-out instruction of each reduction, at the end of the latch.
7779   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7780     Builder.setInsertPoint(VPBB);
7781     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7782     for (auto &Reduction : Legal->getReductionVars()) {
7783       assert(!CM.isInLoopReduction(Reduction.first) &&
7784              "Didn't expect inloop tail folded reduction yet!");
7785       VPValue *Phi = Plan->getVPValue(Reduction.first);
7786       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7787       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7788     }
7789   }
7790 
7791   std::string PlanName;
7792   raw_string_ostream RSO(PlanName);
7793   ElementCount VF = ElementCount::getFixed(Range.Start);
7794   Plan->addVF(VF);
7795   RSO << "Initial VPlan for VF={" << VF;
7796   for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) {
7797     Plan->addVF(VF);
7798     RSO << "," << VF;
7799   }
7800   RSO << "},UF>=1";
7801   RSO.flush();
7802   Plan->setName(PlanName);
7803 
7804   return Plan;
7805 }
7806 
7807 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7808   // Outer loop handling: They may require CFG and instruction level
7809   // transformations before even evaluating whether vectorization is profitable.
7810   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7811   // the vectorization pipeline.
7812   assert(!OrigLoop->empty());
7813   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7814 
7815   // Create new empty VPlan
7816   auto Plan = std::make_unique<VPlan>();
7817 
7818   // Build hierarchical CFG
7819   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7820   HCFGBuilder.buildHierarchicalCFG();
7821 
7822   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7823     Plan->addVF(ElementCount::getFixed(VF));
7824 
7825   if (EnableVPlanPredication) {
7826     VPlanPredicator VPP(*Plan);
7827     VPP.predicate();
7828 
7829     // Avoid running transformation to recipes until masked code generation in
7830     // VPlan-native path is in place.
7831     return Plan;
7832   }
7833 
7834   SmallPtrSet<Instruction *, 1> DeadInstructions;
7835   VPlanTransforms::VPInstructionsToVPRecipes(
7836       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7837   return Plan;
7838 }
7839 
7840 // Adjust the recipes for any inloop reductions. The chain of instructions
7841 // leading from the loop exit instr to the phi need to be converted to
7842 // reductions, with one operand being vector and the other being the scalar
7843 // reduction chain.
7844 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7845     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7846   for (auto &Reduction : CM.getInLoopReductionChains()) {
7847     PHINode *Phi = Reduction.first;
7848     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7849     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7850 
7851     // ReductionOperations are orders top-down from the phi's use to the
7852     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7853     // which of the two operands will remain scalar and which will be reduced.
7854     // For minmax the chain will be the select instructions.
7855     Instruction *Chain = Phi;
7856     for (Instruction *R : ReductionOperations) {
7857       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7858       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7859 
7860       VPValue *ChainOp = Plan->getVPValue(Chain);
7861       unsigned FirstOpId;
7862       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7863           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7864         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
7865                "Expected to replace a VPWidenSelectSC");
7866         FirstOpId = 1;
7867       } else {
7868         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7869                "Expected to replace a VPWidenSC");
7870         FirstOpId = 0;
7871       }
7872       unsigned VecOpId =
7873           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7874       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7875 
7876       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7877           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7878       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7879       WidenRecipe->eraseFromParent();
7880 
7881       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7882           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7883         VPRecipeBase *CompareRecipe =
7884             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7885         assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7886                "Expected to replace a VPWidenSC");
7887         CompareRecipe->eraseFromParent();
7888       }
7889       Chain = R;
7890     }
7891   }
7892 }
7893 
7894 Value* LoopVectorizationPlanner::VPCallbackILV::
7895 getOrCreateVectorValues(Value *V, unsigned Part) {
7896       return ILV.getOrCreateVectorValue(V, Part);
7897 }
7898 
7899 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7900     Value *V, const VPIteration &Instance) {
7901   return ILV.getOrCreateScalarValue(V, Instance);
7902 }
7903 
7904 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7905                                VPSlotTracker &SlotTracker) const {
7906   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7907   IG->getInsertPos()->printAsOperand(O, false);
7908   O << ", ";
7909   getAddr()->printAsOperand(O, SlotTracker);
7910   VPValue *Mask = getMask();
7911   if (Mask) {
7912     O << ", ";
7913     Mask->printAsOperand(O, SlotTracker);
7914   }
7915   for (unsigned i = 0; i < IG->getFactor(); ++i)
7916     if (Instruction *I = IG->getMember(i))
7917       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7918 }
7919 
7920 void VPWidenCallRecipe::execute(VPTransformState &State) {
7921   State.ILV->widenCallInstruction(Ingredient, User, State);
7922 }
7923 
7924 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7925   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7926 }
7927 
7928 void VPWidenRecipe::execute(VPTransformState &State) {
7929   State.ILV->widenInstruction(Ingredient, User, State);
7930 }
7931 
7932 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7933   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7934                       IsIndexLoopInvariant, State);
7935 }
7936 
7937 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7938   assert(!State.Instance && "Int or FP induction being replicated.");
7939   State.ILV->widenIntOrFpInduction(IV, Trunc);
7940 }
7941 
7942 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7943   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7944 }
7945 
7946 void VPBlendRecipe::execute(VPTransformState &State) {
7947   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7948   // We know that all PHIs in non-header blocks are converted into
7949   // selects, so we don't have to worry about the insertion order and we
7950   // can just use the builder.
7951   // At this point we generate the predication tree. There may be
7952   // duplications since this is a simple recursive scan, but future
7953   // optimizations will clean it up.
7954 
7955   unsigned NumIncoming = getNumIncomingValues();
7956 
7957   // Generate a sequence of selects of the form:
7958   // SELECT(Mask3, In3,
7959   //        SELECT(Mask2, In2,
7960   //               SELECT(Mask1, In1,
7961   //                      In0)))
7962   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7963   // are essentially undef are taken from In0.
7964   InnerLoopVectorizer::VectorParts Entry(State.UF);
7965   for (unsigned In = 0; In < NumIncoming; ++In) {
7966     for (unsigned Part = 0; Part < State.UF; ++Part) {
7967       // We might have single edge PHIs (blocks) - use an identity
7968       // 'select' for the first PHI operand.
7969       Value *In0 = State.get(getIncomingValue(In), Part);
7970       if (In == 0)
7971         Entry[Part] = In0; // Initialize with the first incoming value.
7972       else {
7973         // Select between the current value and the previous incoming edge
7974         // based on the incoming mask.
7975         Value *Cond = State.get(getMask(In), Part);
7976         Entry[Part] =
7977             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7978       }
7979     }
7980   }
7981   for (unsigned Part = 0; Part < State.UF; ++Part)
7982     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7983 }
7984 
7985 void VPInterleaveRecipe::execute(VPTransformState &State) {
7986   assert(!State.Instance && "Interleave group being replicated.");
7987   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7988 }
7989 
7990 void VPReductionRecipe::execute(VPTransformState &State) {
7991   assert(!State.Instance && "Reduction being replicated.");
7992   for (unsigned Part = 0; Part < State.UF; ++Part) {
7993     unsigned Kind = RdxDesc->getRecurrenceKind();
7994     Value *NewVecOp = State.get(VecOp, Part);
7995     Value *NewRed =
7996         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
7997     Value *PrevInChain = State.get(ChainOp, Part);
7998     Value *NextInChain;
7999     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8000         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8001       NextInChain =
8002           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8003                          NewRed, PrevInChain);
8004     } else {
8005       NextInChain = State.Builder.CreateBinOp(
8006           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
8007     }
8008     State.ValueMap.setVectorValue(I, Part, NextInChain);
8009   }
8010 }
8011 
8012 void VPReplicateRecipe::execute(VPTransformState &State) {
8013   if (State.Instance) { // Generate a single instance.
8014     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
8015                                     IsPredicated, State);
8016     // Insert scalar instance packing it into a vector.
8017     if (AlsoPack && State.VF.isVector()) {
8018       // If we're constructing lane 0, initialize to start from undef.
8019       if (State.Instance->Lane == 0) {
8020         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8021         Value *Undef =
8022             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8023         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8024       }
8025       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8026     }
8027     return;
8028   }
8029 
8030   // Generate scalar instances for all VF lanes of all UF parts, unless the
8031   // instruction is uniform inwhich case generate only the first lane for each
8032   // of the UF parts.
8033   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8034   for (unsigned Part = 0; Part < State.UF; ++Part)
8035     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8036       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
8037                                       IsPredicated, State);
8038 }
8039 
8040 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8041   assert(State.Instance && "Branch on Mask works only on single instance.");
8042 
8043   unsigned Part = State.Instance->Part;
8044   unsigned Lane = State.Instance->Lane;
8045 
8046   Value *ConditionBit = nullptr;
8047   VPValue *BlockInMask = getMask();
8048   if (BlockInMask) {
8049     ConditionBit = State.get(BlockInMask, Part);
8050     if (ConditionBit->getType()->isVectorTy())
8051       ConditionBit = State.Builder.CreateExtractElement(
8052           ConditionBit, State.Builder.getInt32(Lane));
8053   } else // Block in mask is all-one.
8054     ConditionBit = State.Builder.getTrue();
8055 
8056   // Replace the temporary unreachable terminator with a new conditional branch,
8057   // whose two destinations will be set later when they are created.
8058   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8059   assert(isa<UnreachableInst>(CurrentTerminator) &&
8060          "Expected to replace unreachable terminator with conditional branch.");
8061   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8062   CondBr->setSuccessor(0, nullptr);
8063   ReplaceInstWithInst(CurrentTerminator, CondBr);
8064 }
8065 
8066 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8067   assert(State.Instance && "Predicated instruction PHI works per instance.");
8068   Instruction *ScalarPredInst = cast<Instruction>(
8069       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8070   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8071   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8072   assert(PredicatingBB && "Predicated block has no single predecessor.");
8073 
8074   // By current pack/unpack logic we need to generate only a single phi node: if
8075   // a vector value for the predicated instruction exists at this point it means
8076   // the instruction has vector users only, and a phi for the vector value is
8077   // needed. In this case the recipe of the predicated instruction is marked to
8078   // also do that packing, thereby "hoisting" the insert-element sequence.
8079   // Otherwise, a phi node for the scalar value is needed.
8080   unsigned Part = State.Instance->Part;
8081   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8082     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8083     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8084     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8085     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8086     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8087     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8088   } else {
8089     Type *PredInstType = PredInst->getType();
8090     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8091     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8092     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8093     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8094   }
8095 }
8096 
8097 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8098   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8099   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8100                                         getMask());
8101 }
8102 
8103 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8104 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8105 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8106 // for predication.
8107 static ScalarEpilogueLowering getScalarEpilogueLowering(
8108     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8109     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8110     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8111     LoopVectorizationLegality &LVL) {
8112   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8113   // don't look at hints or options, and don't request a scalar epilogue.
8114   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8115   // LoopAccessInfo (due to code dependency and not being able to reliably get
8116   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8117   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8118   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8119   // back to the old way and vectorize with versioning when forced. See D81345.)
8120   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8121                                                       PGSOQueryType::IRPass) &&
8122                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8123     return CM_ScalarEpilogueNotAllowedOptSize;
8124 
8125   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8126                               !PreferPredicateOverEpilogue;
8127 
8128   // 2) Next, if disabling predication is requested on the command line, honour
8129   // this and request a scalar epilogue.
8130   if (PredicateOptDisabled)
8131     return CM_ScalarEpilogueAllowed;
8132 
8133   // 3) and 4) look if enabling predication is requested on the command line,
8134   // with a loop hint, or if the TTI hook indicates this is profitable, request
8135   // predication.
8136   if (PreferPredicateOverEpilogue ||
8137       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8138       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8139                                         LVL.getLAI()) &&
8140        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8141     return CM_ScalarEpilogueNotNeededUsePredicate;
8142 
8143   return CM_ScalarEpilogueAllowed;
8144 }
8145 
8146 // Process the loop in the VPlan-native vectorization path. This path builds
8147 // VPlan upfront in the vectorization pipeline, which allows to apply
8148 // VPlan-to-VPlan transformations from the very beginning without modifying the
8149 // input LLVM IR.
8150 static bool processLoopInVPlanNativePath(
8151     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8152     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8153     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8154     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8155     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8156 
8157   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8158     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8159     return false;
8160   }
8161   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8162   Function *F = L->getHeader()->getParent();
8163   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8164 
8165   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8166       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8167 
8168   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8169                                 &Hints, IAI);
8170   // Use the planner for outer loop vectorization.
8171   // TODO: CM is not used at this point inside the planner. Turn CM into an
8172   // optional argument if we don't need it in the future.
8173   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8174 
8175   // Get user vectorization factor.
8176   const unsigned UserVF = Hints.getWidth();
8177 
8178   // Plan how to best vectorize, return the best VF and its cost.
8179   const VectorizationFactor VF =
8180       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8181 
8182   // If we are stress testing VPlan builds, do not attempt to generate vector
8183   // code. Masked vector code generation support will follow soon.
8184   // Also, do not attempt to vectorize if no vector code will be produced.
8185   if (VPlanBuildStressTest || EnableVPlanPredication ||
8186       VectorizationFactor::Disabled() == VF)
8187     return false;
8188 
8189   LVP.setBestPlan(VF.Width, 1);
8190 
8191   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8192                          &CM, BFI, PSI);
8193   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8194                     << L->getHeader()->getParent()->getName() << "\"\n");
8195   LVP.executePlan(LB, DT);
8196 
8197   // Mark the loop as already vectorized to avoid vectorizing again.
8198   Hints.setAlreadyVectorized();
8199 
8200   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8201   return true;
8202 }
8203 
8204 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8205     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8206                                !EnableLoopInterleaving),
8207       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8208                               !EnableLoopVectorization) {}
8209 
8210 bool LoopVectorizePass::processLoop(Loop *L) {
8211   assert((EnableVPlanNativePath || L->empty()) &&
8212          "VPlan-native path is not enabled. Only process inner loops.");
8213 
8214 #ifndef NDEBUG
8215   const std::string DebugLocStr = getDebugLocString(L);
8216 #endif /* NDEBUG */
8217 
8218   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8219                     << L->getHeader()->getParent()->getName() << "\" from "
8220                     << DebugLocStr << "\n");
8221 
8222   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8223 
8224   LLVM_DEBUG(
8225       dbgs() << "LV: Loop hints:"
8226              << " force="
8227              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8228                      ? "disabled"
8229                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8230                             ? "enabled"
8231                             : "?"))
8232              << " width=" << Hints.getWidth()
8233              << " unroll=" << Hints.getInterleave() << "\n");
8234 
8235   // Function containing loop
8236   Function *F = L->getHeader()->getParent();
8237 
8238   // Looking at the diagnostic output is the only way to determine if a loop
8239   // was vectorized (other than looking at the IR or machine code), so it
8240   // is important to generate an optimization remark for each loop. Most of
8241   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8242   // generated as OptimizationRemark and OptimizationRemarkMissed are
8243   // less verbose reporting vectorized loops and unvectorized loops that may
8244   // benefit from vectorization, respectively.
8245 
8246   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8247     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8248     return false;
8249   }
8250 
8251   PredicatedScalarEvolution PSE(*SE, *L);
8252 
8253   // Check if it is legal to vectorize the loop.
8254   LoopVectorizationRequirements Requirements(*ORE);
8255   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8256                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8257   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8258     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8259     Hints.emitRemarkWithHints();
8260     return false;
8261   }
8262 
8263   // Check the function attributes and profiles to find out if this function
8264   // should be optimized for size.
8265   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8266       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8267 
8268   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8269   // here. They may require CFG and instruction level transformations before
8270   // even evaluating whether vectorization is profitable. Since we cannot modify
8271   // the incoming IR, we need to build VPlan upfront in the vectorization
8272   // pipeline.
8273   if (!L->empty())
8274     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8275                                         ORE, BFI, PSI, Hints);
8276 
8277   assert(L->empty() && "Inner loop expected.");
8278 
8279   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8280   // count by optimizing for size, to minimize overheads.
8281   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8282   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8283     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8284                       << "This loop is worth vectorizing only if no scalar "
8285                       << "iteration overheads are incurred.");
8286     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8287       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8288     else {
8289       LLVM_DEBUG(dbgs() << "\n");
8290       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8291     }
8292   }
8293 
8294   // Check the function attributes to see if implicit floats are allowed.
8295   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8296   // an integer loop and the vector instructions selected are purely integer
8297   // vector instructions?
8298   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8299     reportVectorizationFailure(
8300         "Can't vectorize when the NoImplicitFloat attribute is used",
8301         "loop not vectorized due to NoImplicitFloat attribute",
8302         "NoImplicitFloat", ORE, L);
8303     Hints.emitRemarkWithHints();
8304     return false;
8305   }
8306 
8307   // Check if the target supports potentially unsafe FP vectorization.
8308   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8309   // for the target we're vectorizing for, to make sure none of the
8310   // additional fp-math flags can help.
8311   if (Hints.isPotentiallyUnsafe() &&
8312       TTI->isFPVectorizationPotentiallyUnsafe()) {
8313     reportVectorizationFailure(
8314         "Potentially unsafe FP op prevents vectorization",
8315         "loop not vectorized due to unsafe FP support.",
8316         "UnsafeFP", ORE, L);
8317     Hints.emitRemarkWithHints();
8318     return false;
8319   }
8320 
8321   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8322   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8323 
8324   // If an override option has been passed in for interleaved accesses, use it.
8325   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8326     UseInterleaved = EnableInterleavedMemAccesses;
8327 
8328   // Analyze interleaved memory accesses.
8329   if (UseInterleaved) {
8330     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8331   }
8332 
8333   // Use the cost model.
8334   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8335                                 F, &Hints, IAI);
8336   CM.collectValuesToIgnore();
8337 
8338   // Use the planner for vectorization.
8339   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8340 
8341   // Get user vectorization factor and interleave count.
8342   unsigned UserVF = Hints.getWidth();
8343   unsigned UserIC = Hints.getInterleave();
8344 
8345   // Plan how to best vectorize, return the best VF and its cost.
8346   Optional<VectorizationFactor> MaybeVF =
8347       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8348 
8349   VectorizationFactor VF = VectorizationFactor::Disabled();
8350   unsigned IC = 1;
8351 
8352   if (MaybeVF) {
8353     VF = *MaybeVF;
8354     // Select the interleave count.
8355     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8356   }
8357 
8358   // Identify the diagnostic messages that should be produced.
8359   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8360   bool VectorizeLoop = true, InterleaveLoop = true;
8361   if (Requirements.doesNotMeet(F, L, Hints)) {
8362     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8363                          "requirements.\n");
8364     Hints.emitRemarkWithHints();
8365     return false;
8366   }
8367 
8368   if (VF.Width == 1) {
8369     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8370     VecDiagMsg = std::make_pair(
8371         "VectorizationNotBeneficial",
8372         "the cost-model indicates that vectorization is not beneficial");
8373     VectorizeLoop = false;
8374   }
8375 
8376   if (!MaybeVF && UserIC > 1) {
8377     // Tell the user interleaving was avoided up-front, despite being explicitly
8378     // requested.
8379     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8380                          "interleaving should be avoided up front\n");
8381     IntDiagMsg = std::make_pair(
8382         "InterleavingAvoided",
8383         "Ignoring UserIC, because interleaving was avoided up front");
8384     InterleaveLoop = false;
8385   } else if (IC == 1 && UserIC <= 1) {
8386     // Tell the user interleaving is not beneficial.
8387     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8388     IntDiagMsg = std::make_pair(
8389         "InterleavingNotBeneficial",
8390         "the cost-model indicates that interleaving is not beneficial");
8391     InterleaveLoop = false;
8392     if (UserIC == 1) {
8393       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8394       IntDiagMsg.second +=
8395           " and is explicitly disabled or interleave count is set to 1";
8396     }
8397   } else if (IC > 1 && UserIC == 1) {
8398     // Tell the user interleaving is beneficial, but it explicitly disabled.
8399     LLVM_DEBUG(
8400         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8401     IntDiagMsg = std::make_pair(
8402         "InterleavingBeneficialButDisabled",
8403         "the cost-model indicates that interleaving is beneficial "
8404         "but is explicitly disabled or interleave count is set to 1");
8405     InterleaveLoop = false;
8406   }
8407 
8408   // Override IC if user provided an interleave count.
8409   IC = UserIC > 0 ? UserIC : IC;
8410 
8411   // Emit diagnostic messages, if any.
8412   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8413   if (!VectorizeLoop && !InterleaveLoop) {
8414     // Do not vectorize or interleaving the loop.
8415     ORE->emit([&]() {
8416       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8417                                       L->getStartLoc(), L->getHeader())
8418              << VecDiagMsg.second;
8419     });
8420     ORE->emit([&]() {
8421       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8422                                       L->getStartLoc(), L->getHeader())
8423              << IntDiagMsg.second;
8424     });
8425     return false;
8426   } else if (!VectorizeLoop && InterleaveLoop) {
8427     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8428     ORE->emit([&]() {
8429       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8430                                         L->getStartLoc(), L->getHeader())
8431              << VecDiagMsg.second;
8432     });
8433   } else if (VectorizeLoop && !InterleaveLoop) {
8434     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8435                       << ") in " << DebugLocStr << '\n');
8436     ORE->emit([&]() {
8437       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8438                                         L->getStartLoc(), L->getHeader())
8439              << IntDiagMsg.second;
8440     });
8441   } else if (VectorizeLoop && InterleaveLoop) {
8442     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8443                       << ") in " << DebugLocStr << '\n');
8444     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8445   }
8446 
8447   LVP.setBestPlan(VF.Width, IC);
8448 
8449   using namespace ore;
8450   bool DisableRuntimeUnroll = false;
8451   MDNode *OrigLoopID = L->getLoopID();
8452 
8453   if (!VectorizeLoop) {
8454     assert(IC > 1 && "interleave count should not be 1 or 0");
8455     // If we decided that it is not legal to vectorize the loop, then
8456     // interleave it.
8457     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8458                                BFI, PSI);
8459     LVP.executePlan(Unroller, DT);
8460 
8461     ORE->emit([&]() {
8462       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8463                                 L->getHeader())
8464              << "interleaved loop (interleaved count: "
8465              << NV("InterleaveCount", IC) << ")";
8466     });
8467   } else {
8468     // If we decided that it is *legal* to vectorize the loop, then do it.
8469     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8470                            &LVL, &CM, BFI, PSI);
8471     LVP.executePlan(LB, DT);
8472     ++LoopsVectorized;
8473 
8474     // Add metadata to disable runtime unrolling a scalar loop when there are
8475     // no runtime checks about strides and memory. A scalar loop that is
8476     // rarely used is not worth unrolling.
8477     if (!LB.areSafetyChecksAdded())
8478       DisableRuntimeUnroll = true;
8479 
8480     // Report the vectorization decision.
8481     ORE->emit([&]() {
8482       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8483                                 L->getHeader())
8484              << "vectorized loop (vectorization width: "
8485              << NV("VectorizationFactor", VF.Width)
8486              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8487     });
8488   }
8489 
8490   Optional<MDNode *> RemainderLoopID =
8491       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8492                                       LLVMLoopVectorizeFollowupEpilogue});
8493   if (RemainderLoopID.hasValue()) {
8494     L->setLoopID(RemainderLoopID.getValue());
8495   } else {
8496     if (DisableRuntimeUnroll)
8497       AddRuntimeUnrollDisableMetaData(L);
8498 
8499     // Mark the loop as already vectorized to avoid vectorizing again.
8500     Hints.setAlreadyVectorized();
8501   }
8502 
8503   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8504   return true;
8505 }
8506 
8507 LoopVectorizeResult LoopVectorizePass::runImpl(
8508     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8509     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8510     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8511     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8512     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8513   SE = &SE_;
8514   LI = &LI_;
8515   TTI = &TTI_;
8516   DT = &DT_;
8517   BFI = &BFI_;
8518   TLI = TLI_;
8519   AA = &AA_;
8520   AC = &AC_;
8521   GetLAA = &GetLAA_;
8522   DB = &DB_;
8523   ORE = &ORE_;
8524   PSI = PSI_;
8525 
8526   // Don't attempt if
8527   // 1. the target claims to have no vector registers, and
8528   // 2. interleaving won't help ILP.
8529   //
8530   // The second condition is necessary because, even if the target has no
8531   // vector registers, loop vectorization may still enable scalar
8532   // interleaving.
8533   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8534       TTI->getMaxInterleaveFactor(1) < 2)
8535     return LoopVectorizeResult(false, false);
8536 
8537   bool Changed = false, CFGChanged = false;
8538 
8539   // The vectorizer requires loops to be in simplified form.
8540   // Since simplification may add new inner loops, it has to run before the
8541   // legality and profitability checks. This means running the loop vectorizer
8542   // will simplify all loops, regardless of whether anything end up being
8543   // vectorized.
8544   for (auto &L : *LI)
8545     Changed |= CFGChanged |=
8546         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8547 
8548   // Build up a worklist of inner-loops to vectorize. This is necessary as
8549   // the act of vectorizing or partially unrolling a loop creates new loops
8550   // and can invalidate iterators across the loops.
8551   SmallVector<Loop *, 8> Worklist;
8552 
8553   for (Loop *L : *LI)
8554     collectSupportedLoops(*L, LI, ORE, Worklist);
8555 
8556   LoopsAnalyzed += Worklist.size();
8557 
8558   // Now walk the identified inner loops.
8559   while (!Worklist.empty()) {
8560     Loop *L = Worklist.pop_back_val();
8561 
8562     // For the inner loops we actually process, form LCSSA to simplify the
8563     // transform.
8564     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8565 
8566     Changed |= CFGChanged |= processLoop(L);
8567   }
8568 
8569   // Process each loop nest in the function.
8570   return LoopVectorizeResult(Changed, CFGChanged);
8571 }
8572 
8573 PreservedAnalyses LoopVectorizePass::run(Function &F,
8574                                          FunctionAnalysisManager &AM) {
8575     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8576     auto &LI = AM.getResult<LoopAnalysis>(F);
8577     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8578     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8579     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8580     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8581     auto &AA = AM.getResult<AAManager>(F);
8582     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8583     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8584     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8585     MemorySSA *MSSA = EnableMSSALoopDependency
8586                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8587                           : nullptr;
8588 
8589     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8590     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8591         [&](Loop &L) -> const LoopAccessInfo & {
8592       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8593       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8594     };
8595     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8596     ProfileSummaryInfo *PSI =
8597         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8598     LoopVectorizeResult Result =
8599         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8600     if (!Result.MadeAnyChange)
8601       return PreservedAnalyses::all();
8602     PreservedAnalyses PA;
8603 
8604     // We currently do not preserve loopinfo/dominator analyses with outer loop
8605     // vectorization. Until this is addressed, mark these analyses as preserved
8606     // only for non-VPlan-native path.
8607     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8608     if (!EnableVPlanNativePath) {
8609       PA.preserve<LoopAnalysis>();
8610       PA.preserve<DominatorTreeAnalysis>();
8611     }
8612     PA.preserve<BasicAA>();
8613     PA.preserve<GlobalsAA>();
8614     if (!Result.MadeCFGChange)
8615       PA.preserveSet<CFGAnalyses>();
8616     return PA;
8617 }
8618