1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/MemorySSA.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<bool> ForceTargetSupportsScalableVectors(
276     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
277     cl::desc(
278         "Pretend that scalable vectors are supported, even if the target does "
279         "not support them. This flag should only be used for testing."));
280 
281 static cl::opt<unsigned> SmallLoopCost(
282     "small-loop-cost", cl::init(20), cl::Hidden,
283     cl::desc(
284         "The cost of a loop that is considered 'small' by the interleaver."));
285 
286 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
287     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
288     cl::desc("Enable the use of the block frequency analysis to access PGO "
289              "heuristics minimizing code growth in cold regions and being more "
290              "aggressive in hot regions."));
291 
292 // Runtime interleave loops for load/store throughput.
293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
294     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
295     cl::desc(
296         "Enable runtime interleaving until load/store ports are saturated"));
297 
298 /// Interleave small loops with scalar reductions.
299 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
300     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
301     cl::desc("Enable interleaving for loops with small iteration counts that "
302              "contain scalar reductions to expose ILP."));
303 
304 /// The number of stores in a loop that are allowed to need predication.
305 static cl::opt<unsigned> NumberOfStoresToPredicate(
306     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
307     cl::desc("Max number of stores to be predicated behind an if."));
308 
309 static cl::opt<bool> EnableIndVarRegisterHeur(
310     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
311     cl::desc("Count the induction variable only once when interleaving"));
312 
313 static cl::opt<bool> EnableCondStoresVectorization(
314     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
315     cl::desc("Enable if predication of stores during vectorization."));
316 
317 static cl::opt<unsigned> MaxNestedScalarReductionIC(
318     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
319     cl::desc("The maximum interleave count to use when interleaving a scalar "
320              "reduction in a nested loop."));
321 
322 static cl::opt<bool>
323     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
324                            cl::Hidden,
325                            cl::desc("Prefer in-loop vector reductions, "
326                                     "overriding the targets preference."));
327 
328 static cl::opt<bool> PreferPredicatedReductionSelect(
329     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
330     cl::desc(
331         "Prefer predicating a reduction operation over an after loop select."));
332 
333 cl::opt<bool> EnableVPlanNativePath(
334     "enable-vplan-native-path", cl::init(false), cl::Hidden,
335     cl::desc("Enable VPlan-native vectorization path with "
336              "support for outer loop vectorization."));
337 
338 // FIXME: Remove this switch once we have divergence analysis. Currently we
339 // assume divergent non-backedge branches when this switch is true.
340 cl::opt<bool> EnableVPlanPredication(
341     "enable-vplan-predication", cl::init(false), cl::Hidden,
342     cl::desc("Enable VPlan-native vectorization path predicator with "
343              "support for outer loop vectorization."));
344 
345 // This flag enables the stress testing of the VPlan H-CFG construction in the
346 // VPlan-native vectorization path. It must be used in conjuction with
347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
348 // verification of the H-CFGs built.
349 static cl::opt<bool> VPlanBuildStressTest(
350     "vplan-build-stress-test", cl::init(false), cl::Hidden,
351     cl::desc(
352         "Build VPlan for every supported loop nest in the function and bail "
353         "out right after the build (stress test the VPlan H-CFG construction "
354         "in the VPlan-native vectorization path)."));
355 
356 cl::opt<bool> llvm::EnableLoopInterleaving(
357     "interleave-loops", cl::init(true), cl::Hidden,
358     cl::desc("Enable loop interleaving in Loop vectorization passes"));
359 cl::opt<bool> llvm::EnableLoopVectorization(
360     "vectorize-loops", cl::init(true), cl::Hidden,
361     cl::desc("Run the Loop vectorization passes"));
362 
363 /// A helper function that returns the type of loaded or stored value.
364 static Type *getMemInstValueType(Value *I) {
365   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
366          "Expected Load or Store instruction");
367   if (auto *LI = dyn_cast<LoadInst>(I))
368     return LI->getType();
369   return cast<StoreInst>(I)->getValueOperand()->getType();
370 }
371 
372 /// A helper function that returns true if the given type is irregular. The
373 /// type is irregular if its allocated size doesn't equal the store size of an
374 /// element of the corresponding vector type at the given vectorization factor.
375 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
376   // Determine if an array of VF elements of type Ty is "bitcast compatible"
377   // with a <VF x Ty> vector.
378   if (VF.isVector()) {
379     auto *VectorTy = VectorType::get(Ty, VF);
380     return TypeSize::get(VF.getKnownMinValue() *
381                              DL.getTypeAllocSize(Ty).getFixedValue(),
382                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
383   }
384 
385   // If the vectorization factor is one, we just check if an array of type Ty
386   // requires padding between elements.
387   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
388 }
389 
390 /// A helper function that returns the reciprocal of the block probability of
391 /// predicated blocks. If we return X, we are assuming the predicated block
392 /// will execute once for every X iterations of the loop header.
393 ///
394 /// TODO: We should use actual block probability here, if available. Currently,
395 ///       we always assume predicated blocks have a 50% chance of executing.
396 static unsigned getReciprocalPredBlockProb() { return 2; }
397 
398 /// A helper function that adds a 'fast' flag to floating-point operations.
399 static Value *addFastMathFlag(Value *V) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 // Forward declare GeneratedRTChecks.
436 class GeneratedRTChecks;
437 
438 namespace llvm {
439 
440 /// InnerLoopVectorizer vectorizes loops which contain only one basic
441 /// block to a specified vectorization factor (VF).
442 /// This class performs the widening of scalars into vectors, or multiple
443 /// scalars. This class also implements the following features:
444 /// * It inserts an epilogue loop for handling loops that don't have iteration
445 ///   counts that are known to be a multiple of the vectorization factor.
446 /// * It handles the code generation for reduction variables.
447 /// * Scalarization (implementation using scalars) of un-vectorizable
448 ///   instructions.
449 /// InnerLoopVectorizer does not perform any vectorization-legality
450 /// checks, and relies on the caller to check for the different legality
451 /// aspects. The InnerLoopVectorizer relies on the
452 /// LoopVectorizationLegality class to provide information about the induction
453 /// and reduction variables that were found to a given vectorization factor.
454 class InnerLoopVectorizer {
455 public:
456   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
457                       LoopInfo *LI, DominatorTree *DT,
458                       const TargetLibraryInfo *TLI,
459                       const TargetTransformInfo *TTI, AssumptionCache *AC,
460                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
461                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
462                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
463                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
464       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
465         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
466         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
467         PSI(PSI), RTChecks(RTChecks) {
468     // Query this against the original loop and save it here because the profile
469     // of the original loop header may change as the transformation happens.
470     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
471         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
472   }
473 
474   virtual ~InnerLoopVectorizer() = default;
475 
476   /// Create a new empty loop that will contain vectorized instructions later
477   /// on, while the old loop will be used as the scalar remainder. Control flow
478   /// is generated around the vectorized (and scalar epilogue) loops consisting
479   /// of various checks and bypasses. Return the pre-header block of the new
480   /// loop.
481   /// In the case of epilogue vectorization, this function is overriden to
482   /// handle the more complex control flow around the loops.
483   virtual BasicBlock *createVectorizedLoopSkeleton();
484 
485   /// Widen a single instruction within the innermost loop.
486   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
487                         VPTransformState &State);
488 
489   /// Widen a single call instruction within the innermost loop.
490   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
491                             VPTransformState &State);
492 
493   /// Widen a single select instruction within the innermost loop.
494   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
495                               bool InvariantCond, VPTransformState &State);
496 
497   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
498   void fixVectorizedLoop(VPTransformState &State);
499 
500   // Return true if any runtime check is added.
501   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
502 
503   /// A type for vectorized values in the new loop. Each value from the
504   /// original loop, when vectorized, is represented by UF vector values in the
505   /// new unrolled loop, where UF is the unroll factor.
506   using VectorParts = SmallVector<Value *, 2>;
507 
508   /// Vectorize a single GetElementPtrInst based on information gathered and
509   /// decisions taken during planning.
510   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
511                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
512                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
513 
514   /// Vectorize a single PHINode in a block. This method handles the induction
515   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
516   /// arbitrary length vectors.
517   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
518                            VPValue *StartV, VPValue *Def,
519                            VPTransformState &State);
520 
521   /// A helper function to scalarize a single Instruction in the innermost loop.
522   /// Generates a sequence of scalar instances for each lane between \p MinLane
523   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
524   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
525   /// Instr's operands.
526   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
527                             const VPIteration &Instance, bool IfPredicateInstr,
528                             VPTransformState &State);
529 
530   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
531   /// is provided, the integer induction variable will first be truncated to
532   /// the corresponding type.
533   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
534                              VPValue *Def, VPValue *CastDef,
535                              VPTransformState &State);
536 
537   /// Construct the vector value of a scalarized value \p V one lane at a time.
538   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
539                                  VPTransformState &State);
540 
541   /// Try to vectorize interleaved access group \p Group with the base address
542   /// given in \p Addr, optionally masking the vector operations if \p
543   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
544   /// values in the vectorized loop.
545   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
546                                 ArrayRef<VPValue *> VPDefs,
547                                 VPTransformState &State, VPValue *Addr,
548                                 ArrayRef<VPValue *> StoredValues,
549                                 VPValue *BlockInMask = nullptr);
550 
551   /// Vectorize Load and Store instructions with the base address given in \p
552   /// Addr, optionally masking the vector operations if \p BlockInMask is
553   /// non-null. Use \p State to translate given VPValues to IR values in the
554   /// vectorized loop.
555   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
556                                   VPValue *Def, VPValue *Addr,
557                                   VPValue *StoredValue, VPValue *BlockInMask);
558 
559   /// Set the debug location in the builder using the debug location in
560   /// the instruction.
561   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
562 
563   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
564   void fixNonInductionPHIs(VPTransformState &State);
565 
566   /// Create a broadcast instruction. This method generates a broadcast
567   /// instruction (shuffle) for loop invariant values and for the induction
568   /// value. If this is the induction variable then we extend it to N, N+1, ...
569   /// this is needed because each iteration in the loop corresponds to a SIMD
570   /// element.
571   virtual Value *getBroadcastInstrs(Value *V);
572 
573 protected:
574   friend class LoopVectorizationPlanner;
575 
576   /// A small list of PHINodes.
577   using PhiVector = SmallVector<PHINode *, 4>;
578 
579   /// A type for scalarized values in the new loop. Each value from the
580   /// original loop, when scalarized, is represented by UF x VF scalar values
581   /// in the new unrolled loop, where UF is the unroll factor and VF is the
582   /// vectorization factor.
583   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
584 
585   /// Set up the values of the IVs correctly when exiting the vector loop.
586   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
587                     Value *CountRoundDown, Value *EndValue,
588                     BasicBlock *MiddleBlock);
589 
590   /// Create a new induction variable inside L.
591   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
592                                    Value *Step, Instruction *DL);
593 
594   /// Handle all cross-iteration phis in the header.
595   void fixCrossIterationPHIs(VPTransformState &State);
596 
597   /// Fix a first-order recurrence. This is the second phase of vectorizing
598   /// this phi node.
599   void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
600 
601   /// Fix a reduction cross-iteration phi. This is the second phase of
602   /// vectorizing this phi node.
603   void fixReduction(PHINode *Phi, VPTransformState &State);
604 
605   /// Clear NSW/NUW flags from reduction instructions if necessary.
606   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
607                                VPTransformState &State);
608 
609   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
610   /// means we need to add the appropriate incoming value from the middle
611   /// block as exiting edges from the scalar epilogue loop (if present) are
612   /// already in place, and we exit the vector loop exclusively to the middle
613   /// block.
614   void fixLCSSAPHIs(VPTransformState &State);
615 
616   /// Iteratively sink the scalarized operands of a predicated instruction into
617   /// the block that was created for it.
618   void sinkScalarOperands(Instruction *PredInst);
619 
620   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
621   /// represented as.
622   void truncateToMinimalBitwidths(VPTransformState &State);
623 
624   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
625   /// to each vector element of Val. The sequence starts at StartIndex.
626   /// \p Opcode is relevant for FP induction variable.
627   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
628                                Instruction::BinaryOps Opcode =
629                                Instruction::BinaryOpsEnd);
630 
631   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
632   /// variable on which to base the steps, \p Step is the size of the step, and
633   /// \p EntryVal is the value from the original loop that maps to the steps.
634   /// Note that \p EntryVal doesn't have to be an induction variable - it
635   /// can also be a truncate instruction.
636   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
637                         const InductionDescriptor &ID, VPValue *Def,
638                         VPValue *CastDef, VPTransformState &State);
639 
640   /// Create a vector induction phi node based on an existing scalar one. \p
641   /// EntryVal is the value from the original loop that maps to the vector phi
642   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
643   /// truncate instruction, instead of widening the original IV, we widen a
644   /// version of the IV truncated to \p EntryVal's type.
645   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
646                                        Value *Step, Value *Start,
647                                        Instruction *EntryVal, VPValue *Def,
648                                        VPValue *CastDef,
649                                        VPTransformState &State);
650 
651   /// Returns true if an instruction \p I should be scalarized instead of
652   /// vectorized for the chosen vectorization factor.
653   bool shouldScalarizeInstruction(Instruction *I) const;
654 
655   /// Returns true if we should generate a scalar version of \p IV.
656   bool needsScalarInduction(Instruction *IV) const;
657 
658   /// If there is a cast involved in the induction variable \p ID, which should
659   /// be ignored in the vectorized loop body, this function records the
660   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
661   /// cast. We had already proved that the casted Phi is equal to the uncasted
662   /// Phi in the vectorized loop (under a runtime guard), and therefore
663   /// there is no need to vectorize the cast - the same value can be used in the
664   /// vector loop for both the Phi and the cast.
665   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
666   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
667   ///
668   /// \p EntryVal is the value from the original loop that maps to the vector
669   /// phi node and is used to distinguish what is the IV currently being
670   /// processed - original one (if \p EntryVal is a phi corresponding to the
671   /// original IV) or the "newly-created" one based on the proof mentioned above
672   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
673   /// latter case \p EntryVal is a TruncInst and we must not record anything for
674   /// that IV, but it's error-prone to expect callers of this routine to care
675   /// about that, hence this explicit parameter.
676   void recordVectorLoopValueForInductionCast(
677       const InductionDescriptor &ID, const Instruction *EntryVal,
678       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
679       unsigned Part, unsigned Lane = UINT_MAX);
680 
681   /// Generate a shuffle sequence that will reverse the vector Vec.
682   virtual Value *reverseVector(Value *Vec);
683 
684   /// Returns (and creates if needed) the original loop trip count.
685   Value *getOrCreateTripCount(Loop *NewLoop);
686 
687   /// Returns (and creates if needed) the trip count of the widened loop.
688   Value *getOrCreateVectorTripCount(Loop *NewLoop);
689 
690   /// Returns a bitcasted value to the requested vector type.
691   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
692   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
693                                 const DataLayout &DL);
694 
695   /// Emit a bypass check to see if the vector trip count is zero, including if
696   /// it overflows.
697   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
698 
699   /// Emit a bypass check to see if all of the SCEV assumptions we've
700   /// had to make are correct. Returns the block containing the checks or
701   /// nullptr if no checks have been added.
702   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
703 
704   /// Emit bypass checks to check any memory assumptions we may have made.
705   /// Returns the block containing the checks or nullptr if no checks have been
706   /// added.
707   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
708 
709   /// Compute the transformed value of Index at offset StartValue using step
710   /// StepValue.
711   /// For integer induction, returns StartValue + Index * StepValue.
712   /// For pointer induction, returns StartValue[Index * StepValue].
713   /// FIXME: The newly created binary instructions should contain nsw/nuw
714   /// flags, which can be found from the original scalar operations.
715   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
716                               const DataLayout &DL,
717                               const InductionDescriptor &ID) const;
718 
719   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
720   /// vector loop preheader, middle block and scalar preheader. Also
721   /// allocate a loop object for the new vector loop and return it.
722   Loop *createVectorLoopSkeleton(StringRef Prefix);
723 
724   /// Create new phi nodes for the induction variables to resume iteration count
725   /// in the scalar epilogue, from where the vectorized loop left off (given by
726   /// \p VectorTripCount).
727   /// In cases where the loop skeleton is more complicated (eg. epilogue
728   /// vectorization) and the resume values can come from an additional bypass
729   /// block, the \p AdditionalBypass pair provides information about the bypass
730   /// block and the end value on the edge from bypass to this loop.
731   void createInductionResumeValues(
732       Loop *L, Value *VectorTripCount,
733       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
734 
735   /// Complete the loop skeleton by adding debug MDs, creating appropriate
736   /// conditional branches in the middle block, preparing the builder and
737   /// running the verifier. Take in the vector loop \p L as argument, and return
738   /// the preheader of the completed vector loop.
739   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
740 
741   /// Add additional metadata to \p To that was not present on \p Orig.
742   ///
743   /// Currently this is used to add the noalias annotations based on the
744   /// inserted memchecks.  Use this for instructions that are *cloned* into the
745   /// vector loop.
746   void addNewMetadata(Instruction *To, const Instruction *Orig);
747 
748   /// Add metadata from one instruction to another.
749   ///
750   /// This includes both the original MDs from \p From and additional ones (\see
751   /// addNewMetadata).  Use this for *newly created* instructions in the vector
752   /// loop.
753   void addMetadata(Instruction *To, Instruction *From);
754 
755   /// Similar to the previous function but it adds the metadata to a
756   /// vector of instructions.
757   void addMetadata(ArrayRef<Value *> To, Instruction *From);
758 
759   /// Allow subclasses to override and print debug traces before/after vplan
760   /// execution, when trace information is requested.
761   virtual void printDebugTracesAtStart(){};
762   virtual void printDebugTracesAtEnd(){};
763 
764   /// The original loop.
765   Loop *OrigLoop;
766 
767   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
768   /// dynamic knowledge to simplify SCEV expressions and converts them to a
769   /// more usable form.
770   PredicatedScalarEvolution &PSE;
771 
772   /// Loop Info.
773   LoopInfo *LI;
774 
775   /// Dominator Tree.
776   DominatorTree *DT;
777 
778   /// Alias Analysis.
779   AAResults *AA;
780 
781   /// Target Library Info.
782   const TargetLibraryInfo *TLI;
783 
784   /// Target Transform Info.
785   const TargetTransformInfo *TTI;
786 
787   /// Assumption Cache.
788   AssumptionCache *AC;
789 
790   /// Interface to emit optimization remarks.
791   OptimizationRemarkEmitter *ORE;
792 
793   /// LoopVersioning.  It's only set up (non-null) if memchecks were
794   /// used.
795   ///
796   /// This is currently only used to add no-alias metadata based on the
797   /// memchecks.  The actually versioning is performed manually.
798   std::unique_ptr<LoopVersioning> LVer;
799 
800   /// The vectorization SIMD factor to use. Each vector will have this many
801   /// vector elements.
802   ElementCount VF;
803 
804   /// The vectorization unroll factor to use. Each scalar is vectorized to this
805   /// many different vector instructions.
806   unsigned UF;
807 
808   /// The builder that we use
809   IRBuilder<> Builder;
810 
811   // --- Vectorization state ---
812 
813   /// The vector-loop preheader.
814   BasicBlock *LoopVectorPreHeader;
815 
816   /// The scalar-loop preheader.
817   BasicBlock *LoopScalarPreHeader;
818 
819   /// Middle Block between the vector and the scalar.
820   BasicBlock *LoopMiddleBlock;
821 
822   /// The (unique) ExitBlock of the scalar loop.  Note that
823   /// there can be multiple exiting edges reaching this block.
824   BasicBlock *LoopExitBlock;
825 
826   /// The vector loop body.
827   BasicBlock *LoopVectorBody;
828 
829   /// The scalar loop body.
830   BasicBlock *LoopScalarBody;
831 
832   /// A list of all bypass blocks. The first block is the entry of the loop.
833   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
834 
835   /// The new Induction variable which was added to the new block.
836   PHINode *Induction = nullptr;
837 
838   /// The induction variable of the old basic block.
839   PHINode *OldInduction = nullptr;
840 
841   /// Store instructions that were predicated.
842   SmallVector<Instruction *, 4> PredicatedInstructions;
843 
844   /// Trip count of the original loop.
845   Value *TripCount = nullptr;
846 
847   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
848   Value *VectorTripCount = nullptr;
849 
850   /// The legality analysis.
851   LoopVectorizationLegality *Legal;
852 
853   /// The profitablity analysis.
854   LoopVectorizationCostModel *Cost;
855 
856   // Record whether runtime checks are added.
857   bool AddedSafetyChecks = false;
858 
859   // Holds the end values for each induction variable. We save the end values
860   // so we can later fix-up the external users of the induction variables.
861   DenseMap<PHINode *, Value *> IVEndValues;
862 
863   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
864   // fixed up at the end of vector code generation.
865   SmallVector<PHINode *, 8> OrigPHIsToFix;
866 
867   /// BFI and PSI are used to check for profile guided size optimizations.
868   BlockFrequencyInfo *BFI;
869   ProfileSummaryInfo *PSI;
870 
871   // Whether this loop should be optimized for size based on profile guided size
872   // optimizatios.
873   bool OptForSizeBasedOnProfile;
874 
875   /// Structure to hold information about generated runtime checks, responsible
876   /// for cleaning the checks, if vectorization turns out unprofitable.
877   GeneratedRTChecks &RTChecks;
878 };
879 
880 class InnerLoopUnroller : public InnerLoopVectorizer {
881 public:
882   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
883                     LoopInfo *LI, DominatorTree *DT,
884                     const TargetLibraryInfo *TLI,
885                     const TargetTransformInfo *TTI, AssumptionCache *AC,
886                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
887                     LoopVectorizationLegality *LVL,
888                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
889                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
890       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
891                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
892                             BFI, PSI, Check) {}
893 
894 private:
895   Value *getBroadcastInstrs(Value *V) override;
896   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
897                        Instruction::BinaryOps Opcode =
898                        Instruction::BinaryOpsEnd) override;
899   Value *reverseVector(Value *Vec) override;
900 };
901 
902 /// Encapsulate information regarding vectorization of a loop and its epilogue.
903 /// This information is meant to be updated and used across two stages of
904 /// epilogue vectorization.
905 struct EpilogueLoopVectorizationInfo {
906   ElementCount MainLoopVF = ElementCount::getFixed(0);
907   unsigned MainLoopUF = 0;
908   ElementCount EpilogueVF = ElementCount::getFixed(0);
909   unsigned EpilogueUF = 0;
910   BasicBlock *MainLoopIterationCountCheck = nullptr;
911   BasicBlock *EpilogueIterationCountCheck = nullptr;
912   BasicBlock *SCEVSafetyCheck = nullptr;
913   BasicBlock *MemSafetyCheck = nullptr;
914   Value *TripCount = nullptr;
915   Value *VectorTripCount = nullptr;
916 
917   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
918                                 unsigned EUF)
919       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
920         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
921     assert(EUF == 1 &&
922            "A high UF for the epilogue loop is likely not beneficial.");
923   }
924 };
925 
926 /// An extension of the inner loop vectorizer that creates a skeleton for a
927 /// vectorized loop that has its epilogue (residual) also vectorized.
928 /// The idea is to run the vplan on a given loop twice, firstly to setup the
929 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
930 /// from the first step and vectorize the epilogue.  This is achieved by
931 /// deriving two concrete strategy classes from this base class and invoking
932 /// them in succession from the loop vectorizer planner.
933 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
934 public:
935   InnerLoopAndEpilogueVectorizer(
936       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
937       DominatorTree *DT, const TargetLibraryInfo *TLI,
938       const TargetTransformInfo *TTI, AssumptionCache *AC,
939       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
940       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
941       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
942       GeneratedRTChecks &Checks)
943       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
944                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
945                             Checks),
946         EPI(EPI) {}
947 
948   // Override this function to handle the more complex control flow around the
949   // three loops.
950   BasicBlock *createVectorizedLoopSkeleton() final override {
951     return createEpilogueVectorizedLoopSkeleton();
952   }
953 
954   /// The interface for creating a vectorized skeleton using one of two
955   /// different strategies, each corresponding to one execution of the vplan
956   /// as described above.
957   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
958 
959   /// Holds and updates state information required to vectorize the main loop
960   /// and its epilogue in two separate passes. This setup helps us avoid
961   /// regenerating and recomputing runtime safety checks. It also helps us to
962   /// shorten the iteration-count-check path length for the cases where the
963   /// iteration count of the loop is so small that the main vector loop is
964   /// completely skipped.
965   EpilogueLoopVectorizationInfo &EPI;
966 };
967 
968 /// A specialized derived class of inner loop vectorizer that performs
969 /// vectorization of *main* loops in the process of vectorizing loops and their
970 /// epilogues.
971 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
972 public:
973   EpilogueVectorizerMainLoop(
974       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
975       DominatorTree *DT, const TargetLibraryInfo *TLI,
976       const TargetTransformInfo *TTI, AssumptionCache *AC,
977       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
978       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
979       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
980       GeneratedRTChecks &Check)
981       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
982                                        EPI, LVL, CM, BFI, PSI, Check) {}
983   /// Implements the interface for creating a vectorized skeleton using the
984   /// *main loop* strategy (ie the first pass of vplan execution).
985   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
986 
987 protected:
988   /// Emits an iteration count bypass check once for the main loop (when \p
989   /// ForEpilogue is false) and once for the epilogue loop (when \p
990   /// ForEpilogue is true).
991   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
992                                              bool ForEpilogue);
993   void printDebugTracesAtStart() override;
994   void printDebugTracesAtEnd() override;
995 };
996 
997 // A specialized derived class of inner loop vectorizer that performs
998 // vectorization of *epilogue* loops in the process of vectorizing loops and
999 // their epilogues.
1000 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1001 public:
1002   EpilogueVectorizerEpilogueLoop(
1003       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1004       DominatorTree *DT, const TargetLibraryInfo *TLI,
1005       const TargetTransformInfo *TTI, AssumptionCache *AC,
1006       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1007       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1008       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1009       GeneratedRTChecks &Checks)
1010       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1011                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1012   /// Implements the interface for creating a vectorized skeleton using the
1013   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1014   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1015 
1016 protected:
1017   /// Emits an iteration count bypass check after the main vector loop has
1018   /// finished to see if there are any iterations left to execute by either
1019   /// the vector epilogue or the scalar epilogue.
1020   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1021                                                       BasicBlock *Bypass,
1022                                                       BasicBlock *Insert);
1023   void printDebugTracesAtStart() override;
1024   void printDebugTracesAtEnd() override;
1025 };
1026 } // end namespace llvm
1027 
1028 /// Look for a meaningful debug location on the instruction or it's
1029 /// operands.
1030 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1031   if (!I)
1032     return I;
1033 
1034   DebugLoc Empty;
1035   if (I->getDebugLoc() != Empty)
1036     return I;
1037 
1038   for (Use &Op : I->operands()) {
1039     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1040       if (OpInst->getDebugLoc() != Empty)
1041         return OpInst;
1042   }
1043 
1044   return I;
1045 }
1046 
1047 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1048   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1049     const DILocation *DIL = Inst->getDebugLoc();
1050     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051         !isa<DbgInfoIntrinsic>(Inst)) {
1052       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1053       auto NewDIL =
1054           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1055       if (NewDIL)
1056         B.SetCurrentDebugLocation(NewDIL.getValue());
1057       else
1058         LLVM_DEBUG(dbgs()
1059                    << "Failed to create new discriminator: "
1060                    << DIL->getFilename() << " Line: " << DIL->getLine());
1061     }
1062     else
1063       B.SetCurrentDebugLocation(DIL);
1064   } else
1065     B.SetCurrentDebugLocation(DebugLoc());
1066 }
1067 
1068 /// Write a record \p DebugMsg about vectorization failure to the debug
1069 /// output stream. If \p I is passed, it is an instruction that prevents
1070 /// vectorization.
1071 #ifndef NDEBUG
1072 static void debugVectorizationFailure(const StringRef DebugMsg,
1073     Instruction *I) {
1074   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1075   if (I != nullptr)
1076     dbgs() << " " << *I;
1077   else
1078     dbgs() << '.';
1079   dbgs() << '\n';
1080 }
1081 #endif
1082 
1083 /// Create an analysis remark that explains why vectorization failed
1084 ///
1085 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1086 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1087 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1088 /// the location of the remark.  \return the remark object that can be
1089 /// streamed to.
1090 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1091     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1092   Value *CodeRegion = TheLoop->getHeader();
1093   DebugLoc DL = TheLoop->getStartLoc();
1094 
1095   if (I) {
1096     CodeRegion = I->getParent();
1097     // If there is no debug location attached to the instruction, revert back to
1098     // using the loop's.
1099     if (I->getDebugLoc())
1100       DL = I->getDebugLoc();
1101   }
1102 
1103   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1104   R << "loop not vectorized: ";
1105   return R;
1106 }
1107 
1108 /// Return a value for Step multiplied by VF.
1109 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1110   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1111   Constant *StepVal = ConstantInt::get(
1112       Step->getType(),
1113       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1114   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1115 }
1116 
1117 namespace llvm {
1118 
1119 void reportVectorizationFailure(const StringRef DebugMsg,
1120     const StringRef OREMsg, const StringRef ORETag,
1121     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1122   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1123   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1124   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1125                 ORETag, TheLoop, I) << OREMsg);
1126 }
1127 
1128 } // end namespace llvm
1129 
1130 #ifndef NDEBUG
1131 /// \return string containing a file name and a line # for the given loop.
1132 static std::string getDebugLocString(const Loop *L) {
1133   std::string Result;
1134   if (L) {
1135     raw_string_ostream OS(Result);
1136     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1137       LoopDbgLoc.print(OS);
1138     else
1139       // Just print the module name.
1140       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1141     OS.flush();
1142   }
1143   return Result;
1144 }
1145 #endif
1146 
1147 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1148                                          const Instruction *Orig) {
1149   // If the loop was versioned with memchecks, add the corresponding no-alias
1150   // metadata.
1151   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1152     LVer->annotateInstWithNoAlias(To, Orig);
1153 }
1154 
1155 void InnerLoopVectorizer::addMetadata(Instruction *To,
1156                                       Instruction *From) {
1157   propagateMetadata(To, From);
1158   addNewMetadata(To, From);
1159 }
1160 
1161 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1162                                       Instruction *From) {
1163   for (Value *V : To) {
1164     if (Instruction *I = dyn_cast<Instruction>(V))
1165       addMetadata(I, From);
1166   }
1167 }
1168 
1169 namespace llvm {
1170 
1171 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1172 // lowered.
1173 enum ScalarEpilogueLowering {
1174 
1175   // The default: allowing scalar epilogues.
1176   CM_ScalarEpilogueAllowed,
1177 
1178   // Vectorization with OptForSize: don't allow epilogues.
1179   CM_ScalarEpilogueNotAllowedOptSize,
1180 
1181   // A special case of vectorisation with OptForSize: loops with a very small
1182   // trip count are considered for vectorization under OptForSize, thereby
1183   // making sure the cost of their loop body is dominant, free of runtime
1184   // guards and scalar iteration overheads.
1185   CM_ScalarEpilogueNotAllowedLowTripLoop,
1186 
1187   // Loop hint predicate indicating an epilogue is undesired.
1188   CM_ScalarEpilogueNotNeededUsePredicate,
1189 
1190   // Directive indicating we must either tail fold or not vectorize
1191   CM_ScalarEpilogueNotAllowedUsePredicate
1192 };
1193 
1194 /// LoopVectorizationCostModel - estimates the expected speedups due to
1195 /// vectorization.
1196 /// In many cases vectorization is not profitable. This can happen because of
1197 /// a number of reasons. In this class we mainly attempt to predict the
1198 /// expected speedup/slowdowns due to the supported instruction set. We use the
1199 /// TargetTransformInfo to query the different backends for the cost of
1200 /// different operations.
1201 class LoopVectorizationCostModel {
1202 public:
1203   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1204                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1205                              LoopVectorizationLegality *Legal,
1206                              const TargetTransformInfo &TTI,
1207                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1208                              AssumptionCache *AC,
1209                              OptimizationRemarkEmitter *ORE, const Function *F,
1210                              const LoopVectorizeHints *Hints,
1211                              InterleavedAccessInfo &IAI)
1212       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1213         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1214         Hints(Hints), InterleaveInfo(IAI) {}
1215 
1216   /// \return An upper bound for the vectorization factor, or None if
1217   /// vectorization and interleaving should be avoided up front.
1218   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1219 
1220   /// \return True if runtime checks are required for vectorization, and false
1221   /// otherwise.
1222   bool runtimeChecksRequired();
1223 
1224   /// \return The most profitable vectorization factor and the cost of that VF.
1225   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1226   /// then this vectorization factor will be selected if vectorization is
1227   /// possible.
1228   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1229   VectorizationFactor
1230   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1231                                     const LoopVectorizationPlanner &LVP);
1232 
1233   /// Setup cost-based decisions for user vectorization factor.
1234   void selectUserVectorizationFactor(ElementCount UserVF) {
1235     collectUniformsAndScalars(UserVF);
1236     collectInstsToScalarize(UserVF);
1237   }
1238 
1239   /// \return The size (in bits) of the smallest and widest types in the code
1240   /// that needs to be vectorized. We ignore values that remain scalar such as
1241   /// 64 bit loop indices.
1242   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1243 
1244   /// \return The desired interleave count.
1245   /// If interleave count has been specified by metadata it will be returned.
1246   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1247   /// are the selected vectorization factor and the cost of the selected VF.
1248   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1249 
1250   /// Memory access instruction may be vectorized in more than one way.
1251   /// Form of instruction after vectorization depends on cost.
1252   /// This function takes cost-based decisions for Load/Store instructions
1253   /// and collects them in a map. This decisions map is used for building
1254   /// the lists of loop-uniform and loop-scalar instructions.
1255   /// The calculated cost is saved with widening decision in order to
1256   /// avoid redundant calculations.
1257   void setCostBasedWideningDecision(ElementCount VF);
1258 
1259   /// A struct that represents some properties of the register usage
1260   /// of a loop.
1261   struct RegisterUsage {
1262     /// Holds the number of loop invariant values that are used in the loop.
1263     /// The key is ClassID of target-provided register class.
1264     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1265     /// Holds the maximum number of concurrent live intervals in the loop.
1266     /// The key is ClassID of target-provided register class.
1267     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1268   };
1269 
1270   /// \return Returns information about the register usages of the loop for the
1271   /// given vectorization factors.
1272   SmallVector<RegisterUsage, 8>
1273   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1274 
1275   /// Collect values we want to ignore in the cost model.
1276   void collectValuesToIgnore();
1277 
1278   /// Split reductions into those that happen in the loop, and those that happen
1279   /// outside. In loop reductions are collected into InLoopReductionChains.
1280   void collectInLoopReductions();
1281 
1282   /// \returns The smallest bitwidth each instruction can be represented with.
1283   /// The vector equivalents of these instructions should be truncated to this
1284   /// type.
1285   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1286     return MinBWs;
1287   }
1288 
1289   /// \returns True if it is more profitable to scalarize instruction \p I for
1290   /// vectorization factor \p VF.
1291   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1292     assert(VF.isVector() &&
1293            "Profitable to scalarize relevant only for VF > 1.");
1294 
1295     // Cost model is not run in the VPlan-native path - return conservative
1296     // result until this changes.
1297     if (EnableVPlanNativePath)
1298       return false;
1299 
1300     auto Scalars = InstsToScalarize.find(VF);
1301     assert(Scalars != InstsToScalarize.end() &&
1302            "VF not yet analyzed for scalarization profitability");
1303     return Scalars->second.find(I) != Scalars->second.end();
1304   }
1305 
1306   /// Returns true if \p I is known to be uniform after vectorization.
1307   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1308     if (VF.isScalar())
1309       return true;
1310 
1311     // Cost model is not run in the VPlan-native path - return conservative
1312     // result until this changes.
1313     if (EnableVPlanNativePath)
1314       return false;
1315 
1316     auto UniformsPerVF = Uniforms.find(VF);
1317     assert(UniformsPerVF != Uniforms.end() &&
1318            "VF not yet analyzed for uniformity");
1319     return UniformsPerVF->second.count(I);
1320   }
1321 
1322   /// Returns true if \p I is known to be scalar after vectorization.
1323   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1324     if (VF.isScalar())
1325       return true;
1326 
1327     // Cost model is not run in the VPlan-native path - return conservative
1328     // result until this changes.
1329     if (EnableVPlanNativePath)
1330       return false;
1331 
1332     auto ScalarsPerVF = Scalars.find(VF);
1333     assert(ScalarsPerVF != Scalars.end() &&
1334            "Scalar values are not calculated for VF");
1335     return ScalarsPerVF->second.count(I);
1336   }
1337 
1338   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1339   /// for vectorization factor \p VF.
1340   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1341     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1342            !isProfitableToScalarize(I, VF) &&
1343            !isScalarAfterVectorization(I, VF);
1344   }
1345 
1346   /// Decision that was taken during cost calculation for memory instruction.
1347   enum InstWidening {
1348     CM_Unknown,
1349     CM_Widen,         // For consecutive accesses with stride +1.
1350     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1351     CM_Interleave,
1352     CM_GatherScatter,
1353     CM_Scalarize
1354   };
1355 
1356   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1357   /// instruction \p I and vector width \p VF.
1358   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1359                            InstructionCost Cost) {
1360     assert(VF.isVector() && "Expected VF >=2");
1361     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1362   }
1363 
1364   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1365   /// interleaving group \p Grp and vector width \p VF.
1366   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1367                            ElementCount VF, InstWidening W,
1368                            InstructionCost Cost) {
1369     assert(VF.isVector() && "Expected VF >=2");
1370     /// Broadcast this decicion to all instructions inside the group.
1371     /// But the cost will be assigned to one instruction only.
1372     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1373       if (auto *I = Grp->getMember(i)) {
1374         if (Grp->getInsertPos() == I)
1375           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1376         else
1377           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1378       }
1379     }
1380   }
1381 
1382   /// Return the cost model decision for the given instruction \p I and vector
1383   /// width \p VF. Return CM_Unknown if this instruction did not pass
1384   /// through the cost modeling.
1385   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1386     assert(VF.isVector() && "Expected VF to be a vector VF");
1387     // Cost model is not run in the VPlan-native path - return conservative
1388     // result until this changes.
1389     if (EnableVPlanNativePath)
1390       return CM_GatherScatter;
1391 
1392     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1393     auto Itr = WideningDecisions.find(InstOnVF);
1394     if (Itr == WideningDecisions.end())
1395       return CM_Unknown;
1396     return Itr->second.first;
1397   }
1398 
1399   /// Return the vectorization cost for the given instruction \p I and vector
1400   /// width \p VF.
1401   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1402     assert(VF.isVector() && "Expected VF >=2");
1403     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1404     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1405            "The cost is not calculated");
1406     return WideningDecisions[InstOnVF].second;
1407   }
1408 
1409   /// Return True if instruction \p I is an optimizable truncate whose operand
1410   /// is an induction variable. Such a truncate will be removed by adding a new
1411   /// induction variable with the destination type.
1412   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1413     // If the instruction is not a truncate, return false.
1414     auto *Trunc = dyn_cast<TruncInst>(I);
1415     if (!Trunc)
1416       return false;
1417 
1418     // Get the source and destination types of the truncate.
1419     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1420     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1421 
1422     // If the truncate is free for the given types, return false. Replacing a
1423     // free truncate with an induction variable would add an induction variable
1424     // update instruction to each iteration of the loop. We exclude from this
1425     // check the primary induction variable since it will need an update
1426     // instruction regardless.
1427     Value *Op = Trunc->getOperand(0);
1428     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1429       return false;
1430 
1431     // If the truncated value is not an induction variable, return false.
1432     return Legal->isInductionPhi(Op);
1433   }
1434 
1435   /// Collects the instructions to scalarize for each predicated instruction in
1436   /// the loop.
1437   void collectInstsToScalarize(ElementCount VF);
1438 
1439   /// Collect Uniform and Scalar values for the given \p VF.
1440   /// The sets depend on CM decision for Load/Store instructions
1441   /// that may be vectorized as interleave, gather-scatter or scalarized.
1442   void collectUniformsAndScalars(ElementCount VF) {
1443     // Do the analysis once.
1444     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1445       return;
1446     setCostBasedWideningDecision(VF);
1447     collectLoopUniforms(VF);
1448     collectLoopScalars(VF);
1449   }
1450 
1451   /// Returns true if the target machine supports masked store operation
1452   /// for the given \p DataType and kind of access to \p Ptr.
1453   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1454     return Legal->isConsecutivePtr(Ptr) &&
1455            TTI.isLegalMaskedStore(DataType, Alignment);
1456   }
1457 
1458   /// Returns true if the target machine supports masked load operation
1459   /// for the given \p DataType and kind of access to \p Ptr.
1460   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1461     return Legal->isConsecutivePtr(Ptr) &&
1462            TTI.isLegalMaskedLoad(DataType, Alignment);
1463   }
1464 
1465   /// Returns true if the target machine supports masked scatter operation
1466   /// for the given \p DataType.
1467   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1468     return TTI.isLegalMaskedScatter(DataType, Alignment);
1469   }
1470 
1471   /// Returns true if the target machine supports masked gather operation
1472   /// for the given \p DataType.
1473   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1474     return TTI.isLegalMaskedGather(DataType, Alignment);
1475   }
1476 
1477   /// Returns true if the target machine can represent \p V as a masked gather
1478   /// or scatter operation.
1479   bool isLegalGatherOrScatter(Value *V) {
1480     bool LI = isa<LoadInst>(V);
1481     bool SI = isa<StoreInst>(V);
1482     if (!LI && !SI)
1483       return false;
1484     auto *Ty = getMemInstValueType(V);
1485     Align Align = getLoadStoreAlignment(V);
1486     return (LI && isLegalMaskedGather(Ty, Align)) ||
1487            (SI && isLegalMaskedScatter(Ty, Align));
1488   }
1489 
1490   /// Returns true if the target machine supports all of the reduction
1491   /// variables found for the given VF.
1492   bool canVectorizeReductions(ElementCount VF) {
1493     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1494       RecurrenceDescriptor RdxDesc = Reduction.second;
1495       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1496     }));
1497   }
1498 
1499   /// Returns true if \p I is an instruction that will be scalarized with
1500   /// predication. Such instructions include conditional stores and
1501   /// instructions that may divide by zero.
1502   /// If a non-zero VF has been calculated, we check if I will be scalarized
1503   /// predication for that VF.
1504   bool isScalarWithPredication(Instruction *I,
1505                                ElementCount VF = ElementCount::getFixed(1));
1506 
1507   // Returns true if \p I is an instruction that will be predicated either
1508   // through scalar predication or masked load/store or masked gather/scatter.
1509   // Superset of instructions that return true for isScalarWithPredication.
1510   bool isPredicatedInst(Instruction *I) {
1511     if (!blockNeedsPredication(I->getParent()))
1512       return false;
1513     // Loads and stores that need some form of masked operation are predicated
1514     // instructions.
1515     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1516       return Legal->isMaskRequired(I);
1517     return isScalarWithPredication(I);
1518   }
1519 
1520   /// Returns true if \p I is a memory instruction with consecutive memory
1521   /// access that can be widened.
1522   bool
1523   memoryInstructionCanBeWidened(Instruction *I,
1524                                 ElementCount VF = ElementCount::getFixed(1));
1525 
1526   /// Returns true if \p I is a memory instruction in an interleaved-group
1527   /// of memory accesses that can be vectorized with wide vector loads/stores
1528   /// and shuffles.
1529   bool
1530   interleavedAccessCanBeWidened(Instruction *I,
1531                                 ElementCount VF = ElementCount::getFixed(1));
1532 
1533   /// Check if \p Instr belongs to any interleaved access group.
1534   bool isAccessInterleaved(Instruction *Instr) {
1535     return InterleaveInfo.isInterleaved(Instr);
1536   }
1537 
1538   /// Get the interleaved access group that \p Instr belongs to.
1539   const InterleaveGroup<Instruction> *
1540   getInterleavedAccessGroup(Instruction *Instr) {
1541     return InterleaveInfo.getInterleaveGroup(Instr);
1542   }
1543 
1544   /// Returns true if we're required to use a scalar epilogue for at least
1545   /// the final iteration of the original loop.
1546   bool requiresScalarEpilogue() const {
1547     if (!isScalarEpilogueAllowed())
1548       return false;
1549     // If we might exit from anywhere but the latch, must run the exiting
1550     // iteration in scalar form.
1551     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1552       return true;
1553     return InterleaveInfo.requiresScalarEpilogue();
1554   }
1555 
1556   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1557   /// loop hint annotation.
1558   bool isScalarEpilogueAllowed() const {
1559     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1560   }
1561 
1562   /// Returns true if all loop blocks should be masked to fold tail loop.
1563   bool foldTailByMasking() const { return FoldTailByMasking; }
1564 
1565   bool blockNeedsPredication(BasicBlock *BB) {
1566     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1567   }
1568 
1569   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1570   /// nodes to the chain of instructions representing the reductions. Uses a
1571   /// MapVector to ensure deterministic iteration order.
1572   using ReductionChainMap =
1573       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1574 
1575   /// Return the chain of instructions representing an inloop reduction.
1576   const ReductionChainMap &getInLoopReductionChains() const {
1577     return InLoopReductionChains;
1578   }
1579 
1580   /// Returns true if the Phi is part of an inloop reduction.
1581   bool isInLoopReduction(PHINode *Phi) const {
1582     return InLoopReductionChains.count(Phi);
1583   }
1584 
1585   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1586   /// with factor VF.  Return the cost of the instruction, including
1587   /// scalarization overhead if it's needed.
1588   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1589 
1590   /// Estimate cost of a call instruction CI if it were vectorized with factor
1591   /// VF. Return the cost of the instruction, including scalarization overhead
1592   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1593   /// scalarized -
1594   /// i.e. either vector version isn't available, or is too expensive.
1595   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1596                                     bool &NeedToScalarize);
1597 
1598   /// Invalidates decisions already taken by the cost model.
1599   void invalidateCostModelingDecisions() {
1600     WideningDecisions.clear();
1601     Uniforms.clear();
1602     Scalars.clear();
1603   }
1604 
1605 private:
1606   unsigned NumPredStores = 0;
1607 
1608   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1609   /// than zero. One is returned if vectorization should best be avoided due
1610   /// to cost.
1611   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1612                                     ElementCount UserVF);
1613 
1614   /// The vectorization cost is a combination of the cost itself and a boolean
1615   /// indicating whether any of the contributing operations will actually
1616   /// operate on
1617   /// vector values after type legalization in the backend. If this latter value
1618   /// is
1619   /// false, then all operations will be scalarized (i.e. no vectorization has
1620   /// actually taken place).
1621   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1622 
1623   /// Returns the expected execution cost. The unit of the cost does
1624   /// not matter because we use the 'cost' units to compare different
1625   /// vector widths. The cost that is returned is *not* normalized by
1626   /// the factor width.
1627   VectorizationCostTy expectedCost(ElementCount VF);
1628 
1629   /// Returns the execution time cost of an instruction for a given vector
1630   /// width. Vector width of one means scalar.
1631   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1632 
1633   /// The cost-computation logic from getInstructionCost which provides
1634   /// the vector type as an output parameter.
1635   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1636                                      Type *&VectorTy);
1637 
1638   /// Return the cost of instructions in an inloop reduction pattern, if I is
1639   /// part of that pattern.
1640   InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1641                                           Type *VectorTy,
1642                                           TTI::TargetCostKind CostKind);
1643 
1644   /// Calculate vectorization cost of memory instruction \p I.
1645   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1646 
1647   /// The cost computation for scalarized memory instruction.
1648   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1649 
1650   /// The cost computation for interleaving group of memory instructions.
1651   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1652 
1653   /// The cost computation for Gather/Scatter instruction.
1654   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1655 
1656   /// The cost computation for widening instruction \p I with consecutive
1657   /// memory access.
1658   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1659 
1660   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1661   /// Load: scalar load + broadcast.
1662   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1663   /// element)
1664   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1665 
1666   /// Estimate the overhead of scalarizing an instruction. This is a
1667   /// convenience wrapper for the type-based getScalarizationOverhead API.
1668   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
1669 
1670   /// Returns whether the instruction is a load or store and will be a emitted
1671   /// as a vector operation.
1672   bool isConsecutiveLoadOrStore(Instruction *I);
1673 
1674   /// Returns true if an artificially high cost for emulated masked memrefs
1675   /// should be used.
1676   bool useEmulatedMaskMemRefHack(Instruction *I);
1677 
1678   /// Map of scalar integer values to the smallest bitwidth they can be legally
1679   /// represented as. The vector equivalents of these values should be truncated
1680   /// to this type.
1681   MapVector<Instruction *, uint64_t> MinBWs;
1682 
1683   /// A type representing the costs for instructions if they were to be
1684   /// scalarized rather than vectorized. The entries are Instruction-Cost
1685   /// pairs.
1686   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1687 
1688   /// A set containing all BasicBlocks that are known to present after
1689   /// vectorization as a predicated block.
1690   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1691 
1692   /// Records whether it is allowed to have the original scalar loop execute at
1693   /// least once. This may be needed as a fallback loop in case runtime
1694   /// aliasing/dependence checks fail, or to handle the tail/remainder
1695   /// iterations when the trip count is unknown or doesn't divide by the VF,
1696   /// or as a peel-loop to handle gaps in interleave-groups.
1697   /// Under optsize and when the trip count is very small we don't allow any
1698   /// iterations to execute in the scalar loop.
1699   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1700 
1701   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1702   bool FoldTailByMasking = false;
1703 
1704   /// A map holding scalar costs for different vectorization factors. The
1705   /// presence of a cost for an instruction in the mapping indicates that the
1706   /// instruction will be scalarized when vectorizing with the associated
1707   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1708   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1709 
1710   /// Holds the instructions known to be uniform after vectorization.
1711   /// The data is collected per VF.
1712   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1713 
1714   /// Holds the instructions known to be scalar after vectorization.
1715   /// The data is collected per VF.
1716   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1717 
1718   /// Holds the instructions (address computations) that are forced to be
1719   /// scalarized.
1720   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1721 
1722   /// PHINodes of the reductions that should be expanded in-loop along with
1723   /// their associated chains of reduction operations, in program order from top
1724   /// (PHI) to bottom
1725   ReductionChainMap InLoopReductionChains;
1726 
1727   /// A Map of inloop reduction operations and their immediate chain operand.
1728   /// FIXME: This can be removed once reductions can be costed correctly in
1729   /// vplan. This was added to allow quick lookup to the inloop operations,
1730   /// without having to loop through InLoopReductionChains.
1731   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1732 
1733   /// Returns the expected difference in cost from scalarizing the expression
1734   /// feeding a predicated instruction \p PredInst. The instructions to
1735   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1736   /// non-negative return value implies the expression will be scalarized.
1737   /// Currently, only single-use chains are considered for scalarization.
1738   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1739                               ElementCount VF);
1740 
1741   /// Collect the instructions that are uniform after vectorization. An
1742   /// instruction is uniform if we represent it with a single scalar value in
1743   /// the vectorized loop corresponding to each vector iteration. Examples of
1744   /// uniform instructions include pointer operands of consecutive or
1745   /// interleaved memory accesses. Note that although uniformity implies an
1746   /// instruction will be scalar, the reverse is not true. In general, a
1747   /// scalarized instruction will be represented by VF scalar values in the
1748   /// vectorized loop, each corresponding to an iteration of the original
1749   /// scalar loop.
1750   void collectLoopUniforms(ElementCount VF);
1751 
1752   /// Collect the instructions that are scalar after vectorization. An
1753   /// instruction is scalar if it is known to be uniform or will be scalarized
1754   /// during vectorization. Non-uniform scalarized instructions will be
1755   /// represented by VF values in the vectorized loop, each corresponding to an
1756   /// iteration of the original scalar loop.
1757   void collectLoopScalars(ElementCount VF);
1758 
1759   /// Keeps cost model vectorization decision and cost for instructions.
1760   /// Right now it is used for memory instructions only.
1761   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1762                                 std::pair<InstWidening, InstructionCost>>;
1763 
1764   DecisionList WideningDecisions;
1765 
1766   /// Returns true if \p V is expected to be vectorized and it needs to be
1767   /// extracted.
1768   bool needsExtract(Value *V, ElementCount VF) const {
1769     Instruction *I = dyn_cast<Instruction>(V);
1770     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1771         TheLoop->isLoopInvariant(I))
1772       return false;
1773 
1774     // Assume we can vectorize V (and hence we need extraction) if the
1775     // scalars are not computed yet. This can happen, because it is called
1776     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1777     // the scalars are collected. That should be a safe assumption in most
1778     // cases, because we check if the operands have vectorizable types
1779     // beforehand in LoopVectorizationLegality.
1780     return Scalars.find(VF) == Scalars.end() ||
1781            !isScalarAfterVectorization(I, VF);
1782   };
1783 
1784   /// Returns a range containing only operands needing to be extracted.
1785   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1786                                                    ElementCount VF) {
1787     return SmallVector<Value *, 4>(make_filter_range(
1788         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1789   }
1790 
1791   /// Determines if we have the infrastructure to vectorize loop \p L and its
1792   /// epilogue, assuming the main loop is vectorized by \p VF.
1793   bool isCandidateForEpilogueVectorization(const Loop &L,
1794                                            const ElementCount VF) const;
1795 
1796   /// Returns true if epilogue vectorization is considered profitable, and
1797   /// false otherwise.
1798   /// \p VF is the vectorization factor chosen for the original loop.
1799   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1800 
1801 public:
1802   /// The loop that we evaluate.
1803   Loop *TheLoop;
1804 
1805   /// Predicated scalar evolution analysis.
1806   PredicatedScalarEvolution &PSE;
1807 
1808   /// Loop Info analysis.
1809   LoopInfo *LI;
1810 
1811   /// Vectorization legality.
1812   LoopVectorizationLegality *Legal;
1813 
1814   /// Vector target information.
1815   const TargetTransformInfo &TTI;
1816 
1817   /// Target Library Info.
1818   const TargetLibraryInfo *TLI;
1819 
1820   /// Demanded bits analysis.
1821   DemandedBits *DB;
1822 
1823   /// Assumption cache.
1824   AssumptionCache *AC;
1825 
1826   /// Interface to emit optimization remarks.
1827   OptimizationRemarkEmitter *ORE;
1828 
1829   const Function *TheFunction;
1830 
1831   /// Loop Vectorize Hint.
1832   const LoopVectorizeHints *Hints;
1833 
1834   /// The interleave access information contains groups of interleaved accesses
1835   /// with the same stride and close to each other.
1836   InterleavedAccessInfo &InterleaveInfo;
1837 
1838   /// Values to ignore in the cost model.
1839   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1840 
1841   /// Values to ignore in the cost model when VF > 1.
1842   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1843 
1844   /// Profitable vector factors.
1845   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1846 };
1847 } // end namespace llvm
1848 
1849 /// Helper struct to manage generating runtime checks for vectorization.
1850 ///
1851 /// The runtime checks are created up-front in temporary blocks to allow better
1852 /// estimating the cost and un-linked from the existing IR. After deciding to
1853 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1854 /// temporary blocks are completely removed.
1855 class GeneratedRTChecks {
1856   /// Basic block which contains the generated SCEV checks, if any.
1857   BasicBlock *SCEVCheckBlock = nullptr;
1858 
1859   /// The value representing the result of the generated SCEV checks. If it is
1860   /// nullptr, either no SCEV checks have been generated or they have been used.
1861   Value *SCEVCheckCond = nullptr;
1862 
1863   /// Basic block which contains the generated memory runtime checks, if any.
1864   BasicBlock *MemCheckBlock = nullptr;
1865 
1866   /// The value representing the result of the generated memory runtime checks.
1867   /// If it is nullptr, either no memory runtime checks have been generated or
1868   /// they have been used.
1869   Instruction *MemRuntimeCheckCond = nullptr;
1870 
1871   DominatorTree *DT;
1872   LoopInfo *LI;
1873 
1874   SCEVExpander SCEVExp;
1875   SCEVExpander MemCheckExp;
1876 
1877 public:
1878   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1879                     const DataLayout &DL)
1880       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1881         MemCheckExp(SE, DL, "scev.check") {}
1882 
1883   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1884   /// accurately estimate the cost of the runtime checks. The blocks are
1885   /// un-linked from the IR and is added back during vector code generation. If
1886   /// there is no vector code generation, the check blocks are removed
1887   /// completely.
1888   void Create(Loop *L, const LoopAccessInfo &LAI,
1889               const SCEVUnionPredicate &UnionPred) {
1890 
1891     BasicBlock *LoopHeader = L->getHeader();
1892     BasicBlock *Preheader = L->getLoopPreheader();
1893 
1894     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1895     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1896     // may be used by SCEVExpander. The blocks will be un-linked from their
1897     // predecessors and removed from LI & DT at the end of the function.
1898     if (!UnionPred.isAlwaysTrue()) {
1899       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1900                                   nullptr, "vector.scevcheck");
1901 
1902       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1903           &UnionPred, SCEVCheckBlock->getTerminator());
1904     }
1905 
1906     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1907     if (RtPtrChecking.Need) {
1908       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1909       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1910                                  "vector.memcheck");
1911 
1912       std::tie(std::ignore, MemRuntimeCheckCond) =
1913           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1914                            RtPtrChecking.getChecks(), MemCheckExp);
1915       assert(MemRuntimeCheckCond &&
1916              "no RT checks generated although RtPtrChecking "
1917              "claimed checks are required");
1918     }
1919 
1920     if (!MemCheckBlock && !SCEVCheckBlock)
1921       return;
1922 
1923     // Unhook the temporary block with the checks, update various places
1924     // accordingly.
1925     if (SCEVCheckBlock)
1926       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1927     if (MemCheckBlock)
1928       MemCheckBlock->replaceAllUsesWith(Preheader);
1929 
1930     if (SCEVCheckBlock) {
1931       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1932       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1933       Preheader->getTerminator()->eraseFromParent();
1934     }
1935     if (MemCheckBlock) {
1936       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1937       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1938       Preheader->getTerminator()->eraseFromParent();
1939     }
1940 
1941     DT->changeImmediateDominator(LoopHeader, Preheader);
1942     if (MemCheckBlock) {
1943       DT->eraseNode(MemCheckBlock);
1944       LI->removeBlock(MemCheckBlock);
1945     }
1946     if (SCEVCheckBlock) {
1947       DT->eraseNode(SCEVCheckBlock);
1948       LI->removeBlock(SCEVCheckBlock);
1949     }
1950   }
1951 
1952   /// Remove the created SCEV & memory runtime check blocks & instructions, if
1953   /// unused.
1954   ~GeneratedRTChecks() {
1955     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1956     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1957     if (!SCEVCheckCond)
1958       SCEVCleaner.markResultUsed();
1959 
1960     if (!MemRuntimeCheckCond)
1961       MemCheckCleaner.markResultUsed();
1962 
1963     if (MemRuntimeCheckCond) {
1964       auto &SE = *MemCheckExp.getSE();
1965       // Memory runtime check generation creates compares that use expanded
1966       // values. Remove them before running the SCEVExpanderCleaners.
1967       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1968         if (MemCheckExp.isInsertedInstruction(&I))
1969           continue;
1970         SE.forgetValue(&I);
1971         SE.eraseValueFromMap(&I);
1972         I.eraseFromParent();
1973       }
1974     }
1975     MemCheckCleaner.cleanup();
1976     SCEVCleaner.cleanup();
1977 
1978     if (SCEVCheckCond)
1979       SCEVCheckBlock->eraseFromParent();
1980     if (MemRuntimeCheckCond)
1981       MemCheckBlock->eraseFromParent();
1982   }
1983 
1984   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
1985   /// adjusts the branches to branch to the vector preheader or \p Bypass,
1986   /// depending on the generated condition.
1987   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
1988                              BasicBlock *LoopVectorPreHeader,
1989                              BasicBlock *LoopExitBlock) {
1990     if (!SCEVCheckCond)
1991       return nullptr;
1992     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
1993       if (C->isZero())
1994         return nullptr;
1995 
1996     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
1997 
1998     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
1999     // Create new preheader for vector loop.
2000     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2001       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2002 
2003     SCEVCheckBlock->getTerminator()->eraseFromParent();
2004     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2005     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2006                                                 SCEVCheckBlock);
2007 
2008     DT->addNewBlock(SCEVCheckBlock, Pred);
2009     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2010 
2011     ReplaceInstWithInst(
2012         SCEVCheckBlock->getTerminator(),
2013         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2014     // Mark the check as used, to prevent it from being removed during cleanup.
2015     SCEVCheckCond = nullptr;
2016     return SCEVCheckBlock;
2017   }
2018 
2019   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2020   /// the branches to branch to the vector preheader or \p Bypass, depending on
2021   /// the generated condition.
2022   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2023                                    BasicBlock *LoopVectorPreHeader) {
2024     // Check if we generated code that checks in runtime if arrays overlap.
2025     if (!MemRuntimeCheckCond)
2026       return nullptr;
2027 
2028     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2029     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2030                                                 MemCheckBlock);
2031 
2032     DT->addNewBlock(MemCheckBlock, Pred);
2033     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2034     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2035 
2036     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2037       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2038 
2039     ReplaceInstWithInst(
2040         MemCheckBlock->getTerminator(),
2041         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2042     MemCheckBlock->getTerminator()->setDebugLoc(
2043         Pred->getTerminator()->getDebugLoc());
2044 
2045     // Mark the check as used, to prevent it from being removed during cleanup.
2046     MemRuntimeCheckCond = nullptr;
2047     return MemCheckBlock;
2048   }
2049 };
2050 
2051 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2052 // vectorization. The loop needs to be annotated with #pragma omp simd
2053 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2054 // vector length information is not provided, vectorization is not considered
2055 // explicit. Interleave hints are not allowed either. These limitations will be
2056 // relaxed in the future.
2057 // Please, note that we are currently forced to abuse the pragma 'clang
2058 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2059 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2060 // provides *explicit vectorization hints* (LV can bypass legal checks and
2061 // assume that vectorization is legal). However, both hints are implemented
2062 // using the same metadata (llvm.loop.vectorize, processed by
2063 // LoopVectorizeHints). This will be fixed in the future when the native IR
2064 // representation for pragma 'omp simd' is introduced.
2065 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2066                                    OptimizationRemarkEmitter *ORE) {
2067   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2068   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2069 
2070   // Only outer loops with an explicit vectorization hint are supported.
2071   // Unannotated outer loops are ignored.
2072   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2073     return false;
2074 
2075   Function *Fn = OuterLp->getHeader()->getParent();
2076   if (!Hints.allowVectorization(Fn, OuterLp,
2077                                 true /*VectorizeOnlyWhenForced*/)) {
2078     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2079     return false;
2080   }
2081 
2082   if (Hints.getInterleave() > 1) {
2083     // TODO: Interleave support is future work.
2084     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2085                          "outer loops.\n");
2086     Hints.emitRemarkWithHints();
2087     return false;
2088   }
2089 
2090   return true;
2091 }
2092 
2093 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2094                                   OptimizationRemarkEmitter *ORE,
2095                                   SmallVectorImpl<Loop *> &V) {
2096   // Collect inner loops and outer loops without irreducible control flow. For
2097   // now, only collect outer loops that have explicit vectorization hints. If we
2098   // are stress testing the VPlan H-CFG construction, we collect the outermost
2099   // loop of every loop nest.
2100   if (L.isInnermost() || VPlanBuildStressTest ||
2101       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2102     LoopBlocksRPO RPOT(&L);
2103     RPOT.perform(LI);
2104     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2105       V.push_back(&L);
2106       // TODO: Collect inner loops inside marked outer loops in case
2107       // vectorization fails for the outer loop. Do not invoke
2108       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2109       // already known to be reducible. We can use an inherited attribute for
2110       // that.
2111       return;
2112     }
2113   }
2114   for (Loop *InnerL : L)
2115     collectSupportedLoops(*InnerL, LI, ORE, V);
2116 }
2117 
2118 namespace {
2119 
2120 /// The LoopVectorize Pass.
2121 struct LoopVectorize : public FunctionPass {
2122   /// Pass identification, replacement for typeid
2123   static char ID;
2124 
2125   LoopVectorizePass Impl;
2126 
2127   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2128                          bool VectorizeOnlyWhenForced = false)
2129       : FunctionPass(ID),
2130         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2131     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2132   }
2133 
2134   bool runOnFunction(Function &F) override {
2135     if (skipFunction(F))
2136       return false;
2137 
2138     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2139     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2140     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2141     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2142     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2143     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2144     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2145     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2146     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2147     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2148     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2149     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2150     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2151 
2152     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2153         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2154 
2155     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2156                         GetLAA, *ORE, PSI).MadeAnyChange;
2157   }
2158 
2159   void getAnalysisUsage(AnalysisUsage &AU) const override {
2160     AU.addRequired<AssumptionCacheTracker>();
2161     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2162     AU.addRequired<DominatorTreeWrapperPass>();
2163     AU.addRequired<LoopInfoWrapperPass>();
2164     AU.addRequired<ScalarEvolutionWrapperPass>();
2165     AU.addRequired<TargetTransformInfoWrapperPass>();
2166     AU.addRequired<AAResultsWrapperPass>();
2167     AU.addRequired<LoopAccessLegacyAnalysis>();
2168     AU.addRequired<DemandedBitsWrapperPass>();
2169     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2170     AU.addRequired<InjectTLIMappingsLegacy>();
2171 
2172     // We currently do not preserve loopinfo/dominator analyses with outer loop
2173     // vectorization. Until this is addressed, mark these analyses as preserved
2174     // only for non-VPlan-native path.
2175     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2176     if (!EnableVPlanNativePath) {
2177       AU.addPreserved<LoopInfoWrapperPass>();
2178       AU.addPreserved<DominatorTreeWrapperPass>();
2179     }
2180 
2181     AU.addPreserved<BasicAAWrapperPass>();
2182     AU.addPreserved<GlobalsAAWrapperPass>();
2183     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2184   }
2185 };
2186 
2187 } // end anonymous namespace
2188 
2189 //===----------------------------------------------------------------------===//
2190 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2191 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2192 //===----------------------------------------------------------------------===//
2193 
2194 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2195   // We need to place the broadcast of invariant variables outside the loop,
2196   // but only if it's proven safe to do so. Else, broadcast will be inside
2197   // vector loop body.
2198   Instruction *Instr = dyn_cast<Instruction>(V);
2199   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2200                      (!Instr ||
2201                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2202   // Place the code for broadcasting invariant variables in the new preheader.
2203   IRBuilder<>::InsertPointGuard Guard(Builder);
2204   if (SafeToHoist)
2205     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2206 
2207   // Broadcast the scalar into all locations in the vector.
2208   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2209 
2210   return Shuf;
2211 }
2212 
2213 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2214     const InductionDescriptor &II, Value *Step, Value *Start,
2215     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2216     VPTransformState &State) {
2217   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2218          "Expected either an induction phi-node or a truncate of it!");
2219 
2220   // Construct the initial value of the vector IV in the vector loop preheader
2221   auto CurrIP = Builder.saveIP();
2222   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2223   if (isa<TruncInst>(EntryVal)) {
2224     assert(Start->getType()->isIntegerTy() &&
2225            "Truncation requires an integer type");
2226     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2227     Step = Builder.CreateTrunc(Step, TruncType);
2228     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2229   }
2230   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2231   Value *SteppedStart =
2232       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2233 
2234   // We create vector phi nodes for both integer and floating-point induction
2235   // variables. Here, we determine the kind of arithmetic we will perform.
2236   Instruction::BinaryOps AddOp;
2237   Instruction::BinaryOps MulOp;
2238   if (Step->getType()->isIntegerTy()) {
2239     AddOp = Instruction::Add;
2240     MulOp = Instruction::Mul;
2241   } else {
2242     AddOp = II.getInductionOpcode();
2243     MulOp = Instruction::FMul;
2244   }
2245 
2246   // Multiply the vectorization factor by the step using integer or
2247   // floating-point arithmetic as appropriate.
2248   Value *ConstVF =
2249       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2250   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2251 
2252   // Create a vector splat to use in the induction update.
2253   //
2254   // FIXME: If the step is non-constant, we create the vector splat with
2255   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2256   //        handle a constant vector splat.
2257   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2258   Value *SplatVF = isa<Constant>(Mul)
2259                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2260                        : Builder.CreateVectorSplat(VF, Mul);
2261   Builder.restoreIP(CurrIP);
2262 
2263   // We may need to add the step a number of times, depending on the unroll
2264   // factor. The last of those goes into the PHI.
2265   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2266                                     &*LoopVectorBody->getFirstInsertionPt());
2267   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2268   Instruction *LastInduction = VecInd;
2269   for (unsigned Part = 0; Part < UF; ++Part) {
2270     State.set(Def, LastInduction, Part);
2271 
2272     if (isa<TruncInst>(EntryVal))
2273       addMetadata(LastInduction, EntryVal);
2274     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2275                                           State, Part);
2276 
2277     LastInduction = cast<Instruction>(addFastMathFlag(
2278         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2279     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2280   }
2281 
2282   // Move the last step to the end of the latch block. This ensures consistent
2283   // placement of all induction updates.
2284   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2285   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2286   auto *ICmp = cast<Instruction>(Br->getCondition());
2287   LastInduction->moveBefore(ICmp);
2288   LastInduction->setName("vec.ind.next");
2289 
2290   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2291   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2292 }
2293 
2294 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2295   return Cost->isScalarAfterVectorization(I, VF) ||
2296          Cost->isProfitableToScalarize(I, VF);
2297 }
2298 
2299 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2300   if (shouldScalarizeInstruction(IV))
2301     return true;
2302   auto isScalarInst = [&](User *U) -> bool {
2303     auto *I = cast<Instruction>(U);
2304     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2305   };
2306   return llvm::any_of(IV->users(), isScalarInst);
2307 }
2308 
2309 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2310     const InductionDescriptor &ID, const Instruction *EntryVal,
2311     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2312     unsigned Part, unsigned Lane) {
2313   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2314          "Expected either an induction phi-node or a truncate of it!");
2315 
2316   // This induction variable is not the phi from the original loop but the
2317   // newly-created IV based on the proof that casted Phi is equal to the
2318   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2319   // re-uses the same InductionDescriptor that original IV uses but we don't
2320   // have to do any recording in this case - that is done when original IV is
2321   // processed.
2322   if (isa<TruncInst>(EntryVal))
2323     return;
2324 
2325   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2326   if (Casts.empty())
2327     return;
2328   // Only the first Cast instruction in the Casts vector is of interest.
2329   // The rest of the Casts (if exist) have no uses outside the
2330   // induction update chain itself.
2331   if (Lane < UINT_MAX)
2332     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2333   else
2334     State.set(CastDef, VectorLoopVal, Part);
2335 }
2336 
2337 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2338                                                 TruncInst *Trunc, VPValue *Def,
2339                                                 VPValue *CastDef,
2340                                                 VPTransformState &State) {
2341   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2342          "Primary induction variable must have an integer type");
2343 
2344   auto II = Legal->getInductionVars().find(IV);
2345   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2346 
2347   auto ID = II->second;
2348   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2349 
2350   // The value from the original loop to which we are mapping the new induction
2351   // variable.
2352   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2353 
2354   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2355 
2356   // Generate code for the induction step. Note that induction steps are
2357   // required to be loop-invariant
2358   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2359     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2360            "Induction step should be loop invariant");
2361     if (PSE.getSE()->isSCEVable(IV->getType())) {
2362       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2363       return Exp.expandCodeFor(Step, Step->getType(),
2364                                LoopVectorPreHeader->getTerminator());
2365     }
2366     return cast<SCEVUnknown>(Step)->getValue();
2367   };
2368 
2369   // The scalar value to broadcast. This is derived from the canonical
2370   // induction variable. If a truncation type is given, truncate the canonical
2371   // induction variable and step. Otherwise, derive these values from the
2372   // induction descriptor.
2373   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2374     Value *ScalarIV = Induction;
2375     if (IV != OldInduction) {
2376       ScalarIV = IV->getType()->isIntegerTy()
2377                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2378                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2379                                           IV->getType());
2380       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2381       ScalarIV->setName("offset.idx");
2382     }
2383     if (Trunc) {
2384       auto *TruncType = cast<IntegerType>(Trunc->getType());
2385       assert(Step->getType()->isIntegerTy() &&
2386              "Truncation requires an integer step");
2387       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2388       Step = Builder.CreateTrunc(Step, TruncType);
2389     }
2390     return ScalarIV;
2391   };
2392 
2393   // Create the vector values from the scalar IV, in the absence of creating a
2394   // vector IV.
2395   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2396     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2397     for (unsigned Part = 0; Part < UF; ++Part) {
2398       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2399       Value *EntryPart =
2400           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2401                         ID.getInductionOpcode());
2402       State.set(Def, EntryPart, Part);
2403       if (Trunc)
2404         addMetadata(EntryPart, Trunc);
2405       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2406                                             State, Part);
2407     }
2408   };
2409 
2410   // Now do the actual transformations, and start with creating the step value.
2411   Value *Step = CreateStepValue(ID.getStep());
2412   if (VF.isZero() || VF.isScalar()) {
2413     Value *ScalarIV = CreateScalarIV(Step);
2414     CreateSplatIV(ScalarIV, Step);
2415     return;
2416   }
2417 
2418   // Determine if we want a scalar version of the induction variable. This is
2419   // true if the induction variable itself is not widened, or if it has at
2420   // least one user in the loop that is not widened.
2421   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2422   if (!NeedsScalarIV) {
2423     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2424                                     State);
2425     return;
2426   }
2427 
2428   // Try to create a new independent vector induction variable. If we can't
2429   // create the phi node, we will splat the scalar induction variable in each
2430   // loop iteration.
2431   if (!shouldScalarizeInstruction(EntryVal)) {
2432     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2433                                     State);
2434     Value *ScalarIV = CreateScalarIV(Step);
2435     // Create scalar steps that can be used by instructions we will later
2436     // scalarize. Note that the addition of the scalar steps will not increase
2437     // the number of instructions in the loop in the common case prior to
2438     // InstCombine. We will be trading one vector extract for each scalar step.
2439     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2440     return;
2441   }
2442 
2443   // All IV users are scalar instructions, so only emit a scalar IV, not a
2444   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2445   // predicate used by the masked loads/stores.
2446   Value *ScalarIV = CreateScalarIV(Step);
2447   if (!Cost->isScalarEpilogueAllowed())
2448     CreateSplatIV(ScalarIV, Step);
2449   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2450 }
2451 
2452 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2453                                           Instruction::BinaryOps BinOp) {
2454   // Create and check the types.
2455   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2456   int VLen = ValVTy->getNumElements();
2457 
2458   Type *STy = Val->getType()->getScalarType();
2459   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2460          "Induction Step must be an integer or FP");
2461   assert(Step->getType() == STy && "Step has wrong type");
2462 
2463   SmallVector<Constant *, 8> Indices;
2464 
2465   if (STy->isIntegerTy()) {
2466     // Create a vector of consecutive numbers from zero to VF.
2467     for (int i = 0; i < VLen; ++i)
2468       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2469 
2470     // Add the consecutive indices to the vector value.
2471     Constant *Cv = ConstantVector::get(Indices);
2472     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2473     Step = Builder.CreateVectorSplat(VLen, Step);
2474     assert(Step->getType() == Val->getType() && "Invalid step vec");
2475     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2476     // which can be found from the original scalar operations.
2477     Step = Builder.CreateMul(Cv, Step);
2478     return Builder.CreateAdd(Val, Step, "induction");
2479   }
2480 
2481   // Floating point induction.
2482   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2483          "Binary Opcode should be specified for FP induction");
2484   // Create a vector of consecutive numbers from zero to VF.
2485   for (int i = 0; i < VLen; ++i)
2486     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2487 
2488   // Add the consecutive indices to the vector value.
2489   Constant *Cv = ConstantVector::get(Indices);
2490 
2491   Step = Builder.CreateVectorSplat(VLen, Step);
2492 
2493   // Floating point operations had to be 'fast' to enable the induction.
2494   FastMathFlags Flags;
2495   Flags.setFast();
2496 
2497   Value *MulOp = Builder.CreateFMul(Cv, Step);
2498   if (isa<Instruction>(MulOp))
2499     // Have to check, MulOp may be a constant
2500     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2501 
2502   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2503   if (isa<Instruction>(BOp))
2504     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2505   return BOp;
2506 }
2507 
2508 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2509                                            Instruction *EntryVal,
2510                                            const InductionDescriptor &ID,
2511                                            VPValue *Def, VPValue *CastDef,
2512                                            VPTransformState &State) {
2513   // We shouldn't have to build scalar steps if we aren't vectorizing.
2514   assert(VF.isVector() && "VF should be greater than one");
2515   // Get the value type and ensure it and the step have the same integer type.
2516   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2517   assert(ScalarIVTy == Step->getType() &&
2518          "Val and Step should have the same type");
2519 
2520   // We build scalar steps for both integer and floating-point induction
2521   // variables. Here, we determine the kind of arithmetic we will perform.
2522   Instruction::BinaryOps AddOp;
2523   Instruction::BinaryOps MulOp;
2524   if (ScalarIVTy->isIntegerTy()) {
2525     AddOp = Instruction::Add;
2526     MulOp = Instruction::Mul;
2527   } else {
2528     AddOp = ID.getInductionOpcode();
2529     MulOp = Instruction::FMul;
2530   }
2531 
2532   // Determine the number of scalars we need to generate for each unroll
2533   // iteration. If EntryVal is uniform, we only need to generate the first
2534   // lane. Otherwise, we generate all VF values.
2535   unsigned Lanes =
2536       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2537           ? 1
2538           : VF.getKnownMinValue();
2539   assert((!VF.isScalable() || Lanes == 1) &&
2540          "Should never scalarize a scalable vector");
2541   // Compute the scalar steps and save the results in State.
2542   for (unsigned Part = 0; Part < UF; ++Part) {
2543     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2544       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2545                                          ScalarIVTy->getScalarSizeInBits());
2546       Value *StartIdx =
2547           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2548       if (ScalarIVTy->isFloatingPointTy())
2549         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2550       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2551           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2552       // The step returned by `createStepForVF` is a runtime-evaluated value
2553       // when VF is scalable. Otherwise, it should be folded into a Constant.
2554       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2555              "Expected StartIdx to be folded to a constant when VF is not "
2556              "scalable");
2557       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2558       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2559       State.set(Def, Add, VPIteration(Part, Lane));
2560       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2561                                             Part, Lane);
2562     }
2563   }
2564 }
2565 
2566 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2567                                                     const VPIteration &Instance,
2568                                                     VPTransformState &State) {
2569   Value *ScalarInst = State.get(Def, Instance);
2570   Value *VectorValue = State.get(Def, Instance.Part);
2571   VectorValue = Builder.CreateInsertElement(
2572       VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane));
2573   State.set(Def, VectorValue, Instance.Part);
2574 }
2575 
2576 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2577   assert(Vec->getType()->isVectorTy() && "Invalid type");
2578   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2579   SmallVector<int, 8> ShuffleMask;
2580   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2581     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2582 
2583   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2584 }
2585 
2586 // Return whether we allow using masked interleave-groups (for dealing with
2587 // strided loads/stores that reside in predicated blocks, or for dealing
2588 // with gaps).
2589 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2590   // If an override option has been passed in for interleaved accesses, use it.
2591   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2592     return EnableMaskedInterleavedMemAccesses;
2593 
2594   return TTI.enableMaskedInterleavedAccessVectorization();
2595 }
2596 
2597 // Try to vectorize the interleave group that \p Instr belongs to.
2598 //
2599 // E.g. Translate following interleaved load group (factor = 3):
2600 //   for (i = 0; i < N; i+=3) {
2601 //     R = Pic[i];             // Member of index 0
2602 //     G = Pic[i+1];           // Member of index 1
2603 //     B = Pic[i+2];           // Member of index 2
2604 //     ... // do something to R, G, B
2605 //   }
2606 // To:
2607 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2608 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2609 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2610 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2611 //
2612 // Or translate following interleaved store group (factor = 3):
2613 //   for (i = 0; i < N; i+=3) {
2614 //     ... do something to R, G, B
2615 //     Pic[i]   = R;           // Member of index 0
2616 //     Pic[i+1] = G;           // Member of index 1
2617 //     Pic[i+2] = B;           // Member of index 2
2618 //   }
2619 // To:
2620 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2621 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2622 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2623 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2624 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2625 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2626     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2627     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2628     VPValue *BlockInMask) {
2629   Instruction *Instr = Group->getInsertPos();
2630   const DataLayout &DL = Instr->getModule()->getDataLayout();
2631 
2632   // Prepare for the vector type of the interleaved load/store.
2633   Type *ScalarTy = getMemInstValueType(Instr);
2634   unsigned InterleaveFactor = Group->getFactor();
2635   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2636   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2637 
2638   // Prepare for the new pointers.
2639   SmallVector<Value *, 2> AddrParts;
2640   unsigned Index = Group->getIndex(Instr);
2641 
2642   // TODO: extend the masked interleaved-group support to reversed access.
2643   assert((!BlockInMask || !Group->isReverse()) &&
2644          "Reversed masked interleave-group not supported.");
2645 
2646   // If the group is reverse, adjust the index to refer to the last vector lane
2647   // instead of the first. We adjust the index from the first vector lane,
2648   // rather than directly getting the pointer for lane VF - 1, because the
2649   // pointer operand of the interleaved access is supposed to be uniform. For
2650   // uniform instructions, we're only required to generate a value for the
2651   // first vector lane in each unroll iteration.
2652   assert(!VF.isScalable() &&
2653          "scalable vector reverse operation is not implemented");
2654   if (Group->isReverse())
2655     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2656 
2657   for (unsigned Part = 0; Part < UF; Part++) {
2658     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2659     setDebugLocFromInst(Builder, AddrPart);
2660 
2661     // Notice current instruction could be any index. Need to adjust the address
2662     // to the member of index 0.
2663     //
2664     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2665     //       b = A[i];       // Member of index 0
2666     // Current pointer is pointed to A[i+1], adjust it to A[i].
2667     //
2668     // E.g.  A[i+1] = a;     // Member of index 1
2669     //       A[i]   = b;     // Member of index 0
2670     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2671     // Current pointer is pointed to A[i+2], adjust it to A[i].
2672 
2673     bool InBounds = false;
2674     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2675       InBounds = gep->isInBounds();
2676     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2677     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2678 
2679     // Cast to the vector pointer type.
2680     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2681     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2682     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2683   }
2684 
2685   setDebugLocFromInst(Builder, Instr);
2686   Value *PoisonVec = PoisonValue::get(VecTy);
2687 
2688   Value *MaskForGaps = nullptr;
2689   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2690     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2691     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2692     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2693   }
2694 
2695   // Vectorize the interleaved load group.
2696   if (isa<LoadInst>(Instr)) {
2697     // For each unroll part, create a wide load for the group.
2698     SmallVector<Value *, 2> NewLoads;
2699     for (unsigned Part = 0; Part < UF; Part++) {
2700       Instruction *NewLoad;
2701       if (BlockInMask || MaskForGaps) {
2702         assert(useMaskedInterleavedAccesses(*TTI) &&
2703                "masked interleaved groups are not allowed.");
2704         Value *GroupMask = MaskForGaps;
2705         if (BlockInMask) {
2706           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2707           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2708           Value *ShuffledMask = Builder.CreateShuffleVector(
2709               BlockInMaskPart,
2710               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2711               "interleaved.mask");
2712           GroupMask = MaskForGaps
2713                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2714                                                 MaskForGaps)
2715                           : ShuffledMask;
2716         }
2717         NewLoad =
2718             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2719                                      GroupMask, PoisonVec, "wide.masked.vec");
2720       }
2721       else
2722         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2723                                             Group->getAlign(), "wide.vec");
2724       Group->addMetadata(NewLoad);
2725       NewLoads.push_back(NewLoad);
2726     }
2727 
2728     // For each member in the group, shuffle out the appropriate data from the
2729     // wide loads.
2730     unsigned J = 0;
2731     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2732       Instruction *Member = Group->getMember(I);
2733 
2734       // Skip the gaps in the group.
2735       if (!Member)
2736         continue;
2737 
2738       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2739       auto StrideMask =
2740           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2741       for (unsigned Part = 0; Part < UF; Part++) {
2742         Value *StridedVec = Builder.CreateShuffleVector(
2743             NewLoads[Part], StrideMask, "strided.vec");
2744 
2745         // If this member has different type, cast the result type.
2746         if (Member->getType() != ScalarTy) {
2747           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2748           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2749           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2750         }
2751 
2752         if (Group->isReverse())
2753           StridedVec = reverseVector(StridedVec);
2754 
2755         State.set(VPDefs[J], StridedVec, Part);
2756       }
2757       ++J;
2758     }
2759     return;
2760   }
2761 
2762   // The sub vector type for current instruction.
2763   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2764   auto *SubVT = VectorType::get(ScalarTy, VF);
2765 
2766   // Vectorize the interleaved store group.
2767   for (unsigned Part = 0; Part < UF; Part++) {
2768     // Collect the stored vector from each member.
2769     SmallVector<Value *, 4> StoredVecs;
2770     for (unsigned i = 0; i < InterleaveFactor; i++) {
2771       // Interleaved store group doesn't allow a gap, so each index has a member
2772       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2773 
2774       Value *StoredVec = State.get(StoredValues[i], Part);
2775 
2776       if (Group->isReverse())
2777         StoredVec = reverseVector(StoredVec);
2778 
2779       // If this member has different type, cast it to a unified type.
2780 
2781       if (StoredVec->getType() != SubVT)
2782         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2783 
2784       StoredVecs.push_back(StoredVec);
2785     }
2786 
2787     // Concatenate all vectors into a wide vector.
2788     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2789 
2790     // Interleave the elements in the wide vector.
2791     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2792     Value *IVec = Builder.CreateShuffleVector(
2793         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2794         "interleaved.vec");
2795 
2796     Instruction *NewStoreInstr;
2797     if (BlockInMask) {
2798       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2799       Value *ShuffledMask = Builder.CreateShuffleVector(
2800           BlockInMaskPart,
2801           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2802           "interleaved.mask");
2803       NewStoreInstr = Builder.CreateMaskedStore(
2804           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2805     }
2806     else
2807       NewStoreInstr =
2808           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2809 
2810     Group->addMetadata(NewStoreInstr);
2811   }
2812 }
2813 
2814 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2815     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2816     VPValue *StoredValue, VPValue *BlockInMask) {
2817   // Attempt to issue a wide load.
2818   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2819   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2820 
2821   assert((LI || SI) && "Invalid Load/Store instruction");
2822   assert((!SI || StoredValue) && "No stored value provided for widened store");
2823   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2824 
2825   LoopVectorizationCostModel::InstWidening Decision =
2826       Cost->getWideningDecision(Instr, VF);
2827   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2828           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2829           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2830          "CM decision is not to widen the memory instruction");
2831 
2832   Type *ScalarDataTy = getMemInstValueType(Instr);
2833 
2834   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2835   const Align Alignment = getLoadStoreAlignment(Instr);
2836 
2837   // Determine if the pointer operand of the access is either consecutive or
2838   // reverse consecutive.
2839   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2840   bool ConsecutiveStride =
2841       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2842   bool CreateGatherScatter =
2843       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2844 
2845   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2846   // gather/scatter. Otherwise Decision should have been to Scalarize.
2847   assert((ConsecutiveStride || CreateGatherScatter) &&
2848          "The instruction should be scalarized");
2849   (void)ConsecutiveStride;
2850 
2851   VectorParts BlockInMaskParts(UF);
2852   bool isMaskRequired = BlockInMask;
2853   if (isMaskRequired)
2854     for (unsigned Part = 0; Part < UF; ++Part)
2855       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2856 
2857   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2858     // Calculate the pointer for the specific unroll-part.
2859     GetElementPtrInst *PartPtr = nullptr;
2860 
2861     bool InBounds = false;
2862     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2863       InBounds = gep->isInBounds();
2864 
2865     if (Reverse) {
2866       assert(!VF.isScalable() &&
2867              "Reversing vectors is not yet supported for scalable vectors.");
2868 
2869       // If the address is consecutive but reversed, then the
2870       // wide store needs to start at the last vector element.
2871       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2872           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2873       PartPtr->setIsInBounds(InBounds);
2874       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2875           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2876       PartPtr->setIsInBounds(InBounds);
2877       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2878         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2879     } else {
2880       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2881       PartPtr = cast<GetElementPtrInst>(
2882           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2883       PartPtr->setIsInBounds(InBounds);
2884     }
2885 
2886     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2887     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2888   };
2889 
2890   // Handle Stores:
2891   if (SI) {
2892     setDebugLocFromInst(Builder, SI);
2893 
2894     for (unsigned Part = 0; Part < UF; ++Part) {
2895       Instruction *NewSI = nullptr;
2896       Value *StoredVal = State.get(StoredValue, Part);
2897       if (CreateGatherScatter) {
2898         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2899         Value *VectorGep = State.get(Addr, Part);
2900         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2901                                             MaskPart);
2902       } else {
2903         if (Reverse) {
2904           // If we store to reverse consecutive memory locations, then we need
2905           // to reverse the order of elements in the stored value.
2906           StoredVal = reverseVector(StoredVal);
2907           // We don't want to update the value in the map as it might be used in
2908           // another expression. So don't call resetVectorValue(StoredVal).
2909         }
2910         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2911         if (isMaskRequired)
2912           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2913                                             BlockInMaskParts[Part]);
2914         else
2915           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2916       }
2917       addMetadata(NewSI, SI);
2918     }
2919     return;
2920   }
2921 
2922   // Handle loads.
2923   assert(LI && "Must have a load instruction");
2924   setDebugLocFromInst(Builder, LI);
2925   for (unsigned Part = 0; Part < UF; ++Part) {
2926     Value *NewLI;
2927     if (CreateGatherScatter) {
2928       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2929       Value *VectorGep = State.get(Addr, Part);
2930       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2931                                          nullptr, "wide.masked.gather");
2932       addMetadata(NewLI, LI);
2933     } else {
2934       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2935       if (isMaskRequired)
2936         NewLI = Builder.CreateMaskedLoad(
2937             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2938             "wide.masked.load");
2939       else
2940         NewLI =
2941             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2942 
2943       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2944       addMetadata(NewLI, LI);
2945       if (Reverse)
2946         NewLI = reverseVector(NewLI);
2947     }
2948 
2949     State.set(Def, NewLI, Part);
2950   }
2951 }
2952 
2953 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
2954                                                VPUser &User,
2955                                                const VPIteration &Instance,
2956                                                bool IfPredicateInstr,
2957                                                VPTransformState &State) {
2958   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2959 
2960   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2961   // the first lane and part.
2962   if (isa<NoAliasScopeDeclInst>(Instr))
2963     if (!Instance.isFirstIteration())
2964       return;
2965 
2966   setDebugLocFromInst(Builder, Instr);
2967 
2968   // Does this instruction return a value ?
2969   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2970 
2971   Instruction *Cloned = Instr->clone();
2972   if (!IsVoidRetTy)
2973     Cloned->setName(Instr->getName() + ".cloned");
2974 
2975   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2976                                Builder.GetInsertPoint());
2977   // Replace the operands of the cloned instructions with their scalar
2978   // equivalents in the new loop.
2979   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2980     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2981     auto InputInstance = Instance;
2982     if (!Operand || !OrigLoop->contains(Operand) ||
2983         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2984       InputInstance.Lane = 0;
2985     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2986     Cloned->setOperand(op, NewOp);
2987   }
2988   addNewMetadata(Cloned, Instr);
2989 
2990   // Place the cloned scalar in the new loop.
2991   Builder.Insert(Cloned);
2992 
2993   State.set(Def, Cloned, Instance);
2994 
2995   // If we just cloned a new assumption, add it the assumption cache.
2996   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2997     if (II->getIntrinsicID() == Intrinsic::assume)
2998       AC->registerAssumption(II);
2999 
3000   // End if-block.
3001   if (IfPredicateInstr)
3002     PredicatedInstructions.push_back(Cloned);
3003 }
3004 
3005 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3006                                                       Value *End, Value *Step,
3007                                                       Instruction *DL) {
3008   BasicBlock *Header = L->getHeader();
3009   BasicBlock *Latch = L->getLoopLatch();
3010   // As we're just creating this loop, it's possible no latch exists
3011   // yet. If so, use the header as this will be a single block loop.
3012   if (!Latch)
3013     Latch = Header;
3014 
3015   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3016   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3017   setDebugLocFromInst(Builder, OldInst);
3018   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3019 
3020   Builder.SetInsertPoint(Latch->getTerminator());
3021   setDebugLocFromInst(Builder, OldInst);
3022 
3023   // Create i+1 and fill the PHINode.
3024   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3025   Induction->addIncoming(Start, L->getLoopPreheader());
3026   Induction->addIncoming(Next, Latch);
3027   // Create the compare.
3028   Value *ICmp = Builder.CreateICmpEQ(Next, End);
3029   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3030 
3031   // Now we have two terminators. Remove the old one from the block.
3032   Latch->getTerminator()->eraseFromParent();
3033 
3034   return Induction;
3035 }
3036 
3037 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3038   if (TripCount)
3039     return TripCount;
3040 
3041   assert(L && "Create Trip Count for null loop.");
3042   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3043   // Find the loop boundaries.
3044   ScalarEvolution *SE = PSE.getSE();
3045   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3046   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3047          "Invalid loop count");
3048 
3049   Type *IdxTy = Legal->getWidestInductionType();
3050   assert(IdxTy && "No type for induction");
3051 
3052   // The exit count might have the type of i64 while the phi is i32. This can
3053   // happen if we have an induction variable that is sign extended before the
3054   // compare. The only way that we get a backedge taken count is that the
3055   // induction variable was signed and as such will not overflow. In such a case
3056   // truncation is legal.
3057   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3058       IdxTy->getPrimitiveSizeInBits())
3059     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3060   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3061 
3062   // Get the total trip count from the count by adding 1.
3063   const SCEV *ExitCount = SE->getAddExpr(
3064       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3065 
3066   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3067 
3068   // Expand the trip count and place the new instructions in the preheader.
3069   // Notice that the pre-header does not change, only the loop body.
3070   SCEVExpander Exp(*SE, DL, "induction");
3071 
3072   // Count holds the overall loop count (N).
3073   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3074                                 L->getLoopPreheader()->getTerminator());
3075 
3076   if (TripCount->getType()->isPointerTy())
3077     TripCount =
3078         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3079                                     L->getLoopPreheader()->getTerminator());
3080 
3081   return TripCount;
3082 }
3083 
3084 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3085   if (VectorTripCount)
3086     return VectorTripCount;
3087 
3088   Value *TC = getOrCreateTripCount(L);
3089   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3090 
3091   Type *Ty = TC->getType();
3092   // This is where we can make the step a runtime constant.
3093   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3094 
3095   // If the tail is to be folded by masking, round the number of iterations N
3096   // up to a multiple of Step instead of rounding down. This is done by first
3097   // adding Step-1 and then rounding down. Note that it's ok if this addition
3098   // overflows: the vector induction variable will eventually wrap to zero given
3099   // that it starts at zero and its Step is a power of two; the loop will then
3100   // exit, with the last early-exit vector comparison also producing all-true.
3101   if (Cost->foldTailByMasking()) {
3102     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3103            "VF*UF must be a power of 2 when folding tail by masking");
3104     assert(!VF.isScalable() &&
3105            "Tail folding not yet supported for scalable vectors");
3106     TC = Builder.CreateAdd(
3107         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3108   }
3109 
3110   // Now we need to generate the expression for the part of the loop that the
3111   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3112   // iterations are not required for correctness, or N - Step, otherwise. Step
3113   // is equal to the vectorization factor (number of SIMD elements) times the
3114   // unroll factor (number of SIMD instructions).
3115   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3116 
3117   // There are two cases where we need to ensure (at least) the last iteration
3118   // runs in the scalar remainder loop. Thus, if the step evenly divides
3119   // the trip count, we set the remainder to be equal to the step. If the step
3120   // does not evenly divide the trip count, no adjustment is necessary since
3121   // there will already be scalar iterations. Note that the minimum iterations
3122   // check ensures that N >= Step. The cases are:
3123   // 1) If there is a non-reversed interleaved group that may speculatively
3124   //    access memory out-of-bounds.
3125   // 2) If any instruction may follow a conditionally taken exit. That is, if
3126   //    the loop contains multiple exiting blocks, or a single exiting block
3127   //    which is not the latch.
3128   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3129     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3130     R = Builder.CreateSelect(IsZero, Step, R);
3131   }
3132 
3133   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3134 
3135   return VectorTripCount;
3136 }
3137 
3138 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3139                                                    const DataLayout &DL) {
3140   // Verify that V is a vector type with same number of elements as DstVTy.
3141   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3142   unsigned VF = DstFVTy->getNumElements();
3143   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3144   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3145   Type *SrcElemTy = SrcVecTy->getElementType();
3146   Type *DstElemTy = DstFVTy->getElementType();
3147   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3148          "Vector elements must have same size");
3149 
3150   // Do a direct cast if element types are castable.
3151   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3152     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3153   }
3154   // V cannot be directly casted to desired vector type.
3155   // May happen when V is a floating point vector but DstVTy is a vector of
3156   // pointers or vice-versa. Handle this using a two-step bitcast using an
3157   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3158   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3159          "Only one type should be a pointer type");
3160   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3161          "Only one type should be a floating point type");
3162   Type *IntTy =
3163       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3164   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3165   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3166   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3167 }
3168 
3169 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3170                                                          BasicBlock *Bypass) {
3171   Value *Count = getOrCreateTripCount(L);
3172   // Reuse existing vector loop preheader for TC checks.
3173   // Note that new preheader block is generated for vector loop.
3174   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3175   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3176 
3177   // Generate code to check if the loop's trip count is less than VF * UF, or
3178   // equal to it in case a scalar epilogue is required; this implies that the
3179   // vector trip count is zero. This check also covers the case where adding one
3180   // to the backedge-taken count overflowed leading to an incorrect trip count
3181   // of zero. In this case we will also jump to the scalar loop.
3182   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3183                                           : ICmpInst::ICMP_ULT;
3184 
3185   // If tail is to be folded, vector loop takes care of all iterations.
3186   Value *CheckMinIters = Builder.getFalse();
3187   if (!Cost->foldTailByMasking()) {
3188     Value *Step =
3189         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3190     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3191   }
3192   // Create new preheader for vector loop.
3193   LoopVectorPreHeader =
3194       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3195                  "vector.ph");
3196 
3197   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3198                                DT->getNode(Bypass)->getIDom()) &&
3199          "TC check is expected to dominate Bypass");
3200 
3201   // Update dominator for Bypass & LoopExit.
3202   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3203   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3204 
3205   ReplaceInstWithInst(
3206       TCCheckBlock->getTerminator(),
3207       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3208   LoopBypassBlocks.push_back(TCCheckBlock);
3209 }
3210 
3211 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3212 
3213   BasicBlock *const SCEVCheckBlock =
3214       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3215   if (!SCEVCheckBlock)
3216     return nullptr;
3217 
3218   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3219            (OptForSizeBasedOnProfile &&
3220             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3221          "Cannot SCEV check stride or overflow when optimizing for size");
3222 
3223 
3224   // Update dominator only if this is first RT check.
3225   if (LoopBypassBlocks.empty()) {
3226     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3227     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3228   }
3229 
3230   LoopBypassBlocks.push_back(SCEVCheckBlock);
3231   AddedSafetyChecks = true;
3232   return SCEVCheckBlock;
3233 }
3234 
3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3236                                                       BasicBlock *Bypass) {
3237   // VPlan-native path does not do any analysis for runtime checks currently.
3238   if (EnableVPlanNativePath)
3239     return nullptr;
3240 
3241   BasicBlock *const MemCheckBlock =
3242       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3243 
3244   // Check if we generated code that checks in runtime if arrays overlap. We put
3245   // the checks into a separate block to make the more common case of few
3246   // elements faster.
3247   if (!MemCheckBlock)
3248     return nullptr;
3249 
3250   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3251     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3252            "Cannot emit memory checks when optimizing for size, unless forced "
3253            "to vectorize.");
3254     ORE->emit([&]() {
3255       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3256                                         L->getStartLoc(), L->getHeader())
3257              << "Code-size may be reduced by not forcing "
3258                 "vectorization, or by source-code modifications "
3259                 "eliminating the need for runtime checks "
3260                 "(e.g., adding 'restrict').";
3261     });
3262   }
3263 
3264   LoopBypassBlocks.push_back(MemCheckBlock);
3265 
3266   AddedSafetyChecks = true;
3267 
3268   // We currently don't use LoopVersioning for the actual loop cloning but we
3269   // still use it to add the noalias metadata.
3270   LVer = std::make_unique<LoopVersioning>(
3271       *Legal->getLAI(),
3272       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3273       DT, PSE.getSE());
3274   LVer->prepareNoAliasMetadata();
3275   return MemCheckBlock;
3276 }
3277 
3278 Value *InnerLoopVectorizer::emitTransformedIndex(
3279     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3280     const InductionDescriptor &ID) const {
3281 
3282   SCEVExpander Exp(*SE, DL, "induction");
3283   auto Step = ID.getStep();
3284   auto StartValue = ID.getStartValue();
3285   assert(Index->getType() == Step->getType() &&
3286          "Index type does not match StepValue type");
3287 
3288   // Note: the IR at this point is broken. We cannot use SE to create any new
3289   // SCEV and then expand it, hoping that SCEV's simplification will give us
3290   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3291   // lead to various SCEV crashes. So all we can do is to use builder and rely
3292   // on InstCombine for future simplifications. Here we handle some trivial
3293   // cases only.
3294   auto CreateAdd = [&B](Value *X, Value *Y) {
3295     assert(X->getType() == Y->getType() && "Types don't match!");
3296     if (auto *CX = dyn_cast<ConstantInt>(X))
3297       if (CX->isZero())
3298         return Y;
3299     if (auto *CY = dyn_cast<ConstantInt>(Y))
3300       if (CY->isZero())
3301         return X;
3302     return B.CreateAdd(X, Y);
3303   };
3304 
3305   auto CreateMul = [&B](Value *X, Value *Y) {
3306     assert(X->getType() == Y->getType() && "Types don't match!");
3307     if (auto *CX = dyn_cast<ConstantInt>(X))
3308       if (CX->isOne())
3309         return Y;
3310     if (auto *CY = dyn_cast<ConstantInt>(Y))
3311       if (CY->isOne())
3312         return X;
3313     return B.CreateMul(X, Y);
3314   };
3315 
3316   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3317   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3318   // the DomTree is not kept up-to-date for additional blocks generated in the
3319   // vector loop. By using the header as insertion point, we guarantee that the
3320   // expanded instructions dominate all their uses.
3321   auto GetInsertPoint = [this, &B]() {
3322     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3323     if (InsertBB != LoopVectorBody &&
3324         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3325       return LoopVectorBody->getTerminator();
3326     return &*B.GetInsertPoint();
3327   };
3328   switch (ID.getKind()) {
3329   case InductionDescriptor::IK_IntInduction: {
3330     assert(Index->getType() == StartValue->getType() &&
3331            "Index type does not match StartValue type");
3332     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3333       return B.CreateSub(StartValue, Index);
3334     auto *Offset = CreateMul(
3335         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3336     return CreateAdd(StartValue, Offset);
3337   }
3338   case InductionDescriptor::IK_PtrInduction: {
3339     assert(isa<SCEVConstant>(Step) &&
3340            "Expected constant step for pointer induction");
3341     return B.CreateGEP(
3342         StartValue->getType()->getPointerElementType(), StartValue,
3343         CreateMul(Index,
3344                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3345   }
3346   case InductionDescriptor::IK_FpInduction: {
3347     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3348     auto InductionBinOp = ID.getInductionBinOp();
3349     assert(InductionBinOp &&
3350            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3351             InductionBinOp->getOpcode() == Instruction::FSub) &&
3352            "Original bin op should be defined for FP induction");
3353 
3354     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3355 
3356     // Floating point operations had to be 'fast' to enable the induction.
3357     FastMathFlags Flags;
3358     Flags.setFast();
3359 
3360     Value *MulExp = B.CreateFMul(StepValue, Index);
3361     if (isa<Instruction>(MulExp))
3362       // We have to check, the MulExp may be a constant.
3363       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3364 
3365     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3366                                "induction");
3367     if (isa<Instruction>(BOp))
3368       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3369 
3370     return BOp;
3371   }
3372   case InductionDescriptor::IK_NoInduction:
3373     return nullptr;
3374   }
3375   llvm_unreachable("invalid enum");
3376 }
3377 
3378 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3379   LoopScalarBody = OrigLoop->getHeader();
3380   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3381   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3382   assert(LoopExitBlock && "Must have an exit block");
3383   assert(LoopVectorPreHeader && "Invalid loop structure");
3384 
3385   LoopMiddleBlock =
3386       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3387                  LI, nullptr, Twine(Prefix) + "middle.block");
3388   LoopScalarPreHeader =
3389       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3390                  nullptr, Twine(Prefix) + "scalar.ph");
3391 
3392   // Set up branch from middle block to the exit and scalar preheader blocks.
3393   // completeLoopSkeleton will update the condition to use an iteration check,
3394   // if required to decide whether to execute the remainder.
3395   BranchInst *BrInst =
3396       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3397   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3398   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3399   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3400 
3401   // We intentionally don't let SplitBlock to update LoopInfo since
3402   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3403   // LoopVectorBody is explicitly added to the correct place few lines later.
3404   LoopVectorBody =
3405       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3406                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3407 
3408   // Update dominator for loop exit.
3409   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3410 
3411   // Create and register the new vector loop.
3412   Loop *Lp = LI->AllocateLoop();
3413   Loop *ParentLoop = OrigLoop->getParentLoop();
3414 
3415   // Insert the new loop into the loop nest and register the new basic blocks
3416   // before calling any utilities such as SCEV that require valid LoopInfo.
3417   if (ParentLoop) {
3418     ParentLoop->addChildLoop(Lp);
3419   } else {
3420     LI->addTopLevelLoop(Lp);
3421   }
3422   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3423   return Lp;
3424 }
3425 
3426 void InnerLoopVectorizer::createInductionResumeValues(
3427     Loop *L, Value *VectorTripCount,
3428     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3429   assert(VectorTripCount && L && "Expected valid arguments");
3430   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3431           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3432          "Inconsistent information about additional bypass.");
3433   // We are going to resume the execution of the scalar loop.
3434   // Go over all of the induction variables that we found and fix the
3435   // PHIs that are left in the scalar version of the loop.
3436   // The starting values of PHI nodes depend on the counter of the last
3437   // iteration in the vectorized loop.
3438   // If we come from a bypass edge then we need to start from the original
3439   // start value.
3440   for (auto &InductionEntry : Legal->getInductionVars()) {
3441     PHINode *OrigPhi = InductionEntry.first;
3442     InductionDescriptor II = InductionEntry.second;
3443 
3444     // Create phi nodes to merge from the  backedge-taken check block.
3445     PHINode *BCResumeVal =
3446         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3447                         LoopScalarPreHeader->getTerminator());
3448     // Copy original phi DL over to the new one.
3449     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3450     Value *&EndValue = IVEndValues[OrigPhi];
3451     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3452     if (OrigPhi == OldInduction) {
3453       // We know what the end value is.
3454       EndValue = VectorTripCount;
3455     } else {
3456       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3457       Type *StepType = II.getStep()->getType();
3458       Instruction::CastOps CastOp =
3459           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3460       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3461       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3462       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3463       EndValue->setName("ind.end");
3464 
3465       // Compute the end value for the additional bypass (if applicable).
3466       if (AdditionalBypass.first) {
3467         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3468         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3469                                          StepType, true);
3470         CRD =
3471             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3472         EndValueFromAdditionalBypass =
3473             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3474         EndValueFromAdditionalBypass->setName("ind.end");
3475       }
3476     }
3477     // The new PHI merges the original incoming value, in case of a bypass,
3478     // or the value at the end of the vectorized loop.
3479     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3480 
3481     // Fix the scalar body counter (PHI node).
3482     // The old induction's phi node in the scalar body needs the truncated
3483     // value.
3484     for (BasicBlock *BB : LoopBypassBlocks)
3485       BCResumeVal->addIncoming(II.getStartValue(), BB);
3486 
3487     if (AdditionalBypass.first)
3488       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3489                                             EndValueFromAdditionalBypass);
3490 
3491     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3492   }
3493 }
3494 
3495 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3496                                                       MDNode *OrigLoopID) {
3497   assert(L && "Expected valid loop.");
3498 
3499   // The trip counts should be cached by now.
3500   Value *Count = getOrCreateTripCount(L);
3501   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3502 
3503   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3504 
3505   // Add a check in the middle block to see if we have completed
3506   // all of the iterations in the first vector loop.
3507   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3508   // If tail is to be folded, we know we don't need to run the remainder.
3509   if (!Cost->foldTailByMasking()) {
3510     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3511                                         Count, VectorTripCount, "cmp.n",
3512                                         LoopMiddleBlock->getTerminator());
3513 
3514     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3515     // of the corresponding compare because they may have ended up with
3516     // different line numbers and we want to avoid awkward line stepping while
3517     // debugging. Eg. if the compare has got a line number inside the loop.
3518     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3519     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3520   }
3521 
3522   // Get ready to start creating new instructions into the vectorized body.
3523   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3524          "Inconsistent vector loop preheader");
3525   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3526 
3527   Optional<MDNode *> VectorizedLoopID =
3528       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3529                                       LLVMLoopVectorizeFollowupVectorized});
3530   if (VectorizedLoopID.hasValue()) {
3531     L->setLoopID(VectorizedLoopID.getValue());
3532 
3533     // Do not setAlreadyVectorized if loop attributes have been defined
3534     // explicitly.
3535     return LoopVectorPreHeader;
3536   }
3537 
3538   // Keep all loop hints from the original loop on the vector loop (we'll
3539   // replace the vectorizer-specific hints below).
3540   if (MDNode *LID = OrigLoop->getLoopID())
3541     L->setLoopID(LID);
3542 
3543   LoopVectorizeHints Hints(L, true, *ORE);
3544   Hints.setAlreadyVectorized();
3545 
3546 #ifdef EXPENSIVE_CHECKS
3547   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3548   LI->verify(*DT);
3549 #endif
3550 
3551   return LoopVectorPreHeader;
3552 }
3553 
3554 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3555   /*
3556    In this function we generate a new loop. The new loop will contain
3557    the vectorized instructions while the old loop will continue to run the
3558    scalar remainder.
3559 
3560        [ ] <-- loop iteration number check.
3561     /   |
3562    /    v
3563   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3564   |  /  |
3565   | /   v
3566   ||   [ ]     <-- vector pre header.
3567   |/    |
3568   |     v
3569   |    [  ] \
3570   |    [  ]_|   <-- vector loop.
3571   |     |
3572   |     v
3573   |   -[ ]   <--- middle-block.
3574   |  /  |
3575   | /   v
3576   -|- >[ ]     <--- new preheader.
3577    |    |
3578    |    v
3579    |   [ ] \
3580    |   [ ]_|   <-- old scalar loop to handle remainder.
3581     \   |
3582      \  v
3583       >[ ]     <-- exit block.
3584    ...
3585    */
3586 
3587   // Get the metadata of the original loop before it gets modified.
3588   MDNode *OrigLoopID = OrigLoop->getLoopID();
3589 
3590   // Create an empty vector loop, and prepare basic blocks for the runtime
3591   // checks.
3592   Loop *Lp = createVectorLoopSkeleton("");
3593 
3594   // Now, compare the new count to zero. If it is zero skip the vector loop and
3595   // jump to the scalar loop. This check also covers the case where the
3596   // backedge-taken count is uint##_max: adding one to it will overflow leading
3597   // to an incorrect trip count of zero. In this (rare) case we will also jump
3598   // to the scalar loop.
3599   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3600 
3601   // Generate the code to check any assumptions that we've made for SCEV
3602   // expressions.
3603   emitSCEVChecks(Lp, LoopScalarPreHeader);
3604 
3605   // Generate the code that checks in runtime if arrays overlap. We put the
3606   // checks into a separate block to make the more common case of few elements
3607   // faster.
3608   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3609 
3610   // Some loops have a single integer induction variable, while other loops
3611   // don't. One example is c++ iterators that often have multiple pointer
3612   // induction variables. In the code below we also support a case where we
3613   // don't have a single induction variable.
3614   //
3615   // We try to obtain an induction variable from the original loop as hard
3616   // as possible. However if we don't find one that:
3617   //   - is an integer
3618   //   - counts from zero, stepping by one
3619   //   - is the size of the widest induction variable type
3620   // then we create a new one.
3621   OldInduction = Legal->getPrimaryInduction();
3622   Type *IdxTy = Legal->getWidestInductionType();
3623   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3624   // The loop step is equal to the vectorization factor (num of SIMD elements)
3625   // times the unroll factor (num of SIMD instructions).
3626   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3627   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3628   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3629   Induction =
3630       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3631                               getDebugLocFromInstOrOperands(OldInduction));
3632 
3633   // Emit phis for the new starting index of the scalar loop.
3634   createInductionResumeValues(Lp, CountRoundDown);
3635 
3636   return completeLoopSkeleton(Lp, OrigLoopID);
3637 }
3638 
3639 // Fix up external users of the induction variable. At this point, we are
3640 // in LCSSA form, with all external PHIs that use the IV having one input value,
3641 // coming from the remainder loop. We need those PHIs to also have a correct
3642 // value for the IV when arriving directly from the middle block.
3643 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3644                                        const InductionDescriptor &II,
3645                                        Value *CountRoundDown, Value *EndValue,
3646                                        BasicBlock *MiddleBlock) {
3647   // There are two kinds of external IV usages - those that use the value
3648   // computed in the last iteration (the PHI) and those that use the penultimate
3649   // value (the value that feeds into the phi from the loop latch).
3650   // We allow both, but they, obviously, have different values.
3651 
3652   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3653 
3654   DenseMap<Value *, Value *> MissingVals;
3655 
3656   // An external user of the last iteration's value should see the value that
3657   // the remainder loop uses to initialize its own IV.
3658   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3659   for (User *U : PostInc->users()) {
3660     Instruction *UI = cast<Instruction>(U);
3661     if (!OrigLoop->contains(UI)) {
3662       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3663       MissingVals[UI] = EndValue;
3664     }
3665   }
3666 
3667   // An external user of the penultimate value need to see EndValue - Step.
3668   // The simplest way to get this is to recompute it from the constituent SCEVs,
3669   // that is Start + (Step * (CRD - 1)).
3670   for (User *U : OrigPhi->users()) {
3671     auto *UI = cast<Instruction>(U);
3672     if (!OrigLoop->contains(UI)) {
3673       const DataLayout &DL =
3674           OrigLoop->getHeader()->getModule()->getDataLayout();
3675       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3676 
3677       IRBuilder<> B(MiddleBlock->getTerminator());
3678       Value *CountMinusOne = B.CreateSub(
3679           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3680       Value *CMO =
3681           !II.getStep()->getType()->isIntegerTy()
3682               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3683                              II.getStep()->getType())
3684               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3685       CMO->setName("cast.cmo");
3686       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3687       Escape->setName("ind.escape");
3688       MissingVals[UI] = Escape;
3689     }
3690   }
3691 
3692   for (auto &I : MissingVals) {
3693     PHINode *PHI = cast<PHINode>(I.first);
3694     // One corner case we have to handle is two IVs "chasing" each-other,
3695     // that is %IV2 = phi [...], [ %IV1, %latch ]
3696     // In this case, if IV1 has an external use, we need to avoid adding both
3697     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3698     // don't already have an incoming value for the middle block.
3699     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3700       PHI->addIncoming(I.second, MiddleBlock);
3701   }
3702 }
3703 
3704 namespace {
3705 
3706 struct CSEDenseMapInfo {
3707   static bool canHandle(const Instruction *I) {
3708     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3709            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3710   }
3711 
3712   static inline Instruction *getEmptyKey() {
3713     return DenseMapInfo<Instruction *>::getEmptyKey();
3714   }
3715 
3716   static inline Instruction *getTombstoneKey() {
3717     return DenseMapInfo<Instruction *>::getTombstoneKey();
3718   }
3719 
3720   static unsigned getHashValue(const Instruction *I) {
3721     assert(canHandle(I) && "Unknown instruction!");
3722     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3723                                                            I->value_op_end()));
3724   }
3725 
3726   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3727     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3728         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3729       return LHS == RHS;
3730     return LHS->isIdenticalTo(RHS);
3731   }
3732 };
3733 
3734 } // end anonymous namespace
3735 
3736 ///Perform cse of induction variable instructions.
3737 static void cse(BasicBlock *BB) {
3738   // Perform simple cse.
3739   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3740   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3741     Instruction *In = &*I++;
3742 
3743     if (!CSEDenseMapInfo::canHandle(In))
3744       continue;
3745 
3746     // Check if we can replace this instruction with any of the
3747     // visited instructions.
3748     if (Instruction *V = CSEMap.lookup(In)) {
3749       In->replaceAllUsesWith(V);
3750       In->eraseFromParent();
3751       continue;
3752     }
3753 
3754     CSEMap[In] = In;
3755   }
3756 }
3757 
3758 InstructionCost
3759 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3760                                               bool &NeedToScalarize) {
3761   Function *F = CI->getCalledFunction();
3762   Type *ScalarRetTy = CI->getType();
3763   SmallVector<Type *, 4> Tys, ScalarTys;
3764   for (auto &ArgOp : CI->arg_operands())
3765     ScalarTys.push_back(ArgOp->getType());
3766 
3767   // Estimate cost of scalarized vector call. The source operands are assumed
3768   // to be vectors, so we need to extract individual elements from there,
3769   // execute VF scalar calls, and then gather the result into the vector return
3770   // value.
3771   InstructionCost ScalarCallCost =
3772       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3773   if (VF.isScalar())
3774     return ScalarCallCost;
3775 
3776   // Compute corresponding vector type for return value and arguments.
3777   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3778   for (Type *ScalarTy : ScalarTys)
3779     Tys.push_back(ToVectorTy(ScalarTy, VF));
3780 
3781   // Compute costs of unpacking argument values for the scalar calls and
3782   // packing the return values to a vector.
3783   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3784 
3785   InstructionCost Cost =
3786       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3787 
3788   // If we can't emit a vector call for this function, then the currently found
3789   // cost is the cost we need to return.
3790   NeedToScalarize = true;
3791   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3792   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3793 
3794   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3795     return Cost;
3796 
3797   // If the corresponding vector cost is cheaper, return its cost.
3798   InstructionCost VectorCallCost =
3799       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3800   if (VectorCallCost < Cost) {
3801     NeedToScalarize = false;
3802     Cost = VectorCallCost;
3803   }
3804   return Cost;
3805 }
3806 
3807 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3808   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3809     return Elt;
3810   return VectorType::get(Elt, VF);
3811 }
3812 
3813 InstructionCost
3814 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3815                                                    ElementCount VF) {
3816   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3817   assert(ID && "Expected intrinsic call!");
3818   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3819   FastMathFlags FMF;
3820   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3821     FMF = FPMO->getFastMathFlags();
3822 
3823   SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3824   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3825   SmallVector<Type *> ParamTys;
3826   std::transform(FTy->param_begin(), FTy->param_end(),
3827                  std::back_inserter(ParamTys),
3828                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3829 
3830   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3831                                     dyn_cast<IntrinsicInst>(CI));
3832   return TTI.getIntrinsicInstrCost(CostAttrs,
3833                                    TargetTransformInfo::TCK_RecipThroughput);
3834 }
3835 
3836 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3837   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3838   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3839   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3840 }
3841 
3842 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3843   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3844   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3845   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3846 }
3847 
3848 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3849   // For every instruction `I` in MinBWs, truncate the operands, create a
3850   // truncated version of `I` and reextend its result. InstCombine runs
3851   // later and will remove any ext/trunc pairs.
3852   SmallPtrSet<Value *, 4> Erased;
3853   for (const auto &KV : Cost->getMinimalBitwidths()) {
3854     // If the value wasn't vectorized, we must maintain the original scalar
3855     // type. The absence of the value from State indicates that it
3856     // wasn't vectorized.
3857     VPValue *Def = State.Plan->getVPValue(KV.first);
3858     if (!State.hasAnyVectorValue(Def))
3859       continue;
3860     for (unsigned Part = 0; Part < UF; ++Part) {
3861       Value *I = State.get(Def, Part);
3862       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3863         continue;
3864       Type *OriginalTy = I->getType();
3865       Type *ScalarTruncatedTy =
3866           IntegerType::get(OriginalTy->getContext(), KV.second);
3867       auto *TruncatedTy = FixedVectorType::get(
3868           ScalarTruncatedTy,
3869           cast<FixedVectorType>(OriginalTy)->getNumElements());
3870       if (TruncatedTy == OriginalTy)
3871         continue;
3872 
3873       IRBuilder<> B(cast<Instruction>(I));
3874       auto ShrinkOperand = [&](Value *V) -> Value * {
3875         if (auto *ZI = dyn_cast<ZExtInst>(V))
3876           if (ZI->getSrcTy() == TruncatedTy)
3877             return ZI->getOperand(0);
3878         return B.CreateZExtOrTrunc(V, TruncatedTy);
3879       };
3880 
3881       // The actual instruction modification depends on the instruction type,
3882       // unfortunately.
3883       Value *NewI = nullptr;
3884       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3885         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3886                              ShrinkOperand(BO->getOperand(1)));
3887 
3888         // Any wrapping introduced by shrinking this operation shouldn't be
3889         // considered undefined behavior. So, we can't unconditionally copy
3890         // arithmetic wrapping flags to NewI.
3891         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3892       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3893         NewI =
3894             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3895                          ShrinkOperand(CI->getOperand(1)));
3896       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3897         NewI = B.CreateSelect(SI->getCondition(),
3898                               ShrinkOperand(SI->getTrueValue()),
3899                               ShrinkOperand(SI->getFalseValue()));
3900       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3901         switch (CI->getOpcode()) {
3902         default:
3903           llvm_unreachable("Unhandled cast!");
3904         case Instruction::Trunc:
3905           NewI = ShrinkOperand(CI->getOperand(0));
3906           break;
3907         case Instruction::SExt:
3908           NewI = B.CreateSExtOrTrunc(
3909               CI->getOperand(0),
3910               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3911           break;
3912         case Instruction::ZExt:
3913           NewI = B.CreateZExtOrTrunc(
3914               CI->getOperand(0),
3915               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3916           break;
3917         }
3918       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3919         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3920                              ->getNumElements();
3921         auto *O0 = B.CreateZExtOrTrunc(
3922             SI->getOperand(0),
3923             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3924         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3925                              ->getNumElements();
3926         auto *O1 = B.CreateZExtOrTrunc(
3927             SI->getOperand(1),
3928             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3929 
3930         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3931       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3932         // Don't do anything with the operands, just extend the result.
3933         continue;
3934       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3935         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3936                             ->getNumElements();
3937         auto *O0 = B.CreateZExtOrTrunc(
3938             IE->getOperand(0),
3939             FixedVectorType::get(ScalarTruncatedTy, Elements));
3940         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3941         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3942       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3943         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3944                             ->getNumElements();
3945         auto *O0 = B.CreateZExtOrTrunc(
3946             EE->getOperand(0),
3947             FixedVectorType::get(ScalarTruncatedTy, Elements));
3948         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3949       } else {
3950         // If we don't know what to do, be conservative and don't do anything.
3951         continue;
3952       }
3953 
3954       // Lastly, extend the result.
3955       NewI->takeName(cast<Instruction>(I));
3956       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3957       I->replaceAllUsesWith(Res);
3958       cast<Instruction>(I)->eraseFromParent();
3959       Erased.insert(I);
3960       State.reset(Def, Res, Part);
3961     }
3962   }
3963 
3964   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3965   for (const auto &KV : Cost->getMinimalBitwidths()) {
3966     // If the value wasn't vectorized, we must maintain the original scalar
3967     // type. The absence of the value from State indicates that it
3968     // wasn't vectorized.
3969     VPValue *Def = State.Plan->getVPValue(KV.first);
3970     if (!State.hasAnyVectorValue(Def))
3971       continue;
3972     for (unsigned Part = 0; Part < UF; ++Part) {
3973       Value *I = State.get(Def, Part);
3974       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3975       if (Inst && Inst->use_empty()) {
3976         Value *NewI = Inst->getOperand(0);
3977         Inst->eraseFromParent();
3978         State.reset(Def, NewI, Part);
3979       }
3980     }
3981   }
3982 }
3983 
3984 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3985   // Insert truncates and extends for any truncated instructions as hints to
3986   // InstCombine.
3987   if (VF.isVector())
3988     truncateToMinimalBitwidths(State);
3989 
3990   // Fix widened non-induction PHIs by setting up the PHI operands.
3991   if (OrigPHIsToFix.size()) {
3992     assert(EnableVPlanNativePath &&
3993            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3994     fixNonInductionPHIs(State);
3995   }
3996 
3997   // At this point every instruction in the original loop is widened to a
3998   // vector form. Now we need to fix the recurrences in the loop. These PHI
3999   // nodes are currently empty because we did not want to introduce cycles.
4000   // This is the second stage of vectorizing recurrences.
4001   fixCrossIterationPHIs(State);
4002 
4003   // Forget the original basic block.
4004   PSE.getSE()->forgetLoop(OrigLoop);
4005 
4006   // Fix-up external users of the induction variables.
4007   for (auto &Entry : Legal->getInductionVars())
4008     fixupIVUsers(Entry.first, Entry.second,
4009                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4010                  IVEndValues[Entry.first], LoopMiddleBlock);
4011 
4012   fixLCSSAPHIs(State);
4013   for (Instruction *PI : PredicatedInstructions)
4014     sinkScalarOperands(&*PI);
4015 
4016   // Remove redundant induction instructions.
4017   cse(LoopVectorBody);
4018 
4019   // Set/update profile weights for the vector and remainder loops as original
4020   // loop iterations are now distributed among them. Note that original loop
4021   // represented by LoopScalarBody becomes remainder loop after vectorization.
4022   //
4023   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4024   // end up getting slightly roughened result but that should be OK since
4025   // profile is not inherently precise anyway. Note also possible bypass of
4026   // vector code caused by legality checks is ignored, assigning all the weight
4027   // to the vector loop, optimistically.
4028   //
4029   // For scalable vectorization we can't know at compile time how many iterations
4030   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4031   // vscale of '1'.
4032   setProfileInfoAfterUnrolling(
4033       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4034       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4035 }
4036 
4037 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4038   // In order to support recurrences we need to be able to vectorize Phi nodes.
4039   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4040   // stage #2: We now need to fix the recurrences by adding incoming edges to
4041   // the currently empty PHI nodes. At this point every instruction in the
4042   // original loop is widened to a vector form so we can use them to construct
4043   // the incoming edges.
4044   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
4045     // Handle first-order recurrences and reductions that need to be fixed.
4046     if (Legal->isFirstOrderRecurrence(&Phi))
4047       fixFirstOrderRecurrence(&Phi, State);
4048     else if (Legal->isReductionVariable(&Phi))
4049       fixReduction(&Phi, State);
4050   }
4051 }
4052 
4053 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4054                                                   VPTransformState &State) {
4055   // This is the second phase of vectorizing first-order recurrences. An
4056   // overview of the transformation is described below. Suppose we have the
4057   // following loop.
4058   //
4059   //   for (int i = 0; i < n; ++i)
4060   //     b[i] = a[i] - a[i - 1];
4061   //
4062   // There is a first-order recurrence on "a". For this loop, the shorthand
4063   // scalar IR looks like:
4064   //
4065   //   scalar.ph:
4066   //     s_init = a[-1]
4067   //     br scalar.body
4068   //
4069   //   scalar.body:
4070   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4071   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4072   //     s2 = a[i]
4073   //     b[i] = s2 - s1
4074   //     br cond, scalar.body, ...
4075   //
4076   // In this example, s1 is a recurrence because it's value depends on the
4077   // previous iteration. In the first phase of vectorization, we created a
4078   // temporary value for s1. We now complete the vectorization and produce the
4079   // shorthand vector IR shown below (for VF = 4, UF = 1).
4080   //
4081   //   vector.ph:
4082   //     v_init = vector(..., ..., ..., a[-1])
4083   //     br vector.body
4084   //
4085   //   vector.body
4086   //     i = phi [0, vector.ph], [i+4, vector.body]
4087   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4088   //     v2 = a[i, i+1, i+2, i+3];
4089   //     v3 = vector(v1(3), v2(0, 1, 2))
4090   //     b[i, i+1, i+2, i+3] = v2 - v3
4091   //     br cond, vector.body, middle.block
4092   //
4093   //   middle.block:
4094   //     x = v2(3)
4095   //     br scalar.ph
4096   //
4097   //   scalar.ph:
4098   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4099   //     br scalar.body
4100   //
4101   // After execution completes the vector loop, we extract the next value of
4102   // the recurrence (x) to use as the initial value in the scalar loop.
4103 
4104   // Get the original loop preheader and single loop latch.
4105   auto *Preheader = OrigLoop->getLoopPreheader();
4106   auto *Latch = OrigLoop->getLoopLatch();
4107 
4108   // Get the initial and previous values of the scalar recurrence.
4109   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4110   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4111 
4112   // Create a vector from the initial value.
4113   auto *VectorInit = ScalarInit;
4114   if (VF.isVector()) {
4115     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4116     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4117     VectorInit = Builder.CreateInsertElement(
4118         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4119         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4120   }
4121 
4122   VPValue *PhiDef = State.Plan->getVPValue(Phi);
4123   VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4124   // We constructed a temporary phi node in the first phase of vectorization.
4125   // This phi node will eventually be deleted.
4126   Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4127 
4128   // Create a phi node for the new recurrence. The current value will either be
4129   // the initial value inserted into a vector or loop-varying vector value.
4130   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4131   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4132 
4133   // Get the vectorized previous value of the last part UF - 1. It appears last
4134   // among all unrolled iterations, due to the order of their construction.
4135   Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4136 
4137   // Find and set the insertion point after the previous value if it is an
4138   // instruction.
4139   BasicBlock::iterator InsertPt;
4140   // Note that the previous value may have been constant-folded so it is not
4141   // guaranteed to be an instruction in the vector loop.
4142   // FIXME: Loop invariant values do not form recurrences. We should deal with
4143   //        them earlier.
4144   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4145     InsertPt = LoopVectorBody->getFirstInsertionPt();
4146   else {
4147     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4148     if (isa<PHINode>(PreviousLastPart))
4149       // If the previous value is a phi node, we should insert after all the phi
4150       // nodes in the block containing the PHI to avoid breaking basic block
4151       // verification. Note that the basic block may be different to
4152       // LoopVectorBody, in case we predicate the loop.
4153       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4154     else
4155       InsertPt = ++PreviousInst->getIterator();
4156   }
4157   Builder.SetInsertPoint(&*InsertPt);
4158 
4159   // We will construct a vector for the recurrence by combining the values for
4160   // the current and previous iterations. This is the required shuffle mask.
4161   assert(!VF.isScalable());
4162   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4163   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4164   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4165     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4166 
4167   // The vector from which to take the initial value for the current iteration
4168   // (actual or unrolled). Initially, this is the vector phi node.
4169   Value *Incoming = VecPhi;
4170 
4171   // Shuffle the current and previous vector and update the vector parts.
4172   for (unsigned Part = 0; Part < UF; ++Part) {
4173     Value *PreviousPart = State.get(PreviousDef, Part);
4174     Value *PhiPart = State.get(PhiDef, Part);
4175     auto *Shuffle =
4176         VF.isVector()
4177             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4178             : Incoming;
4179     PhiPart->replaceAllUsesWith(Shuffle);
4180     cast<Instruction>(PhiPart)->eraseFromParent();
4181     State.reset(PhiDef, Shuffle, Part);
4182     Incoming = PreviousPart;
4183   }
4184 
4185   // Fix the latch value of the new recurrence in the vector loop.
4186   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4187 
4188   // Extract the last vector element in the middle block. This will be the
4189   // initial value for the recurrence when jumping to the scalar loop.
4190   auto *ExtractForScalar = Incoming;
4191   if (VF.isVector()) {
4192     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4193     ExtractForScalar = Builder.CreateExtractElement(
4194         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4195         "vector.recur.extract");
4196   }
4197   // Extract the second last element in the middle block if the
4198   // Phi is used outside the loop. We need to extract the phi itself
4199   // and not the last element (the phi update in the current iteration). This
4200   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4201   // when the scalar loop is not run at all.
4202   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4203   if (VF.isVector())
4204     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4205         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4206         "vector.recur.extract.for.phi");
4207   // When loop is unrolled without vectorizing, initialize
4208   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4209   // `Incoming`. This is analogous to the vectorized case above: extracting the
4210   // second last element when VF > 1.
4211   else if (UF > 1)
4212     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4213 
4214   // Fix the initial value of the original recurrence in the scalar loop.
4215   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4216   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4217   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4218     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4219     Start->addIncoming(Incoming, BB);
4220   }
4221 
4222   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4223   Phi->setName("scalar.recur");
4224 
4225   // Finally, fix users of the recurrence outside the loop. The users will need
4226   // either the last value of the scalar recurrence or the last value of the
4227   // vector recurrence we extracted in the middle block. Since the loop is in
4228   // LCSSA form, we just need to find all the phi nodes for the original scalar
4229   // recurrence in the exit block, and then add an edge for the middle block.
4230   // Note that LCSSA does not imply single entry when the original scalar loop
4231   // had multiple exiting edges (as we always run the last iteration in the
4232   // scalar epilogue); in that case, the exiting path through middle will be
4233   // dynamically dead and the value picked for the phi doesn't matter.
4234   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4235     if (any_of(LCSSAPhi.incoming_values(),
4236                [Phi](Value *V) { return V == Phi; }))
4237       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4238 }
4239 
4240 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
4241   // Get it's reduction variable descriptor.
4242   assert(Legal->isReductionVariable(Phi) &&
4243          "Unable to find the reduction variable");
4244   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4245 
4246   RecurKind RK = RdxDesc.getRecurrenceKind();
4247   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4248   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4249   setDebugLocFromInst(Builder, ReductionStartValue);
4250   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4251 
4252   VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4253   // This is the vector-clone of the value that leaves the loop.
4254   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4255 
4256   // Wrap flags are in general invalid after vectorization, clear them.
4257   clearReductionWrapFlags(RdxDesc, State);
4258 
4259   // Fix the vector-loop phi.
4260 
4261   // Reductions do not have to start at zero. They can start with
4262   // any loop invariant values.
4263   BasicBlock *Latch = OrigLoop->getLoopLatch();
4264   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4265 
4266   for (unsigned Part = 0; Part < UF; ++Part) {
4267     Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
4268     Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
4269     cast<PHINode>(VecRdxPhi)
4270       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4271   }
4272 
4273   // Before each round, move the insertion point right between
4274   // the PHIs and the values we are going to write.
4275   // This allows us to write both PHINodes and the extractelement
4276   // instructions.
4277   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4278 
4279   setDebugLocFromInst(Builder, LoopExitInst);
4280 
4281   // If tail is folded by masking, the vector value to leave the loop should be
4282   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4283   // instead of the former. For an inloop reduction the reduction will already
4284   // be predicated, and does not need to be handled here.
4285   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4286     for (unsigned Part = 0; Part < UF; ++Part) {
4287       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4288       Value *Sel = nullptr;
4289       for (User *U : VecLoopExitInst->users()) {
4290         if (isa<SelectInst>(U)) {
4291           assert(!Sel && "Reduction exit feeding two selects");
4292           Sel = U;
4293         } else
4294           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4295       }
4296       assert(Sel && "Reduction exit feeds no select");
4297       State.reset(LoopExitInstDef, Sel, Part);
4298 
4299       // If the target can create a predicated operator for the reduction at no
4300       // extra cost in the loop (for example a predicated vadd), it can be
4301       // cheaper for the select to remain in the loop than be sunk out of it,
4302       // and so use the select value for the phi instead of the old
4303       // LoopExitValue.
4304       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4305       if (PreferPredicatedReductionSelect ||
4306           TTI->preferPredicatedReductionSelect(
4307               RdxDesc.getOpcode(), Phi->getType(),
4308               TargetTransformInfo::ReductionFlags())) {
4309         auto *VecRdxPhi =
4310             cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part));
4311         VecRdxPhi->setIncomingValueForBlock(
4312             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4313       }
4314     }
4315   }
4316 
4317   // If the vector reduction can be performed in a smaller type, we truncate
4318   // then extend the loop exit value to enable InstCombine to evaluate the
4319   // entire expression in the smaller type.
4320   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4321     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4322     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4323     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4324     Builder.SetInsertPoint(
4325         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4326     VectorParts RdxParts(UF);
4327     for (unsigned Part = 0; Part < UF; ++Part) {
4328       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4329       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4330       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4331                                         : Builder.CreateZExt(Trunc, VecTy);
4332       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4333            UI != RdxParts[Part]->user_end();)
4334         if (*UI != Trunc) {
4335           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4336           RdxParts[Part] = Extnd;
4337         } else {
4338           ++UI;
4339         }
4340     }
4341     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4342     for (unsigned Part = 0; Part < UF; ++Part) {
4343       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4344       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4345     }
4346   }
4347 
4348   // Reduce all of the unrolled parts into a single vector.
4349   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4350   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4351 
4352   // The middle block terminator has already been assigned a DebugLoc here (the
4353   // OrigLoop's single latch terminator). We want the whole middle block to
4354   // appear to execute on this line because: (a) it is all compiler generated,
4355   // (b) these instructions are always executed after evaluating the latch
4356   // conditional branch, and (c) other passes may add new predecessors which
4357   // terminate on this line. This is the easiest way to ensure we don't
4358   // accidentally cause an extra step back into the loop while debugging.
4359   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4360   {
4361     // Floating-point operations should have some FMF to enable the reduction.
4362     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4363     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4364     for (unsigned Part = 1; Part < UF; ++Part) {
4365       Value *RdxPart = State.get(LoopExitInstDef, Part);
4366       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4367         ReducedPartRdx = Builder.CreateBinOp(
4368             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4369       } else {
4370         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4371       }
4372     }
4373   }
4374 
4375   // Create the reduction after the loop. Note that inloop reductions create the
4376   // target reduction in the loop using a Reduction recipe.
4377   if (VF.isVector() && !IsInLoopReductionPhi) {
4378     ReducedPartRdx =
4379         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4380     // If the reduction can be performed in a smaller type, we need to extend
4381     // the reduction to the wider type before we branch to the original loop.
4382     if (Phi->getType() != RdxDesc.getRecurrenceType())
4383       ReducedPartRdx =
4384         RdxDesc.isSigned()
4385         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4386         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4387   }
4388 
4389   // Create a phi node that merges control-flow from the backedge-taken check
4390   // block and the middle block.
4391   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4392                                         LoopScalarPreHeader->getTerminator());
4393   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4394     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4395   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4396 
4397   // Now, we need to fix the users of the reduction variable
4398   // inside and outside of the scalar remainder loop.
4399 
4400   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4401   // in the exit blocks.  See comment on analogous loop in
4402   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4403   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4404     if (any_of(LCSSAPhi.incoming_values(),
4405                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4406       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4407 
4408   // Fix the scalar loop reduction variable with the incoming reduction sum
4409   // from the vector body and from the backedge value.
4410   int IncomingEdgeBlockIdx =
4411     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4412   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4413   // Pick the other block.
4414   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4415   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4416   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4417 }
4418 
4419 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4420                                                   VPTransformState &State) {
4421   RecurKind RK = RdxDesc.getRecurrenceKind();
4422   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4423     return;
4424 
4425   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4426   assert(LoopExitInstr && "null loop exit instruction");
4427   SmallVector<Instruction *, 8> Worklist;
4428   SmallPtrSet<Instruction *, 8> Visited;
4429   Worklist.push_back(LoopExitInstr);
4430   Visited.insert(LoopExitInstr);
4431 
4432   while (!Worklist.empty()) {
4433     Instruction *Cur = Worklist.pop_back_val();
4434     if (isa<OverflowingBinaryOperator>(Cur))
4435       for (unsigned Part = 0; Part < UF; ++Part) {
4436         Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4437         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4438       }
4439 
4440     for (User *U : Cur->users()) {
4441       Instruction *UI = cast<Instruction>(U);
4442       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4443           Visited.insert(UI).second)
4444         Worklist.push_back(UI);
4445     }
4446   }
4447 }
4448 
4449 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4450   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4451     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4452       // Some phis were already hand updated by the reduction and recurrence
4453       // code above, leave them alone.
4454       continue;
4455 
4456     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4457     // Non-instruction incoming values will have only one value.
4458     unsigned LastLane = 0;
4459     if (isa<Instruction>(IncomingValue))
4460       LastLane = Cost->isUniformAfterVectorization(
4461                      cast<Instruction>(IncomingValue), VF)
4462                      ? 0
4463                      : VF.getKnownMinValue() - 1;
4464     assert((!VF.isScalable() || LastLane == 0) &&
4465            "scalable vectors dont support non-uniform scalars yet");
4466     // Can be a loop invariant incoming value or the last scalar value to be
4467     // extracted from the vectorized loop.
4468     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4469     Value *lastIncomingValue =
4470         OrigLoop->isLoopInvariant(IncomingValue)
4471             ? IncomingValue
4472             : State.get(State.Plan->getVPValue(IncomingValue),
4473                         VPIteration(UF - 1, LastLane));
4474     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4475   }
4476 }
4477 
4478 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4479   // The basic block and loop containing the predicated instruction.
4480   auto *PredBB = PredInst->getParent();
4481   auto *VectorLoop = LI->getLoopFor(PredBB);
4482 
4483   // Initialize a worklist with the operands of the predicated instruction.
4484   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4485 
4486   // Holds instructions that we need to analyze again. An instruction may be
4487   // reanalyzed if we don't yet know if we can sink it or not.
4488   SmallVector<Instruction *, 8> InstsToReanalyze;
4489 
4490   // Returns true if a given use occurs in the predicated block. Phi nodes use
4491   // their operands in their corresponding predecessor blocks.
4492   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4493     auto *I = cast<Instruction>(U.getUser());
4494     BasicBlock *BB = I->getParent();
4495     if (auto *Phi = dyn_cast<PHINode>(I))
4496       BB = Phi->getIncomingBlock(
4497           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4498     return BB == PredBB;
4499   };
4500 
4501   // Iteratively sink the scalarized operands of the predicated instruction
4502   // into the block we created for it. When an instruction is sunk, it's
4503   // operands are then added to the worklist. The algorithm ends after one pass
4504   // through the worklist doesn't sink a single instruction.
4505   bool Changed;
4506   do {
4507     // Add the instructions that need to be reanalyzed to the worklist, and
4508     // reset the changed indicator.
4509     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4510     InstsToReanalyze.clear();
4511     Changed = false;
4512 
4513     while (!Worklist.empty()) {
4514       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4515 
4516       // We can't sink an instruction if it is a phi node, is already in the
4517       // predicated block, is not in the loop, or may have side effects.
4518       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4519           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4520         continue;
4521 
4522       // It's legal to sink the instruction if all its uses occur in the
4523       // predicated block. Otherwise, there's nothing to do yet, and we may
4524       // need to reanalyze the instruction.
4525       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4526         InstsToReanalyze.push_back(I);
4527         continue;
4528       }
4529 
4530       // Move the instruction to the beginning of the predicated block, and add
4531       // it's operands to the worklist.
4532       I->moveBefore(&*PredBB->getFirstInsertionPt());
4533       Worklist.insert(I->op_begin(), I->op_end());
4534 
4535       // The sinking may have enabled other instructions to be sunk, so we will
4536       // need to iterate.
4537       Changed = true;
4538     }
4539   } while (Changed);
4540 }
4541 
4542 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4543   for (PHINode *OrigPhi : OrigPHIsToFix) {
4544     VPWidenPHIRecipe *VPPhi =
4545         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4546     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4547     // Make sure the builder has a valid insert point.
4548     Builder.SetInsertPoint(NewPhi);
4549     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4550       VPValue *Inc = VPPhi->getIncomingValue(i);
4551       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4552       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4553     }
4554   }
4555 }
4556 
4557 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4558                                    VPUser &Operands, unsigned UF,
4559                                    ElementCount VF, bool IsPtrLoopInvariant,
4560                                    SmallBitVector &IsIndexLoopInvariant,
4561                                    VPTransformState &State) {
4562   // Construct a vector GEP by widening the operands of the scalar GEP as
4563   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4564   // results in a vector of pointers when at least one operand of the GEP
4565   // is vector-typed. Thus, to keep the representation compact, we only use
4566   // vector-typed operands for loop-varying values.
4567 
4568   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4569     // If we are vectorizing, but the GEP has only loop-invariant operands,
4570     // the GEP we build (by only using vector-typed operands for
4571     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4572     // produce a vector of pointers, we need to either arbitrarily pick an
4573     // operand to broadcast, or broadcast a clone of the original GEP.
4574     // Here, we broadcast a clone of the original.
4575     //
4576     // TODO: If at some point we decide to scalarize instructions having
4577     //       loop-invariant operands, this special case will no longer be
4578     //       required. We would add the scalarization decision to
4579     //       collectLoopScalars() and teach getVectorValue() to broadcast
4580     //       the lane-zero scalar value.
4581     auto *Clone = Builder.Insert(GEP->clone());
4582     for (unsigned Part = 0; Part < UF; ++Part) {
4583       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4584       State.set(VPDef, EntryPart, Part);
4585       addMetadata(EntryPart, GEP);
4586     }
4587   } else {
4588     // If the GEP has at least one loop-varying operand, we are sure to
4589     // produce a vector of pointers. But if we are only unrolling, we want
4590     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4591     // produce with the code below will be scalar (if VF == 1) or vector
4592     // (otherwise). Note that for the unroll-only case, we still maintain
4593     // values in the vector mapping with initVector, as we do for other
4594     // instructions.
4595     for (unsigned Part = 0; Part < UF; ++Part) {
4596       // The pointer operand of the new GEP. If it's loop-invariant, we
4597       // won't broadcast it.
4598       auto *Ptr = IsPtrLoopInvariant
4599                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4600                       : State.get(Operands.getOperand(0), Part);
4601 
4602       // Collect all the indices for the new GEP. If any index is
4603       // loop-invariant, we won't broadcast it.
4604       SmallVector<Value *, 4> Indices;
4605       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4606         VPValue *Operand = Operands.getOperand(I);
4607         if (IsIndexLoopInvariant[I - 1])
4608           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4609         else
4610           Indices.push_back(State.get(Operand, Part));
4611       }
4612 
4613       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4614       // but it should be a vector, otherwise.
4615       auto *NewGEP =
4616           GEP->isInBounds()
4617               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4618                                           Indices)
4619               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4620       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4621              "NewGEP is not a pointer vector");
4622       State.set(VPDef, NewGEP, Part);
4623       addMetadata(NewGEP, GEP);
4624     }
4625   }
4626 }
4627 
4628 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4629                                               RecurrenceDescriptor *RdxDesc,
4630                                               VPValue *StartVPV, VPValue *Def,
4631                                               VPTransformState &State) {
4632   PHINode *P = cast<PHINode>(PN);
4633   if (EnableVPlanNativePath) {
4634     // Currently we enter here in the VPlan-native path for non-induction
4635     // PHIs where all control flow is uniform. We simply widen these PHIs.
4636     // Create a vector phi with no operands - the vector phi operands will be
4637     // set at the end of vector code generation.
4638     Type *VecTy = (State.VF.isScalar())
4639                       ? PN->getType()
4640                       : VectorType::get(PN->getType(), State.VF);
4641     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4642     State.set(Def, VecPhi, 0);
4643     OrigPHIsToFix.push_back(P);
4644 
4645     return;
4646   }
4647 
4648   assert(PN->getParent() == OrigLoop->getHeader() &&
4649          "Non-header phis should have been handled elsewhere");
4650 
4651   Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4652   // In order to support recurrences we need to be able to vectorize Phi nodes.
4653   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4654   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4655   // this value when we vectorize all of the instructions that use the PHI.
4656   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4657     Value *Iden = nullptr;
4658     bool ScalarPHI =
4659         (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4660     Type *VecTy =
4661         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4662 
4663     if (RdxDesc) {
4664       assert(Legal->isReductionVariable(P) && StartV &&
4665              "RdxDesc should only be set for reduction variables; in that case "
4666              "a StartV is also required");
4667       RecurKind RK = RdxDesc->getRecurrenceKind();
4668       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4669         // MinMax reduction have the start value as their identify.
4670         if (ScalarPHI) {
4671           Iden = StartV;
4672         } else {
4673           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4674           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4675           StartV = Iden =
4676               Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4677         }
4678       } else {
4679         Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4680             RK, VecTy->getScalarType());
4681         Iden = IdenC;
4682 
4683         if (!ScalarPHI) {
4684           Iden = ConstantVector::getSplat(State.VF, IdenC);
4685           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4686           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4687           Constant *Zero = Builder.getInt32(0);
4688           StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4689         }
4690       }
4691     }
4692 
4693     for (unsigned Part = 0; Part < State.UF; ++Part) {
4694       // This is phase one of vectorizing PHIs.
4695       Value *EntryPart = PHINode::Create(
4696           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4697       State.set(Def, EntryPart, Part);
4698       if (StartV) {
4699         // Make sure to add the reduction start value only to the
4700         // first unroll part.
4701         Value *StartVal = (Part == 0) ? StartV : Iden;
4702         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4703       }
4704     }
4705     return;
4706   }
4707 
4708   assert(!Legal->isReductionVariable(P) &&
4709          "reductions should be handled above");
4710 
4711   setDebugLocFromInst(Builder, P);
4712 
4713   // This PHINode must be an induction variable.
4714   // Make sure that we know about it.
4715   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4716 
4717   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4718   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4719 
4720   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4721   // which can be found from the original scalar operations.
4722   switch (II.getKind()) {
4723   case InductionDescriptor::IK_NoInduction:
4724     llvm_unreachable("Unknown induction");
4725   case InductionDescriptor::IK_IntInduction:
4726   case InductionDescriptor::IK_FpInduction:
4727     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4728   case InductionDescriptor::IK_PtrInduction: {
4729     // Handle the pointer induction variable case.
4730     assert(P->getType()->isPointerTy() && "Unexpected type.");
4731 
4732     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4733       // This is the normalized GEP that starts counting at zero.
4734       Value *PtrInd =
4735           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4736       // Determine the number of scalars we need to generate for each unroll
4737       // iteration. If the instruction is uniform, we only need to generate the
4738       // first lane. Otherwise, we generate all VF values.
4739       unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF)
4740                            ? 1
4741                            : State.VF.getKnownMinValue();
4742       for (unsigned Part = 0; Part < UF; ++Part) {
4743         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4744           Constant *Idx = ConstantInt::get(
4745               PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue());
4746           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4747           Value *SclrGep =
4748               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4749           SclrGep->setName("next.gep");
4750           State.set(Def, SclrGep, VPIteration(Part, Lane));
4751         }
4752       }
4753       return;
4754     }
4755     assert(isa<SCEVConstant>(II.getStep()) &&
4756            "Induction step not a SCEV constant!");
4757     Type *PhiType = II.getStep()->getType();
4758 
4759     // Build a pointer phi
4760     Value *ScalarStartValue = II.getStartValue();
4761     Type *ScStValueType = ScalarStartValue->getType();
4762     PHINode *NewPointerPhi =
4763         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4764     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4765 
4766     // A pointer induction, performed by using a gep
4767     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4768     Instruction *InductionLoc = LoopLatch->getTerminator();
4769     const SCEV *ScalarStep = II.getStep();
4770     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4771     Value *ScalarStepValue =
4772         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4773     Value *InductionGEP = GetElementPtrInst::Create(
4774         ScStValueType->getPointerElementType(), NewPointerPhi,
4775         Builder.CreateMul(
4776             ScalarStepValue,
4777             ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)),
4778         "ptr.ind", InductionLoc);
4779     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4780 
4781     // Create UF many actual address geps that use the pointer
4782     // phi as base and a vectorized version of the step value
4783     // (<step*0, ..., step*N>) as offset.
4784     for (unsigned Part = 0; Part < State.UF; ++Part) {
4785       SmallVector<Constant *, 8> Indices;
4786       // Create a vector of consecutive numbers from zero to VF.
4787       for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i)
4788         Indices.push_back(
4789             ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue()));
4790       Constant *StartOffset = ConstantVector::get(Indices);
4791 
4792       Value *GEP = Builder.CreateGEP(
4793           ScStValueType->getPointerElementType(), NewPointerPhi,
4794           Builder.CreateMul(StartOffset,
4795                             Builder.CreateVectorSplat(
4796                                 State.VF.getKnownMinValue(), ScalarStepValue),
4797                             "vector.gep"));
4798       State.set(Def, GEP, Part);
4799     }
4800   }
4801   }
4802 }
4803 
4804 /// A helper function for checking whether an integer division-related
4805 /// instruction may divide by zero (in which case it must be predicated if
4806 /// executed conditionally in the scalar code).
4807 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4808 /// Non-zero divisors that are non compile-time constants will not be
4809 /// converted into multiplication, so we will still end up scalarizing
4810 /// the division, but can do so w/o predication.
4811 static bool mayDivideByZero(Instruction &I) {
4812   assert((I.getOpcode() == Instruction::UDiv ||
4813           I.getOpcode() == Instruction::SDiv ||
4814           I.getOpcode() == Instruction::URem ||
4815           I.getOpcode() == Instruction::SRem) &&
4816          "Unexpected instruction");
4817   Value *Divisor = I.getOperand(1);
4818   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4819   return !CInt || CInt->isZero();
4820 }
4821 
4822 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4823                                            VPUser &User,
4824                                            VPTransformState &State) {
4825   switch (I.getOpcode()) {
4826   case Instruction::Call:
4827   case Instruction::Br:
4828   case Instruction::PHI:
4829   case Instruction::GetElementPtr:
4830   case Instruction::Select:
4831     llvm_unreachable("This instruction is handled by a different recipe.");
4832   case Instruction::UDiv:
4833   case Instruction::SDiv:
4834   case Instruction::SRem:
4835   case Instruction::URem:
4836   case Instruction::Add:
4837   case Instruction::FAdd:
4838   case Instruction::Sub:
4839   case Instruction::FSub:
4840   case Instruction::FNeg:
4841   case Instruction::Mul:
4842   case Instruction::FMul:
4843   case Instruction::FDiv:
4844   case Instruction::FRem:
4845   case Instruction::Shl:
4846   case Instruction::LShr:
4847   case Instruction::AShr:
4848   case Instruction::And:
4849   case Instruction::Or:
4850   case Instruction::Xor: {
4851     // Just widen unops and binops.
4852     setDebugLocFromInst(Builder, &I);
4853 
4854     for (unsigned Part = 0; Part < UF; ++Part) {
4855       SmallVector<Value *, 2> Ops;
4856       for (VPValue *VPOp : User.operands())
4857         Ops.push_back(State.get(VPOp, Part));
4858 
4859       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4860 
4861       if (auto *VecOp = dyn_cast<Instruction>(V))
4862         VecOp->copyIRFlags(&I);
4863 
4864       // Use this vector value for all users of the original instruction.
4865       State.set(Def, V, Part);
4866       addMetadata(V, &I);
4867     }
4868 
4869     break;
4870   }
4871   case Instruction::ICmp:
4872   case Instruction::FCmp: {
4873     // Widen compares. Generate vector compares.
4874     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4875     auto *Cmp = cast<CmpInst>(&I);
4876     setDebugLocFromInst(Builder, Cmp);
4877     for (unsigned Part = 0; Part < UF; ++Part) {
4878       Value *A = State.get(User.getOperand(0), Part);
4879       Value *B = State.get(User.getOperand(1), Part);
4880       Value *C = nullptr;
4881       if (FCmp) {
4882         // Propagate fast math flags.
4883         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4884         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4885         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4886       } else {
4887         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4888       }
4889       State.set(Def, C, Part);
4890       addMetadata(C, &I);
4891     }
4892 
4893     break;
4894   }
4895 
4896   case Instruction::ZExt:
4897   case Instruction::SExt:
4898   case Instruction::FPToUI:
4899   case Instruction::FPToSI:
4900   case Instruction::FPExt:
4901   case Instruction::PtrToInt:
4902   case Instruction::IntToPtr:
4903   case Instruction::SIToFP:
4904   case Instruction::UIToFP:
4905   case Instruction::Trunc:
4906   case Instruction::FPTrunc:
4907   case Instruction::BitCast: {
4908     auto *CI = cast<CastInst>(&I);
4909     setDebugLocFromInst(Builder, CI);
4910 
4911     /// Vectorize casts.
4912     Type *DestTy =
4913         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4914 
4915     for (unsigned Part = 0; Part < UF; ++Part) {
4916       Value *A = State.get(User.getOperand(0), Part);
4917       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4918       State.set(Def, Cast, Part);
4919       addMetadata(Cast, &I);
4920     }
4921     break;
4922   }
4923   default:
4924     // This instruction is not vectorized by simple widening.
4925     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4926     llvm_unreachable("Unhandled instruction!");
4927   } // end of switch.
4928 }
4929 
4930 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4931                                                VPUser &ArgOperands,
4932                                                VPTransformState &State) {
4933   assert(!isa<DbgInfoIntrinsic>(I) &&
4934          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4935   setDebugLocFromInst(Builder, &I);
4936 
4937   Module *M = I.getParent()->getParent()->getParent();
4938   auto *CI = cast<CallInst>(&I);
4939 
4940   SmallVector<Type *, 4> Tys;
4941   for (Value *ArgOperand : CI->arg_operands())
4942     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4943 
4944   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4945 
4946   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4947   // version of the instruction.
4948   // Is it beneficial to perform intrinsic call compared to lib call?
4949   bool NeedToScalarize = false;
4950   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4951   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4952   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4953   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4954          "Instruction should be scalarized elsewhere.");
4955   assert(IntrinsicCost.isValid() && CallCost.isValid() &&
4956          "Cannot have invalid costs while widening");
4957 
4958   for (unsigned Part = 0; Part < UF; ++Part) {
4959     SmallVector<Value *, 4> Args;
4960     for (auto &I : enumerate(ArgOperands.operands())) {
4961       // Some intrinsics have a scalar argument - don't replace it with a
4962       // vector.
4963       Value *Arg;
4964       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4965         Arg = State.get(I.value(), Part);
4966       else
4967         Arg = State.get(I.value(), VPIteration(0, 0));
4968       Args.push_back(Arg);
4969     }
4970 
4971     Function *VectorF;
4972     if (UseVectorIntrinsic) {
4973       // Use vector version of the intrinsic.
4974       Type *TysForDecl[] = {CI->getType()};
4975       if (VF.isVector())
4976         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4977       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4978       assert(VectorF && "Can't retrieve vector intrinsic.");
4979     } else {
4980       // Use vector version of the function call.
4981       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4982 #ifndef NDEBUG
4983       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4984              "Can't create vector function.");
4985 #endif
4986         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4987     }
4988       SmallVector<OperandBundleDef, 1> OpBundles;
4989       CI->getOperandBundlesAsDefs(OpBundles);
4990       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4991 
4992       if (isa<FPMathOperator>(V))
4993         V->copyFastMathFlags(CI);
4994 
4995       State.set(Def, V, Part);
4996       addMetadata(V, &I);
4997   }
4998 }
4999 
5000 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5001                                                  VPUser &Operands,
5002                                                  bool InvariantCond,
5003                                                  VPTransformState &State) {
5004   setDebugLocFromInst(Builder, &I);
5005 
5006   // The condition can be loop invariant  but still defined inside the
5007   // loop. This means that we can't just use the original 'cond' value.
5008   // We have to take the 'vectorized' value and pick the first lane.
5009   // Instcombine will make this a no-op.
5010   auto *InvarCond = InvariantCond
5011                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5012                         : nullptr;
5013 
5014   for (unsigned Part = 0; Part < UF; ++Part) {
5015     Value *Cond =
5016         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5017     Value *Op0 = State.get(Operands.getOperand(1), Part);
5018     Value *Op1 = State.get(Operands.getOperand(2), Part);
5019     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5020     State.set(VPDef, Sel, Part);
5021     addMetadata(Sel, &I);
5022   }
5023 }
5024 
5025 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5026   // We should not collect Scalars more than once per VF. Right now, this
5027   // function is called from collectUniformsAndScalars(), which already does
5028   // this check. Collecting Scalars for VF=1 does not make any sense.
5029   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5030          "This function should not be visited twice for the same VF");
5031 
5032   SmallSetVector<Instruction *, 8> Worklist;
5033 
5034   // These sets are used to seed the analysis with pointers used by memory
5035   // accesses that will remain scalar.
5036   SmallSetVector<Instruction *, 8> ScalarPtrs;
5037   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5038   auto *Latch = TheLoop->getLoopLatch();
5039 
5040   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5041   // The pointer operands of loads and stores will be scalar as long as the
5042   // memory access is not a gather or scatter operation. The value operand of a
5043   // store will remain scalar if the store is scalarized.
5044   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5045     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5046     assert(WideningDecision != CM_Unknown &&
5047            "Widening decision should be ready at this moment");
5048     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5049       if (Ptr == Store->getValueOperand())
5050         return WideningDecision == CM_Scalarize;
5051     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5052            "Ptr is neither a value or pointer operand");
5053     return WideningDecision != CM_GatherScatter;
5054   };
5055 
5056   // A helper that returns true if the given value is a bitcast or
5057   // getelementptr instruction contained in the loop.
5058   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5059     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5060             isa<GetElementPtrInst>(V)) &&
5061            !TheLoop->isLoopInvariant(V);
5062   };
5063 
5064   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5065     if (!isa<PHINode>(Ptr) ||
5066         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5067       return false;
5068     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5069     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5070       return false;
5071     return isScalarUse(MemAccess, Ptr);
5072   };
5073 
5074   // A helper that evaluates a memory access's use of a pointer. If the
5075   // pointer is actually the pointer induction of a loop, it is being
5076   // inserted into Worklist. If the use will be a scalar use, and the
5077   // pointer is only used by memory accesses, we place the pointer in
5078   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5079   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5080     if (isScalarPtrInduction(MemAccess, Ptr)) {
5081       Worklist.insert(cast<Instruction>(Ptr));
5082       Instruction *Update = cast<Instruction>(
5083           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5084       Worklist.insert(Update);
5085       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5086                         << "\n");
5087       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5088                         << "\n");
5089       return;
5090     }
5091     // We only care about bitcast and getelementptr instructions contained in
5092     // the loop.
5093     if (!isLoopVaryingBitCastOrGEP(Ptr))
5094       return;
5095 
5096     // If the pointer has already been identified as scalar (e.g., if it was
5097     // also identified as uniform), there's nothing to do.
5098     auto *I = cast<Instruction>(Ptr);
5099     if (Worklist.count(I))
5100       return;
5101 
5102     // If the use of the pointer will be a scalar use, and all users of the
5103     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5104     // place the pointer in PossibleNonScalarPtrs.
5105     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5106           return isa<LoadInst>(U) || isa<StoreInst>(U);
5107         }))
5108       ScalarPtrs.insert(I);
5109     else
5110       PossibleNonScalarPtrs.insert(I);
5111   };
5112 
5113   // We seed the scalars analysis with three classes of instructions: (1)
5114   // instructions marked uniform-after-vectorization and (2) bitcast,
5115   // getelementptr and (pointer) phi instructions used by memory accesses
5116   // requiring a scalar use.
5117   //
5118   // (1) Add to the worklist all instructions that have been identified as
5119   // uniform-after-vectorization.
5120   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5121 
5122   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5123   // memory accesses requiring a scalar use. The pointer operands of loads and
5124   // stores will be scalar as long as the memory accesses is not a gather or
5125   // scatter operation. The value operand of a store will remain scalar if the
5126   // store is scalarized.
5127   for (auto *BB : TheLoop->blocks())
5128     for (auto &I : *BB) {
5129       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5130         evaluatePtrUse(Load, Load->getPointerOperand());
5131       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5132         evaluatePtrUse(Store, Store->getPointerOperand());
5133         evaluatePtrUse(Store, Store->getValueOperand());
5134       }
5135     }
5136   for (auto *I : ScalarPtrs)
5137     if (!PossibleNonScalarPtrs.count(I)) {
5138       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5139       Worklist.insert(I);
5140     }
5141 
5142   // Insert the forced scalars.
5143   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5144   // induction variable when the PHI user is scalarized.
5145   auto ForcedScalar = ForcedScalars.find(VF);
5146   if (ForcedScalar != ForcedScalars.end())
5147     for (auto *I : ForcedScalar->second)
5148       Worklist.insert(I);
5149 
5150   // Expand the worklist by looking through any bitcasts and getelementptr
5151   // instructions we've already identified as scalar. This is similar to the
5152   // expansion step in collectLoopUniforms(); however, here we're only
5153   // expanding to include additional bitcasts and getelementptr instructions.
5154   unsigned Idx = 0;
5155   while (Idx != Worklist.size()) {
5156     Instruction *Dst = Worklist[Idx++];
5157     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5158       continue;
5159     auto *Src = cast<Instruction>(Dst->getOperand(0));
5160     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5161           auto *J = cast<Instruction>(U);
5162           return !TheLoop->contains(J) || Worklist.count(J) ||
5163                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5164                   isScalarUse(J, Src));
5165         })) {
5166       Worklist.insert(Src);
5167       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5168     }
5169   }
5170 
5171   // An induction variable will remain scalar if all users of the induction
5172   // variable and induction variable update remain scalar.
5173   for (auto &Induction : Legal->getInductionVars()) {
5174     auto *Ind = Induction.first;
5175     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5176 
5177     // If tail-folding is applied, the primary induction variable will be used
5178     // to feed a vector compare.
5179     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5180       continue;
5181 
5182     // Determine if all users of the induction variable are scalar after
5183     // vectorization.
5184     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5185       auto *I = cast<Instruction>(U);
5186       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5187     });
5188     if (!ScalarInd)
5189       continue;
5190 
5191     // Determine if all users of the induction variable update instruction are
5192     // scalar after vectorization.
5193     auto ScalarIndUpdate =
5194         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5195           auto *I = cast<Instruction>(U);
5196           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5197         });
5198     if (!ScalarIndUpdate)
5199       continue;
5200 
5201     // The induction variable and its update instruction will remain scalar.
5202     Worklist.insert(Ind);
5203     Worklist.insert(IndUpdate);
5204     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5205     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5206                       << "\n");
5207   }
5208 
5209   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5210 }
5211 
5212 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5213                                                          ElementCount VF) {
5214   if (!blockNeedsPredication(I->getParent()))
5215     return false;
5216   switch(I->getOpcode()) {
5217   default:
5218     break;
5219   case Instruction::Load:
5220   case Instruction::Store: {
5221     if (!Legal->isMaskRequired(I))
5222       return false;
5223     auto *Ptr = getLoadStorePointerOperand(I);
5224     auto *Ty = getMemInstValueType(I);
5225     // We have already decided how to vectorize this instruction, get that
5226     // result.
5227     if (VF.isVector()) {
5228       InstWidening WideningDecision = getWideningDecision(I, VF);
5229       assert(WideningDecision != CM_Unknown &&
5230              "Widening decision should be ready at this moment");
5231       return WideningDecision == CM_Scalarize;
5232     }
5233     const Align Alignment = getLoadStoreAlignment(I);
5234     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5235                                 isLegalMaskedGather(Ty, Alignment))
5236                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5237                                 isLegalMaskedScatter(Ty, Alignment));
5238   }
5239   case Instruction::UDiv:
5240   case Instruction::SDiv:
5241   case Instruction::SRem:
5242   case Instruction::URem:
5243     return mayDivideByZero(*I);
5244   }
5245   return false;
5246 }
5247 
5248 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5249     Instruction *I, ElementCount VF) {
5250   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5251   assert(getWideningDecision(I, VF) == CM_Unknown &&
5252          "Decision should not be set yet.");
5253   auto *Group = getInterleavedAccessGroup(I);
5254   assert(Group && "Must have a group.");
5255 
5256   // If the instruction's allocated size doesn't equal it's type size, it
5257   // requires padding and will be scalarized.
5258   auto &DL = I->getModule()->getDataLayout();
5259   auto *ScalarTy = getMemInstValueType(I);
5260   if (hasIrregularType(ScalarTy, DL, VF))
5261     return false;
5262 
5263   // Check if masking is required.
5264   // A Group may need masking for one of two reasons: it resides in a block that
5265   // needs predication, or it was decided to use masking to deal with gaps.
5266   bool PredicatedAccessRequiresMasking =
5267       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5268   bool AccessWithGapsRequiresMasking =
5269       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5270   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5271     return true;
5272 
5273   // If masked interleaving is required, we expect that the user/target had
5274   // enabled it, because otherwise it either wouldn't have been created or
5275   // it should have been invalidated by the CostModel.
5276   assert(useMaskedInterleavedAccesses(TTI) &&
5277          "Masked interleave-groups for predicated accesses are not enabled.");
5278 
5279   auto *Ty = getMemInstValueType(I);
5280   const Align Alignment = getLoadStoreAlignment(I);
5281   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5282                           : TTI.isLegalMaskedStore(Ty, Alignment);
5283 }
5284 
5285 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5286     Instruction *I, ElementCount VF) {
5287   // Get and ensure we have a valid memory instruction.
5288   LoadInst *LI = dyn_cast<LoadInst>(I);
5289   StoreInst *SI = dyn_cast<StoreInst>(I);
5290   assert((LI || SI) && "Invalid memory instruction");
5291 
5292   auto *Ptr = getLoadStorePointerOperand(I);
5293 
5294   // In order to be widened, the pointer should be consecutive, first of all.
5295   if (!Legal->isConsecutivePtr(Ptr))
5296     return false;
5297 
5298   // If the instruction is a store located in a predicated block, it will be
5299   // scalarized.
5300   if (isScalarWithPredication(I))
5301     return false;
5302 
5303   // If the instruction's allocated size doesn't equal it's type size, it
5304   // requires padding and will be scalarized.
5305   auto &DL = I->getModule()->getDataLayout();
5306   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5307   if (hasIrregularType(ScalarTy, DL, VF))
5308     return false;
5309 
5310   return true;
5311 }
5312 
5313 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5314   // We should not collect Uniforms more than once per VF. Right now,
5315   // this function is called from collectUniformsAndScalars(), which
5316   // already does this check. Collecting Uniforms for VF=1 does not make any
5317   // sense.
5318 
5319   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5320          "This function should not be visited twice for the same VF");
5321 
5322   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5323   // not analyze again.  Uniforms.count(VF) will return 1.
5324   Uniforms[VF].clear();
5325 
5326   // We now know that the loop is vectorizable!
5327   // Collect instructions inside the loop that will remain uniform after
5328   // vectorization.
5329 
5330   // Global values, params and instructions outside of current loop are out of
5331   // scope.
5332   auto isOutOfScope = [&](Value *V) -> bool {
5333     Instruction *I = dyn_cast<Instruction>(V);
5334     return (!I || !TheLoop->contains(I));
5335   };
5336 
5337   SetVector<Instruction *> Worklist;
5338   BasicBlock *Latch = TheLoop->getLoopLatch();
5339 
5340   // Instructions that are scalar with predication must not be considered
5341   // uniform after vectorization, because that would create an erroneous
5342   // replicating region where only a single instance out of VF should be formed.
5343   // TODO: optimize such seldom cases if found important, see PR40816.
5344   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5345     if (isOutOfScope(I)) {
5346       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5347                         << *I << "\n");
5348       return;
5349     }
5350     if (isScalarWithPredication(I, VF)) {
5351       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5352                         << *I << "\n");
5353       return;
5354     }
5355     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5356     Worklist.insert(I);
5357   };
5358 
5359   // Start with the conditional branch. If the branch condition is an
5360   // instruction contained in the loop that is only used by the branch, it is
5361   // uniform.
5362   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5363   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5364     addToWorklistIfAllowed(Cmp);
5365 
5366   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5367     InstWidening WideningDecision = getWideningDecision(I, VF);
5368     assert(WideningDecision != CM_Unknown &&
5369            "Widening decision should be ready at this moment");
5370 
5371     // A uniform memory op is itself uniform.  We exclude uniform stores
5372     // here as they demand the last lane, not the first one.
5373     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5374       assert(WideningDecision == CM_Scalarize);
5375       return true;
5376     }
5377 
5378     return (WideningDecision == CM_Widen ||
5379             WideningDecision == CM_Widen_Reverse ||
5380             WideningDecision == CM_Interleave);
5381   };
5382 
5383 
5384   // Returns true if Ptr is the pointer operand of a memory access instruction
5385   // I, and I is known to not require scalarization.
5386   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5387     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5388   };
5389 
5390   // Holds a list of values which are known to have at least one uniform use.
5391   // Note that there may be other uses which aren't uniform.  A "uniform use"
5392   // here is something which only demands lane 0 of the unrolled iterations;
5393   // it does not imply that all lanes produce the same value (e.g. this is not
5394   // the usual meaning of uniform)
5395   SmallPtrSet<Value *, 8> HasUniformUse;
5396 
5397   // Scan the loop for instructions which are either a) known to have only
5398   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5399   for (auto *BB : TheLoop->blocks())
5400     for (auto &I : *BB) {
5401       // If there's no pointer operand, there's nothing to do.
5402       auto *Ptr = getLoadStorePointerOperand(&I);
5403       if (!Ptr)
5404         continue;
5405 
5406       // A uniform memory op is itself uniform.  We exclude uniform stores
5407       // here as they demand the last lane, not the first one.
5408       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5409         addToWorklistIfAllowed(&I);
5410 
5411       if (isUniformDecision(&I, VF)) {
5412         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5413         HasUniformUse.insert(Ptr);
5414       }
5415     }
5416 
5417   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5418   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5419   // disallows uses outside the loop as well.
5420   for (auto *V : HasUniformUse) {
5421     if (isOutOfScope(V))
5422       continue;
5423     auto *I = cast<Instruction>(V);
5424     auto UsersAreMemAccesses =
5425       llvm::all_of(I->users(), [&](User *U) -> bool {
5426         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5427       });
5428     if (UsersAreMemAccesses)
5429       addToWorklistIfAllowed(I);
5430   }
5431 
5432   // Expand Worklist in topological order: whenever a new instruction
5433   // is added , its users should be already inside Worklist.  It ensures
5434   // a uniform instruction will only be used by uniform instructions.
5435   unsigned idx = 0;
5436   while (idx != Worklist.size()) {
5437     Instruction *I = Worklist[idx++];
5438 
5439     for (auto OV : I->operand_values()) {
5440       // isOutOfScope operands cannot be uniform instructions.
5441       if (isOutOfScope(OV))
5442         continue;
5443       // First order recurrence Phi's should typically be considered
5444       // non-uniform.
5445       auto *OP = dyn_cast<PHINode>(OV);
5446       if (OP && Legal->isFirstOrderRecurrence(OP))
5447         continue;
5448       // If all the users of the operand are uniform, then add the
5449       // operand into the uniform worklist.
5450       auto *OI = cast<Instruction>(OV);
5451       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5452             auto *J = cast<Instruction>(U);
5453             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5454           }))
5455         addToWorklistIfAllowed(OI);
5456     }
5457   }
5458 
5459   // For an instruction to be added into Worklist above, all its users inside
5460   // the loop should also be in Worklist. However, this condition cannot be
5461   // true for phi nodes that form a cyclic dependence. We must process phi
5462   // nodes separately. An induction variable will remain uniform if all users
5463   // of the induction variable and induction variable update remain uniform.
5464   // The code below handles both pointer and non-pointer induction variables.
5465   for (auto &Induction : Legal->getInductionVars()) {
5466     auto *Ind = Induction.first;
5467     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5468 
5469     // Determine if all users of the induction variable are uniform after
5470     // vectorization.
5471     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5472       auto *I = cast<Instruction>(U);
5473       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5474              isVectorizedMemAccessUse(I, Ind);
5475     });
5476     if (!UniformInd)
5477       continue;
5478 
5479     // Determine if all users of the induction variable update instruction are
5480     // uniform after vectorization.
5481     auto UniformIndUpdate =
5482         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5483           auto *I = cast<Instruction>(U);
5484           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5485                  isVectorizedMemAccessUse(I, IndUpdate);
5486         });
5487     if (!UniformIndUpdate)
5488       continue;
5489 
5490     // The induction variable and its update instruction will remain uniform.
5491     addToWorklistIfAllowed(Ind);
5492     addToWorklistIfAllowed(IndUpdate);
5493   }
5494 
5495   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5496 }
5497 
5498 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5499   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5500 
5501   if (Legal->getRuntimePointerChecking()->Need) {
5502     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5503         "runtime pointer checks needed. Enable vectorization of this "
5504         "loop with '#pragma clang loop vectorize(enable)' when "
5505         "compiling with -Os/-Oz",
5506         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5507     return true;
5508   }
5509 
5510   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5511     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5512         "runtime SCEV checks needed. Enable vectorization of this "
5513         "loop with '#pragma clang loop vectorize(enable)' when "
5514         "compiling with -Os/-Oz",
5515         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5516     return true;
5517   }
5518 
5519   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5520   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5521     reportVectorizationFailure("Runtime stride check for small trip count",
5522         "runtime stride == 1 checks needed. Enable vectorization of "
5523         "this loop without such check by compiling with -Os/-Oz",
5524         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5525     return true;
5526   }
5527 
5528   return false;
5529 }
5530 
5531 Optional<ElementCount>
5532 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5533   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5534     // TODO: It may by useful to do since it's still likely to be dynamically
5535     // uniform if the target can skip.
5536     reportVectorizationFailure(
5537         "Not inserting runtime ptr check for divergent target",
5538         "runtime pointer checks needed. Not enabled for divergent target",
5539         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5540     return None;
5541   }
5542 
5543   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5544   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5545   if (TC == 1) {
5546     reportVectorizationFailure("Single iteration (non) loop",
5547         "loop trip count is one, irrelevant for vectorization",
5548         "SingleIterationLoop", ORE, TheLoop);
5549     return None;
5550   }
5551 
5552   switch (ScalarEpilogueStatus) {
5553   case CM_ScalarEpilogueAllowed:
5554     return computeFeasibleMaxVF(TC, UserVF);
5555   case CM_ScalarEpilogueNotAllowedUsePredicate:
5556     LLVM_FALLTHROUGH;
5557   case CM_ScalarEpilogueNotNeededUsePredicate:
5558     LLVM_DEBUG(
5559         dbgs() << "LV: vector predicate hint/switch found.\n"
5560                << "LV: Not allowing scalar epilogue, creating predicated "
5561                << "vector loop.\n");
5562     break;
5563   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5564     // fallthrough as a special case of OptForSize
5565   case CM_ScalarEpilogueNotAllowedOptSize:
5566     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5567       LLVM_DEBUG(
5568           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5569     else
5570       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5571                         << "count.\n");
5572 
5573     // Bail if runtime checks are required, which are not good when optimising
5574     // for size.
5575     if (runtimeChecksRequired())
5576       return None;
5577 
5578     break;
5579   }
5580 
5581   // The only loops we can vectorize without a scalar epilogue, are loops with
5582   // a bottom-test and a single exiting block. We'd have to handle the fact
5583   // that not every instruction executes on the last iteration.  This will
5584   // require a lane mask which varies through the vector loop body.  (TODO)
5585   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5586     // If there was a tail-folding hint/switch, but we can't fold the tail by
5587     // masking, fallback to a vectorization with a scalar epilogue.
5588     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5589       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5590                            "scalar epilogue instead.\n");
5591       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5592       return computeFeasibleMaxVF(TC, UserVF);
5593     }
5594     return None;
5595   }
5596 
5597   // Now try the tail folding
5598 
5599   // Invalidate interleave groups that require an epilogue if we can't mask
5600   // the interleave-group.
5601   if (!useMaskedInterleavedAccesses(TTI)) {
5602     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5603            "No decisions should have been taken at this point");
5604     // Note: There is no need to invalidate any cost modeling decisions here, as
5605     // non where taken so far.
5606     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5607   }
5608 
5609   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5610   assert(!MaxVF.isScalable() &&
5611          "Scalable vectors do not yet support tail folding");
5612   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5613          "MaxVF must be a power of 2");
5614   unsigned MaxVFtimesIC =
5615       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5616   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5617   // chose.
5618   ScalarEvolution *SE = PSE.getSE();
5619   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5620   const SCEV *ExitCount = SE->getAddExpr(
5621       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5622   const SCEV *Rem = SE->getURemExpr(
5623       SE->applyLoopGuards(ExitCount, TheLoop),
5624       SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5625   if (Rem->isZero()) {
5626     // Accept MaxVF if we do not have a tail.
5627     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5628     return MaxVF;
5629   }
5630 
5631   // If we don't know the precise trip count, or if the trip count that we
5632   // found modulo the vectorization factor is not zero, try to fold the tail
5633   // by masking.
5634   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5635   if (Legal->prepareToFoldTailByMasking()) {
5636     FoldTailByMasking = true;
5637     return MaxVF;
5638   }
5639 
5640   // If there was a tail-folding hint/switch, but we can't fold the tail by
5641   // masking, fallback to a vectorization with a scalar epilogue.
5642   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5643     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5644                          "scalar epilogue instead.\n");
5645     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5646     return MaxVF;
5647   }
5648 
5649   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5650     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5651     return None;
5652   }
5653 
5654   if (TC == 0) {
5655     reportVectorizationFailure(
5656         "Unable to calculate the loop count due to complex control flow",
5657         "unable to calculate the loop count due to complex control flow",
5658         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5659     return None;
5660   }
5661 
5662   reportVectorizationFailure(
5663       "Cannot optimize for size and vectorize at the same time.",
5664       "cannot optimize for size and vectorize at the same time. "
5665       "Enable vectorization of this loop with '#pragma clang loop "
5666       "vectorize(enable)' when compiling with -Os/-Oz",
5667       "NoTailLoopWithOptForSize", ORE, TheLoop);
5668   return None;
5669 }
5670 
5671 ElementCount
5672 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5673                                                  ElementCount UserVF) {
5674   bool IgnoreScalableUserVF = UserVF.isScalable() &&
5675                               !TTI.supportsScalableVectors() &&
5676                               !ForceTargetSupportsScalableVectors;
5677   if (IgnoreScalableUserVF) {
5678     LLVM_DEBUG(
5679         dbgs() << "LV: Ignoring VF=" << UserVF
5680                << " because target does not support scalable vectors.\n");
5681     ORE->emit([&]() {
5682       return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
5683                                         TheLoop->getStartLoc(),
5684                                         TheLoop->getHeader())
5685              << "Ignoring VF=" << ore::NV("UserVF", UserVF)
5686              << " because target does not support scalable vectors.";
5687     });
5688   }
5689 
5690   // Beyond this point two scenarios are handled. If UserVF isn't specified
5691   // then a suitable VF is chosen. If UserVF is specified and there are
5692   // dependencies, check if it's legal. However, if a UserVF is specified and
5693   // there are no dependencies, then there's nothing to do.
5694   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5695     if (!canVectorizeReductions(UserVF)) {
5696       reportVectorizationFailure(
5697           "LV: Scalable vectorization not supported for the reduction "
5698           "operations found in this loop. Using fixed-width "
5699           "vectorization instead.",
5700           "Scalable vectorization not supported for the reduction operations "
5701           "found in this loop. Using fixed-width vectorization instead.",
5702           "ScalableVFUnfeasible", ORE, TheLoop);
5703       return computeFeasibleMaxVF(
5704           ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5705     }
5706 
5707     if (Legal->isSafeForAnyVectorWidth())
5708       return UserVF;
5709   }
5710 
5711   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5712   unsigned SmallestType, WidestType;
5713   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5714   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5715 
5716   // Get the maximum safe dependence distance in bits computed by LAA.
5717   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5718   // the memory accesses that is most restrictive (involved in the smallest
5719   // dependence distance).
5720   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5721 
5722   // If the user vectorization factor is legally unsafe, clamp it to a safe
5723   // value. Otherwise, return as is.
5724   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5725     unsigned MaxSafeElements =
5726         PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5727     ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
5728 
5729     if (UserVF.isScalable()) {
5730       Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5731 
5732       // Scale VF by vscale before checking if it's safe.
5733       MaxSafeVF = ElementCount::getScalable(
5734           MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5735 
5736       if (MaxSafeVF.isZero()) {
5737         // The dependence distance is too small to use scalable vectors,
5738         // fallback on fixed.
5739         LLVM_DEBUG(
5740             dbgs()
5741             << "LV: Max legal vector width too small, scalable vectorization "
5742                "unfeasible. Using fixed-width vectorization instead.\n");
5743         ORE->emit([&]() {
5744           return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
5745                                             TheLoop->getStartLoc(),
5746                                             TheLoop->getHeader())
5747                  << "Max legal vector width too small, scalable vectorization "
5748                  << "unfeasible. Using fixed-width vectorization instead.";
5749         });
5750         return computeFeasibleMaxVF(
5751             ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5752       }
5753     }
5754 
5755     LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
5756 
5757     if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
5758       return UserVF;
5759 
5760     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5761                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5762                       << ".\n");
5763     ORE->emit([&]() {
5764       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5765                                         TheLoop->getStartLoc(),
5766                                         TheLoop->getHeader())
5767              << "User-specified vectorization factor "
5768              << ore::NV("UserVectorizationFactor", UserVF)
5769              << " is unsafe, clamping to maximum safe vectorization factor "
5770              << ore::NV("VectorizationFactor", MaxSafeVF);
5771     });
5772     return MaxSafeVF;
5773   }
5774 
5775   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5776 
5777   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5778   // Note that both WidestRegister and WidestType may not be a powers of 2.
5779   auto MaxVectorSize =
5780       ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
5781 
5782   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5783                     << " / " << WidestType << " bits.\n");
5784   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5785                     << WidestRegister << " bits.\n");
5786 
5787   assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
5788          "Did not expect to pack so many elements"
5789          " into one vector!");
5790   if (MaxVectorSize.getFixedValue() == 0) {
5791     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5792     return ElementCount::getFixed(1);
5793   } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
5794              isPowerOf2_32(ConstTripCount)) {
5795     // We need to clamp the VF to be the ConstTripCount. There is no point in
5796     // choosing a higher viable VF as done in the loop below.
5797     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5798                       << ConstTripCount << "\n");
5799     return ElementCount::getFixed(ConstTripCount);
5800   }
5801 
5802   ElementCount MaxVF = MaxVectorSize;
5803   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5804       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5805     // Collect all viable vectorization factors larger than the default MaxVF
5806     // (i.e. MaxVectorSize).
5807     SmallVector<ElementCount, 8> VFs;
5808     auto MaxVectorSizeMaxBW =
5809         ElementCount::getFixed(WidestRegister / SmallestType);
5810     for (ElementCount VS = MaxVectorSize * 2;
5811          ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
5812       VFs.push_back(VS);
5813 
5814     // For each VF calculate its register usage.
5815     auto RUs = calculateRegisterUsage(VFs);
5816 
5817     // Select the largest VF which doesn't require more registers than existing
5818     // ones.
5819     for (int i = RUs.size() - 1; i >= 0; --i) {
5820       bool Selected = true;
5821       for (auto &pair : RUs[i].MaxLocalUsers) {
5822         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5823         if (pair.second > TargetNumRegisters)
5824           Selected = false;
5825       }
5826       if (Selected) {
5827         MaxVF = VFs[i];
5828         break;
5829       }
5830     }
5831     if (ElementCount MinVF =
5832             TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
5833       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5834         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5835                           << ") with target's minimum: " << MinVF << '\n');
5836         MaxVF = MinVF;
5837       }
5838     }
5839   }
5840   return MaxVF;
5841 }
5842 
5843 VectorizationFactor
5844 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5845   // FIXME: This can be fixed for scalable vectors later, because at this stage
5846   // the LoopVectorizer will only consider vectorizing a loop with scalable
5847   // vectors when the loop has a hint to enable vectorization for a given VF.
5848   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5849 
5850   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5851   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5852   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5853 
5854   auto Width = ElementCount::getFixed(1);
5855   const float ScalarCost = *ExpectedCost.getValue();
5856   float Cost = ScalarCost;
5857 
5858   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5859   if (ForceVectorization && MaxVF.isVector()) {
5860     // Ignore scalar width, because the user explicitly wants vectorization.
5861     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5862     // evaluation.
5863     Cost = std::numeric_limits<float>::max();
5864   }
5865 
5866   for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
5867        i *= 2) {
5868     // Notice that the vector loop needs to be executed less times, so
5869     // we need to divide the cost of the vector loops by the width of
5870     // the vector elements.
5871     VectorizationCostTy C = expectedCost(i);
5872     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
5873     float VectorCost = *C.first.getValue() / (float)i.getFixedValue();
5874     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5875                       << " costs: " << (int)VectorCost << ".\n");
5876     if (!C.second && !ForceVectorization) {
5877       LLVM_DEBUG(
5878           dbgs() << "LV: Not considering vector loop of width " << i
5879                  << " because it will not generate any vector instructions.\n");
5880       continue;
5881     }
5882 
5883     // If profitable add it to ProfitableVF list.
5884     if (VectorCost < ScalarCost) {
5885       ProfitableVFs.push_back(VectorizationFactor(
5886           {i, (unsigned)VectorCost}));
5887     }
5888 
5889     if (VectorCost < Cost) {
5890       Cost = VectorCost;
5891       Width = i;
5892     }
5893   }
5894 
5895   if (!EnableCondStoresVectorization && NumPredStores) {
5896     reportVectorizationFailure("There are conditional stores.",
5897         "store that is conditionally executed prevents vectorization",
5898         "ConditionalStore", ORE, TheLoop);
5899     Width = ElementCount::getFixed(1);
5900     Cost = ScalarCost;
5901   }
5902 
5903   LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs()
5904              << "LV: Vectorization seems to be not beneficial, "
5905              << "but was forced by a user.\n");
5906   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5907   VectorizationFactor Factor = {Width,
5908                                 (unsigned)(Width.getKnownMinValue() * Cost)};
5909   return Factor;
5910 }
5911 
5912 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5913     const Loop &L, ElementCount VF) const {
5914   // Cross iteration phis such as reductions need special handling and are
5915   // currently unsupported.
5916   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5917         return Legal->isFirstOrderRecurrence(&Phi) ||
5918                Legal->isReductionVariable(&Phi);
5919       }))
5920     return false;
5921 
5922   // Phis with uses outside of the loop require special handling and are
5923   // currently unsupported.
5924   for (auto &Entry : Legal->getInductionVars()) {
5925     // Look for uses of the value of the induction at the last iteration.
5926     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5927     for (User *U : PostInc->users())
5928       if (!L.contains(cast<Instruction>(U)))
5929         return false;
5930     // Look for uses of penultimate value of the induction.
5931     for (User *U : Entry.first->users())
5932       if (!L.contains(cast<Instruction>(U)))
5933         return false;
5934   }
5935 
5936   // Induction variables that are widened require special handling that is
5937   // currently not supported.
5938   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5939         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5940                  this->isProfitableToScalarize(Entry.first, VF));
5941       }))
5942     return false;
5943 
5944   return true;
5945 }
5946 
5947 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5948     const ElementCount VF) const {
5949   // FIXME: We need a much better cost-model to take different parameters such
5950   // as register pressure, code size increase and cost of extra branches into
5951   // account. For now we apply a very crude heuristic and only consider loops
5952   // with vectorization factors larger than a certain value.
5953   // We also consider epilogue vectorization unprofitable for targets that don't
5954   // consider interleaving beneficial (eg. MVE).
5955   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5956     return false;
5957   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5958     return true;
5959   return false;
5960 }
5961 
5962 VectorizationFactor
5963 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5964     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5965   VectorizationFactor Result = VectorizationFactor::Disabled();
5966   if (!EnableEpilogueVectorization) {
5967     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5968     return Result;
5969   }
5970 
5971   if (!isScalarEpilogueAllowed()) {
5972     LLVM_DEBUG(
5973         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5974                   "allowed.\n";);
5975     return Result;
5976   }
5977 
5978   // FIXME: This can be fixed for scalable vectors later, because at this stage
5979   // the LoopVectorizer will only consider vectorizing a loop with scalable
5980   // vectors when the loop has a hint to enable vectorization for a given VF.
5981   if (MainLoopVF.isScalable()) {
5982     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5983                          "yet supported.\n");
5984     return Result;
5985   }
5986 
5987   // Not really a cost consideration, but check for unsupported cases here to
5988   // simplify the logic.
5989   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5990     LLVM_DEBUG(
5991         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5992                   "not a supported candidate.\n";);
5993     return Result;
5994   }
5995 
5996   if (EpilogueVectorizationForceVF > 1) {
5997     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5998     if (LVP.hasPlanWithVFs(
5999             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
6000       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
6001     else {
6002       LLVM_DEBUG(
6003           dbgs()
6004               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6005       return Result;
6006     }
6007   }
6008 
6009   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6010       TheLoop->getHeader()->getParent()->hasMinSize()) {
6011     LLVM_DEBUG(
6012         dbgs()
6013             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6014     return Result;
6015   }
6016 
6017   if (!isEpilogueVectorizationProfitable(MainLoopVF))
6018     return Result;
6019 
6020   for (auto &NextVF : ProfitableVFs)
6021     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6022         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
6023         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6024       Result = NextVF;
6025 
6026   if (Result != VectorizationFactor::Disabled())
6027     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6028                       << Result.Width.getFixedValue() << "\n";);
6029   return Result;
6030 }
6031 
6032 std::pair<unsigned, unsigned>
6033 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6034   unsigned MinWidth = -1U;
6035   unsigned MaxWidth = 8;
6036   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6037 
6038   // For each block.
6039   for (BasicBlock *BB : TheLoop->blocks()) {
6040     // For each instruction in the loop.
6041     for (Instruction &I : BB->instructionsWithoutDebug()) {
6042       Type *T = I.getType();
6043 
6044       // Skip ignored values.
6045       if (ValuesToIgnore.count(&I))
6046         continue;
6047 
6048       // Only examine Loads, Stores and PHINodes.
6049       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6050         continue;
6051 
6052       // Examine PHI nodes that are reduction variables. Update the type to
6053       // account for the recurrence type.
6054       if (auto *PN = dyn_cast<PHINode>(&I)) {
6055         if (!Legal->isReductionVariable(PN))
6056           continue;
6057         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
6058         if (PreferInLoopReductions ||
6059             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6060                                       RdxDesc.getRecurrenceType(),
6061                                       TargetTransformInfo::ReductionFlags()))
6062           continue;
6063         T = RdxDesc.getRecurrenceType();
6064       }
6065 
6066       // Examine the stored values.
6067       if (auto *ST = dyn_cast<StoreInst>(&I))
6068         T = ST->getValueOperand()->getType();
6069 
6070       // Ignore loaded pointer types and stored pointer types that are not
6071       // vectorizable.
6072       //
6073       // FIXME: The check here attempts to predict whether a load or store will
6074       //        be vectorized. We only know this for certain after a VF has
6075       //        been selected. Here, we assume that if an access can be
6076       //        vectorized, it will be. We should also look at extending this
6077       //        optimization to non-pointer types.
6078       //
6079       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6080           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6081         continue;
6082 
6083       MinWidth = std::min(MinWidth,
6084                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6085       MaxWidth = std::max(MaxWidth,
6086                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6087     }
6088   }
6089 
6090   return {MinWidth, MaxWidth};
6091 }
6092 
6093 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6094                                                            unsigned LoopCost) {
6095   // -- The interleave heuristics --
6096   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6097   // There are many micro-architectural considerations that we can't predict
6098   // at this level. For example, frontend pressure (on decode or fetch) due to
6099   // code size, or the number and capabilities of the execution ports.
6100   //
6101   // We use the following heuristics to select the interleave count:
6102   // 1. If the code has reductions, then we interleave to break the cross
6103   // iteration dependency.
6104   // 2. If the loop is really small, then we interleave to reduce the loop
6105   // overhead.
6106   // 3. We don't interleave if we think that we will spill registers to memory
6107   // due to the increased register pressure.
6108 
6109   if (!isScalarEpilogueAllowed())
6110     return 1;
6111 
6112   // We used the distance for the interleave count.
6113   if (Legal->getMaxSafeDepDistBytes() != -1U)
6114     return 1;
6115 
6116   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6117   const bool HasReductions = !Legal->getReductionVars().empty();
6118   // Do not interleave loops with a relatively small known or estimated trip
6119   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6120   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6121   // because with the above conditions interleaving can expose ILP and break
6122   // cross iteration dependences for reductions.
6123   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6124       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6125     return 1;
6126 
6127   RegisterUsage R = calculateRegisterUsage({VF})[0];
6128   // We divide by these constants so assume that we have at least one
6129   // instruction that uses at least one register.
6130   for (auto& pair : R.MaxLocalUsers) {
6131     pair.second = std::max(pair.second, 1U);
6132   }
6133 
6134   // We calculate the interleave count using the following formula.
6135   // Subtract the number of loop invariants from the number of available
6136   // registers. These registers are used by all of the interleaved instances.
6137   // Next, divide the remaining registers by the number of registers that is
6138   // required by the loop, in order to estimate how many parallel instances
6139   // fit without causing spills. All of this is rounded down if necessary to be
6140   // a power of two. We want power of two interleave count to simplify any
6141   // addressing operations or alignment considerations.
6142   // We also want power of two interleave counts to ensure that the induction
6143   // variable of the vector loop wraps to zero, when tail is folded by masking;
6144   // this currently happens when OptForSize, in which case IC is set to 1 above.
6145   unsigned IC = UINT_MAX;
6146 
6147   for (auto& pair : R.MaxLocalUsers) {
6148     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6149     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6150                       << " registers of "
6151                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6152     if (VF.isScalar()) {
6153       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6154         TargetNumRegisters = ForceTargetNumScalarRegs;
6155     } else {
6156       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6157         TargetNumRegisters = ForceTargetNumVectorRegs;
6158     }
6159     unsigned MaxLocalUsers = pair.second;
6160     unsigned LoopInvariantRegs = 0;
6161     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6162       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6163 
6164     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6165     // Don't count the induction variable as interleaved.
6166     if (EnableIndVarRegisterHeur) {
6167       TmpIC =
6168           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6169                         std::max(1U, (MaxLocalUsers - 1)));
6170     }
6171 
6172     IC = std::min(IC, TmpIC);
6173   }
6174 
6175   // Clamp the interleave ranges to reasonable counts.
6176   unsigned MaxInterleaveCount =
6177       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6178 
6179   // Check if the user has overridden the max.
6180   if (VF.isScalar()) {
6181     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6182       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6183   } else {
6184     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6185       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6186   }
6187 
6188   // If trip count is known or estimated compile time constant, limit the
6189   // interleave count to be less than the trip count divided by VF, provided it
6190   // is at least 1.
6191   //
6192   // For scalable vectors we can't know if interleaving is beneficial. It may
6193   // not be beneficial for small loops if none of the lanes in the second vector
6194   // iterations is enabled. However, for larger loops, there is likely to be a
6195   // similar benefit as for fixed-width vectors. For now, we choose to leave
6196   // the InterleaveCount as if vscale is '1', although if some information about
6197   // the vector is known (e.g. min vector size), we can make a better decision.
6198   if (BestKnownTC) {
6199     MaxInterleaveCount =
6200         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6201     // Make sure MaxInterleaveCount is greater than 0.
6202     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6203   }
6204 
6205   assert(MaxInterleaveCount > 0 &&
6206          "Maximum interleave count must be greater than 0");
6207 
6208   // Clamp the calculated IC to be between the 1 and the max interleave count
6209   // that the target and trip count allows.
6210   if (IC > MaxInterleaveCount)
6211     IC = MaxInterleaveCount;
6212   else
6213     // Make sure IC is greater than 0.
6214     IC = std::max(1u, IC);
6215 
6216   assert(IC > 0 && "Interleave count must be greater than 0.");
6217 
6218   // If we did not calculate the cost for VF (because the user selected the VF)
6219   // then we calculate the cost of VF here.
6220   if (LoopCost == 0) {
6221     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6222     LoopCost = *expectedCost(VF).first.getValue();
6223   }
6224 
6225   assert(LoopCost && "Non-zero loop cost expected");
6226 
6227   // Interleave if we vectorized this loop and there is a reduction that could
6228   // benefit from interleaving.
6229   if (VF.isVector() && HasReductions) {
6230     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6231     return IC;
6232   }
6233 
6234   // Note that if we've already vectorized the loop we will have done the
6235   // runtime check and so interleaving won't require further checks.
6236   bool InterleavingRequiresRuntimePointerCheck =
6237       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6238 
6239   // We want to interleave small loops in order to reduce the loop overhead and
6240   // potentially expose ILP opportunities.
6241   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6242                     << "LV: IC is " << IC << '\n'
6243                     << "LV: VF is " << VF << '\n');
6244   const bool AggressivelyInterleaveReductions =
6245       TTI.enableAggressiveInterleaving(HasReductions);
6246   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6247     // We assume that the cost overhead is 1 and we use the cost model
6248     // to estimate the cost of the loop and interleave until the cost of the
6249     // loop overhead is about 5% of the cost of the loop.
6250     unsigned SmallIC =
6251         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6252 
6253     // Interleave until store/load ports (estimated by max interleave count) are
6254     // saturated.
6255     unsigned NumStores = Legal->getNumStores();
6256     unsigned NumLoads = Legal->getNumLoads();
6257     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6258     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6259 
6260     // If we have a scalar reduction (vector reductions are already dealt with
6261     // by this point), we can increase the critical path length if the loop
6262     // we're interleaving is inside another loop. Limit, by default to 2, so the
6263     // critical path only gets increased by one reduction operation.
6264     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6265       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6266       SmallIC = std::min(SmallIC, F);
6267       StoresIC = std::min(StoresIC, F);
6268       LoadsIC = std::min(LoadsIC, F);
6269     }
6270 
6271     if (EnableLoadStoreRuntimeInterleave &&
6272         std::max(StoresIC, LoadsIC) > SmallIC) {
6273       LLVM_DEBUG(
6274           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6275       return std::max(StoresIC, LoadsIC);
6276     }
6277 
6278     // If there are scalar reductions and TTI has enabled aggressive
6279     // interleaving for reductions, we will interleave to expose ILP.
6280     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6281         AggressivelyInterleaveReductions) {
6282       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6283       // Interleave no less than SmallIC but not as aggressive as the normal IC
6284       // to satisfy the rare situation when resources are too limited.
6285       return std::max(IC / 2, SmallIC);
6286     } else {
6287       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6288       return SmallIC;
6289     }
6290   }
6291 
6292   // Interleave if this is a large loop (small loops are already dealt with by
6293   // this point) that could benefit from interleaving.
6294   if (AggressivelyInterleaveReductions) {
6295     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6296     return IC;
6297   }
6298 
6299   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6300   return 1;
6301 }
6302 
6303 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6304 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6305   // This function calculates the register usage by measuring the highest number
6306   // of values that are alive at a single location. Obviously, this is a very
6307   // rough estimation. We scan the loop in a topological order in order and
6308   // assign a number to each instruction. We use RPO to ensure that defs are
6309   // met before their users. We assume that each instruction that has in-loop
6310   // users starts an interval. We record every time that an in-loop value is
6311   // used, so we have a list of the first and last occurrences of each
6312   // instruction. Next, we transpose this data structure into a multi map that
6313   // holds the list of intervals that *end* at a specific location. This multi
6314   // map allows us to perform a linear search. We scan the instructions linearly
6315   // and record each time that a new interval starts, by placing it in a set.
6316   // If we find this value in the multi-map then we remove it from the set.
6317   // The max register usage is the maximum size of the set.
6318   // We also search for instructions that are defined outside the loop, but are
6319   // used inside the loop. We need this number separately from the max-interval
6320   // usage number because when we unroll, loop-invariant values do not take
6321   // more register.
6322   LoopBlocksDFS DFS(TheLoop);
6323   DFS.perform(LI);
6324 
6325   RegisterUsage RU;
6326 
6327   // Each 'key' in the map opens a new interval. The values
6328   // of the map are the index of the 'last seen' usage of the
6329   // instruction that is the key.
6330   using IntervalMap = DenseMap<Instruction *, unsigned>;
6331 
6332   // Maps instruction to its index.
6333   SmallVector<Instruction *, 64> IdxToInstr;
6334   // Marks the end of each interval.
6335   IntervalMap EndPoint;
6336   // Saves the list of instruction indices that are used in the loop.
6337   SmallPtrSet<Instruction *, 8> Ends;
6338   // Saves the list of values that are used in the loop but are
6339   // defined outside the loop, such as arguments and constants.
6340   SmallPtrSet<Value *, 8> LoopInvariants;
6341 
6342   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6343     for (Instruction &I : BB->instructionsWithoutDebug()) {
6344       IdxToInstr.push_back(&I);
6345 
6346       // Save the end location of each USE.
6347       for (Value *U : I.operands()) {
6348         auto *Instr = dyn_cast<Instruction>(U);
6349 
6350         // Ignore non-instruction values such as arguments, constants, etc.
6351         if (!Instr)
6352           continue;
6353 
6354         // If this instruction is outside the loop then record it and continue.
6355         if (!TheLoop->contains(Instr)) {
6356           LoopInvariants.insert(Instr);
6357           continue;
6358         }
6359 
6360         // Overwrite previous end points.
6361         EndPoint[Instr] = IdxToInstr.size();
6362         Ends.insert(Instr);
6363       }
6364     }
6365   }
6366 
6367   // Saves the list of intervals that end with the index in 'key'.
6368   using InstrList = SmallVector<Instruction *, 2>;
6369   DenseMap<unsigned, InstrList> TransposeEnds;
6370 
6371   // Transpose the EndPoints to a list of values that end at each index.
6372   for (auto &Interval : EndPoint)
6373     TransposeEnds[Interval.second].push_back(Interval.first);
6374 
6375   SmallPtrSet<Instruction *, 8> OpenIntervals;
6376   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6377   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6378 
6379   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6380 
6381   // A lambda that gets the register usage for the given type and VF.
6382   const auto &TTICapture = TTI;
6383   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6384     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6385       return 0U;
6386     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6387   };
6388 
6389   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6390     Instruction *I = IdxToInstr[i];
6391 
6392     // Remove all of the instructions that end at this location.
6393     InstrList &List = TransposeEnds[i];
6394     for (Instruction *ToRemove : List)
6395       OpenIntervals.erase(ToRemove);
6396 
6397     // Ignore instructions that are never used within the loop.
6398     if (!Ends.count(I))
6399       continue;
6400 
6401     // Skip ignored values.
6402     if (ValuesToIgnore.count(I))
6403       continue;
6404 
6405     // For each VF find the maximum usage of registers.
6406     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6407       // Count the number of live intervals.
6408       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6409 
6410       if (VFs[j].isScalar()) {
6411         for (auto Inst : OpenIntervals) {
6412           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6413           if (RegUsage.find(ClassID) == RegUsage.end())
6414             RegUsage[ClassID] = 1;
6415           else
6416             RegUsage[ClassID] += 1;
6417         }
6418       } else {
6419         collectUniformsAndScalars(VFs[j]);
6420         for (auto Inst : OpenIntervals) {
6421           // Skip ignored values for VF > 1.
6422           if (VecValuesToIgnore.count(Inst))
6423             continue;
6424           if (isScalarAfterVectorization(Inst, VFs[j])) {
6425             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6426             if (RegUsage.find(ClassID) == RegUsage.end())
6427               RegUsage[ClassID] = 1;
6428             else
6429               RegUsage[ClassID] += 1;
6430           } else {
6431             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6432             if (RegUsage.find(ClassID) == RegUsage.end())
6433               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6434             else
6435               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6436           }
6437         }
6438       }
6439 
6440       for (auto& pair : RegUsage) {
6441         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6442           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6443         else
6444           MaxUsages[j][pair.first] = pair.second;
6445       }
6446     }
6447 
6448     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6449                       << OpenIntervals.size() << '\n');
6450 
6451     // Add the current instruction to the list of open intervals.
6452     OpenIntervals.insert(I);
6453   }
6454 
6455   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6456     SmallMapVector<unsigned, unsigned, 4> Invariant;
6457 
6458     for (auto Inst : LoopInvariants) {
6459       unsigned Usage =
6460           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6461       unsigned ClassID =
6462           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6463       if (Invariant.find(ClassID) == Invariant.end())
6464         Invariant[ClassID] = Usage;
6465       else
6466         Invariant[ClassID] += Usage;
6467     }
6468 
6469     LLVM_DEBUG({
6470       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6471       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6472              << " item\n";
6473       for (const auto &pair : MaxUsages[i]) {
6474         dbgs() << "LV(REG): RegisterClass: "
6475                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6476                << " registers\n";
6477       }
6478       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6479              << " item\n";
6480       for (const auto &pair : Invariant) {
6481         dbgs() << "LV(REG): RegisterClass: "
6482                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6483                << " registers\n";
6484       }
6485     });
6486 
6487     RU.LoopInvariantRegs = Invariant;
6488     RU.MaxLocalUsers = MaxUsages[i];
6489     RUs[i] = RU;
6490   }
6491 
6492   return RUs;
6493 }
6494 
6495 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6496   // TODO: Cost model for emulated masked load/store is completely
6497   // broken. This hack guides the cost model to use an artificially
6498   // high enough value to practically disable vectorization with such
6499   // operations, except where previously deployed legality hack allowed
6500   // using very low cost values. This is to avoid regressions coming simply
6501   // from moving "masked load/store" check from legality to cost model.
6502   // Masked Load/Gather emulation was previously never allowed.
6503   // Limited number of Masked Store/Scatter emulation was allowed.
6504   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6505   return isa<LoadInst>(I) ||
6506          (isa<StoreInst>(I) &&
6507           NumPredStores > NumberOfStoresToPredicate);
6508 }
6509 
6510 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6511   // If we aren't vectorizing the loop, or if we've already collected the
6512   // instructions to scalarize, there's nothing to do. Collection may already
6513   // have occurred if we have a user-selected VF and are now computing the
6514   // expected cost for interleaving.
6515   if (VF.isScalar() || VF.isZero() ||
6516       InstsToScalarize.find(VF) != InstsToScalarize.end())
6517     return;
6518 
6519   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6520   // not profitable to scalarize any instructions, the presence of VF in the
6521   // map will indicate that we've analyzed it already.
6522   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6523 
6524   // Find all the instructions that are scalar with predication in the loop and
6525   // determine if it would be better to not if-convert the blocks they are in.
6526   // If so, we also record the instructions to scalarize.
6527   for (BasicBlock *BB : TheLoop->blocks()) {
6528     if (!blockNeedsPredication(BB))
6529       continue;
6530     for (Instruction &I : *BB)
6531       if (isScalarWithPredication(&I)) {
6532         ScalarCostsTy ScalarCosts;
6533         // Do not apply discount logic if hacked cost is needed
6534         // for emulated masked memrefs.
6535         if (!useEmulatedMaskMemRefHack(&I) &&
6536             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6537           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6538         // Remember that BB will remain after vectorization.
6539         PredicatedBBsAfterVectorization.insert(BB);
6540       }
6541   }
6542 }
6543 
6544 int LoopVectorizationCostModel::computePredInstDiscount(
6545     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6546   assert(!isUniformAfterVectorization(PredInst, VF) &&
6547          "Instruction marked uniform-after-vectorization will be predicated");
6548 
6549   // Initialize the discount to zero, meaning that the scalar version and the
6550   // vector version cost the same.
6551   InstructionCost Discount = 0;
6552 
6553   // Holds instructions to analyze. The instructions we visit are mapped in
6554   // ScalarCosts. Those instructions are the ones that would be scalarized if
6555   // we find that the scalar version costs less.
6556   SmallVector<Instruction *, 8> Worklist;
6557 
6558   // Returns true if the given instruction can be scalarized.
6559   auto canBeScalarized = [&](Instruction *I) -> bool {
6560     // We only attempt to scalarize instructions forming a single-use chain
6561     // from the original predicated block that would otherwise be vectorized.
6562     // Although not strictly necessary, we give up on instructions we know will
6563     // already be scalar to avoid traversing chains that are unlikely to be
6564     // beneficial.
6565     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6566         isScalarAfterVectorization(I, VF))
6567       return false;
6568 
6569     // If the instruction is scalar with predication, it will be analyzed
6570     // separately. We ignore it within the context of PredInst.
6571     if (isScalarWithPredication(I))
6572       return false;
6573 
6574     // If any of the instruction's operands are uniform after vectorization,
6575     // the instruction cannot be scalarized. This prevents, for example, a
6576     // masked load from being scalarized.
6577     //
6578     // We assume we will only emit a value for lane zero of an instruction
6579     // marked uniform after vectorization, rather than VF identical values.
6580     // Thus, if we scalarize an instruction that uses a uniform, we would
6581     // create uses of values corresponding to the lanes we aren't emitting code
6582     // for. This behavior can be changed by allowing getScalarValue to clone
6583     // the lane zero values for uniforms rather than asserting.
6584     for (Use &U : I->operands())
6585       if (auto *J = dyn_cast<Instruction>(U.get()))
6586         if (isUniformAfterVectorization(J, VF))
6587           return false;
6588 
6589     // Otherwise, we can scalarize the instruction.
6590     return true;
6591   };
6592 
6593   // Compute the expected cost discount from scalarizing the entire expression
6594   // feeding the predicated instruction. We currently only consider expressions
6595   // that are single-use instruction chains.
6596   Worklist.push_back(PredInst);
6597   while (!Worklist.empty()) {
6598     Instruction *I = Worklist.pop_back_val();
6599 
6600     // If we've already analyzed the instruction, there's nothing to do.
6601     if (ScalarCosts.find(I) != ScalarCosts.end())
6602       continue;
6603 
6604     // Compute the cost of the vector instruction. Note that this cost already
6605     // includes the scalarization overhead of the predicated instruction.
6606     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6607 
6608     // Compute the cost of the scalarized instruction. This cost is the cost of
6609     // the instruction as if it wasn't if-converted and instead remained in the
6610     // predicated block. We will scale this cost by block probability after
6611     // computing the scalarization overhead.
6612     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6613     InstructionCost ScalarCost =
6614         VF.getKnownMinValue() *
6615         getInstructionCost(I, ElementCount::getFixed(1)).first;
6616 
6617     // Compute the scalarization overhead of needed insertelement instructions
6618     // and phi nodes.
6619     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6620       ScalarCost += TTI.getScalarizationOverhead(
6621           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6622           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6623       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6624       ScalarCost +=
6625           VF.getKnownMinValue() *
6626           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6627     }
6628 
6629     // Compute the scalarization overhead of needed extractelement
6630     // instructions. For each of the instruction's operands, if the operand can
6631     // be scalarized, add it to the worklist; otherwise, account for the
6632     // overhead.
6633     for (Use &U : I->operands())
6634       if (auto *J = dyn_cast<Instruction>(U.get())) {
6635         assert(VectorType::isValidElementType(J->getType()) &&
6636                "Instruction has non-scalar type");
6637         if (canBeScalarized(J))
6638           Worklist.push_back(J);
6639         else if (needsExtract(J, VF)) {
6640           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6641           ScalarCost += TTI.getScalarizationOverhead(
6642               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6643               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6644         }
6645       }
6646 
6647     // Scale the total scalar cost by block probability.
6648     ScalarCost /= getReciprocalPredBlockProb();
6649 
6650     // Compute the discount. A non-negative discount means the vector version
6651     // of the instruction costs more, and scalarizing would be beneficial.
6652     Discount += VectorCost - ScalarCost;
6653     ScalarCosts[I] = ScalarCost;
6654   }
6655 
6656   return *Discount.getValue();
6657 }
6658 
6659 LoopVectorizationCostModel::VectorizationCostTy
6660 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6661   VectorizationCostTy Cost;
6662 
6663   // For each block.
6664   for (BasicBlock *BB : TheLoop->blocks()) {
6665     VectorizationCostTy BlockCost;
6666 
6667     // For each instruction in the old loop.
6668     for (Instruction &I : BB->instructionsWithoutDebug()) {
6669       // Skip ignored values.
6670       if (ValuesToIgnore.count(&I) ||
6671           (VF.isVector() && VecValuesToIgnore.count(&I)))
6672         continue;
6673 
6674       VectorizationCostTy C = getInstructionCost(&I, VF);
6675 
6676       // Check if we should override the cost.
6677       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6678         C.first = InstructionCost(ForceTargetInstructionCost);
6679 
6680       BlockCost.first += C.first;
6681       BlockCost.second |= C.second;
6682       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6683                         << " for VF " << VF << " For instruction: " << I
6684                         << '\n');
6685     }
6686 
6687     // If we are vectorizing a predicated block, it will have been
6688     // if-converted. This means that the block's instructions (aside from
6689     // stores and instructions that may divide by zero) will now be
6690     // unconditionally executed. For the scalar case, we may not always execute
6691     // the predicated block, if it is an if-else block. Thus, scale the block's
6692     // cost by the probability of executing it. blockNeedsPredication from
6693     // Legal is used so as to not include all blocks in tail folded loops.
6694     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6695       BlockCost.first /= getReciprocalPredBlockProb();
6696 
6697     Cost.first += BlockCost.first;
6698     Cost.second |= BlockCost.second;
6699   }
6700 
6701   return Cost;
6702 }
6703 
6704 /// Gets Address Access SCEV after verifying that the access pattern
6705 /// is loop invariant except the induction variable dependence.
6706 ///
6707 /// This SCEV can be sent to the Target in order to estimate the address
6708 /// calculation cost.
6709 static const SCEV *getAddressAccessSCEV(
6710               Value *Ptr,
6711               LoopVectorizationLegality *Legal,
6712               PredicatedScalarEvolution &PSE,
6713               const Loop *TheLoop) {
6714 
6715   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6716   if (!Gep)
6717     return nullptr;
6718 
6719   // We are looking for a gep with all loop invariant indices except for one
6720   // which should be an induction variable.
6721   auto SE = PSE.getSE();
6722   unsigned NumOperands = Gep->getNumOperands();
6723   for (unsigned i = 1; i < NumOperands; ++i) {
6724     Value *Opd = Gep->getOperand(i);
6725     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6726         !Legal->isInductionVariable(Opd))
6727       return nullptr;
6728   }
6729 
6730   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6731   return PSE.getSCEV(Ptr);
6732 }
6733 
6734 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6735   return Legal->hasStride(I->getOperand(0)) ||
6736          Legal->hasStride(I->getOperand(1));
6737 }
6738 
6739 InstructionCost
6740 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6741                                                         ElementCount VF) {
6742   assert(VF.isVector() &&
6743          "Scalarization cost of instruction implies vectorization.");
6744   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6745   Type *ValTy = getMemInstValueType(I);
6746   auto SE = PSE.getSE();
6747 
6748   unsigned AS = getLoadStoreAddressSpace(I);
6749   Value *Ptr = getLoadStorePointerOperand(I);
6750   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6751 
6752   // Figure out whether the access is strided and get the stride value
6753   // if it's known in compile time
6754   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6755 
6756   // Get the cost of the scalar memory instruction and address computation.
6757   InstructionCost Cost =
6758       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6759 
6760   // Don't pass *I here, since it is scalar but will actually be part of a
6761   // vectorized loop where the user of it is a vectorized instruction.
6762   const Align Alignment = getLoadStoreAlignment(I);
6763   Cost += VF.getKnownMinValue() *
6764           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6765                               AS, TTI::TCK_RecipThroughput);
6766 
6767   // Get the overhead of the extractelement and insertelement instructions
6768   // we might create due to scalarization.
6769   Cost += getScalarizationOverhead(I, VF);
6770 
6771   // If we have a predicated store, it may not be executed for each vector
6772   // lane. Scale the cost by the probability of executing the predicated
6773   // block.
6774   if (isPredicatedInst(I)) {
6775     Cost /= getReciprocalPredBlockProb();
6776 
6777     if (useEmulatedMaskMemRefHack(I))
6778       // Artificially setting to a high enough value to practically disable
6779       // vectorization with such operations.
6780       Cost = 3000000;
6781   }
6782 
6783   return Cost;
6784 }
6785 
6786 InstructionCost
6787 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6788                                                     ElementCount VF) {
6789   Type *ValTy = getMemInstValueType(I);
6790   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6791   Value *Ptr = getLoadStorePointerOperand(I);
6792   unsigned AS = getLoadStoreAddressSpace(I);
6793   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6794   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6795 
6796   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6797          "Stride should be 1 or -1 for consecutive memory access");
6798   const Align Alignment = getLoadStoreAlignment(I);
6799   InstructionCost Cost = 0;
6800   if (Legal->isMaskRequired(I))
6801     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6802                                       CostKind);
6803   else
6804     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6805                                 CostKind, I);
6806 
6807   bool Reverse = ConsecutiveStride < 0;
6808   if (Reverse)
6809     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6810   return Cost;
6811 }
6812 
6813 InstructionCost
6814 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6815                                                 ElementCount VF) {
6816   assert(Legal->isUniformMemOp(*I));
6817 
6818   Type *ValTy = getMemInstValueType(I);
6819   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6820   const Align Alignment = getLoadStoreAlignment(I);
6821   unsigned AS = getLoadStoreAddressSpace(I);
6822   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6823   if (isa<LoadInst>(I)) {
6824     return TTI.getAddressComputationCost(ValTy) +
6825            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6826                                CostKind) +
6827            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6828   }
6829   StoreInst *SI = cast<StoreInst>(I);
6830 
6831   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6832   return TTI.getAddressComputationCost(ValTy) +
6833          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6834                              CostKind) +
6835          (isLoopInvariantStoreValue
6836               ? 0
6837               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6838                                        VF.getKnownMinValue() - 1));
6839 }
6840 
6841 InstructionCost
6842 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6843                                                  ElementCount VF) {
6844   Type *ValTy = getMemInstValueType(I);
6845   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6846   const Align Alignment = getLoadStoreAlignment(I);
6847   const Value *Ptr = getLoadStorePointerOperand(I);
6848 
6849   return TTI.getAddressComputationCost(VectorTy) +
6850          TTI.getGatherScatterOpCost(
6851              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6852              TargetTransformInfo::TCK_RecipThroughput, I);
6853 }
6854 
6855 InstructionCost
6856 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6857                                                    ElementCount VF) {
6858   // TODO: Once we have support for interleaving with scalable vectors
6859   // we can calculate the cost properly here.
6860   if (VF.isScalable())
6861     return InstructionCost::getInvalid();
6862 
6863   Type *ValTy = getMemInstValueType(I);
6864   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6865   unsigned AS = getLoadStoreAddressSpace(I);
6866 
6867   auto Group = getInterleavedAccessGroup(I);
6868   assert(Group && "Fail to get an interleaved access group.");
6869 
6870   unsigned InterleaveFactor = Group->getFactor();
6871   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6872 
6873   // Holds the indices of existing members in an interleaved load group.
6874   // An interleaved store group doesn't need this as it doesn't allow gaps.
6875   SmallVector<unsigned, 4> Indices;
6876   if (isa<LoadInst>(I)) {
6877     for (unsigned i = 0; i < InterleaveFactor; i++)
6878       if (Group->getMember(i))
6879         Indices.push_back(i);
6880   }
6881 
6882   // Calculate the cost of the whole interleaved group.
6883   bool UseMaskForGaps =
6884       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6885   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6886       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6887       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6888 
6889   if (Group->isReverse()) {
6890     // TODO: Add support for reversed masked interleaved access.
6891     assert(!Legal->isMaskRequired(I) &&
6892            "Reverse masked interleaved access not supported.");
6893     Cost += Group->getNumMembers() *
6894             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6895   }
6896   return Cost;
6897 }
6898 
6899 InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
6900     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6901   // Early exit for no inloop reductions
6902   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6903     return InstructionCost::getInvalid();
6904   auto *VectorTy = cast<VectorType>(Ty);
6905 
6906   // We are looking for a pattern of, and finding the minimal acceptable cost:
6907   //  reduce(mul(ext(A), ext(B))) or
6908   //  reduce(mul(A, B)) or
6909   //  reduce(ext(A)) or
6910   //  reduce(A).
6911   // The basic idea is that we walk down the tree to do that, finding the root
6912   // reduction instruction in InLoopReductionImmediateChains. From there we find
6913   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6914   // of the components. If the reduction cost is lower then we return it for the
6915   // reduction instruction and 0 for the other instructions in the pattern. If
6916   // it is not we return an invalid cost specifying the orignal cost method
6917   // should be used.
6918   Instruction *RetI = I;
6919   if ((RetI->getOpcode() == Instruction::SExt ||
6920        RetI->getOpcode() == Instruction::ZExt)) {
6921     if (!RetI->hasOneUser())
6922       return InstructionCost::getInvalid();
6923     RetI = RetI->user_back();
6924   }
6925   if (RetI->getOpcode() == Instruction::Mul &&
6926       RetI->user_back()->getOpcode() == Instruction::Add) {
6927     if (!RetI->hasOneUser())
6928       return InstructionCost::getInvalid();
6929     RetI = RetI->user_back();
6930   }
6931 
6932   // Test if the found instruction is a reduction, and if not return an invalid
6933   // cost specifying the parent to use the original cost modelling.
6934   if (!InLoopReductionImmediateChains.count(RetI))
6935     return InstructionCost::getInvalid();
6936 
6937   // Find the reduction this chain is a part of and calculate the basic cost of
6938   // the reduction on its own.
6939   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6940   Instruction *ReductionPhi = LastChain;
6941   while (!isa<PHINode>(ReductionPhi))
6942     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6943 
6944   RecurrenceDescriptor RdxDesc =
6945       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
6946   unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
6947                                                      VectorTy, false, CostKind);
6948 
6949   // Get the operand that was not the reduction chain and match it to one of the
6950   // patterns, returning the better cost if it is found.
6951   Instruction *RedOp = RetI->getOperand(1) == LastChain
6952                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6953                            : dyn_cast<Instruction>(RetI->getOperand(1));
6954 
6955   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6956 
6957   if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
6958       !TheLoop->isLoopInvariant(RedOp)) {
6959     bool IsUnsigned = isa<ZExtInst>(RedOp);
6960     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6961     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6962         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6963         CostKind);
6964 
6965     unsigned ExtCost =
6966         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6967                              TTI::CastContextHint::None, CostKind, RedOp);
6968     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6969       return I == RetI ? *RedCost.getValue() : 0;
6970   } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
6971     Instruction *Mul = RedOp;
6972     Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
6973     Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
6974     if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
6975         Op0->getOpcode() == Op1->getOpcode() &&
6976         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6977         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6978       bool IsUnsigned = isa<ZExtInst>(Op0);
6979       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6980       // reduce(mul(ext, ext))
6981       unsigned ExtCost =
6982           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
6983                                TTI::CastContextHint::None, CostKind, Op0);
6984       InstructionCost MulCost =
6985           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
6986 
6987       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6988           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6989           CostKind);
6990 
6991       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
6992         return I == RetI ? *RedCost.getValue() : 0;
6993     } else {
6994       InstructionCost MulCost =
6995           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
6996 
6997       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6998           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6999           CostKind);
7000 
7001       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7002         return I == RetI ? *RedCost.getValue() : 0;
7003     }
7004   }
7005 
7006   return I == RetI ? BaseCost : InstructionCost::getInvalid();
7007 }
7008 
7009 InstructionCost
7010 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7011                                                      ElementCount VF) {
7012   // Calculate scalar cost only. Vectorization cost should be ready at this
7013   // moment.
7014   if (VF.isScalar()) {
7015     Type *ValTy = getMemInstValueType(I);
7016     const Align Alignment = getLoadStoreAlignment(I);
7017     unsigned AS = getLoadStoreAddressSpace(I);
7018 
7019     return TTI.getAddressComputationCost(ValTy) +
7020            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7021                                TTI::TCK_RecipThroughput, I);
7022   }
7023   return getWideningCost(I, VF);
7024 }
7025 
7026 LoopVectorizationCostModel::VectorizationCostTy
7027 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7028                                                ElementCount VF) {
7029   // If we know that this instruction will remain uniform, check the cost of
7030   // the scalar version.
7031   if (isUniformAfterVectorization(I, VF))
7032     VF = ElementCount::getFixed(1);
7033 
7034   if (VF.isVector() && isProfitableToScalarize(I, VF))
7035     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7036 
7037   // Forced scalars do not have any scalarization overhead.
7038   auto ForcedScalar = ForcedScalars.find(VF);
7039   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7040     auto InstSet = ForcedScalar->second;
7041     if (InstSet.count(I))
7042       return VectorizationCostTy(
7043           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7044            VF.getKnownMinValue()),
7045           false);
7046   }
7047 
7048   Type *VectorTy;
7049   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7050 
7051   bool TypeNotScalarized =
7052       VF.isVector() && VectorTy->isVectorTy() &&
7053       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7054   return VectorizationCostTy(C, TypeNotScalarized);
7055 }
7056 
7057 InstructionCost
7058 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7059                                                      ElementCount VF) {
7060 
7061   if (VF.isScalable())
7062     return InstructionCost::getInvalid();
7063 
7064   if (VF.isScalar())
7065     return 0;
7066 
7067   InstructionCost Cost = 0;
7068   Type *RetTy = ToVectorTy(I->getType(), VF);
7069   if (!RetTy->isVoidTy() &&
7070       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7071     Cost += TTI.getScalarizationOverhead(
7072         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7073         true, false);
7074 
7075   // Some targets keep addresses scalar.
7076   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7077     return Cost;
7078 
7079   // Some targets support efficient element stores.
7080   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7081     return Cost;
7082 
7083   // Collect operands to consider.
7084   CallInst *CI = dyn_cast<CallInst>(I);
7085   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7086 
7087   // Skip operands that do not require extraction/scalarization and do not incur
7088   // any overhead.
7089   SmallVector<Type *> Tys;
7090   for (auto *V : filterExtractingOperands(Ops, VF))
7091     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7092   return Cost + TTI.getOperandsScalarizationOverhead(
7093                     filterExtractingOperands(Ops, VF), Tys);
7094 }
7095 
7096 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7097   if (VF.isScalar())
7098     return;
7099   NumPredStores = 0;
7100   for (BasicBlock *BB : TheLoop->blocks()) {
7101     // For each instruction in the old loop.
7102     for (Instruction &I : *BB) {
7103       Value *Ptr =  getLoadStorePointerOperand(&I);
7104       if (!Ptr)
7105         continue;
7106 
7107       // TODO: We should generate better code and update the cost model for
7108       // predicated uniform stores. Today they are treated as any other
7109       // predicated store (see added test cases in
7110       // invariant-store-vectorization.ll).
7111       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7112         NumPredStores++;
7113 
7114       if (Legal->isUniformMemOp(I)) {
7115         // TODO: Avoid replicating loads and stores instead of
7116         // relying on instcombine to remove them.
7117         // Load: Scalar load + broadcast
7118         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7119         InstructionCost Cost = getUniformMemOpCost(&I, VF);
7120         setWideningDecision(&I, VF, CM_Scalarize, Cost);
7121         continue;
7122       }
7123 
7124       // We assume that widening is the best solution when possible.
7125       if (memoryInstructionCanBeWidened(&I, VF)) {
7126         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7127         int ConsecutiveStride =
7128                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7129         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7130                "Expected consecutive stride.");
7131         InstWidening Decision =
7132             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7133         setWideningDecision(&I, VF, Decision, Cost);
7134         continue;
7135       }
7136 
7137       // Choose between Interleaving, Gather/Scatter or Scalarization.
7138       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7139       unsigned NumAccesses = 1;
7140       if (isAccessInterleaved(&I)) {
7141         auto Group = getInterleavedAccessGroup(&I);
7142         assert(Group && "Fail to get an interleaved access group.");
7143 
7144         // Make one decision for the whole group.
7145         if (getWideningDecision(&I, VF) != CM_Unknown)
7146           continue;
7147 
7148         NumAccesses = Group->getNumMembers();
7149         if (interleavedAccessCanBeWidened(&I, VF))
7150           InterleaveCost = getInterleaveGroupCost(&I, VF);
7151       }
7152 
7153       InstructionCost GatherScatterCost =
7154           isLegalGatherOrScatter(&I)
7155               ? getGatherScatterCost(&I, VF) * NumAccesses
7156               : InstructionCost::getInvalid();
7157 
7158       InstructionCost ScalarizationCost =
7159           !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses
7160                            : InstructionCost::getInvalid();
7161 
7162       // Choose better solution for the current VF,
7163       // write down this decision and use it during vectorization.
7164       InstructionCost Cost;
7165       InstWidening Decision;
7166       if (InterleaveCost <= GatherScatterCost &&
7167           InterleaveCost < ScalarizationCost) {
7168         Decision = CM_Interleave;
7169         Cost = InterleaveCost;
7170       } else if (GatherScatterCost < ScalarizationCost) {
7171         Decision = CM_GatherScatter;
7172         Cost = GatherScatterCost;
7173       } else {
7174         assert(!VF.isScalable() &&
7175                "We cannot yet scalarise for scalable vectors");
7176         Decision = CM_Scalarize;
7177         Cost = ScalarizationCost;
7178       }
7179       // If the instructions belongs to an interleave group, the whole group
7180       // receives the same decision. The whole group receives the cost, but
7181       // the cost will actually be assigned to one instruction.
7182       if (auto Group = getInterleavedAccessGroup(&I))
7183         setWideningDecision(Group, VF, Decision, Cost);
7184       else
7185         setWideningDecision(&I, VF, Decision, Cost);
7186     }
7187   }
7188 
7189   // Make sure that any load of address and any other address computation
7190   // remains scalar unless there is gather/scatter support. This avoids
7191   // inevitable extracts into address registers, and also has the benefit of
7192   // activating LSR more, since that pass can't optimize vectorized
7193   // addresses.
7194   if (TTI.prefersVectorizedAddressing())
7195     return;
7196 
7197   // Start with all scalar pointer uses.
7198   SmallPtrSet<Instruction *, 8> AddrDefs;
7199   for (BasicBlock *BB : TheLoop->blocks())
7200     for (Instruction &I : *BB) {
7201       Instruction *PtrDef =
7202         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7203       if (PtrDef && TheLoop->contains(PtrDef) &&
7204           getWideningDecision(&I, VF) != CM_GatherScatter)
7205         AddrDefs.insert(PtrDef);
7206     }
7207 
7208   // Add all instructions used to generate the addresses.
7209   SmallVector<Instruction *, 4> Worklist;
7210   append_range(Worklist, AddrDefs);
7211   while (!Worklist.empty()) {
7212     Instruction *I = Worklist.pop_back_val();
7213     for (auto &Op : I->operands())
7214       if (auto *InstOp = dyn_cast<Instruction>(Op))
7215         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7216             AddrDefs.insert(InstOp).second)
7217           Worklist.push_back(InstOp);
7218   }
7219 
7220   for (auto *I : AddrDefs) {
7221     if (isa<LoadInst>(I)) {
7222       // Setting the desired widening decision should ideally be handled in
7223       // by cost functions, but since this involves the task of finding out
7224       // if the loaded register is involved in an address computation, it is
7225       // instead changed here when we know this is the case.
7226       InstWidening Decision = getWideningDecision(I, VF);
7227       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7228         // Scalarize a widened load of address.
7229         setWideningDecision(
7230             I, VF, CM_Scalarize,
7231             (VF.getKnownMinValue() *
7232              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7233       else if (auto Group = getInterleavedAccessGroup(I)) {
7234         // Scalarize an interleave group of address loads.
7235         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7236           if (Instruction *Member = Group->getMember(I))
7237             setWideningDecision(
7238                 Member, VF, CM_Scalarize,
7239                 (VF.getKnownMinValue() *
7240                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7241         }
7242       }
7243     } else
7244       // Make sure I gets scalarized and a cost estimate without
7245       // scalarization overhead.
7246       ForcedScalars[VF].insert(I);
7247   }
7248 }
7249 
7250 InstructionCost
7251 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7252                                                Type *&VectorTy) {
7253   Type *RetTy = I->getType();
7254   if (canTruncateToMinimalBitwidth(I, VF))
7255     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7256   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7257   auto SE = PSE.getSE();
7258   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7259 
7260   // TODO: We need to estimate the cost of intrinsic calls.
7261   switch (I->getOpcode()) {
7262   case Instruction::GetElementPtr:
7263     // We mark this instruction as zero-cost because the cost of GEPs in
7264     // vectorized code depends on whether the corresponding memory instruction
7265     // is scalarized or not. Therefore, we handle GEPs with the memory
7266     // instruction cost.
7267     return 0;
7268   case Instruction::Br: {
7269     // In cases of scalarized and predicated instructions, there will be VF
7270     // predicated blocks in the vectorized loop. Each branch around these
7271     // blocks requires also an extract of its vector compare i1 element.
7272     bool ScalarPredicatedBB = false;
7273     BranchInst *BI = cast<BranchInst>(I);
7274     if (VF.isVector() && BI->isConditional() &&
7275         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7276          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7277       ScalarPredicatedBB = true;
7278 
7279     if (ScalarPredicatedBB) {
7280       // Return cost for branches around scalarized and predicated blocks.
7281       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7282       auto *Vec_i1Ty =
7283           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7284       return (TTI.getScalarizationOverhead(
7285                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7286                   false, true) +
7287               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7288                VF.getKnownMinValue()));
7289     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7290       // The back-edge branch will remain, as will all scalar branches.
7291       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7292     else
7293       // This branch will be eliminated by if-conversion.
7294       return 0;
7295     // Note: We currently assume zero cost for an unconditional branch inside
7296     // a predicated block since it will become a fall-through, although we
7297     // may decide in the future to call TTI for all branches.
7298   }
7299   case Instruction::PHI: {
7300     auto *Phi = cast<PHINode>(I);
7301 
7302     // First-order recurrences are replaced by vector shuffles inside the loop.
7303     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7304     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7305       return TTI.getShuffleCost(
7306           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7307           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7308 
7309     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7310     // converted into select instructions. We require N - 1 selects per phi
7311     // node, where N is the number of incoming values.
7312     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7313       return (Phi->getNumIncomingValues() - 1) *
7314              TTI.getCmpSelInstrCost(
7315                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7316                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7317                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7318 
7319     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7320   }
7321   case Instruction::UDiv:
7322   case Instruction::SDiv:
7323   case Instruction::URem:
7324   case Instruction::SRem:
7325     // If we have a predicated instruction, it may not be executed for each
7326     // vector lane. Get the scalarization cost and scale this amount by the
7327     // probability of executing the predicated block. If the instruction is not
7328     // predicated, we fall through to the next case.
7329     if (VF.isVector() && isScalarWithPredication(I)) {
7330       InstructionCost Cost = 0;
7331 
7332       // These instructions have a non-void type, so account for the phi nodes
7333       // that we will create. This cost is likely to be zero. The phi node
7334       // cost, if any, should be scaled by the block probability because it
7335       // models a copy at the end of each predicated block.
7336       Cost += VF.getKnownMinValue() *
7337               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7338 
7339       // The cost of the non-predicated instruction.
7340       Cost += VF.getKnownMinValue() *
7341               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7342 
7343       // The cost of insertelement and extractelement instructions needed for
7344       // scalarization.
7345       Cost += getScalarizationOverhead(I, VF);
7346 
7347       // Scale the cost by the probability of executing the predicated blocks.
7348       // This assumes the predicated block for each vector lane is equally
7349       // likely.
7350       return Cost / getReciprocalPredBlockProb();
7351     }
7352     LLVM_FALLTHROUGH;
7353   case Instruction::Add:
7354   case Instruction::FAdd:
7355   case Instruction::Sub:
7356   case Instruction::FSub:
7357   case Instruction::Mul:
7358   case Instruction::FMul:
7359   case Instruction::FDiv:
7360   case Instruction::FRem:
7361   case Instruction::Shl:
7362   case Instruction::LShr:
7363   case Instruction::AShr:
7364   case Instruction::And:
7365   case Instruction::Or:
7366   case Instruction::Xor: {
7367     // Since we will replace the stride by 1 the multiplication should go away.
7368     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7369       return 0;
7370 
7371     // Detect reduction patterns
7372     InstructionCost RedCost;
7373     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7374             .isValid())
7375       return RedCost;
7376 
7377     // Certain instructions can be cheaper to vectorize if they have a constant
7378     // second vector operand. One example of this are shifts on x86.
7379     Value *Op2 = I->getOperand(1);
7380     TargetTransformInfo::OperandValueProperties Op2VP;
7381     TargetTransformInfo::OperandValueKind Op2VK =
7382         TTI.getOperandInfo(Op2, Op2VP);
7383     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7384       Op2VK = TargetTransformInfo::OK_UniformValue;
7385 
7386     SmallVector<const Value *, 4> Operands(I->operand_values());
7387     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7388     return N * TTI.getArithmeticInstrCost(
7389                    I->getOpcode(), VectorTy, CostKind,
7390                    TargetTransformInfo::OK_AnyValue,
7391                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7392   }
7393   case Instruction::FNeg: {
7394     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7395     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7396     return N * TTI.getArithmeticInstrCost(
7397                    I->getOpcode(), VectorTy, CostKind,
7398                    TargetTransformInfo::OK_AnyValue,
7399                    TargetTransformInfo::OK_AnyValue,
7400                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7401                    I->getOperand(0), I);
7402   }
7403   case Instruction::Select: {
7404     SelectInst *SI = cast<SelectInst>(I);
7405     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7406     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7407     Type *CondTy = SI->getCondition()->getType();
7408     if (!ScalarCond)
7409       CondTy = VectorType::get(CondTy, VF);
7410     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7411                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7412   }
7413   case Instruction::ICmp:
7414   case Instruction::FCmp: {
7415     Type *ValTy = I->getOperand(0)->getType();
7416     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7417     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7418       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7419     VectorTy = ToVectorTy(ValTy, VF);
7420     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7421                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7422   }
7423   case Instruction::Store:
7424   case Instruction::Load: {
7425     ElementCount Width = VF;
7426     if (Width.isVector()) {
7427       InstWidening Decision = getWideningDecision(I, Width);
7428       assert(Decision != CM_Unknown &&
7429              "CM decision should be taken at this point");
7430       if (Decision == CM_Scalarize)
7431         Width = ElementCount::getFixed(1);
7432     }
7433     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7434     return getMemoryInstructionCost(I, VF);
7435   }
7436   case Instruction::ZExt:
7437   case Instruction::SExt:
7438   case Instruction::FPToUI:
7439   case Instruction::FPToSI:
7440   case Instruction::FPExt:
7441   case Instruction::PtrToInt:
7442   case Instruction::IntToPtr:
7443   case Instruction::SIToFP:
7444   case Instruction::UIToFP:
7445   case Instruction::Trunc:
7446   case Instruction::FPTrunc:
7447   case Instruction::BitCast: {
7448     // Computes the CastContextHint from a Load/Store instruction.
7449     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7450       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7451              "Expected a load or a store!");
7452 
7453       if (VF.isScalar() || !TheLoop->contains(I))
7454         return TTI::CastContextHint::Normal;
7455 
7456       switch (getWideningDecision(I, VF)) {
7457       case LoopVectorizationCostModel::CM_GatherScatter:
7458         return TTI::CastContextHint::GatherScatter;
7459       case LoopVectorizationCostModel::CM_Interleave:
7460         return TTI::CastContextHint::Interleave;
7461       case LoopVectorizationCostModel::CM_Scalarize:
7462       case LoopVectorizationCostModel::CM_Widen:
7463         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7464                                         : TTI::CastContextHint::Normal;
7465       case LoopVectorizationCostModel::CM_Widen_Reverse:
7466         return TTI::CastContextHint::Reversed;
7467       case LoopVectorizationCostModel::CM_Unknown:
7468         llvm_unreachable("Instr did not go through cost modelling?");
7469       }
7470 
7471       llvm_unreachable("Unhandled case!");
7472     };
7473 
7474     unsigned Opcode = I->getOpcode();
7475     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7476     // For Trunc, the context is the only user, which must be a StoreInst.
7477     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7478       if (I->hasOneUse())
7479         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7480           CCH = ComputeCCH(Store);
7481     }
7482     // For Z/Sext, the context is the operand, which must be a LoadInst.
7483     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7484              Opcode == Instruction::FPExt) {
7485       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7486         CCH = ComputeCCH(Load);
7487     }
7488 
7489     // We optimize the truncation of induction variables having constant
7490     // integer steps. The cost of these truncations is the same as the scalar
7491     // operation.
7492     if (isOptimizableIVTruncate(I, VF)) {
7493       auto *Trunc = cast<TruncInst>(I);
7494       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7495                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7496     }
7497 
7498     // Detect reduction patterns
7499     InstructionCost RedCost;
7500     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7501             .isValid())
7502       return RedCost;
7503 
7504     Type *SrcScalarTy = I->getOperand(0)->getType();
7505     Type *SrcVecTy =
7506         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7507     if (canTruncateToMinimalBitwidth(I, VF)) {
7508       // This cast is going to be shrunk. This may remove the cast or it might
7509       // turn it into slightly different cast. For example, if MinBW == 16,
7510       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7511       //
7512       // Calculate the modified src and dest types.
7513       Type *MinVecTy = VectorTy;
7514       if (Opcode == Instruction::Trunc) {
7515         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7516         VectorTy =
7517             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7518       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7519         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7520         VectorTy =
7521             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7522       }
7523     }
7524 
7525     unsigned N;
7526     if (isScalarAfterVectorization(I, VF)) {
7527       assert(!VF.isScalable() && "VF is assumed to be non scalable");
7528       N = VF.getKnownMinValue();
7529     } else
7530       N = 1;
7531     return N *
7532            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7533   }
7534   case Instruction::Call: {
7535     bool NeedToScalarize;
7536     CallInst *CI = cast<CallInst>(I);
7537     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7538     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7539       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7540       return std::min(CallCost, IntrinsicCost);
7541     }
7542     return CallCost;
7543   }
7544   case Instruction::ExtractValue:
7545     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7546   default:
7547     // The cost of executing VF copies of the scalar instruction. This opcode
7548     // is unknown. Assume that it is the same as 'mul'.
7549     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7550                                        Instruction::Mul, VectorTy, CostKind) +
7551            getScalarizationOverhead(I, VF);
7552   } // end of switch.
7553 }
7554 
7555 char LoopVectorize::ID = 0;
7556 
7557 static const char lv_name[] = "Loop Vectorization";
7558 
7559 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7560 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7561 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7562 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7563 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7564 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7565 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7566 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7567 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7568 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7569 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7570 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7571 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7572 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7573 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7574 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7575 
7576 namespace llvm {
7577 
7578 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7579 
7580 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7581                               bool VectorizeOnlyWhenForced) {
7582   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7583 }
7584 
7585 } // end namespace llvm
7586 
7587 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7588   // Check if the pointer operand of a load or store instruction is
7589   // consecutive.
7590   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7591     return Legal->isConsecutivePtr(Ptr);
7592   return false;
7593 }
7594 
7595 void LoopVectorizationCostModel::collectValuesToIgnore() {
7596   // Ignore ephemeral values.
7597   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7598 
7599   // Ignore type-promoting instructions we identified during reduction
7600   // detection.
7601   for (auto &Reduction : Legal->getReductionVars()) {
7602     RecurrenceDescriptor &RedDes = Reduction.second;
7603     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7604     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7605   }
7606   // Ignore type-casting instructions we identified during induction
7607   // detection.
7608   for (auto &Induction : Legal->getInductionVars()) {
7609     InductionDescriptor &IndDes = Induction.second;
7610     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7611     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7612   }
7613 }
7614 
7615 void LoopVectorizationCostModel::collectInLoopReductions() {
7616   for (auto &Reduction : Legal->getReductionVars()) {
7617     PHINode *Phi = Reduction.first;
7618     RecurrenceDescriptor &RdxDesc = Reduction.second;
7619 
7620     // We don't collect reductions that are type promoted (yet).
7621     if (RdxDesc.getRecurrenceType() != Phi->getType())
7622       continue;
7623 
7624     // If the target would prefer this reduction to happen "in-loop", then we
7625     // want to record it as such.
7626     unsigned Opcode = RdxDesc.getOpcode();
7627     if (!PreferInLoopReductions &&
7628         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7629                                    TargetTransformInfo::ReductionFlags()))
7630       continue;
7631 
7632     // Check that we can correctly put the reductions into the loop, by
7633     // finding the chain of operations that leads from the phi to the loop
7634     // exit value.
7635     SmallVector<Instruction *, 4> ReductionOperations =
7636         RdxDesc.getReductionOpChain(Phi, TheLoop);
7637     bool InLoop = !ReductionOperations.empty();
7638     if (InLoop) {
7639       InLoopReductionChains[Phi] = ReductionOperations;
7640       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7641       Instruction *LastChain = Phi;
7642       for (auto *I : ReductionOperations) {
7643         InLoopReductionImmediateChains[I] = LastChain;
7644         LastChain = I;
7645       }
7646     }
7647     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7648                       << " reduction for phi: " << *Phi << "\n");
7649   }
7650 }
7651 
7652 // TODO: we could return a pair of values that specify the max VF and
7653 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7654 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7655 // doesn't have a cost model that can choose which plan to execute if
7656 // more than one is generated.
7657 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7658                                  LoopVectorizationCostModel &CM) {
7659   unsigned WidestType;
7660   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7661   return WidestVectorRegBits / WidestType;
7662 }
7663 
7664 VectorizationFactor
7665 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7666   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7667   ElementCount VF = UserVF;
7668   // Outer loop handling: They may require CFG and instruction level
7669   // transformations before even evaluating whether vectorization is profitable.
7670   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7671   // the vectorization pipeline.
7672   if (!OrigLoop->isInnermost()) {
7673     // If the user doesn't provide a vectorization factor, determine a
7674     // reasonable one.
7675     if (UserVF.isZero()) {
7676       VF = ElementCount::getFixed(
7677           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7678       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7679 
7680       // Make sure we have a VF > 1 for stress testing.
7681       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7682         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7683                           << "overriding computed VF.\n");
7684         VF = ElementCount::getFixed(4);
7685       }
7686     }
7687     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7688     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7689            "VF needs to be a power of two");
7690     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7691                       << "VF " << VF << " to build VPlans.\n");
7692     buildVPlans(VF, VF);
7693 
7694     // For VPlan build stress testing, we bail out after VPlan construction.
7695     if (VPlanBuildStressTest)
7696       return VectorizationFactor::Disabled();
7697 
7698     return {VF, 0 /*Cost*/};
7699   }
7700 
7701   LLVM_DEBUG(
7702       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7703                 "VPlan-native path.\n");
7704   return VectorizationFactor::Disabled();
7705 }
7706 
7707 Optional<VectorizationFactor>
7708 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7709   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7710   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7711   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7712     return None;
7713 
7714   // Invalidate interleave groups if all blocks of loop will be predicated.
7715   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7716       !useMaskedInterleavedAccesses(*TTI)) {
7717     LLVM_DEBUG(
7718         dbgs()
7719         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7720            "which requires masked-interleaved support.\n");
7721     if (CM.InterleaveInfo.invalidateGroups())
7722       // Invalidating interleave groups also requires invalidating all decisions
7723       // based on them, which includes widening decisions and uniform and scalar
7724       // values.
7725       CM.invalidateCostModelingDecisions();
7726   }
7727 
7728   ElementCount MaxVF = MaybeMaxVF.getValue();
7729   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7730 
7731   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
7732   if (!UserVF.isZero() &&
7733       (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
7734     // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
7735     // VFs here, this should be reverted to only use legal UserVFs once the
7736     // loop below supports scalable VFs.
7737     ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
7738     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
7739                       << " VF " << VF << ".\n");
7740     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7741            "VF needs to be a power of two");
7742     // Collect the instructions (and their associated costs) that will be more
7743     // profitable to scalarize.
7744     CM.selectUserVectorizationFactor(VF);
7745     CM.collectInLoopReductions();
7746     buildVPlansWithVPRecipes(VF, VF);
7747     LLVM_DEBUG(printPlans(dbgs()));
7748     return {{VF, 0}};
7749   }
7750 
7751   assert(!MaxVF.isScalable() &&
7752          "Scalable vectors not yet supported beyond this point");
7753 
7754   for (ElementCount VF = ElementCount::getFixed(1);
7755        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7756     // Collect Uniform and Scalar instructions after vectorization with VF.
7757     CM.collectUniformsAndScalars(VF);
7758 
7759     // Collect the instructions (and their associated costs) that will be more
7760     // profitable to scalarize.
7761     if (VF.isVector())
7762       CM.collectInstsToScalarize(VF);
7763   }
7764 
7765   CM.collectInLoopReductions();
7766 
7767   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7768   LLVM_DEBUG(printPlans(dbgs()));
7769   if (MaxVF.isScalar())
7770     return VectorizationFactor::Disabled();
7771 
7772   // Select the optimal vectorization factor.
7773   return CM.selectVectorizationFactor(MaxVF);
7774 }
7775 
7776 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7777   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7778                     << '\n');
7779   BestVF = VF;
7780   BestUF = UF;
7781 
7782   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7783     return !Plan->hasVF(VF);
7784   });
7785   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7786 }
7787 
7788 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7789                                            DominatorTree *DT) {
7790   // Perform the actual loop transformation.
7791 
7792   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7793   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7794   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7795 
7796   VPTransformState State{
7797       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
7798   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7799   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7800   State.CanonicalIV = ILV.Induction;
7801 
7802   ILV.printDebugTracesAtStart();
7803 
7804   //===------------------------------------------------===//
7805   //
7806   // Notice: any optimization or new instruction that go
7807   // into the code below should also be implemented in
7808   // the cost-model.
7809   //
7810   //===------------------------------------------------===//
7811 
7812   // 2. Copy and widen instructions from the old loop into the new loop.
7813   VPlans.front()->execute(&State);
7814 
7815   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7816   //    predication, updating analyses.
7817   ILV.fixVectorizedLoop(State);
7818 
7819   ILV.printDebugTracesAtEnd();
7820 }
7821 
7822 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7823     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7824 
7825   // We create new control-flow for the vectorized loop, so the original exit
7826   // conditions will be dead after vectorization if it's only used by the
7827   // terminator
7828   SmallVector<BasicBlock*> ExitingBlocks;
7829   OrigLoop->getExitingBlocks(ExitingBlocks);
7830   for (auto *BB : ExitingBlocks) {
7831     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7832     if (!Cmp || !Cmp->hasOneUse())
7833       continue;
7834 
7835     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7836     if (!DeadInstructions.insert(Cmp).second)
7837       continue;
7838 
7839     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7840     // TODO: can recurse through operands in general
7841     for (Value *Op : Cmp->operands()) {
7842       if (isa<TruncInst>(Op) && Op->hasOneUse())
7843           DeadInstructions.insert(cast<Instruction>(Op));
7844     }
7845   }
7846 
7847   // We create new "steps" for induction variable updates to which the original
7848   // induction variables map. An original update instruction will be dead if
7849   // all its users except the induction variable are dead.
7850   auto *Latch = OrigLoop->getLoopLatch();
7851   for (auto &Induction : Legal->getInductionVars()) {
7852     PHINode *Ind = Induction.first;
7853     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7854 
7855     // If the tail is to be folded by masking, the primary induction variable,
7856     // if exists, isn't dead: it will be used for masking. Don't kill it.
7857     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7858       continue;
7859 
7860     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7861           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7862         }))
7863       DeadInstructions.insert(IndUpdate);
7864 
7865     // We record as "Dead" also the type-casting instructions we had identified
7866     // during induction analysis. We don't need any handling for them in the
7867     // vectorized loop because we have proven that, under a proper runtime
7868     // test guarding the vectorized loop, the value of the phi, and the casted
7869     // value of the phi, are the same. The last instruction in this casting chain
7870     // will get its scalar/vector/widened def from the scalar/vector/widened def
7871     // of the respective phi node. Any other casts in the induction def-use chain
7872     // have no other uses outside the phi update chain, and will be ignored.
7873     InductionDescriptor &IndDes = Induction.second;
7874     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7875     DeadInstructions.insert(Casts.begin(), Casts.end());
7876   }
7877 }
7878 
7879 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7880 
7881 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7882 
7883 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7884                                         Instruction::BinaryOps BinOp) {
7885   // When unrolling and the VF is 1, we only need to add a simple scalar.
7886   Type *Ty = Val->getType();
7887   assert(!Ty->isVectorTy() && "Val must be a scalar");
7888 
7889   if (Ty->isFloatingPointTy()) {
7890     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7891 
7892     // Floating point operations had to be 'fast' to enable the unrolling.
7893     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7894     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7895   }
7896   Constant *C = ConstantInt::get(Ty, StartIdx);
7897   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7898 }
7899 
7900 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7901   SmallVector<Metadata *, 4> MDs;
7902   // Reserve first location for self reference to the LoopID metadata node.
7903   MDs.push_back(nullptr);
7904   bool IsUnrollMetadata = false;
7905   MDNode *LoopID = L->getLoopID();
7906   if (LoopID) {
7907     // First find existing loop unrolling disable metadata.
7908     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7909       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7910       if (MD) {
7911         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7912         IsUnrollMetadata =
7913             S && S->getString().startswith("llvm.loop.unroll.disable");
7914       }
7915       MDs.push_back(LoopID->getOperand(i));
7916     }
7917   }
7918 
7919   if (!IsUnrollMetadata) {
7920     // Add runtime unroll disable metadata.
7921     LLVMContext &Context = L->getHeader()->getContext();
7922     SmallVector<Metadata *, 1> DisableOperands;
7923     DisableOperands.push_back(
7924         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7925     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7926     MDs.push_back(DisableNode);
7927     MDNode *NewLoopID = MDNode::get(Context, MDs);
7928     // Set operand 0 to refer to the loop id itself.
7929     NewLoopID->replaceOperandWith(0, NewLoopID);
7930     L->setLoopID(NewLoopID);
7931   }
7932 }
7933 
7934 //===--------------------------------------------------------------------===//
7935 // EpilogueVectorizerMainLoop
7936 //===--------------------------------------------------------------------===//
7937 
7938 /// This function is partially responsible for generating the control flow
7939 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7940 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7941   MDNode *OrigLoopID = OrigLoop->getLoopID();
7942   Loop *Lp = createVectorLoopSkeleton("");
7943 
7944   // Generate the code to check the minimum iteration count of the vector
7945   // epilogue (see below).
7946   EPI.EpilogueIterationCountCheck =
7947       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7948   EPI.EpilogueIterationCountCheck->setName("iter.check");
7949 
7950   // Generate the code to check any assumptions that we've made for SCEV
7951   // expressions.
7952   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
7953 
7954   // Generate the code that checks at runtime if arrays overlap. We put the
7955   // checks into a separate block to make the more common case of few elements
7956   // faster.
7957   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7958 
7959   // Generate the iteration count check for the main loop, *after* the check
7960   // for the epilogue loop, so that the path-length is shorter for the case
7961   // that goes directly through the vector epilogue. The longer-path length for
7962   // the main loop is compensated for, by the gain from vectorizing the larger
7963   // trip count. Note: the branch will get updated later on when we vectorize
7964   // the epilogue.
7965   EPI.MainLoopIterationCountCheck =
7966       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7967 
7968   // Generate the induction variable.
7969   OldInduction = Legal->getPrimaryInduction();
7970   Type *IdxTy = Legal->getWidestInductionType();
7971   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7972   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7973   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7974   EPI.VectorTripCount = CountRoundDown;
7975   Induction =
7976       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7977                               getDebugLocFromInstOrOperands(OldInduction));
7978 
7979   // Skip induction resume value creation here because they will be created in
7980   // the second pass. If we created them here, they wouldn't be used anyway,
7981   // because the vplan in the second pass still contains the inductions from the
7982   // original loop.
7983 
7984   return completeLoopSkeleton(Lp, OrigLoopID);
7985 }
7986 
7987 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7988   LLVM_DEBUG({
7989     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7990            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7991            << ", Main Loop UF:" << EPI.MainLoopUF
7992            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7993            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7994   });
7995 }
7996 
7997 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7998   DEBUG_WITH_TYPE(VerboseDebug, {
7999     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
8000   });
8001 }
8002 
8003 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8004     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8005   assert(L && "Expected valid Loop.");
8006   assert(Bypass && "Expected valid bypass basic block.");
8007   unsigned VFactor =
8008       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
8009   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8010   Value *Count = getOrCreateTripCount(L);
8011   // Reuse existing vector loop preheader for TC checks.
8012   // Note that new preheader block is generated for vector loop.
8013   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8014   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8015 
8016   // Generate code to check if the loop's trip count is less than VF * UF of the
8017   // main vector loop.
8018   auto P =
8019       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8020 
8021   Value *CheckMinIters = Builder.CreateICmp(
8022       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8023       "min.iters.check");
8024 
8025   if (!ForEpilogue)
8026     TCCheckBlock->setName("vector.main.loop.iter.check");
8027 
8028   // Create new preheader for vector loop.
8029   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8030                                    DT, LI, nullptr, "vector.ph");
8031 
8032   if (ForEpilogue) {
8033     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8034                                  DT->getNode(Bypass)->getIDom()) &&
8035            "TC check is expected to dominate Bypass");
8036 
8037     // Update dominator for Bypass & LoopExit.
8038     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8039     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8040 
8041     LoopBypassBlocks.push_back(TCCheckBlock);
8042 
8043     // Save the trip count so we don't have to regenerate it in the
8044     // vec.epilog.iter.check. This is safe to do because the trip count
8045     // generated here dominates the vector epilog iter check.
8046     EPI.TripCount = Count;
8047   }
8048 
8049   ReplaceInstWithInst(
8050       TCCheckBlock->getTerminator(),
8051       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8052 
8053   return TCCheckBlock;
8054 }
8055 
8056 //===--------------------------------------------------------------------===//
8057 // EpilogueVectorizerEpilogueLoop
8058 //===--------------------------------------------------------------------===//
8059 
8060 /// This function is partially responsible for generating the control flow
8061 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8062 BasicBlock *
8063 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8064   MDNode *OrigLoopID = OrigLoop->getLoopID();
8065   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8066 
8067   // Now, compare the remaining count and if there aren't enough iterations to
8068   // execute the vectorized epilogue skip to the scalar part.
8069   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8070   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8071   LoopVectorPreHeader =
8072       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8073                  LI, nullptr, "vec.epilog.ph");
8074   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8075                                           VecEpilogueIterationCountCheck);
8076 
8077   // Adjust the control flow taking the state info from the main loop
8078   // vectorization into account.
8079   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8080          "expected this to be saved from the previous pass.");
8081   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8082       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8083 
8084   DT->changeImmediateDominator(LoopVectorPreHeader,
8085                                EPI.MainLoopIterationCountCheck);
8086 
8087   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8088       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8089 
8090   if (EPI.SCEVSafetyCheck)
8091     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8092         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8093   if (EPI.MemSafetyCheck)
8094     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8095         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8096 
8097   DT->changeImmediateDominator(
8098       VecEpilogueIterationCountCheck,
8099       VecEpilogueIterationCountCheck->getSinglePredecessor());
8100 
8101   DT->changeImmediateDominator(LoopScalarPreHeader,
8102                                EPI.EpilogueIterationCountCheck);
8103   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
8104 
8105   // Keep track of bypass blocks, as they feed start values to the induction
8106   // phis in the scalar loop preheader.
8107   if (EPI.SCEVSafetyCheck)
8108     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8109   if (EPI.MemSafetyCheck)
8110     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8111   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8112 
8113   // Generate a resume induction for the vector epilogue and put it in the
8114   // vector epilogue preheader
8115   Type *IdxTy = Legal->getWidestInductionType();
8116   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8117                                          LoopVectorPreHeader->getFirstNonPHI());
8118   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8119   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8120                            EPI.MainLoopIterationCountCheck);
8121 
8122   // Generate the induction variable.
8123   OldInduction = Legal->getPrimaryInduction();
8124   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8125   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8126   Value *StartIdx = EPResumeVal;
8127   Induction =
8128       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8129                               getDebugLocFromInstOrOperands(OldInduction));
8130 
8131   // Generate induction resume values. These variables save the new starting
8132   // indexes for the scalar loop. They are used to test if there are any tail
8133   // iterations left once the vector loop has completed.
8134   // Note that when the vectorized epilogue is skipped due to iteration count
8135   // check, then the resume value for the induction variable comes from
8136   // the trip count of the main vector loop, hence passing the AdditionalBypass
8137   // argument.
8138   createInductionResumeValues(Lp, CountRoundDown,
8139                               {VecEpilogueIterationCountCheck,
8140                                EPI.VectorTripCount} /* AdditionalBypass */);
8141 
8142   AddRuntimeUnrollDisableMetaData(Lp);
8143   return completeLoopSkeleton(Lp, OrigLoopID);
8144 }
8145 
8146 BasicBlock *
8147 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8148     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8149 
8150   assert(EPI.TripCount &&
8151          "Expected trip count to have been safed in the first pass.");
8152   assert(
8153       (!isa<Instruction>(EPI.TripCount) ||
8154        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8155       "saved trip count does not dominate insertion point.");
8156   Value *TC = EPI.TripCount;
8157   IRBuilder<> Builder(Insert->getTerminator());
8158   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8159 
8160   // Generate code to check if the loop's trip count is less than VF * UF of the
8161   // vector epilogue loop.
8162   auto P =
8163       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8164 
8165   Value *CheckMinIters = Builder.CreateICmp(
8166       P, Count,
8167       ConstantInt::get(Count->getType(),
8168                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8169       "min.epilog.iters.check");
8170 
8171   ReplaceInstWithInst(
8172       Insert->getTerminator(),
8173       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8174 
8175   LoopBypassBlocks.push_back(Insert);
8176   return Insert;
8177 }
8178 
8179 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8180   LLVM_DEBUG({
8181     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8182            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8183            << ", Main Loop UF:" << EPI.MainLoopUF
8184            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8185            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8186   });
8187 }
8188 
8189 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8190   DEBUG_WITH_TYPE(VerboseDebug, {
8191     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8192   });
8193 }
8194 
8195 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8196     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8197   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8198   bool PredicateAtRangeStart = Predicate(Range.Start);
8199 
8200   for (ElementCount TmpVF = Range.Start * 2;
8201        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8202     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8203       Range.End = TmpVF;
8204       break;
8205     }
8206 
8207   return PredicateAtRangeStart;
8208 }
8209 
8210 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8211 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8212 /// of VF's starting at a given VF and extending it as much as possible. Each
8213 /// vectorization decision can potentially shorten this sub-range during
8214 /// buildVPlan().
8215 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8216                                            ElementCount MaxVF) {
8217   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8218   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8219     VFRange SubRange = {VF, MaxVFPlusOne};
8220     VPlans.push_back(buildVPlan(SubRange));
8221     VF = SubRange.End;
8222   }
8223 }
8224 
8225 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8226                                          VPlanPtr &Plan) {
8227   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8228 
8229   // Look for cached value.
8230   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8231   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8232   if (ECEntryIt != EdgeMaskCache.end())
8233     return ECEntryIt->second;
8234 
8235   VPValue *SrcMask = createBlockInMask(Src, Plan);
8236 
8237   // The terminator has to be a branch inst!
8238   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8239   assert(BI && "Unexpected terminator found");
8240 
8241   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8242     return EdgeMaskCache[Edge] = SrcMask;
8243 
8244   // If source is an exiting block, we know the exit edge is dynamically dead
8245   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8246   // adding uses of an otherwise potentially dead instruction.
8247   if (OrigLoop->isLoopExiting(Src))
8248     return EdgeMaskCache[Edge] = SrcMask;
8249 
8250   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8251   assert(EdgeMask && "No Edge Mask found for condition");
8252 
8253   if (BI->getSuccessor(0) != Dst)
8254     EdgeMask = Builder.createNot(EdgeMask);
8255 
8256   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8257     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8258     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8259     // The select version does not introduce new UB if SrcMask is false and
8260     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8261     VPValue *False = Plan->getOrAddVPValue(
8262         ConstantInt::getFalse(BI->getCondition()->getType()));
8263     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8264   }
8265 
8266   return EdgeMaskCache[Edge] = EdgeMask;
8267 }
8268 
8269 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8270   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8271 
8272   // Look for cached value.
8273   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8274   if (BCEntryIt != BlockMaskCache.end())
8275     return BCEntryIt->second;
8276 
8277   // All-one mask is modelled as no-mask following the convention for masked
8278   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8279   VPValue *BlockMask = nullptr;
8280 
8281   if (OrigLoop->getHeader() == BB) {
8282     if (!CM.blockNeedsPredication(BB))
8283       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8284 
8285     // Create the block in mask as the first non-phi instruction in the block.
8286     VPBuilder::InsertPointGuard Guard(Builder);
8287     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8288     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8289 
8290     // Introduce the early-exit compare IV <= BTC to form header block mask.
8291     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8292     // Start by constructing the desired canonical IV.
8293     VPValue *IV = nullptr;
8294     if (Legal->getPrimaryInduction())
8295       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8296     else {
8297       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8298       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8299       IV = IVRecipe->getVPValue();
8300     }
8301     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8302     bool TailFolded = !CM.isScalarEpilogueAllowed();
8303 
8304     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8305       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8306       // as a second argument, we only pass the IV here and extract the
8307       // tripcount from the transform state where codegen of the VP instructions
8308       // happen.
8309       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8310     } else {
8311       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8312     }
8313     return BlockMaskCache[BB] = BlockMask;
8314   }
8315 
8316   // This is the block mask. We OR all incoming edges.
8317   for (auto *Predecessor : predecessors(BB)) {
8318     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8319     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8320       return BlockMaskCache[BB] = EdgeMask;
8321 
8322     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8323       BlockMask = EdgeMask;
8324       continue;
8325     }
8326 
8327     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8328   }
8329 
8330   return BlockMaskCache[BB] = BlockMask;
8331 }
8332 
8333 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8334                                                 VPlanPtr &Plan) {
8335   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8336          "Must be called with either a load or store");
8337 
8338   auto willWiden = [&](ElementCount VF) -> bool {
8339     if (VF.isScalar())
8340       return false;
8341     LoopVectorizationCostModel::InstWidening Decision =
8342         CM.getWideningDecision(I, VF);
8343     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8344            "CM decision should be taken at this point.");
8345     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8346       return true;
8347     if (CM.isScalarAfterVectorization(I, VF) ||
8348         CM.isProfitableToScalarize(I, VF))
8349       return false;
8350     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8351   };
8352 
8353   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8354     return nullptr;
8355 
8356   VPValue *Mask = nullptr;
8357   if (Legal->isMaskRequired(I))
8358     Mask = createBlockInMask(I->getParent(), Plan);
8359 
8360   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8361   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8362     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8363 
8364   StoreInst *Store = cast<StoreInst>(I);
8365   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8366   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8367 }
8368 
8369 VPWidenIntOrFpInductionRecipe *
8370 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
8371   // Check if this is an integer or fp induction. If so, build the recipe that
8372   // produces its scalar and vector values.
8373   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8374   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8375       II.getKind() == InductionDescriptor::IK_FpInduction) {
8376     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8377     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8378     return new VPWidenIntOrFpInductionRecipe(
8379         Phi, Start, Casts.empty() ? nullptr : Casts.front());
8380   }
8381 
8382   return nullptr;
8383 }
8384 
8385 VPWidenIntOrFpInductionRecipe *
8386 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
8387                                                 VPlan &Plan) const {
8388   // Optimize the special case where the source is a constant integer
8389   // induction variable. Notice that we can only optimize the 'trunc' case
8390   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8391   // (c) other casts depend on pointer size.
8392 
8393   // Determine whether \p K is a truncation based on an induction variable that
8394   // can be optimized.
8395   auto isOptimizableIVTruncate =
8396       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8397     return [=](ElementCount VF) -> bool {
8398       return CM.isOptimizableIVTruncate(K, VF);
8399     };
8400   };
8401 
8402   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8403           isOptimizableIVTruncate(I), Range)) {
8404 
8405     InductionDescriptor II =
8406         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8407     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8408     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8409                                              Start, nullptr, I);
8410   }
8411   return nullptr;
8412 }
8413 
8414 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8415   // If all incoming values are equal, the incoming VPValue can be used directly
8416   // instead of creating a new VPBlendRecipe.
8417   Value *FirstIncoming = Phi->getIncomingValue(0);
8418   if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) {
8419         return FirstIncoming == Inc;
8420       })) {
8421     return Plan->getOrAddVPValue(Phi->getIncomingValue(0));
8422   }
8423 
8424   // We know that all PHIs in non-header blocks are converted into selects, so
8425   // we don't have to worry about the insertion order and we can just use the
8426   // builder. At this point we generate the predication tree. There may be
8427   // duplications since this is a simple recursive scan, but future
8428   // optimizations will clean it up.
8429   SmallVector<VPValue *, 2> Operands;
8430   unsigned NumIncoming = Phi->getNumIncomingValues();
8431 
8432   for (unsigned In = 0; In < NumIncoming; In++) {
8433     VPValue *EdgeMask =
8434       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8435     assert((EdgeMask || NumIncoming == 1) &&
8436            "Multiple predecessors with one having a full mask");
8437     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8438     if (EdgeMask)
8439       Operands.push_back(EdgeMask);
8440   }
8441   return toVPRecipeResult(new VPBlendRecipe(Phi, Operands));
8442 }
8443 
8444 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8445                                                    VPlan &Plan) const {
8446 
8447   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8448       [this, CI](ElementCount VF) {
8449         return CM.isScalarWithPredication(CI, VF);
8450       },
8451       Range);
8452 
8453   if (IsPredicated)
8454     return nullptr;
8455 
8456   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8457   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8458              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8459              ID == Intrinsic::pseudoprobe ||
8460              ID == Intrinsic::experimental_noalias_scope_decl))
8461     return nullptr;
8462 
8463   auto willWiden = [&](ElementCount VF) -> bool {
8464     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8465     // The following case may be scalarized depending on the VF.
8466     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8467     // version of the instruction.
8468     // Is it beneficial to perform intrinsic call compared to lib call?
8469     bool NeedToScalarize = false;
8470     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8471     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8472     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8473     assert(IntrinsicCost.isValid() && CallCost.isValid() &&
8474            "Cannot have invalid costs while widening");
8475     return UseVectorIntrinsic || !NeedToScalarize;
8476   };
8477 
8478   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8479     return nullptr;
8480 
8481   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8482 }
8483 
8484 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8485   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8486          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8487   // Instruction should be widened, unless it is scalar after vectorization,
8488   // scalarization is profitable or it is predicated.
8489   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8490     return CM.isScalarAfterVectorization(I, VF) ||
8491            CM.isProfitableToScalarize(I, VF) ||
8492            CM.isScalarWithPredication(I, VF);
8493   };
8494   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8495                                                              Range);
8496 }
8497 
8498 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8499   auto IsVectorizableOpcode = [](unsigned Opcode) {
8500     switch (Opcode) {
8501     case Instruction::Add:
8502     case Instruction::And:
8503     case Instruction::AShr:
8504     case Instruction::BitCast:
8505     case Instruction::FAdd:
8506     case Instruction::FCmp:
8507     case Instruction::FDiv:
8508     case Instruction::FMul:
8509     case Instruction::FNeg:
8510     case Instruction::FPExt:
8511     case Instruction::FPToSI:
8512     case Instruction::FPToUI:
8513     case Instruction::FPTrunc:
8514     case Instruction::FRem:
8515     case Instruction::FSub:
8516     case Instruction::ICmp:
8517     case Instruction::IntToPtr:
8518     case Instruction::LShr:
8519     case Instruction::Mul:
8520     case Instruction::Or:
8521     case Instruction::PtrToInt:
8522     case Instruction::SDiv:
8523     case Instruction::Select:
8524     case Instruction::SExt:
8525     case Instruction::Shl:
8526     case Instruction::SIToFP:
8527     case Instruction::SRem:
8528     case Instruction::Sub:
8529     case Instruction::Trunc:
8530     case Instruction::UDiv:
8531     case Instruction::UIToFP:
8532     case Instruction::URem:
8533     case Instruction::Xor:
8534     case Instruction::ZExt:
8535       return true;
8536     }
8537     return false;
8538   };
8539 
8540   if (!IsVectorizableOpcode(I->getOpcode()))
8541     return nullptr;
8542 
8543   // Success: widen this instruction.
8544   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8545 }
8546 
8547 VPBasicBlock *VPRecipeBuilder::handleReplication(
8548     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8549     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8550     VPlanPtr &Plan) {
8551   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8552       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8553       Range);
8554 
8555   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8556       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8557       Range);
8558 
8559   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8560                                        IsUniform, IsPredicated);
8561   setRecipe(I, Recipe);
8562   Plan->addVPValue(I, Recipe);
8563 
8564   // Find if I uses a predicated instruction. If so, it will use its scalar
8565   // value. Avoid hoisting the insert-element which packs the scalar value into
8566   // a vector value, as that happens iff all users use the vector value.
8567   for (auto &Op : I->operands())
8568     if (auto *PredInst = dyn_cast<Instruction>(Op))
8569       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8570         PredInst2Recipe[PredInst]->setAlsoPack(false);
8571 
8572   // Finalize the recipe for Instr, first if it is not predicated.
8573   if (!IsPredicated) {
8574     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8575     VPBB->appendRecipe(Recipe);
8576     return VPBB;
8577   }
8578   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8579   assert(VPBB->getSuccessors().empty() &&
8580          "VPBB has successors when handling predicated replication.");
8581   // Record predicated instructions for above packing optimizations.
8582   PredInst2Recipe[I] = Recipe;
8583   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8584   VPBlockUtils::insertBlockAfter(Region, VPBB);
8585   auto *RegSucc = new VPBasicBlock();
8586   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8587   return RegSucc;
8588 }
8589 
8590 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8591                                                       VPRecipeBase *PredRecipe,
8592                                                       VPlanPtr &Plan) {
8593   // Instructions marked for predication are replicated and placed under an
8594   // if-then construct to prevent side-effects.
8595 
8596   // Generate recipes to compute the block mask for this region.
8597   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8598 
8599   // Build the triangular if-then region.
8600   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8601   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8602   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8603   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8604   auto *PHIRecipe = Instr->getType()->isVoidTy()
8605                         ? nullptr
8606                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8607   if (PHIRecipe) {
8608     Plan->removeVPValueFor(Instr);
8609     Plan->addVPValue(Instr, PHIRecipe);
8610   }
8611   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8612   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8613   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8614 
8615   // Note: first set Entry as region entry and then connect successors starting
8616   // from it in order, to propagate the "parent" of each VPBasicBlock.
8617   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8618   VPBlockUtils::connectBlocks(Pred, Exit);
8619 
8620   return Region;
8621 }
8622 
8623 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8624                                                             VFRange &Range,
8625                                                             VPlanPtr &Plan) {
8626   // First, check for specific widening recipes that deal with calls, memory
8627   // operations, inductions and Phi nodes.
8628   if (auto *CI = dyn_cast<CallInst>(Instr))
8629     return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan));
8630 
8631   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8632     return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan));
8633 
8634   VPRecipeBase *Recipe;
8635   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8636     if (Phi->getParent() != OrigLoop->getHeader())
8637       return tryToBlend(Phi, Plan);
8638     if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
8639       return toVPRecipeResult(Recipe);
8640 
8641     if (Legal->isReductionVariable(Phi)) {
8642       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8643       VPValue *StartV =
8644           Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
8645       return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV));
8646     }
8647 
8648     return toVPRecipeResult(new VPWidenPHIRecipe(Phi));
8649   }
8650 
8651   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8652                                     cast<TruncInst>(Instr), Range, *Plan)))
8653     return toVPRecipeResult(Recipe);
8654 
8655   if (!shouldWiden(Instr, Range))
8656     return nullptr;
8657 
8658   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8659     return toVPRecipeResult(new VPWidenGEPRecipe(
8660         GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop));
8661 
8662   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8663     bool InvariantCond =
8664         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8665     return toVPRecipeResult(new VPWidenSelectRecipe(
8666         *SI, Plan->mapToVPValues(SI->operands()), InvariantCond));
8667   }
8668 
8669   return toVPRecipeResult(tryToWiden(Instr, *Plan));
8670 }
8671 
8672 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8673                                                         ElementCount MaxVF) {
8674   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8675 
8676   // Collect instructions from the original loop that will become trivially dead
8677   // in the vectorized loop. We don't need to vectorize these instructions. For
8678   // example, original induction update instructions can become dead because we
8679   // separately emit induction "steps" when generating code for the new loop.
8680   // Similarly, we create a new latch condition when setting up the structure
8681   // of the new loop, so the old one can become dead.
8682   SmallPtrSet<Instruction *, 4> DeadInstructions;
8683   collectTriviallyDeadInstructions(DeadInstructions);
8684 
8685   // Add assume instructions we need to drop to DeadInstructions, to prevent
8686   // them from being added to the VPlan.
8687   // TODO: We only need to drop assumes in blocks that get flattend. If the
8688   // control flow is preserved, we should keep them.
8689   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8690   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8691 
8692   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8693   // Dead instructions do not need sinking. Remove them from SinkAfter.
8694   for (Instruction *I : DeadInstructions)
8695     SinkAfter.erase(I);
8696 
8697   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8698   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8699     VFRange SubRange = {VF, MaxVFPlusOne};
8700     VPlans.push_back(
8701         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8702     VF = SubRange.End;
8703   }
8704 }
8705 
8706 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8707     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8708     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8709 
8710   // Hold a mapping from predicated instructions to their recipes, in order to
8711   // fix their AlsoPack behavior if a user is determined to replicate and use a
8712   // scalar instead of vector value.
8713   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8714 
8715   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8716 
8717   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8718 
8719   // ---------------------------------------------------------------------------
8720   // Pre-construction: record ingredients whose recipes we'll need to further
8721   // process after constructing the initial VPlan.
8722   // ---------------------------------------------------------------------------
8723 
8724   // Mark instructions we'll need to sink later and their targets as
8725   // ingredients whose recipe we'll need to record.
8726   for (auto &Entry : SinkAfter) {
8727     RecipeBuilder.recordRecipeOf(Entry.first);
8728     RecipeBuilder.recordRecipeOf(Entry.second);
8729   }
8730   for (auto &Reduction : CM.getInLoopReductionChains()) {
8731     PHINode *Phi = Reduction.first;
8732     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8733     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8734 
8735     RecipeBuilder.recordRecipeOf(Phi);
8736     for (auto &R : ReductionOperations) {
8737       RecipeBuilder.recordRecipeOf(R);
8738       // For min/max reducitons, where we have a pair of icmp/select, we also
8739       // need to record the ICmp recipe, so it can be removed later.
8740       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8741         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8742     }
8743   }
8744 
8745   // For each interleave group which is relevant for this (possibly trimmed)
8746   // Range, add it to the set of groups to be later applied to the VPlan and add
8747   // placeholders for its members' Recipes which we'll be replacing with a
8748   // single VPInterleaveRecipe.
8749   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8750     auto applyIG = [IG, this](ElementCount VF) -> bool {
8751       return (VF.isVector() && // Query is illegal for VF == 1
8752               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8753                   LoopVectorizationCostModel::CM_Interleave);
8754     };
8755     if (!getDecisionAndClampRange(applyIG, Range))
8756       continue;
8757     InterleaveGroups.insert(IG);
8758     for (unsigned i = 0; i < IG->getFactor(); i++)
8759       if (Instruction *Member = IG->getMember(i))
8760         RecipeBuilder.recordRecipeOf(Member);
8761   };
8762 
8763   // ---------------------------------------------------------------------------
8764   // Build initial VPlan: Scan the body of the loop in a topological order to
8765   // visit each basic block after having visited its predecessor basic blocks.
8766   // ---------------------------------------------------------------------------
8767 
8768   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8769   auto Plan = std::make_unique<VPlan>();
8770   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8771   Plan->setEntry(VPBB);
8772 
8773   // Scan the body of the loop in a topological order to visit each basic block
8774   // after having visited its predecessor basic blocks.
8775   LoopBlocksDFS DFS(OrigLoop);
8776   DFS.perform(LI);
8777 
8778   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8779     // Relevant instructions from basic block BB will be grouped into VPRecipe
8780     // ingredients and fill a new VPBasicBlock.
8781     unsigned VPBBsForBB = 0;
8782     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8783     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8784     VPBB = FirstVPBBForBB;
8785     Builder.setInsertPoint(VPBB);
8786 
8787     // Introduce each ingredient into VPlan.
8788     // TODO: Model and preserve debug instrinsics in VPlan.
8789     for (Instruction &I : BB->instructionsWithoutDebug()) {
8790       Instruction *Instr = &I;
8791 
8792       // First filter out irrelevant instructions, to ensure no recipes are
8793       // built for them.
8794       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8795         continue;
8796 
8797       if (auto RecipeOrValue =
8798               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8799         // If Instr can be simplified to an existing VPValue, use it.
8800         if (RecipeOrValue.is<VPValue *>()) {
8801           Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>());
8802           continue;
8803         }
8804         // Otherwise, add the new recipe.
8805         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8806         for (auto *Def : Recipe->definedValues()) {
8807           auto *UV = Def->getUnderlyingValue();
8808           Plan->addVPValue(UV, Def);
8809         }
8810 
8811         RecipeBuilder.setRecipe(Instr, Recipe);
8812         VPBB->appendRecipe(Recipe);
8813         continue;
8814       }
8815 
8816       // Otherwise, if all widening options failed, Instruction is to be
8817       // replicated. This may create a successor for VPBB.
8818       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8819           Instr, Range, VPBB, PredInst2Recipe, Plan);
8820       if (NextVPBB != VPBB) {
8821         VPBB = NextVPBB;
8822         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8823                                     : "");
8824       }
8825     }
8826   }
8827 
8828   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8829   // may also be empty, such as the last one VPBB, reflecting original
8830   // basic-blocks with no recipes.
8831   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8832   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8833   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8834   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8835   delete PreEntry;
8836 
8837   // ---------------------------------------------------------------------------
8838   // Transform initial VPlan: Apply previously taken decisions, in order, to
8839   // bring the VPlan to its final state.
8840   // ---------------------------------------------------------------------------
8841 
8842   // Apply Sink-After legal constraints.
8843   for (auto &Entry : SinkAfter) {
8844     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8845     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8846     // If the target is in a replication region, make sure to move Sink to the
8847     // block after it, not into the replication region itself.
8848     if (auto *Region =
8849             dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
8850       if (Region->isReplicator()) {
8851         assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
8852         VPBasicBlock *NextBlock =
8853             cast<VPBasicBlock>(Region->getSuccessors().front());
8854         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8855         continue;
8856       }
8857     }
8858     Sink->moveAfter(Target);
8859   }
8860 
8861   // Interleave memory: for each Interleave Group we marked earlier as relevant
8862   // for this VPlan, replace the Recipes widening its memory instructions with a
8863   // single VPInterleaveRecipe at its insertion point.
8864   for (auto IG : InterleaveGroups) {
8865     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8866         RecipeBuilder.getRecipe(IG->getInsertPos()));
8867     SmallVector<VPValue *, 4> StoredValues;
8868     for (unsigned i = 0; i < IG->getFactor(); ++i)
8869       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8870         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8871 
8872     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8873                                         Recipe->getMask());
8874     VPIG->insertBefore(Recipe);
8875     unsigned J = 0;
8876     for (unsigned i = 0; i < IG->getFactor(); ++i)
8877       if (Instruction *Member = IG->getMember(i)) {
8878         if (!Member->getType()->isVoidTy()) {
8879           VPValue *OriginalV = Plan->getVPValue(Member);
8880           Plan->removeVPValueFor(Member);
8881           Plan->addVPValue(Member, VPIG->getVPValue(J));
8882           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8883           J++;
8884         }
8885         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8886       }
8887   }
8888 
8889   // Adjust the recipes for any inloop reductions.
8890   if (Range.Start.isVector())
8891     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8892 
8893   // Finally, if tail is folded by masking, introduce selects between the phi
8894   // and the live-out instruction of each reduction, at the end of the latch.
8895   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8896     Builder.setInsertPoint(VPBB);
8897     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8898     for (auto &Reduction : Legal->getReductionVars()) {
8899       if (CM.isInLoopReduction(Reduction.first))
8900         continue;
8901       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8902       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8903       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8904     }
8905   }
8906 
8907   std::string PlanName;
8908   raw_string_ostream RSO(PlanName);
8909   ElementCount VF = Range.Start;
8910   Plan->addVF(VF);
8911   RSO << "Initial VPlan for VF={" << VF;
8912   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8913     Plan->addVF(VF);
8914     RSO << "," << VF;
8915   }
8916   RSO << "},UF>=1";
8917   RSO.flush();
8918   Plan->setName(PlanName);
8919 
8920   return Plan;
8921 }
8922 
8923 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8924   // Outer loop handling: They may require CFG and instruction level
8925   // transformations before even evaluating whether vectorization is profitable.
8926   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8927   // the vectorization pipeline.
8928   assert(!OrigLoop->isInnermost());
8929   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8930 
8931   // Create new empty VPlan
8932   auto Plan = std::make_unique<VPlan>();
8933 
8934   // Build hierarchical CFG
8935   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8936   HCFGBuilder.buildHierarchicalCFG();
8937 
8938   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8939        VF *= 2)
8940     Plan->addVF(VF);
8941 
8942   if (EnableVPlanPredication) {
8943     VPlanPredicator VPP(*Plan);
8944     VPP.predicate();
8945 
8946     // Avoid running transformation to recipes until masked code generation in
8947     // VPlan-native path is in place.
8948     return Plan;
8949   }
8950 
8951   SmallPtrSet<Instruction *, 1> DeadInstructions;
8952   VPlanTransforms::VPInstructionsToVPRecipes(
8953       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8954   return Plan;
8955 }
8956 
8957 // Adjust the recipes for any inloop reductions. The chain of instructions
8958 // leading from the loop exit instr to the phi need to be converted to
8959 // reductions, with one operand being vector and the other being the scalar
8960 // reduction chain.
8961 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8962     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8963   for (auto &Reduction : CM.getInLoopReductionChains()) {
8964     PHINode *Phi = Reduction.first;
8965     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8966     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8967 
8968     // ReductionOperations are orders top-down from the phi's use to the
8969     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8970     // which of the two operands will remain scalar and which will be reduced.
8971     // For minmax the chain will be the select instructions.
8972     Instruction *Chain = Phi;
8973     for (Instruction *R : ReductionOperations) {
8974       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8975       RecurKind Kind = RdxDesc.getRecurrenceKind();
8976 
8977       VPValue *ChainOp = Plan->getVPValue(Chain);
8978       unsigned FirstOpId;
8979       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8980         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8981                "Expected to replace a VPWidenSelectSC");
8982         FirstOpId = 1;
8983       } else {
8984         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8985                "Expected to replace a VPWidenSC");
8986         FirstOpId = 0;
8987       }
8988       unsigned VecOpId =
8989           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8990       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8991 
8992       auto *CondOp = CM.foldTailByMasking()
8993                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8994                          : nullptr;
8995       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8996           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
8997       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8998       Plan->removeVPValueFor(R);
8999       Plan->addVPValue(R, RedRecipe);
9000       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9001       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
9002       WidenRecipe->eraseFromParent();
9003 
9004       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9005         VPRecipeBase *CompareRecipe =
9006             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9007         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9008                "Expected to replace a VPWidenSC");
9009         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9010                "Expected no remaining users");
9011         CompareRecipe->eraseFromParent();
9012       }
9013       Chain = R;
9014     }
9015   }
9016 }
9017 
9018 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9019                                VPSlotTracker &SlotTracker) const {
9020   O << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9021   IG->getInsertPos()->printAsOperand(O, false);
9022   O << ", ";
9023   getAddr()->printAsOperand(O, SlotTracker);
9024   VPValue *Mask = getMask();
9025   if (Mask) {
9026     O << ", ";
9027     Mask->printAsOperand(O, SlotTracker);
9028   }
9029   for (unsigned i = 0; i < IG->getFactor(); ++i)
9030     if (Instruction *I = IG->getMember(i))
9031       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
9032 }
9033 
9034 void VPWidenCallRecipe::execute(VPTransformState &State) {
9035   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9036                                   *this, State);
9037 }
9038 
9039 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9040   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9041                                     this, *this, InvariantCond, State);
9042 }
9043 
9044 void VPWidenRecipe::execute(VPTransformState &State) {
9045   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9046 }
9047 
9048 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9049   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9050                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9051                       IsIndexLoopInvariant, State);
9052 }
9053 
9054 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9055   assert(!State.Instance && "Int or FP induction being replicated.");
9056   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9057                                    getTruncInst(), getVPValue(0),
9058                                    getCastValue(), State);
9059 }
9060 
9061 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9062   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
9063                                  getStartValue(), this, State);
9064 }
9065 
9066 void VPBlendRecipe::execute(VPTransformState &State) {
9067   State.ILV->setDebugLocFromInst(State.Builder, Phi);
9068   // We know that all PHIs in non-header blocks are converted into
9069   // selects, so we don't have to worry about the insertion order and we
9070   // can just use the builder.
9071   // At this point we generate the predication tree. There may be
9072   // duplications since this is a simple recursive scan, but future
9073   // optimizations will clean it up.
9074 
9075   unsigned NumIncoming = getNumIncomingValues();
9076 
9077   // Generate a sequence of selects of the form:
9078   // SELECT(Mask3, In3,
9079   //        SELECT(Mask2, In2,
9080   //               SELECT(Mask1, In1,
9081   //                      In0)))
9082   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9083   // are essentially undef are taken from In0.
9084   InnerLoopVectorizer::VectorParts Entry(State.UF);
9085   for (unsigned In = 0; In < NumIncoming; ++In) {
9086     for (unsigned Part = 0; Part < State.UF; ++Part) {
9087       // We might have single edge PHIs (blocks) - use an identity
9088       // 'select' for the first PHI operand.
9089       Value *In0 = State.get(getIncomingValue(In), Part);
9090       if (In == 0)
9091         Entry[Part] = In0; // Initialize with the first incoming value.
9092       else {
9093         // Select between the current value and the previous incoming edge
9094         // based on the incoming mask.
9095         Value *Cond = State.get(getMask(In), Part);
9096         Entry[Part] =
9097             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9098       }
9099     }
9100   }
9101   for (unsigned Part = 0; Part < State.UF; ++Part)
9102     State.set(this, Entry[Part], Part);
9103 }
9104 
9105 void VPInterleaveRecipe::execute(VPTransformState &State) {
9106   assert(!State.Instance && "Interleave group being replicated.");
9107   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9108                                       getStoredValues(), getMask());
9109 }
9110 
9111 void VPReductionRecipe::execute(VPTransformState &State) {
9112   assert(!State.Instance && "Reduction being replicated.");
9113   for (unsigned Part = 0; Part < State.UF; ++Part) {
9114     RecurKind Kind = RdxDesc->getRecurrenceKind();
9115     Value *NewVecOp = State.get(getVecOp(), Part);
9116     if (VPValue *Cond = getCondOp()) {
9117       Value *NewCond = State.get(Cond, Part);
9118       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9119       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9120           Kind, VecTy->getElementType());
9121       Constant *IdenVec =
9122           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9123       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9124       NewVecOp = Select;
9125     }
9126     Value *NewRed =
9127         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9128     Value *PrevInChain = State.get(getChainOp(), Part);
9129     Value *NextInChain;
9130     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9131       NextInChain =
9132           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9133                          NewRed, PrevInChain);
9134     } else {
9135       NextInChain = State.Builder.CreateBinOp(
9136           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9137           PrevInChain);
9138     }
9139     State.set(this, NextInChain, Part);
9140   }
9141 }
9142 
9143 void VPReplicateRecipe::execute(VPTransformState &State) {
9144   if (State.Instance) { // Generate a single instance.
9145     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9146     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9147                                     *State.Instance, IsPredicated, State);
9148     // Insert scalar instance packing it into a vector.
9149     if (AlsoPack && State.VF.isVector()) {
9150       // If we're constructing lane 0, initialize to start from poison.
9151       if (State.Instance->Lane == 0) {
9152         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9153         Value *Poison = PoisonValue::get(
9154             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9155         State.set(this, Poison, State.Instance->Part);
9156       }
9157       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9158     }
9159     return;
9160   }
9161 
9162   // Generate scalar instances for all VF lanes of all UF parts, unless the
9163   // instruction is uniform inwhich case generate only the first lane for each
9164   // of the UF parts.
9165   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9166   assert((!State.VF.isScalable() || IsUniform) &&
9167          "Can't scalarize a scalable vector");
9168   for (unsigned Part = 0; Part < State.UF; ++Part)
9169     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9170       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9171                                       VPIteration(Part, Lane), IsPredicated,
9172                                       State);
9173 }
9174 
9175 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9176   assert(State.Instance && "Branch on Mask works only on single instance.");
9177 
9178   unsigned Part = State.Instance->Part;
9179   unsigned Lane = State.Instance->Lane;
9180 
9181   Value *ConditionBit = nullptr;
9182   VPValue *BlockInMask = getMask();
9183   if (BlockInMask) {
9184     ConditionBit = State.get(BlockInMask, Part);
9185     if (ConditionBit->getType()->isVectorTy())
9186       ConditionBit = State.Builder.CreateExtractElement(
9187           ConditionBit, State.Builder.getInt32(Lane));
9188   } else // Block in mask is all-one.
9189     ConditionBit = State.Builder.getTrue();
9190 
9191   // Replace the temporary unreachable terminator with a new conditional branch,
9192   // whose two destinations will be set later when they are created.
9193   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9194   assert(isa<UnreachableInst>(CurrentTerminator) &&
9195          "Expected to replace unreachable terminator with conditional branch.");
9196   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9197   CondBr->setSuccessor(0, nullptr);
9198   ReplaceInstWithInst(CurrentTerminator, CondBr);
9199 }
9200 
9201 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9202   assert(State.Instance && "Predicated instruction PHI works per instance.");
9203   Instruction *ScalarPredInst =
9204       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9205   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9206   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9207   assert(PredicatingBB && "Predicated block has no single predecessor.");
9208   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9209          "operand must be VPReplicateRecipe");
9210 
9211   // By current pack/unpack logic we need to generate only a single phi node: if
9212   // a vector value for the predicated instruction exists at this point it means
9213   // the instruction has vector users only, and a phi for the vector value is
9214   // needed. In this case the recipe of the predicated instruction is marked to
9215   // also do that packing, thereby "hoisting" the insert-element sequence.
9216   // Otherwise, a phi node for the scalar value is needed.
9217   unsigned Part = State.Instance->Part;
9218   if (State.hasVectorValue(getOperand(0), Part)) {
9219     Value *VectorValue = State.get(getOperand(0), Part);
9220     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9221     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9222     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9223     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9224     if (State.hasVectorValue(this, Part))
9225       State.reset(this, VPhi, Part);
9226     else
9227       State.set(this, VPhi, Part);
9228     // NOTE: Currently we need to update the value of the operand, so the next
9229     // predicated iteration inserts its generated value in the correct vector.
9230     State.reset(getOperand(0), VPhi, Part);
9231   } else {
9232     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9233     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9234     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9235                      PredicatingBB);
9236     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9237     if (State.hasScalarValue(this, *State.Instance))
9238       State.reset(this, Phi, *State.Instance);
9239     else
9240       State.set(this, Phi, *State.Instance);
9241     // NOTE: Currently we need to update the value of the operand, so the next
9242     // predicated iteration inserts its generated value in the correct vector.
9243     State.reset(getOperand(0), Phi, *State.Instance);
9244   }
9245 }
9246 
9247 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9248   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9249   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
9250                                         StoredValue ? nullptr : getVPValue(),
9251                                         getAddr(), StoredValue, getMask());
9252 }
9253 
9254 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9255 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9256 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9257 // for predication.
9258 static ScalarEpilogueLowering getScalarEpilogueLowering(
9259     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9260     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9261     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9262     LoopVectorizationLegality &LVL) {
9263   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9264   // don't look at hints or options, and don't request a scalar epilogue.
9265   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9266   // LoopAccessInfo (due to code dependency and not being able to reliably get
9267   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9268   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9269   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9270   // back to the old way and vectorize with versioning when forced. See D81345.)
9271   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9272                                                       PGSOQueryType::IRPass) &&
9273                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9274     return CM_ScalarEpilogueNotAllowedOptSize;
9275 
9276   // 2) If set, obey the directives
9277   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9278     switch (PreferPredicateOverEpilogue) {
9279     case PreferPredicateTy::ScalarEpilogue:
9280       return CM_ScalarEpilogueAllowed;
9281     case PreferPredicateTy::PredicateElseScalarEpilogue:
9282       return CM_ScalarEpilogueNotNeededUsePredicate;
9283     case PreferPredicateTy::PredicateOrDontVectorize:
9284       return CM_ScalarEpilogueNotAllowedUsePredicate;
9285     };
9286   }
9287 
9288   // 3) If set, obey the hints
9289   switch (Hints.getPredicate()) {
9290   case LoopVectorizeHints::FK_Enabled:
9291     return CM_ScalarEpilogueNotNeededUsePredicate;
9292   case LoopVectorizeHints::FK_Disabled:
9293     return CM_ScalarEpilogueAllowed;
9294   };
9295 
9296   // 4) if the TTI hook indicates this is profitable, request predication.
9297   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9298                                        LVL.getLAI()))
9299     return CM_ScalarEpilogueNotNeededUsePredicate;
9300 
9301   return CM_ScalarEpilogueAllowed;
9302 }
9303 
9304 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9305   // If Values have been set for this Def return the one relevant for \p Part.
9306   if (hasVectorValue(Def, Part))
9307     return Data.PerPartOutput[Def][Part];
9308 
9309   if (!hasScalarValue(Def, {Part, 0})) {
9310     Value *IRV = Def->getLiveInIRValue();
9311     Value *B = ILV->getBroadcastInstrs(IRV);
9312     set(Def, B, Part);
9313     return B;
9314   }
9315 
9316   Value *ScalarValue = get(Def, {Part, 0});
9317   // If we aren't vectorizing, we can just copy the scalar map values over
9318   // to the vector map.
9319   if (VF.isScalar()) {
9320     set(Def, ScalarValue, Part);
9321     return ScalarValue;
9322   }
9323 
9324   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9325   bool IsUniform = RepR && RepR->isUniform();
9326 
9327   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9328   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9329 
9330   // Set the insert point after the last scalarized instruction. This
9331   // ensures the insertelement sequence will directly follow the scalar
9332   // definitions.
9333   auto OldIP = Builder.saveIP();
9334   auto NewIP = std::next(BasicBlock::iterator(LastInst));
9335   Builder.SetInsertPoint(&*NewIP);
9336 
9337   // However, if we are vectorizing, we need to construct the vector values.
9338   // If the value is known to be uniform after vectorization, we can just
9339   // broadcast the scalar value corresponding to lane zero for each unroll
9340   // iteration. Otherwise, we construct the vector values using
9341   // insertelement instructions. Since the resulting vectors are stored in
9342   // State, we will only generate the insertelements once.
9343   Value *VectorValue = nullptr;
9344   if (IsUniform) {
9345     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9346     set(Def, VectorValue, Part);
9347   } else {
9348     // Initialize packing with insertelements to start from undef.
9349     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9350     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9351     set(Def, Undef, Part);
9352     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9353       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9354     VectorValue = get(Def, Part);
9355   }
9356   Builder.restoreIP(OldIP);
9357   return VectorValue;
9358 }
9359 
9360 // Process the loop in the VPlan-native vectorization path. This path builds
9361 // VPlan upfront in the vectorization pipeline, which allows to apply
9362 // VPlan-to-VPlan transformations from the very beginning without modifying the
9363 // input LLVM IR.
9364 static bool processLoopInVPlanNativePath(
9365     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9366     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9367     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9368     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9369     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
9370 
9371   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9372     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9373     return false;
9374   }
9375   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9376   Function *F = L->getHeader()->getParent();
9377   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9378 
9379   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9380       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9381 
9382   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9383                                 &Hints, IAI);
9384   // Use the planner for outer loop vectorization.
9385   // TODO: CM is not used at this point inside the planner. Turn CM into an
9386   // optional argument if we don't need it in the future.
9387   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
9388 
9389   // Get user vectorization factor.
9390   ElementCount UserVF = Hints.getWidth();
9391 
9392   // Plan how to best vectorize, return the best VF and its cost.
9393   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9394 
9395   // If we are stress testing VPlan builds, do not attempt to generate vector
9396   // code. Masked vector code generation support will follow soon.
9397   // Also, do not attempt to vectorize if no vector code will be produced.
9398   if (VPlanBuildStressTest || EnableVPlanPredication ||
9399       VectorizationFactor::Disabled() == VF)
9400     return false;
9401 
9402   LVP.setBestPlan(VF.Width, 1);
9403 
9404   {
9405     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9406                              F->getParent()->getDataLayout());
9407     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9408                            &CM, BFI, PSI, Checks);
9409     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9410                       << L->getHeader()->getParent()->getName() << "\"\n");
9411     LVP.executePlan(LB, DT);
9412   }
9413 
9414   // Mark the loop as already vectorized to avoid vectorizing again.
9415   Hints.setAlreadyVectorized();
9416   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9417   return true;
9418 }
9419 
9420 // Emit a remark if there are stores to floats that required a floating point
9421 // extension. If the vectorized loop was generated with floating point there
9422 // will be a performance penalty from the conversion overhead and the change in
9423 // the vector width.
9424 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9425   SmallVector<Instruction *, 4> Worklist;
9426   for (BasicBlock *BB : L->getBlocks()) {
9427     for (Instruction &Inst : *BB) {
9428       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9429         if (S->getValueOperand()->getType()->isFloatTy())
9430           Worklist.push_back(S);
9431       }
9432     }
9433   }
9434 
9435   // Traverse the floating point stores upwards searching, for floating point
9436   // conversions.
9437   SmallPtrSet<const Instruction *, 4> Visited;
9438   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9439   while (!Worklist.empty()) {
9440     auto *I = Worklist.pop_back_val();
9441     if (!L->contains(I))
9442       continue;
9443     if (!Visited.insert(I).second)
9444       continue;
9445 
9446     // Emit a remark if the floating point store required a floating
9447     // point conversion.
9448     // TODO: More work could be done to identify the root cause such as a
9449     // constant or a function return type and point the user to it.
9450     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9451       ORE->emit([&]() {
9452         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9453                                           I->getDebugLoc(), L->getHeader())
9454                << "floating point conversion changes vector width. "
9455                << "Mixed floating point precision requires an up/down "
9456                << "cast that will negatively impact performance.";
9457       });
9458 
9459     for (Use &Op : I->operands())
9460       if (auto *OpI = dyn_cast<Instruction>(Op))
9461         Worklist.push_back(OpI);
9462   }
9463 }
9464 
9465 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9466     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9467                                !EnableLoopInterleaving),
9468       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9469                               !EnableLoopVectorization) {}
9470 
9471 bool LoopVectorizePass::processLoop(Loop *L) {
9472   assert((EnableVPlanNativePath || L->isInnermost()) &&
9473          "VPlan-native path is not enabled. Only process inner loops.");
9474 
9475 #ifndef NDEBUG
9476   const std::string DebugLocStr = getDebugLocString(L);
9477 #endif /* NDEBUG */
9478 
9479   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9480                     << L->getHeader()->getParent()->getName() << "\" from "
9481                     << DebugLocStr << "\n");
9482 
9483   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9484 
9485   LLVM_DEBUG(
9486       dbgs() << "LV: Loop hints:"
9487              << " force="
9488              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9489                      ? "disabled"
9490                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9491                             ? "enabled"
9492                             : "?"))
9493              << " width=" << Hints.getWidth()
9494              << " unroll=" << Hints.getInterleave() << "\n");
9495 
9496   // Function containing loop
9497   Function *F = L->getHeader()->getParent();
9498 
9499   // Looking at the diagnostic output is the only way to determine if a loop
9500   // was vectorized (other than looking at the IR or machine code), so it
9501   // is important to generate an optimization remark for each loop. Most of
9502   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9503   // generated as OptimizationRemark and OptimizationRemarkMissed are
9504   // less verbose reporting vectorized loops and unvectorized loops that may
9505   // benefit from vectorization, respectively.
9506 
9507   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9508     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9509     return false;
9510   }
9511 
9512   PredicatedScalarEvolution PSE(*SE, *L);
9513 
9514   // Check if it is legal to vectorize the loop.
9515   LoopVectorizationRequirements Requirements(*ORE);
9516   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9517                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9518   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9519     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9520     Hints.emitRemarkWithHints();
9521     return false;
9522   }
9523 
9524   // Check the function attributes and profiles to find out if this function
9525   // should be optimized for size.
9526   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9527       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9528 
9529   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9530   // here. They may require CFG and instruction level transformations before
9531   // even evaluating whether vectorization is profitable. Since we cannot modify
9532   // the incoming IR, we need to build VPlan upfront in the vectorization
9533   // pipeline.
9534   if (!L->isInnermost())
9535     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9536                                         ORE, BFI, PSI, Hints);
9537 
9538   assert(L->isInnermost() && "Inner loop expected.");
9539 
9540   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9541   // count by optimizing for size, to minimize overheads.
9542   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9543   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9544     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9545                       << "This loop is worth vectorizing only if no scalar "
9546                       << "iteration overheads are incurred.");
9547     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9548       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9549     else {
9550       LLVM_DEBUG(dbgs() << "\n");
9551       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9552     }
9553   }
9554 
9555   // Check the function attributes to see if implicit floats are allowed.
9556   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9557   // an integer loop and the vector instructions selected are purely integer
9558   // vector instructions?
9559   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9560     reportVectorizationFailure(
9561         "Can't vectorize when the NoImplicitFloat attribute is used",
9562         "loop not vectorized due to NoImplicitFloat attribute",
9563         "NoImplicitFloat", ORE, L);
9564     Hints.emitRemarkWithHints();
9565     return false;
9566   }
9567 
9568   // Check if the target supports potentially unsafe FP vectorization.
9569   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9570   // for the target we're vectorizing for, to make sure none of the
9571   // additional fp-math flags can help.
9572   if (Hints.isPotentiallyUnsafe() &&
9573       TTI->isFPVectorizationPotentiallyUnsafe()) {
9574     reportVectorizationFailure(
9575         "Potentially unsafe FP op prevents vectorization",
9576         "loop not vectorized due to unsafe FP support.",
9577         "UnsafeFP", ORE, L);
9578     Hints.emitRemarkWithHints();
9579     return false;
9580   }
9581 
9582   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9583   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9584 
9585   // If an override option has been passed in for interleaved accesses, use it.
9586   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9587     UseInterleaved = EnableInterleavedMemAccesses;
9588 
9589   // Analyze interleaved memory accesses.
9590   if (UseInterleaved) {
9591     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9592   }
9593 
9594   // Use the cost model.
9595   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9596                                 F, &Hints, IAI);
9597   CM.collectValuesToIgnore();
9598 
9599   // Use the planner for vectorization.
9600   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9601 
9602   // Get user vectorization factor and interleave count.
9603   ElementCount UserVF = Hints.getWidth();
9604   unsigned UserIC = Hints.getInterleave();
9605 
9606   // Plan how to best vectorize, return the best VF and its cost.
9607   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9608 
9609   VectorizationFactor VF = VectorizationFactor::Disabled();
9610   unsigned IC = 1;
9611 
9612   if (MaybeVF) {
9613     VF = *MaybeVF;
9614     // Select the interleave count.
9615     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9616   }
9617 
9618   // Identify the diagnostic messages that should be produced.
9619   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9620   bool VectorizeLoop = true, InterleaveLoop = true;
9621   if (Requirements.doesNotMeet(F, L, Hints)) {
9622     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9623                          "requirements.\n");
9624     Hints.emitRemarkWithHints();
9625     return false;
9626   }
9627 
9628   if (VF.Width.isScalar()) {
9629     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9630     VecDiagMsg = std::make_pair(
9631         "VectorizationNotBeneficial",
9632         "the cost-model indicates that vectorization is not beneficial");
9633     VectorizeLoop = false;
9634   }
9635 
9636   if (!MaybeVF && UserIC > 1) {
9637     // Tell the user interleaving was avoided up-front, despite being explicitly
9638     // requested.
9639     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9640                          "interleaving should be avoided up front\n");
9641     IntDiagMsg = std::make_pair(
9642         "InterleavingAvoided",
9643         "Ignoring UserIC, because interleaving was avoided up front");
9644     InterleaveLoop = false;
9645   } else if (IC == 1 && UserIC <= 1) {
9646     // Tell the user interleaving is not beneficial.
9647     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9648     IntDiagMsg = std::make_pair(
9649         "InterleavingNotBeneficial",
9650         "the cost-model indicates that interleaving is not beneficial");
9651     InterleaveLoop = false;
9652     if (UserIC == 1) {
9653       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9654       IntDiagMsg.second +=
9655           " and is explicitly disabled or interleave count is set to 1";
9656     }
9657   } else if (IC > 1 && UserIC == 1) {
9658     // Tell the user interleaving is beneficial, but it explicitly disabled.
9659     LLVM_DEBUG(
9660         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9661     IntDiagMsg = std::make_pair(
9662         "InterleavingBeneficialButDisabled",
9663         "the cost-model indicates that interleaving is beneficial "
9664         "but is explicitly disabled or interleave count is set to 1");
9665     InterleaveLoop = false;
9666   }
9667 
9668   // Override IC if user provided an interleave count.
9669   IC = UserIC > 0 ? UserIC : IC;
9670 
9671   // Emit diagnostic messages, if any.
9672   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9673   if (!VectorizeLoop && !InterleaveLoop) {
9674     // Do not vectorize or interleaving the loop.
9675     ORE->emit([&]() {
9676       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9677                                       L->getStartLoc(), L->getHeader())
9678              << VecDiagMsg.second;
9679     });
9680     ORE->emit([&]() {
9681       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9682                                       L->getStartLoc(), L->getHeader())
9683              << IntDiagMsg.second;
9684     });
9685     return false;
9686   } else if (!VectorizeLoop && InterleaveLoop) {
9687     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9688     ORE->emit([&]() {
9689       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9690                                         L->getStartLoc(), L->getHeader())
9691              << VecDiagMsg.second;
9692     });
9693   } else if (VectorizeLoop && !InterleaveLoop) {
9694     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9695                       << ") in " << DebugLocStr << '\n');
9696     ORE->emit([&]() {
9697       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9698                                         L->getStartLoc(), L->getHeader())
9699              << IntDiagMsg.second;
9700     });
9701   } else if (VectorizeLoop && InterleaveLoop) {
9702     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9703                       << ") in " << DebugLocStr << '\n');
9704     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9705   }
9706 
9707   bool DisableRuntimeUnroll = false;
9708   MDNode *OrigLoopID = L->getLoopID();
9709   {
9710     // Optimistically generate runtime checks. Drop them if they turn out to not
9711     // be profitable. Limit the scope of Checks, so the cleanup happens
9712     // immediately after vector codegeneration is done.
9713     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9714                              F->getParent()->getDataLayout());
9715     if (!VF.Width.isScalar() || IC > 1)
9716       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
9717     LVP.setBestPlan(VF.Width, IC);
9718 
9719     using namespace ore;
9720     if (!VectorizeLoop) {
9721       assert(IC > 1 && "interleave count should not be 1 or 0");
9722       // If we decided that it is not legal to vectorize the loop, then
9723       // interleave it.
9724       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9725                                  &CM, BFI, PSI, Checks);
9726       LVP.executePlan(Unroller, DT);
9727 
9728       ORE->emit([&]() {
9729         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9730                                   L->getHeader())
9731                << "interleaved loop (interleaved count: "
9732                << NV("InterleaveCount", IC) << ")";
9733       });
9734     } else {
9735       // If we decided that it is *legal* to vectorize the loop, then do it.
9736 
9737       // Consider vectorizing the epilogue too if it's profitable.
9738       VectorizationFactor EpilogueVF =
9739           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9740       if (EpilogueVF.Width.isVector()) {
9741 
9742         // The first pass vectorizes the main loop and creates a scalar epilogue
9743         // to be vectorized by executing the plan (potentially with a different
9744         // factor) again shortly afterwards.
9745         EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9746                                           EpilogueVF.Width.getKnownMinValue(),
9747                                           1);
9748         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9749                                            EPI, &LVL, &CM, BFI, PSI, Checks);
9750 
9751         LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9752         LVP.executePlan(MainILV, DT);
9753         ++LoopsVectorized;
9754 
9755         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9756         formLCSSARecursively(*L, *DT, LI, SE);
9757 
9758         // Second pass vectorizes the epilogue and adjusts the control flow
9759         // edges from the first pass.
9760         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9761         EPI.MainLoopVF = EPI.EpilogueVF;
9762         EPI.MainLoopUF = EPI.EpilogueUF;
9763         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9764                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
9765                                                  Checks);
9766         LVP.executePlan(EpilogILV, DT);
9767         ++LoopsEpilogueVectorized;
9768 
9769         if (!MainILV.areSafetyChecksAdded())
9770           DisableRuntimeUnroll = true;
9771       } else {
9772         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9773                                &LVL, &CM, BFI, PSI, Checks);
9774         LVP.executePlan(LB, DT);
9775         ++LoopsVectorized;
9776 
9777         // Add metadata to disable runtime unrolling a scalar loop when there
9778         // are no runtime checks about strides and memory. A scalar loop that is
9779         // rarely used is not worth unrolling.
9780         if (!LB.areSafetyChecksAdded())
9781           DisableRuntimeUnroll = true;
9782       }
9783       // Report the vectorization decision.
9784       ORE->emit([&]() {
9785         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9786                                   L->getHeader())
9787                << "vectorized loop (vectorization width: "
9788                << NV("VectorizationFactor", VF.Width)
9789                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9790       });
9791     }
9792 
9793     if (ORE->allowExtraAnalysis(LV_NAME))
9794       checkMixedPrecision(L, ORE);
9795   }
9796 
9797   Optional<MDNode *> RemainderLoopID =
9798       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9799                                       LLVMLoopVectorizeFollowupEpilogue});
9800   if (RemainderLoopID.hasValue()) {
9801     L->setLoopID(RemainderLoopID.getValue());
9802   } else {
9803     if (DisableRuntimeUnroll)
9804       AddRuntimeUnrollDisableMetaData(L);
9805 
9806     // Mark the loop as already vectorized to avoid vectorizing again.
9807     Hints.setAlreadyVectorized();
9808   }
9809 
9810   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9811   return true;
9812 }
9813 
9814 LoopVectorizeResult LoopVectorizePass::runImpl(
9815     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9816     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9817     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9818     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9819     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9820   SE = &SE_;
9821   LI = &LI_;
9822   TTI = &TTI_;
9823   DT = &DT_;
9824   BFI = &BFI_;
9825   TLI = TLI_;
9826   AA = &AA_;
9827   AC = &AC_;
9828   GetLAA = &GetLAA_;
9829   DB = &DB_;
9830   ORE = &ORE_;
9831   PSI = PSI_;
9832 
9833   // Don't attempt if
9834   // 1. the target claims to have no vector registers, and
9835   // 2. interleaving won't help ILP.
9836   //
9837   // The second condition is necessary because, even if the target has no
9838   // vector registers, loop vectorization may still enable scalar
9839   // interleaving.
9840   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9841       TTI->getMaxInterleaveFactor(1) < 2)
9842     return LoopVectorizeResult(false, false);
9843 
9844   bool Changed = false, CFGChanged = false;
9845 
9846   // The vectorizer requires loops to be in simplified form.
9847   // Since simplification may add new inner loops, it has to run before the
9848   // legality and profitability checks. This means running the loop vectorizer
9849   // will simplify all loops, regardless of whether anything end up being
9850   // vectorized.
9851   for (auto &L : *LI)
9852     Changed |= CFGChanged |=
9853         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9854 
9855   // Build up a worklist of inner-loops to vectorize. This is necessary as
9856   // the act of vectorizing or partially unrolling a loop creates new loops
9857   // and can invalidate iterators across the loops.
9858   SmallVector<Loop *, 8> Worklist;
9859 
9860   for (Loop *L : *LI)
9861     collectSupportedLoops(*L, LI, ORE, Worklist);
9862 
9863   LoopsAnalyzed += Worklist.size();
9864 
9865   // Now walk the identified inner loops.
9866   while (!Worklist.empty()) {
9867     Loop *L = Worklist.pop_back_val();
9868 
9869     // For the inner loops we actually process, form LCSSA to simplify the
9870     // transform.
9871     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9872 
9873     Changed |= CFGChanged |= processLoop(L);
9874   }
9875 
9876   // Process each loop nest in the function.
9877   return LoopVectorizeResult(Changed, CFGChanged);
9878 }
9879 
9880 PreservedAnalyses LoopVectorizePass::run(Function &F,
9881                                          FunctionAnalysisManager &AM) {
9882     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9883     auto &LI = AM.getResult<LoopAnalysis>(F);
9884     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9885     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9886     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9887     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9888     auto &AA = AM.getResult<AAManager>(F);
9889     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9890     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9891     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9892     MemorySSA *MSSA = EnableMSSALoopDependency
9893                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9894                           : nullptr;
9895 
9896     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9897     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9898         [&](Loop &L) -> const LoopAccessInfo & {
9899       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9900                                         TLI, TTI, nullptr, MSSA};
9901       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9902     };
9903     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9904     ProfileSummaryInfo *PSI =
9905         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9906     LoopVectorizeResult Result =
9907         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9908     if (!Result.MadeAnyChange)
9909       return PreservedAnalyses::all();
9910     PreservedAnalyses PA;
9911 
9912     // We currently do not preserve loopinfo/dominator analyses with outer loop
9913     // vectorization. Until this is addressed, mark these analyses as preserved
9914     // only for non-VPlan-native path.
9915     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9916     if (!EnableVPlanNativePath) {
9917       PA.preserve<LoopAnalysis>();
9918       PA.preserve<DominatorTreeAnalysis>();
9919     }
9920     PA.preserve<BasicAA>();
9921     PA.preserve<GlobalsAA>();
9922     if (!Result.MadeCFGChange)
9923       PA.preserveSet<CFGAnalyses>();
9924     return PA;
9925 }
9926